关于c#:不带分隔符的转换平面文件

Transform Flat File without delimiters

我想将平面文件test.txt转换为平面文件test-output.txt。

方案下方:

输入示例:test.txt

1
2
3
4
5
6
7
8
9
10
11
12
COD/ID:37
PRJ/NAME: Josephy Murphy
PRJ/EMAIL: jmurphy@email.com
PRJ/DESCRIPTION: test37, test37, test37 ...

COD/ID:38
PRJ/NAME: Paul Newman
PRJ/EMAIL: pnewman@email.com
PRJ/DESCRIPTION: test38, test38, test38 ...

.
.

示例输出:test-output.txt(不带标签的管道分隔符)

1
2
3
4
37|Josephy Murphy|jmurphy@email.com|test37, test37, test37 ...
38|Paul Newman|pnewman@email.com|test38, test38, test38 ...
.
.

截图链接:
test.txt
test-output.txt

我想将此文件导入SQL Server。但是默认情况下,文件带分隔符的导入文件test.txt(15,000,000行)不是默认值。
我将使用SSIS导入数据,但必须为CSV格式或带分隔符的其他格式。
我考虑过使用REGEX或SSIS脚本组件。我知道带有格式化文本的SSIS文件的导入过程,但是此文件未格式化。


以正则表达式为例:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
    class Program
    {
        private static Regex reg = new Regex(@"COD/ID:\\s(?<id>\\d+)\
\
PRJ/NAME:\\s(?<name>.+?)\
\
PRJ/EMAIL:\\s(?\\S+?@\\S+?\\.\\S+?)\
\
PRJ/DESCRIPTION:\\s(?<description>.*?)(?:\
|$)"
);

        static void Main(string[] args)
        {
            string original = @"
COD/ID: 37
PRJ/NAME: Josephy Murphy
PRJ/EMAIL: [email protected]
PRJ/DESCRIPTION: test37, test37, test37 ...

COD/ID: 38
PRJ/NAME: Paul Newman
PRJ/EMAIL: [email protected]
PRJ/DESCRIPTION: test38, test38, test38 ..."
;


            string result = string.Join(
               "\
"
,
                reg.Matches(original)
                .Cast<Match>()
                .Select(m => string.Format("{0}|{1}|{2}|{3}",m.Groups["id"].Value,m.Groups["name"].Value,m.Groups["email"].Value,m.Groups["description"].Value)));
            Console.WriteLine(result);
        }
    }

编辑

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
class Program
{
    private static Regex reg = new Regex(@"COD/ID:\\s(?<id>\\d+)\
\
PRJ/NAME:\\s(?<name>.+?)\
\
PRJ/EMAIL:\\s(?\\S+?@\\S+?\\.\\S+?)\
\
PRJ/DESCRIPTION:\\s(?<description>.*?)\
\
"
);

    static void Main(string[] args)
    {
        StringBuilder intermediateStringBuilder = new StringBuilder();

        using (StreamReader reader = new StreamReader(@"YourInputPath.txt",true))
        {              
            using (StreamWriter writer = new StreamWriter("YourOutputPath.txt"))
            {
                while (reader.Peek() > 0)
                {
                    string line = reader.ReadLine();
                    if (!string.IsNullOrWhiteSpace(line))
                    {
                        intermediateStringBuilder.AppendLine(line);
                    }
                    else
                    {
                        WriteToFile(intermediateStringBuilder, writer);
                    }
                }
                WriteToFile(intermediateStringBuilder,writer);
            }
        }
    }

    private static void WriteToFile(StringBuilder intermediateStringBuilder, StreamWriter writer)
    {
        Match m = reg.Match(intermediateStringBuilder.ToString());
        writer.WriteLine("{0}|{1}|{2}|{3}", m.Groups["id"].Value, m.Groups["name"].Value, m.Groups["email"].Value, m.Groups["description"].Value);
        intermediateStringBuilder.Clear();
    }
}


在这种情况下,您可以在不使用正则表达式的情况下执行此操作,因为上下文是已知的。

使用此:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
public class EntryN
{
   public string id { get; set; }
   public string name { get; set; }
   public string email { get; set; }
   public string description { get; set; }

   public EntryN()
   {
      this.id = this.name = this.email = this.description = string.Empty;
   }
   public string ToLine()
   {
       return this.id +"|" + this.name +"|" + this.email +"|" + this.description;
   }
}

var entries = new List<EntryN>();
using (var sl = new StreamReader(@"c:\\YOURPATH.txt", true))
{
    var entry = new EntryN();
    var line = string.Empty;
    while ((line = sl.ReadLine()) != null)
    {
       if (line.StartsWith("COD/ID:"))
          entry.id = line.Substring(8).Trim();
       else if (line.StartsWith("PRJ/NAME:"))
          entry.name = line.Substring(10).Trim();
       else if (line.StartsWith("PRJ/EMAIL"))
          entry.email = line.Substring(11).Trim();
       else if (line.StartsWith("PRJ/DESCRIPTION"))
          entry.description = line.Substring(17).Trim();
      else if (line.Trim() == string.Empty)
      {
          entries.Add(entry);
          entry = new EntryN();
      }
    }
    if (!entry.Equals(new EntryN()))
       entries.Add(entry);
    sl.Close();
}

var resulted = entries.Select(p => p.ToLine()).ToList();

输出:

enter

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
var id = string.Empty;
var name = string.Empty;
var email = string.Empty;
var description = string.Empty;
using (var sw = new StreamWriter(@"OUTPUT_FILE", false, Encoding.UTF8))
{
    using (var sl = new StreamReader(@"INPUT_FILE", true))
    {
       var line = string.Empty;
       while ((line = sl.ReadLine()) != null)
       {
           if (line.StartsWith("COD/ID:"))
              id = line.Substring(8).Trim();
           else if (line.StartsWith("PRJ/NAME:"))
              name = line.Substring(10).Trim();
           else if (line.StartsWith("PRJ/EMAIL"))
              email = line.Substring(11).Trim();
           else if (line.StartsWith("PRJ/DESCRIPTION"))
              description = line.Substring(17).Trim();
           else if (line.Trim() == string.Empty)
           {
               sw.WriteLine(string.Format("{0}|{1}|{2}|{3}", id, name, email, description));
               id = name = email = description = string.Empty;
            }
        }
        if (!new string[] {id, name, email, description}.Any(p => string.IsNullOrWhiteSpace(p)))
            sw.WriteLine(string.Format("{0}|{1}|{2}|{3}", id, name, email, description));
        sl.Close();
     }
     sw.Close();
 }