Transform Flat File without delimiters
我想将平面文件test.txt转换为平面文件test-output.txt。
方案下方:
输入示例:test.txt
1 2 3 4 5 6 7 8 9 10 11 12 | COD/ID:37 PRJ/NAME: Josephy Murphy PRJ/EMAIL: jmurphy@email.com PRJ/DESCRIPTION: test37, test37, test37 ... COD/ID:38 PRJ/NAME: Paul Newman PRJ/EMAIL: pnewman@email.com PRJ/DESCRIPTION: test38, test38, test38 ... . . |
示例输出:test-output.txt(不带标签的管道分隔符)
1 2 3 4 | 37|Josephy Murphy|jmurphy@email.com|test37, test37, test37 ... 38|Paul Newman|pnewman@email.com|test38, test38, test38 ... . . |
截图链接:
test.txt
test-output.txt
我想将此文件导入SQL Server。但是默认情况下,文件带分隔符的导入文件test.txt(15,000,000行)不是默认值。
我将使用SSIS导入数据,但必须为CSV格式或带分隔符的其他格式。
我考虑过使用REGEX或SSIS脚本组件。我知道带有格式化文本的SSIS文件的导入过程,但是此文件未格式化。
以正则表达式为例:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 | class Program { private static Regex reg = new Regex(@"COD/ID:\\s(?<id>\\d+)\ \ PRJ/NAME:\\s(?<name>.+?)\ \ PRJ/EMAIL:\\s(?\\S+?@\\S+?\\.\\S+?)\ \ PRJ/DESCRIPTION:\\s(?<description>.*?)(?:\ |$)"); static void Main(string[] args) { string original = @" COD/ID: 37 PRJ/NAME: Josephy Murphy PRJ/EMAIL: [email protected] PRJ/DESCRIPTION: test37, test37, test37 ... COD/ID: 38 PRJ/NAME: Paul Newman PRJ/EMAIL: [email protected] PRJ/DESCRIPTION: test38, test38, test38 ..."; string result = string.Join( "\ ", reg.Matches(original) .Cast<Match>() .Select(m => string.Format("{0}|{1}|{2}|{3}",m.Groups["id"].Value,m.Groups["name"].Value,m.Groups["email"].Value,m.Groups["description"].Value))); Console.WriteLine(result); } } |
编辑
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 | class Program { private static Regex reg = new Regex(@"COD/ID:\\s(?<id>\\d+)\ \ PRJ/NAME:\\s(?<name>.+?)\ \ PRJ/EMAIL:\\s(?\\S+?@\\S+?\\.\\S+?)\ \ PRJ/DESCRIPTION:\\s(?<description>.*?)\ \ "); static void Main(string[] args) { StringBuilder intermediateStringBuilder = new StringBuilder(); using (StreamReader reader = new StreamReader(@"YourInputPath.txt",true)) { using (StreamWriter writer = new StreamWriter("YourOutputPath.txt")) { while (reader.Peek() > 0) { string line = reader.ReadLine(); if (!string.IsNullOrWhiteSpace(line)) { intermediateStringBuilder.AppendLine(line); } else { WriteToFile(intermediateStringBuilder, writer); } } WriteToFile(intermediateStringBuilder,writer); } } } private static void WriteToFile(StringBuilder intermediateStringBuilder, StreamWriter writer) { Match m = reg.Match(intermediateStringBuilder.ToString()); writer.WriteLine("{0}|{1}|{2}|{3}", m.Groups["id"].Value, m.Groups["name"].Value, m.Groups["email"].Value, m.Groups["description"].Value); intermediateStringBuilder.Clear(); } } |
在这种情况下,您可以在不使用正则表达式的情况下执行此操作,因为上下文是已知的。
使用此:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 | public class EntryN { public string id { get; set; } public string name { get; set; } public string email { get; set; } public string description { get; set; } public EntryN() { this.id = this.name = this.email = this.description = string.Empty; } public string ToLine() { return this.id +"|" + this.name +"|" + this.email +"|" + this.description; } } var entries = new List<EntryN>(); using (var sl = new StreamReader(@"c:\\YOURPATH.txt", true)) { var entry = new EntryN(); var line = string.Empty; while ((line = sl.ReadLine()) != null) { if (line.StartsWith("COD/ID:")) entry.id = line.Substring(8).Trim(); else if (line.StartsWith("PRJ/NAME:")) entry.name = line.Substring(10).Trim(); else if (line.StartsWith("PRJ/EMAIL")) entry.email = line.Substring(11).Trim(); else if (line.StartsWith("PRJ/DESCRIPTION")) entry.description = line.Substring(17).Trim(); else if (line.Trim() == string.Empty) { entries.Add(entry); entry = new EntryN(); } } if (!entry.Equals(new EntryN())) entries.Add(entry); sl.Close(); } var resulted = entries.Select(p => p.ToLine()).ToList(); |
输出:
编辑:另一个没有单独类的代码将直接编写而无需创建其他字符串:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 | var id = string.Empty; var name = string.Empty; var email = string.Empty; var description = string.Empty; using (var sw = new StreamWriter(@"OUTPUT_FILE", false, Encoding.UTF8)) { using (var sl = new StreamReader(@"INPUT_FILE", true)) { var line = string.Empty; while ((line = sl.ReadLine()) != null) { if (line.StartsWith("COD/ID:")) id = line.Substring(8).Trim(); else if (line.StartsWith("PRJ/NAME:")) name = line.Substring(10).Trim(); else if (line.StartsWith("PRJ/EMAIL")) email = line.Substring(11).Trim(); else if (line.StartsWith("PRJ/DESCRIPTION")) description = line.Substring(17).Trim(); else if (line.Trim() == string.Empty) { sw.WriteLine(string.Format("{0}|{1}|{2}|{3}", id, name, email, description)); id = name = email = description = string.Empty; } } if (!new string[] {id, name, email, description}.Any(p => string.IsNullOrWhiteSpace(p))) sw.WriteLine(string.Format("{0}|{1}|{2}|{3}", id, name, email, description)); sl.Close(); } sw.Close(); } |