How can I Convert HTML to Text in C#?
我正在寻找C#代码以将HTML文档转换为纯文本。
我不是在寻找简单的标记剥离方法,而是会输出纯文本并合理保留原始布局的东西。
输出应如下所示:
W3C的Html2Txt
我看过HTML Agility Pack,但我认为这不是我所需要的。有人还有其他建议吗?
编辑:我只是从CodePlex下载HTML Agility Pack,并运行了Html2Txt项目。真令人失望(至少是将html转换为文本的模块)!它所做的只是剥离标签,展平表等。输出看起来与生成的Html2Txt @ W3C有所不同。不幸的是,该源似乎不可用。
我一直在寻找是否有更多的"罐头"解决方案可用。
编辑2:谢谢大家的建议。 FlySwat向我提示了我要走的方向。我可以使用
只是有关HtmlAgilityPack的注释,以供后代使用。该项目包含一个将文本解析为html的示例,正??如OP所指出的那样,它根本不像任何编写HTML的人所希望的那样处理空格。有人提出了这个问题的全文显示解决方案,虽然不是(它甚至不能处理当前形式的表),但是它是轻量级且快速的,这是我创建简单文本所需的全部HTML电子邮件的版本。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 | using System.IO; using System.Text.RegularExpressions; using HtmlAgilityPack; //small but important modification to class https://github.com/zzzprojects/html-agility-pack/blob/master/src/Samples/Html2Txt/HtmlConvert.cs public static class HtmlToText { public static string Convert(string path) { HtmlDocument doc = new HtmlDocument(); doc.Load(path); return ConvertDoc(doc); } public static string ConvertHtml(string html) { HtmlDocument doc = new HtmlDocument(); doc.LoadHtml(html); return ConvertDoc(doc); } public static string ConvertDoc (HtmlDocument doc) { using (StringWriter sw = new StringWriter()) { ConvertTo(doc.DocumentNode, sw); sw.Flush(); return sw.ToString(); } } internal static void ConvertContentTo(HtmlNode node, TextWriter outText, PreceedingDomTextInfo textInfo) { foreach (HtmlNode subnode in node.ChildNodes) { ConvertTo(subnode, outText, textInfo); } } public static void ConvertTo(HtmlNode node, TextWriter outText) { ConvertTo(node, outText, new PreceedingDomTextInfo(false)); } internal static void ConvertTo(HtmlNode node, TextWriter outText, PreceedingDomTextInfo textInfo) { string html; switch (node.NodeType) { case HtmlNodeType.Comment: // don't output comments break; case HtmlNodeType.Document: ConvertContentTo(node, outText, textInfo); break; case HtmlNodeType.Text: // script and style must not be output string parentName = node.ParentNode.Name; if ((parentName =="script") || (parentName =="style")) { break; } // get text html = ((HtmlTextNode)node).Text; // is it in fact a special closing node output as text? if (HtmlNode.IsOverlappedClosingElement(html)) { break; } // check the text is meaningful and not a bunch of whitespaces if (html.Length == 0) { break; } if (!textInfo.WritePrecedingWhiteSpace || textInfo.LastCharWasSpace) { html= html.TrimStart(); if (html.Length == 0) { break; } textInfo.IsFirstTextOfDocWritten.Value = textInfo.WritePrecedingWhiteSpace = true; } outText.Write(HtmlEntity.DeEntitize(Regex.Replace(html.TrimEnd(), @"\\s{2,}",""))); if (textInfo.LastCharWasSpace = char.IsWhiteSpace(html[html.Length - 1])) { outText.Write(' '); } break; case HtmlNodeType.Element: string endElementString = null; bool isInline; bool skip = false; int listIndex = 0; switch (node.Name) { case"nav": skip = true; isInline = false; break; case"body": case"section": case"article": case"aside": case"h1": case"h2": case"header": case"footer": case"address": case"main": case"div": case"p": // stylistic - adjust as you tend to use if (textInfo.IsFirstTextOfDocWritten) { outText.Write("\ \ "); } endElementString ="\ \ "; isInline = false; break; case"br": outText.Write("\ \ "); skip = true; textInfo.WritePrecedingWhiteSpace = false; isInline = true; break; case"a": if (node.Attributes.Contains("href")) { string href = node.Attributes["href"].Value.Trim(); if (node.InnerText.IndexOf(href, StringComparison.InvariantCultureIgnoreCase)==-1) { endElementString = "<" + href +">"; } } isInline = true; break; case"li": if(textInfo.ListIndex>0) { outText.Write("\ \ {0}.\\t", textInfo.ListIndex++); } else { outText.Write("\ \ *\\t"); //using '*' as bullet char, with tab after, but whatever you want eg"\\t->", if utf-8 0x2022 } isInline = false; break; case"ol": listIndex = 1; goto case"ul"; case"ul": //not handling nested lists any differently at this stage - that is getting close to rendering problems endElementString ="\ \ "; isInline = false; break; case"img": //inline-block in reality if (node.Attributes.Contains("alt")) { outText.Write('[' + node.Attributes["alt"].Value); endElementString ="]"; } if (node.Attributes.Contains("src")) { outText.Write('<' + node.Attributes["src"].Value + '>'); } isInline = true; break; default: isInline = true; break; } if (!skip && node.HasChildNodes) { ConvertContentTo(node, outText, isInline ? textInfo : new PreceedingDomTextInfo(textInfo.IsFirstTextOfDocWritten){ ListIndex = listIndex }); } if (endElementString != null) { outText.Write(endElementString); } break; } } } internal class PreceedingDomTextInfo { public PreceedingDomTextInfo(BoolWrapper isFirstTextOfDocWritten) { IsFirstTextOfDocWritten = isFirstTextOfDocWritten; } public bool WritePrecedingWhiteSpace {get;set;} public bool LastCharWasSpace { get; set; } public readonly BoolWrapper IsFirstTextOfDocWritten; public int ListIndex { get; set; } } internal class BoolWrapper { public BoolWrapper() { } public bool Value { get; set; } public static implicit operator bool(BoolWrapper boolWrapper) { return boolWrapper.Value; } public static implicit operator BoolWrapper(bool boolWrapper) { return new BoolWrapper{ Value = boolWrapper }; } } |
例如,以下HTML代码...
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 | <!DOCTYPE HTML> <html> <head> </head> <body> <header> Whatever Inc. </header> <main> <p> Thanks for your enquiry. As this is the 1st time you have contacted us, we would like to clarify a few things: </p> <li> Please confirm this is your email by replying. </li> <li> Then perform this step. </li> <p><center>[wp_ad_camp_3]</center></p><p> Please solve this <img alt="complex equation" src="http://upload.wikimedia.org/wikipedia/commons/8/8d/First_Equation_Ever.png"/>. Then, in any order, could you please: </p> <ul> <li> a point. </li> <li> another point, with a hyperlink. </li> </ul> <p> Sincerely, </p> <p> The whatever.com team </p> </main> <footer> Ph: 000 000 000<br/> mail: whatever st </footer> </body> </html> |
...将转换为:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 | Whatever Inc. Thanks for your enquiry. As this is the 1st time you have contacted us, we would like to clarify a few things: 1. Please confirm this is your email by replying. 2. Then perform this step. Please solve this [complex equation<http://upload.wikimedia.org/wikipedia/commons/8/8d/First_Equation_Ever.png>]. Then, in any order, could you please: * a point. * another point, with a hyperlink<http://en.wikipedia.org/wiki/Hyperlink>. Sincerely, The whatever.com team Ph: 000 000 000 mail: whatever st |
...相对于:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 | Whatever Inc. Thanks for your enquiry. As this is the 1st time you have contacted us, we would like to clarify a few things: Please confirm this is your email by replying. Then perform this step. Please solve this . Then, in any order, could you please: a point. another point, with a hyperlink. Sincerely, The whatever.com team Ph: 000 000 000 mail: whatever st |
您可以使用此:
1 2 3 4 5 6 | public static string StripHTML(string HTMLText, bool decode = true) { Regex reg = new Regex("<[^>]+>", RegexOptions.IgnoreCase); var stripped = reg.Replace(HTMLText,""); return decode ? HttpUtility.HtmlDecode(stripped) : stripped; } |
更新
感谢您的评论,我已更新以改进此功能
我从可靠的消息来源得知,如果您正在.Net中进行HTML解析,则应该再次查看HTML敏捷包。
http://www.codeplex.com/htmlagilitypack
SO上的一些样本
HTML Agility包-解析表
您正在寻找的是文本模式DOM渲染器,该输出器可以输出文本,就像Lynx或其他文本浏览器一样……这比您预期的要难得多。
因为我想转换为带有LF和项目符号的纯文本,所以我在codeproject上找到了这个漂亮的解决方案,其中涵盖了许多转换用例:
将HTML转换为纯文本
是的,看起来很大,但效果很好。
假设您的HTML格式正确,您也可以尝试XSL转换。
这是一个例子:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 | using System; using System.IO; using System.Xml.Linq; using System.Xml.XPath; using System.Xml.Xsl; class Html2TextExample { public static string Html2Text(XDocument source) { var writer = new StringWriter(); Html2Text(source, writer); return writer.ToString(); } public static void Html2Text(XDocument source, TextWriter output) { Transformer.Transform(source.CreateReader(), null, output); } public static XslCompiledTransform _transformer; public static XslCompiledTransform Transformer { get { if (_transformer == null) { _transformer = new XslCompiledTransform(); var xsl = XDocument.Parse(@"<?xml version='1.0'?><xsl:stylesheet version=""1.0"" xmlns:xsl=""http://www.w3.org/1999/XSL/Transform"" exclude-result-prefixes=""xsl""><xsl:output method=""html"" indent=""yes"" version=""4.0"" omit-xml-declaration=""yes"" encoding=""UTF-8"" /><xsl:template match=""/""><xsl:value-of select=""."" /></xsl:template></xsl:stylesheet>"); _transformer.Load(xsl.CreateNavigator()); } return _transformer; } } static void Main(string[] args) { var html = XDocument.Parse("<html><body>Hello world!</body></html>"); var text = Html2Text(html); Console.WriteLine(text); } } |
您是否尝试过http://www.aaronsw.com/2002/html2text/,它是Python,但是是开放源代码。
最简单的方法可能是标签剥离,再用文本布局元素替换一些标签,例如列表元素(li)的破折号和br和p的换行符。
将其扩展到表应该不难。
我在使用HtmlAgility时遇到一些解码问题,因此不想花时间去研究它。
相反,我使用了Microsoft Team Foundation API中的该实用程序:
1 | var text = HtmlFilter.ConvertToPlainText(htmlContent); |
另一篇文章提出了HTML敏捷包:
This is an agile HTML parser that
builds a read/write DOM and supports
plain XPATH or XSLT (you actually
don't HAVE to understand XPATH nor
XSLT to use it, don't worry...). It is
a .NET code library that allows you to
parse"out of the web" HTML files. The
parser is very tolerant with"real
world" malformed HTML. The object
model is very similar to what proposes
System.Xml, but for HTML documents (or
streams).
我过去曾经使用过Detagger。它在将HTML格式化为文本方面做得非常好,并且不只是标签删除器。
此功能将"在浏览器中看到的内容"转换为带换行符的纯文本。 (如果要在浏览器中查看结果,请使用注释的返回值)
1 2 3 4 5 6 7 8 9 10 11 12 13 | public string HtmlFileToText(string filePath) { using (var browser = new WebBrowser()) { string text = File.ReadAllText(filePath); browser.ScriptErrorsSuppressed = true; browser.Navigate("about:blank"); browser?.Document?.OpenNew(false); browser?.Document?.Write(text); return browser.Document?.Body?.InnerText; //return browser.Document?.Body?.InnerText.Replace(Environment.NewLine,"<br />"); } } |
我最近在博客上写了一个对我有用的解决方案,方法是使用Markdown XSLT文件转换HTML源。 HTML源当然必须首先是有效的XML
尝试简单易用的方法:只需调用
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 | public string StripHTML(WebBrowser webp) { try { doc.execCommand("SelectAll", true, null); IHTMLSelectionObject currentSelection = doc.selection; if (currentSelection != null) { IHTMLTxtRange range = currentSelection.createRange() as IHTMLTxtRange; if (range != null) { currentSelection.empty(); return range.text; } } } catch (Exception ep) { //MessageBox.Show(ep.Message); } return""; } |
我不懂C#,但是这里有一个相当小且易于阅读的python html2txt脚本:http://www.aaronsw.com/2002/html2text/
如果您使用的是.NET Framework 4.5,则可以使用System.Net.WebUtility.HtmlDecode(),该方法接受HTML编码的字符串并返回解码的字符串。
在MSDN上记录在:http://msdn.microsoft.com/zh-cn/library/system.net.webutility.htmldecode(v=vs.110).aspx
您也可以在Windows Store应用程序中使用它。
在Genexus中,您可以使用Regex制作
&pattern ='<[^>] +>'
&TSTRPNOT =&TSTRPNOT.ReplaceRegEx(&pattern,")
在Genexus possiamo gestirlo con Regex中,
您可以使用WebBrowser控件在内存中呈现html内容。在LoadCompleted事件触发后...
1 2 3 | IHTMLDocument2 htmlDoc = (IHTMLDocument2)webBrowser.Document; string innerHTML = htmlDoc.body.innerHTML; string innerText = htmlDoc.body.innerText; |
这是在C#中将HTML转换为Text或RTF的另一种解决方案:
1 2 3 | SautinSoft.HtmlToRtf h = new SautinSoft.HtmlToRtf(); h.OutputFormat = HtmlToRtf.eOutputFormat.TextUnicode; string text = h.ConvertString(htmlString); |
该库不是免费的,这是商业产品,它是我自己的产品。