////// Converts HTML to plain text. /// class HtmlToText { // Static data tables protected static Dictionary_tags; protected static HashSet _ignoreTags; // Instance variables protected TextBuilder _text; protected string _html; protected int _pos; // Static constructor (one time only) static HtmlToText() { _tags = new Dictionary (); _tags.Add("address", "\n"); _tags.Add("blockquote", "\n"); _tags.Add("div", "\n"); _tags.Add("dl", "\n"); _tags.Add("fieldset", "\n"); _tags.Add("form", "\n"); _tags.Add("h1", "\n"); _tags.Add("/h1", "\n"); _tags.Add("h2", "\n"); _tags.Add("/h2", "\n"); _tags.Add("h3", "\n"); _tags.Add("/h3", "\n"); _tags.Add("h4", "\n"); _tags.Add("/h4", "\n"); _tags.Add("h5", "\n"); _tags.Add("/h5", "\n"); _tags.Add("h6", "\n"); _tags.Add("/h6", "\n"); _tags.Add("p", "\n"); _tags.Add("/p", "\n"); _tags.Add("table", "\n"); _tags.Add("/table", "\n"); _tags.Add("ul", "\n"); _tags.Add("/ul", "\n"); _tags.Add("ol", "\n"); _tags.Add("/ol", "\n"); _tags.Add("/li", "\n"); _tags.Add("br", "\n"); _tags.Add("/td", "\t"); _tags.Add("/tr", "\n"); _tags.Add("/pre", "\n"); _ignoreTags = new HashSet (); _ignoreTags.Add("script"); _ignoreTags.Add("noscript"); _ignoreTags.Add("style"); _ignoreTags.Add("object"); } /// /// Converts the given HTML to plain text and returns the result. /// /// HTML to be converted ///Resulting plain text public string Convert(string html) { // Initialize state variables _text = new TextBuilder(); _html = html; _pos = 0; // Process input while (!EndOfText) { if (Peek() == '<') { // HTML tag bool selfClosing; string tag = ParseTag(out selfClosing); // Handle special tag cases if (tag == "body") { // Discard content before _text.Clear(); } else if (tag == "/body") { // Discard content after _pos = _html.Length; } else if (tag == "pre") { // Enter preformatted mode _text.Preformatted = true; EatWhitespaceToNextLine(); } else if (tag == "/pre") { // Exit preformatted mode _text.Preformatted = false; } string value; if (_tags.TryGetValue(tag, out value)) _text.Write(value); if (_ignoreTags.Contains(tag)) EatInnerContent(tag); } else if (Char.IsWhiteSpace(Peek())) { // Whitespace (treat all as space) _text.Write(_text.Preformatted ? Peek() : ' '); MoveAhead(); } else { // Other text _text.Write(Peek()); MoveAhead(); } } // Return result return HttpUtility.HtmlDecode(_text.ToString()); } // Eats all characters that are part of the current tag // and returns information about that tag protected string ParseTag(out bool selfClosing) { string tag = String.Empty; selfClosing = false; if (Peek() == '<') { MoveAhead(); // Parse tag name EatWhitespace(); int start = _pos; if (Peek() == '/') MoveAhead(); while (!EndOfText && !Char.IsWhiteSpace(Peek()) && Peek() != '/' && Peek() != '>') MoveAhead(); tag = _html.Substring(start, _pos - start).ToLower(); // Parse rest of tag while (!EndOfText && Peek() != '>') { if (Peek() == '"' || Peek() == '\'') EatQuotedValue(); else { if (Peek() == '/') selfClosing = true; MoveAhead(); } } MoveAhead(); } return tag; } // Consumes inner content from the current tag protected void EatInnerContent(string tag) { string endTag = "/" + tag; while (!EndOfText) { if (Peek() == '<') { // Consume a tag bool selfClosing; if (ParseTag(out selfClosing) == endTag) return; // Use recursion to consume nested tags if (!selfClosing && !tag.StartsWith("/")) EatInnerContent(tag); } else MoveAhead(); } } // Returns true if the current position is at the end of // the string protected bool EndOfText { get { return (_pos >= _html.Length); } } // Safely returns the character at the current position protected char Peek() { return (_pos < _html.Length) ? _html[_pos] : (char)0; } // Safely advances to current position to the next character protected void MoveAhead() { _pos = Math.Min(_pos + 1, _html.Length); } // Moves the current position to the next non-whitespace // character. protected void EatWhitespace() { while (Char.IsWhiteSpace(Peek())) MoveAhead(); } // Moves the current position to the next non-whitespace // character or the start of the next line, whichever // comes first protected void EatWhitespaceToNextLine() { while (Char.IsWhiteSpace(Peek())) { char c = Peek(); MoveAhead(); if (c == '\n') break; } } // Moves the current position past a quoted value protected void EatQuotedValue() { char c = Peek(); if (c == '"' || c == '\'') { // Opening quote MoveAhead(); // Find end of value int start = _pos; _pos = _html.IndexOfAny(new char[] { c, '\r', '\n' }, _pos); if (_pos < 0) _pos = _html.Length; else MoveAhead(); // Closing quote } } ////// A StringBuilder class that helps eliminate excess whitespace. /// protected class TextBuilder { private StringBuilder _text; private StringBuilder _currLine; private int _emptyLines; private bool _preformatted; // Construction public TextBuilder() { _text = new StringBuilder(); _currLine = new StringBuilder(); _emptyLines = 0; _preformatted = false; } ////// Normally, extra whitespace characters are discarded. /// If this property is set to true, they are passed /// through unchanged. /// public bool Preformatted { get { return _preformatted; } set { if (value) { // Clear line buffer if changing to // preformatted mode if (_currLine.Length > 0) FlushCurrLine(); _emptyLines = 0; } _preformatted = value; } } ////// Clears all current text. /// public void Clear() { _text.Length = 0; _currLine.Length = 0; _emptyLines = 0; } ////// Writes the given string to the output buffer. /// /// public void Write(string s) { foreach (char c in s) Write(c); } ////// Writes the given character to the output buffer. /// /// Character to write public void Write(char c) { if (_preformatted) { // Write preformatted character _text.Append(c); } else { if (c == '\r') { // Ignore carriage returns. We'll process // '\n' if it comes next } else if (c == '\n') { // Flush current line FlushCurrLine(); } else if (Char.IsWhiteSpace(c)) { // Write single space character int len = _currLine.Length; if (len == 0 || !Char.IsWhiteSpace(_currLine[len - 1])) _currLine.Append(' '); } else { // Add character to current line _currLine.Append(c); } } } // Appends the current line to output buffer protected void FlushCurrLine() { // Get current line string line = _currLine.ToString().Trim(); // Determine if line contains non-space characters string tmp = line.Replace(" ", String.Empty); if (tmp.Length == 0) { // An empty line _emptyLines++; if (_emptyLines < 2 && _text.Length > 0) _text.AppendLine(line); } else { // A non-empty line _emptyLines = 0; _text.AppendLine(line); } // Reset current line _currLine.Length = 0; } ////// Returns the current output as a string. /// public override string ToString() { if (_currLine.Length > 0) FlushCurrLine(); return _text.ToString(); } } }
0
0
相关文章
C# MAUI怎么使用C# Markup编写UI MAUI C# UI教程
c# 如何用C#代码触发一次GC回收 Full GC
c# C#中的幽灵读和不可重复读怎么在代码层面避免
c# solid原则是什么
blazor 是什么 blazor和vue/react对比
本站声明:本文内容由网友自发贡献,版权归原作者所有,本站不承担相应法律责任。如您发现有涉嫌抄袭侵权的内容,请联系admin@php.cn
热门AI工具
相关专题
在拼多多上赚钱主要可以通过无货源模式一件代发、精细化运营特色店铺、参与官方高流量活动、利用拼团机制社交裂变,以及成为多多进宝推广员这5种方法实现。核心策略在于通过低成本、高效率的供应链管理与营销,利用平台社交电商红利实现盈利。
28
2026.01.26
在Edge浏览器中设置主页,请依次点击右上角“...”图标 > 设置 > 开始、主页和新建标签页。在“Microsoft Edge 启动时”选择“打开以下页面”,点击“添加新页面”并输入网址。若要使用主页按钮,需在“外观”设置中开启“显示主页按钮”并设定网址。
8
2026.01.26
苹果官方查询网站主要通过 checkcoverage.apple.com/cn/zh/ 进行,可用于查询序列号(SN)对应的保修状态、激活日期及技术支持服务。此外,查找丢失设备请使用 iCloud.com/find,购买信息与物流可访问 Apple (中国大陆) 订单状态页面。
31
2026.01.26
NPD(Narcissistic Personality Disorder)即自恋型人格障碍,是一种心理健康问题,特点是极度夸大自我重要性、需要过度赞美与关注,同时极度缺乏共情能力,背后常掩藏着低自尊和不安全感,影响人际关系、工作和生活,通常在青少年时期开始显现,需由专业人士诊断。
3
2026.01.26
关闭Windows安全中心(Windows Defender)可通过系统设置暂时关闭,或使用组策略/注册表永久关闭。最简单的方法是:进入设置 > 隐私和安全性 > Windows安全中心 > 病毒和威胁防护 > 管理设置,将实时保护等选项关闭。
5
2026.01.26
铁路12306提供起售时间查询、起售提醒、购票预填、候补购票及误购限时免费退票五项服务,并强调官方渠道唯一性与信息安全。
35
2026.01.26
以工资薪金所得为例,应纳税额 = 应纳税所得额 × 税率 - 速算扣除数。应纳税所得额 = 月度收入 - 5000 元 - 专项扣除 - 专项附加扣除 - 依法确定的其他扣除。假设某员工月工资 10000 元,专项扣除 1000 元,专项附加扣除 2000 元,当月应纳税所得额为 10000 - 5000 - 1000 - 2000 = 2000 元,对应税率为 3%,速算扣除数为 0,则当月应纳税额为 2000×3% = 60 元。
12
2026.01.26
oppo云服务https://cloud.oppo.com/可以在云端安全存储您的照片、视频、联系人、便签等重要数据。当您的手机数据意外丢失或者需要更换手机时,可以随时将这些存储在云端的数据快速恢复到手机中。
40
2026.01.26
网页端充值步骤:打开浏览器,输入https://www.douyin.com,登录账号;点击右上角头像,选择“钱包”;进入“充值中心”,操作和APP端一致。注意:切勿通过第三方链接、二维码充值,谨防受骗
7
2026.01.26
热门下载
精品课程
最新文章

