HTML文本到纯文本

编程入门 行业动态 更新时间:2024-10-12 03:17:13
本文介绍了HTML文本到纯文本的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧! 问题描述

Hai All, 如何使用C#将HTML文本转换为纯文本 请帮助我... 谢谢进展 Aatif Ali

Hai All, How to Convert HTML text to Plain Text using C# Please Help me ... Thanks Advnce Aatif Ali

推荐答案

你可以尝试下面一个: You can try below one : private string StripHTML(string source) { try { string result; // Remove HTML Development formatting // Replace line breaks with space // because browsers inserts space result = source.Replace("\r", " "); // Replace line breaks with space // because browsers inserts space result = result.Replace("\n", " "); // Remove step-formatting result = result.Replace("\t", string.Empty); // Remove repeating spaces because browsers ignore them result = System.Text.RegularExpressions.Regex.Replace(result, @"( )+", " "); // Remove the header (prepare first by clearing attributes) result = System.Text.RegularExpressions.Regex.Replace(result, @"<( )*head([^>])*>","<head>", System.Text.RegularExpressions.RegexOptions.IgnoreCase); result = System.Text.RegularExpressions.Regex.Replace(result, @"(<( )*(/)( )*head( )*>)","</head>", System.Text.RegularExpressions.RegexOptions.IgnoreCase); result = System.Text.RegularExpressions.Regex.Replace(result, "(<head>).*(</head>)",string.Empty, System.Text.RegularExpressions.RegexOptions.IgnoreCase); // remove all scripts (prepare first by clearing attributes) result = System.Text.RegularExpressions.Regex.Replace(result, @"<( )*script([^>])*>","<script>", System.Text.RegularExpressions.RegexOptions.IgnoreCase); result = System.Text.RegularExpressions.Regex.Replace(result, @"(<( )*(/)( )*script( )*>)","</script>", System.Text.RegularExpressions.RegexOptions.IgnoreCase); //result = System.Text.RegularExpressions.Regex.Replace(result, // @"(<script>)([^(<script>\.</script>)])*(</script>)", // string.Empty, // System.Text.RegularExpressions.RegexOptions.IgnoreCase); result = System.Text.RegularExpressions.Regex.Replace(result, @"(<script>).*(</script>)",string.Empty, System.Text.RegularExpressions.RegexOptions.IgnoreCase); // remove all styles (prepare first by clearing attributes) result = System.Text.RegularExpressions.Regex.Replace(result, @"<( )*style([^>])*>","<style>", System.Text.RegularExpressions.RegexOptions.IgnoreCase); result = System.Text.RegularExpressions.Regex.Replace(result, @"(<( )*(/)( )*style( )*>)","</style>", System.Text.RegularExpressions.RegexOptions.IgnoreCase); result = System.Text.RegularExpressions.Regex.Replace(result, "(<style>).*(</style>)",string.Empty, System.Text.RegularExpressions.RegexOptions.IgnoreCase); // insert tabs in spaces of <td> tags result = System.Text.RegularExpressions.Regex.Replace(result, @"<( )*td([^>])*>","\t", System.Text.RegularExpressions.RegexOptions.IgnoreCase); // insert line breaks in places of <BR> and <LI> tags result = System.Text.RegularExpressions.Regex.Replace(result, @"<( )*br( )*>","\r", System.Text.RegularExpressions.RegexOptions.IgnoreCase); result = System.Text.RegularExpressions.Regex.Replace(result, @"<( )*li( )*>","\r", System.Text.RegularExpressions.RegexOptions.IgnoreCase); // insert line paragraphs (double line breaks) in place // if <P>, <DIV> and <TR> tags result = System.Text.RegularExpressions.Regex.Replace(result, @"<( )*div([^>])*>","\r\r", System.Text.RegularExpressions.RegexOptions.IgnoreCase); result = System.Text.RegularExpressions.Regex.Replace(result, @"<( )*tr([^>])*>","\r\r", System.Text.RegularExpressions.RegexOptions.IgnoreCase); result = System.Text.RegularExpressions.Regex.Replace(result, @"<( )*p([^>])*>","\r\r", System.Text.RegularExpressions.RegexOptions.IgnoreCase); // Remove remaining tags like <a>, links, images, // comments etc - anything that's enclosed inside < > result = System.Text.RegularExpressions.Regex.Replace(result, @"<[^>]*>",string.Empty, System.Text.RegularExpressions.RegexOptions.IgnoreCase); // replace special characters: result = System.Text.RegularExpressions.Regex.Replace(result, @" "," ", System.Text.RegularExpressions.RegexOptions.IgnoreCase); result = System.Text.RegularExpressions.Regex.Replace(result, @"&bull;"," * ", System.Text.RegularExpressions.RegexOptions.IgnoreCase); result = System.Text.RegularExpressions.Regex.Replace(result, @"&lsaquo;","<", System.Text.RegularExpressions.RegexOptions.IgnoreCase); result = System.Text.RegularExpressions.Regex.Replace(result, @"&rsaquo;",">", System.Text.RegularExpressions.RegexOptions.IgnoreCase); result = System.Text.RegularExpressions.Regex.Replace(result, @"&trade;","(tm)", System.Text.RegularExpressions.RegexOptions.IgnoreCase); result = System.Text.RegularExpressions.Regex.Replace(result, @"&frasl;","/", System.Text.RegularExpressions.RegexOptions.IgnoreCase); result = System.Text.RegularExpressions.Regex.Replace(result, @"&lt;","<", System.Text.RegularExpressions.RegexOptions.IgnoreCase); result = System.Text.RegularExpressions.Regex.Replace(result, @"&gt;",">", System.Text.RegularExpressions.RegexOptions.IgnoreCase); result = System.Text.RegularExpressions.Regex.Replace(result, @"&copy;","(c)", System.Text.RegularExpressions.RegexOptions.IgnoreCase); result = System.Text.RegularExpressions.Regex.Replace(result, @"&reg;","(r)", System.Text.RegularExpressions.RegexOptions.IgnoreCase); // Remove all others. More can be added, see // hotwired.lycos/webmonkey/reference/special_characters/ result = System.Text.RegularExpressions.Regex.Replace(result, @"&(.{2,6});", string.Empty, System.Text.RegularExpressions.RegexOptions.IgnoreCase); // for testing //System.Text.RegularExpressions.Regex.Replace(result, // this.txtRegex.Text,string.Empty, // System.Text.RegularExpressions.RegexOptions.IgnoreCase); // make line breaking consistent result = result.Replace("\n", "\r"); // Remove extra line breaks and tabs: // replace over 2 breaks with 2 and over 4 tabs with 4. // Prepare first to remove any whitespaces in between // the escaped characters and remove redundant tabs in between line breaks result = System.Text.RegularExpressions.Regex.Replace(result, "(\r)( )+(\r)","\r\r", System.Text.RegularExpressions.RegexOptions.IgnoreCase); result = System.Text.RegularExpressions.Regex.Replace(result, "(\t)( )+(\t)","\t\t", System.Text.RegularExpressions.RegexOptions.IgnoreCase); result = System.Text.RegularExpressions.Regex.Replace(result, "(\t)( )+(\r)","\t\r", System.Text.RegularExpressions.RegexOptions.IgnoreCase); result = System.Text.RegularExpressions.Regex.Replace(result, "(\r)( )+(\t)","\r\t", System.Text.RegularExpressions.RegexOptions.IgnoreCase); // Remove redundant tabs result = System.Text.RegularExpressions.Regex.Replace(result, "(\r)(\t)+(\r)","\r\r", System.Text.RegularExpressions.RegexOptions.IgnoreCase); // Remove multiple tabs following a line break with just one tab result = System.Text.RegularExpressions.Regex.Replace(result, "(\r)(\t)+","\r\t", System.Text.RegularExpressions.RegexOptions.IgnoreCase); // Initial replacement target string for line breaks string breaks = "\r\r\r"; // Initial replacement target string for tabs string tabs = "\t\t\t\t\t"; for (int index=0; index<result.Length; index++) { result = result.Replace(breaks, "\r\r"); result = result.Replace(tabs, "\t\t\t\t"); breaks = breaks + "\r"; tabs = tabs + "\t"; } // That's it. return result; } catch { MessageBox.Show("Error"); return source; } }

For更多信息选中此项(将HTML转换为纯文本): 将HTML转换为纯文本 [ ^ ] 我希望这会对你有所帮助。

For more info Check this (Convert HTML to Plain Text) : Convert HTML to Plain Text[^] I hope this will help to you.

我得到了解决方案.. public static string StripHTML(string tr) { Regex reg =新的Regex(< [^>] +>,RegexOptions.IgnoreCase); 返回reg.Replace(tr,); } > tr是html文本 >要使用此函数,必须使用System.Text.RegularExpressions来填充命名空间。并使用System.Tex; > jsut称这些函数为 string _plainText = StripHTML(tr); I got solution.. public static string StripHTML(string tr) { Regex reg = new Regex("<[^>]+>", RegexOptions.IgnoreCase); return reg.Replace(tr, ""); } > tr is html text > for using this function ,you must implmnt namespace ' using System.Text.RegularExpressions; and using System.Tex; > jsut call these function like string _plainText=StripHTML(tr);

更多推荐

HTML文本到纯文本

本文发布于:2023-10-12 04:35:51,感谢您对本站的认可!
本文链接:https://www.elefans.com/category/jswz/34/1483731.html
版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系,我们将在24小时内删除。
本文标签:文本   HTML

发布评论

评论列表 (有 0 条评论)
草根站长

>www.elefans.com

编程频道|电子爱好者 - 技术资讯及电子产品介绍!