HTML文本到纯文本

编程入门行业动态更新时间:2024-10-12 03:17:13

本文介绍了HTML文本到纯文本的处理方法，对大家解决问题具有一定的参考价值，需要的朋友们下面随着小编来一起学习吧！问题描述

Hai All，如何使用C＃将HTML文本转换为纯文本请帮助我... 谢谢进展 Aatif Ali

Hai All, How to Convert HTML text to Plain Text using C# Please Help me ... Thanks Advnce Aatif Ali

推荐答案

你可以尝试下面一个： You can try below one : private string StripHTML(string source) { try { string result; // Remove HTML Development formatting // Replace line breaks with space // because browsers inserts space result = source.Replace("\r", " "); // Replace line breaks with space // because browsers inserts space result = result.Replace("\n", " "); // Remove step-formatting result = result.Replace("\t", string.Empty); // Remove repeating spaces because browsers ignore them result = System.Text.RegularExpressions.Regex.Replace(result, @"( )+", " "); // Remove the header (prepare first by clearing attributes) result = System.Text.RegularExpressions.Regex.Replace(result, @"<( )*head([^>])*>","<head>", System.Text.RegularExpressions.RegexOptions.IgnoreCase); result = System.Text.RegularExpressions.Regex.Replace(result, @"(<( )*(/)( )*head( )*>)","</head>", System.Text.RegularExpressions.RegexOptions.IgnoreCase); result = System.Text.RegularExpressions.Regex.Replace(result, "(<head>).*(</head>)",string.Empty, System.Text.RegularExpressions.RegexOptions.IgnoreCase); // remove all scripts (prepare first by clearing attributes) result = System.Text.RegularExpressions.Regex.Replace(result, @"<( )*script([^>])*>","<script>", System.Text.RegularExpressions.RegexOptions.IgnoreCase); result = System.Text.RegularExpressions.Regex.Replace(result, @"(<( )*(/)( )*script( )*>)","</script>", System.Text.RegularExpressions.RegexOptions.IgnoreCase); //result = System.Text.RegularExpressions.Regex.Replace(result, // @"(<script>)([^(<script>\.</script>)])*(</script>)", // string.Empty, // System.Text.RegularExpressions.RegexOptions.IgnoreCase); result = System.Text.RegularExpressions.Regex.Replace(result, @"(<script>).*(</script>)",string.Empty, System.Text.RegularExpressions.RegexOptions.IgnoreCase); // remove all styles (prepare first by clearing attributes) result = System.Text.RegularExpressions.Regex.Replace(result, @"<( )*style([^>])*>","<style>", System.Text.RegularExpressions.RegexOptions.IgnoreCase); result = System.Text.RegularExpressions.Regex.Replace(result, @"(<( )*(/)( )*style( )*>)","</style>", System.Text.RegularExpressions.RegexOptions.IgnoreCase); result = System.Text.RegularExpressions.Regex.Replace(result, "(<style>).*(</style>)",string.Empty, System.Text.RegularExpressions.RegexOptions.IgnoreCase); // insert tabs in spaces of <td> tags result = System.Text.RegularExpressions.Regex.Replace(result, @"<( )*td([^>])*>","\t", System.Text.RegularExpressions.RegexOptions.IgnoreCase); // insert line breaks in places of <BR> and <LI> tags result = System.Text.RegularExpressions.Regex.Replace(result, @"<( )*br( )*>","\r", System.Text.RegularExpressions.RegexOptions.IgnoreCase); result = System.Text.RegularExpressions.Regex.Replace(result, @"<( )*li( )*>","\r", System.Text.RegularExpressions.RegexOptions.IgnoreCase); // insert line paragraphs (double line breaks) in place // if <P>, <DIV> and <TR> tags result = System.Text.RegularExpressions.Regex.Replace(result, @"<( )*div([^>])*>","\r\r", System.Text.RegularExpressions.RegexOptions.IgnoreCase); result = System.Text.RegularExpressions.Regex.Replace(result, @"<( )*tr([^>])*>","\r\r", System.Text.RegularExpressions.RegexOptions.IgnoreCase); result = System.Text.RegularExpressions.Regex.Replace(result, @"<( )*p([^>])*>","\r\r", System.Text.RegularExpressions.RegexOptions.IgnoreCase); // Remove remaining tags like <a>, links, images, // comments etc - anything that's enclosed inside < > result = System.Text.RegularExpressions.Regex.Replace(result, @"<[^>]*>",string.Empty, System.Text.RegularExpressions.RegexOptions.IgnoreCase); // replace special characters: result = System.Text.RegularExpressions.Regex.Replace(result, @" "," ", System.Text.RegularExpressions.RegexOptions.IgnoreCase); result = System.Text.RegularExpressions.Regex.Replace(result, @"•"," * ", System.Text.RegularExpressions.RegexOptions.IgnoreCase); result = System.Text.RegularExpressions.Regex.Replace(result, @"&lsaquo;","<", System.Text.RegularExpressions.RegexOptions.IgnoreCase); result = System.Text.RegularExpressions.Regex.Replace(result, @"&rsaquo;",">", System.Text.RegularExpressions.RegexOptions.IgnoreCase); result = System.Text.RegularExpressions.Regex.Replace(result, @"™","(tm)", System.Text.RegularExpressions.RegexOptions.IgnoreCase); result = System.Text.RegularExpressions.Regex.Replace(result, @"&frasl;","/", System.Text.RegularExpressions.RegexOptions.IgnoreCase); result = System.Text.RegularExpressions.Regex.Replace(result, @"<","<", System.Text.RegularExpressions.RegexOptions.IgnoreCase); result = System.Text.RegularExpressions.Regex.Replace(result, @">",">", System.Text.RegularExpressions.RegexOptions.IgnoreCase); result = System.Text.RegularExpressions.Regex.Replace(result, @"©","(c)", System.Text.RegularExpressions.RegexOptions.IgnoreCase); result = System.Text.RegularExpressions.Regex.Replace(result, @"®","(r)", System.Text.RegularExpressions.RegexOptions.IgnoreCase); // Remove all others. More can be added, see // hotwired.lycos/webmonkey/reference/special_characters/ result = System.Text.RegularExpressions.Regex.Replace(result, @"&(.{2,6});", string.Empty, System.Text.RegularExpressions.RegexOptions.IgnoreCase); // for testing //System.Text.RegularExpressions.Regex.Replace(result, // this.txtRegex.Text,string.Empty, // System.Text.RegularExpressions.RegexOptions.IgnoreCase); // make line breaking consistent result = result.Replace("\n", "\r"); // Remove extra line breaks and tabs: // replace over 2 breaks with 2 and over 4 tabs with 4. // Prepare first to remove any whitespaces in between // the escaped characters and remove redundant tabs in between line breaks result = System.Text.RegularExpressions.Regex.Replace(result, "(\r)( )+(\r)","\r\r", System.Text.RegularExpressions.RegexOptions.IgnoreCase); result = System.Text.RegularExpressions.Regex.Replace(result, "(\t)( )+(\t)","\t\t", System.Text.RegularExpressions.RegexOptions.IgnoreCase); result = System.Text.RegularExpressions.Regex.Replace(result, "(\t)( )+(\r)","\t\r", System.Text.RegularExpressions.RegexOptions.IgnoreCase); result = System.Text.RegularExpressions.Regex.Replace(result, "(\r)( )+(\t)","\r\t", System.Text.RegularExpressions.RegexOptions.IgnoreCase); // Remove redundant tabs result = System.Text.RegularExpressions.Regex.Replace(result, "(\r)(\t)+(\r)","\r\r", System.Text.RegularExpressions.RegexOptions.IgnoreCase); // Remove multiple tabs following a line break with just one tab result = System.Text.RegularExpressions.Regex.Replace(result, "(\r)(\t)+","\r\t", System.Text.RegularExpressions.RegexOptions.IgnoreCase); // Initial replacement target string for line breaks string breaks = "\r\r\r"; // Initial replacement target string for tabs string tabs = "\t\t\t\t\t"; for (int index=0; index<result.Length; index++) { result = result.Replace(breaks, "\r\r"); result = result.Replace(tabs, "\t\t\t\t"); breaks = breaks + "\r"; tabs = tabs + "\t"; } // That's it. return result; } catch { MessageBox.Show("Error"); return source; } }

For更多信息选中此项（将HTML转换为纯文本）：将HTML转换为纯文本 [ ^ ] 我希望这会对你有所帮助。

For more info Check this (Convert HTML to Plain Text) : Convert HTML to Plain Text[^] I hope this will help to you.

我得到了解决方案.. public static string StripHTML（string tr） { Regex reg =新的Regex（< [^>] +>，RegexOptions.IgnoreCase）; 返回reg.Replace（tr，）; } > tr是html文本 >要使用此函数，必须使用System.Text.RegularExpressions来填充命名空间。并使用System.Tex; > jsut称这些函数为 string _plainText = StripHTML（tr）; I got solution.. public static string StripHTML(string tr) { Regex reg = new Regex("<[^>]+>", RegexOptions.IgnoreCase); return reg.Replace(tr, ""); } > tr is html text > for using this function ,you must implmnt namespace ' using System.Text.RegularExpressions; and using System.Tex; > jsut call these function like string _plainText=StripHTML(tr);

更多推荐

HTML文本到纯文本

本文发布于:2023-10-12 04:35:51，感谢您对本站的认可！

本文链接:https://www.elefans.com/category/jswz/34/1483731.html