.net抓取网页信息

编程入门 行业动态 更新时间:2024-10-23 17:37:32

.net抓取<a href=https://www.elefans.com/category/jswz/34/1771338.html style=网页信息"/>

.net抓取网页信息

抓取网页信息

  • 废话
    • 上代码

废话

正则表达式就是个坑,学了不常用就忘光了,可是编码过程中万一遇上就是一个大坑,偷偷喜欢一个姑娘

上代码

主体方法之外的可以单独建几个类,懒得建了

using System;
using System.Collections.Generic;
using System.Linq;
using System.Web;
using System.Web.Services;
using System.Collections;
using System.Text.RegularExpressions;
using System.Net;
using System.IO;
using System.Text;/// <summary>
///zzkPaChong 的摘要说明
/// </summary>
[WebService(Namespace = "/")]
[WebServiceBinding(ConformsTo = WsiProfiles.BasicProfile1_1)]
//若要允许使用 ASP.NET AJAX 从脚本中调用此 Web 服务,请取消对下行的注释。 
// [System.Web.Script.Services.ScriptService]
public class zzkPaChong : System.Web.Services.WebService {public zzkPaChong () {//如果使用设计的组件,请取消注释以下行 //InitializeComponent(); }[WebMethod]public string HelloWorld() {return "Hello World";}[WebMethod]public string zzkPaChongTest(){tbt_sps();return "{\"status\":1, \"msg\":\"调用完成!\"}";}# region 主体方法/// <summary>/// 抓取gdtbt的信息/// </summary>public void tbt_sps(){Model.wsn_tbt_sps mwts;BLL.wsn_tbt_sps bwts = new BLL.wsn_tbt_sps();Dictionary<string, string> listtitA = new Dictionary<string, string>();listtitA.Add("WTO/TBT", ".aspx");listtitA.Add("WTO/SPS", ".aspx");foreach (var item in listtitA){List<Hashtable> htt = ListHtml(item.Value);//string ziUrl = HrefHtml("/", sdptAzzktbt(itemTr));if (htt.Count > 0){foreach (Hashtable htItem in htt){if (htItem["status"]!=null&&htItem["status"].ToString()=="1"){//查看本地数据库是否存在此条信息避免重复抓取插入if (1 == 1)//if (bwts.GetModelList(" TBNumber='" + htt["TBNumber"].ToString() + "' ").Count <= 0){string ziUrl = "/" + htItem["ziurl"].ToString();mwts = new Model.wsn_tbt_sps();if (htItem["TBTitle"] != null){mwts.TBTitle = htItem["TBTitle"].ToString();}if (htItem["TBChengYuan"] != null){mwts.TBChengYuan = htItem["TBChengYuan"].ToString();}if (htItem["TBDate"] != null){mwts.TBDate = Convert.ToDateTime(htItem["TBDate"]);}if (htItem["TBNumber"] != null){mwts.TBNumber = htItem["TBNumber"].ToString();if (mwts.TBNumber.IndexOf("TBT") >= 0){mwts.TBType = "TBT";}else{mwts.TBType = "SPS";}}string bdboy = details(ziUrl);mwts.TBbody = bdboy;mwts.zq_Time = DateTime.Now;mwts.state = 0;//int bolid = bwts.Add(mwts);}}}}}}#endregion# region 抓取网页标签方法/// <summary>/// 获取列表页信息/// </summary>/// <param name="url">url带http://</param>/// <returns></returns>public List<Hashtable> ListHtml(string url){List<Hashtable> htt = new List<Hashtable>();Hashtable ht = new Hashtable();try{string htmlcode = GetHTML(url);Regex reg1 = new Regex(@"<table[^>]*(class=""tablelist"")[^>]*>[\s\S]*?</table>");MatchCollection ms1 = reg1.Matches(htmlcode);string b = ms1[0].Groups[0].Value.ToString();Regex reg2 = new Regex(@"<tr[^>]*>(?<ww>[\s\S]*?)</tr>");MatchCollection ms2 = reg2.Matches(b);foreach (Match m2 in ms2){string html = m2.Groups["ww"].Value.ToString().Trim();Regex reg3 = new Regex(@"<td[^>]*>(?<ww>[\s\S]*?)</td>");MatchCollection ms3 = reg3.Matches(html);if (ms3.Count > 0){ht = new Hashtable();ht.Add("status", "1");for (int i = 0; i < ms3.Count; i++){//此处可加判断加一个关键字查询if (i == 0){string a = ms3[i].Groups["ww"].Value.ToString();Regex reg4 = new Regex(@"(?is)<a[^>]*href=([""'])?(?<href>[^'""]+)\1[^>]*class=""tbh"">");MatchCollection ms4 = reg4.Matches(a);foreach (Match m4 in ms4){ht.Add("ziurl", m4.Groups["href"].Value.ToString().Trim());}Regex reg5 = new Regex(@"(?is)<a[^>]*class=""tbh"">(?<ww>[\s\S]*?)</a>");MatchCollection ms5 = reg5.Matches(a);foreach (Match m5 in ms5){ht.Add("TBNumber", m5.Groups["ww"].Value.ToString().Trim());}}if (i == 1){string a = ms3[i].Groups["ww"].Value.ToString();Regex reg4 = new Regex(@"(?is)<a[^>]*class=""tbbt"">(?<ww>[\s\S]*?)</a>");MatchCollection ms4 = reg4.Matches(a);foreach (Match m4 in ms4){ht.Add("TBTitle", m4.Groups["ww"].Value.ToString().Trim());}}if (i == 2){string a = ms3[i].Groups["ww"].Value.ToString();ht.Add("TBChengYuan", a.Trim());}if (i == 3){string a = ms3[i].Groups["ww"].Value.ToString();ht.Add("TBDate", a.Trim());}}htt.Add(ht);}}}catch (Exception ex){//ht.Remove("status");ht = new Hashtable();ht.Add("status", "0");ht.Add("msg",ex.Message);htt.Add(ht);}return htt;}/// <summary>/// 获取详情页信息/// </summary>/// <param name="url">url带http://</param>/// <returns></returns>public string details(string url){string zong = "";string htmlcode = GetHTML(url);Regex reg = new Regex(@"(?is)<div class=""tb1[^>]*>(?><div[^>]*>(?<o>)|</div>(?<-o>)|(?:(?!</?div\b).)*)*(?(o)(?!))</div>");MatchCollection ms = reg.Matches(htmlcode);if (ms.Count > 0){zong += ms[0].Value.ToString().Trim();}return zong;}#endregion#region 工具方法/// <summary>/// 获取utf-8/// </summary>/// <param name="url"></param>/// <returns></returns>private string GetHTML(string url){string rt = "";try{WebRequest Wrq = WebRequest.Create(url);//URLWebResponse Wrs = Wrq.GetResponse();Stream strm = Wrs.GetResponseStream();StreamReader sr = new StreamReader(strm, System.Text.Encoding.GetEncoding("UTF-8"));rt = sr.ReadToEnd(); Wrs.Close();strm.Close();sr.Close();}catch{rt = "1";}return rt;}/// <summary>/// 获取gb2312源代码/// </summary>/// <param name="url"></param>/// <returns></returns>private string GetHTML2312(string url){string rt = "";try{WebRequest Wrq = WebRequest.Create(url);//URLWebResponse Wrs = Wrq.GetResponse();Stream strm = Wrs.GetResponseStream();StreamReader sr = new StreamReader(strm, System.Text.Encoding.GetEncoding("gb2312"));rt = sr.ReadToEnd();Wrs.Close();strm.Close();sr.Close();}catch{rt = "1";}return rt;}//部分链接用上边的获取不到源代码,可以用这个试一试public string GetHtmlwc(string url){WebClient client = new WebClient();client.Headers.Add("user-agent", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; .NET CLR 1.0.3705;)");  // Add a user agent header in case the // requested URI contains a query. Stream data = client.OpenRead(url);StreamReader reader = new StreamReader(data, Encoding.UTF8);string s = reader.ReadToEnd();data.Close();reader.Close();return s;}#endregion
}

更多推荐

.net抓取网页信息

本文发布于:2024-03-14 02:40:00,感谢您对本站的认可!
本文链接:https://www.elefans.com/category/jswz/34/1735447.html
版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系,我们将在24小时内删除。
本文标签:网页   信息   net

发布评论

评论列表 (有 0 条评论)
草根站长

>www.elefans.com

编程频道|电子爱好者 - 技术资讯及电子产品介绍!