猫眼电影爬取案例"/>
Top100猫眼电影爬取案例
#保存请求头的列表
ua_list=["Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 2.0.50727; SLCC2; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; Tablet PC 2.0; .NET4.0E)","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1","Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0","Opera/9.80 (Windows NT 6.1; U; zh-cn) Presto/2.9.168 Version/11.50","Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; GTB7.0)","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50"]from urllib import request
import re
import random
import time
import csvclass MaoyanSpider(object):def __init__(self):self.url="={}"#添加计数变量self.i=0#请求def get_html(self,url):headers={"User-Agent":random.choice(ua_list)}#随机获取请求头req=request.Request(url=url,headers=headers)res=request.urlopen(req)html=res.read().decode()# 直接调用解析函数self.parse_html(html)#解析def parse_html(self,html):re_bds='<div class="movie-item-info">.*?title=' \'"(.*?)".*?class="star">(.*?)</p>.*?class="releasetime">(.*?)</p>'pattern=repile(re_bds,re.S)r_list=pattern.findall(html)#直接调用写入函数self.write_html(r_list)#保存def write_html(self,r_list):item={}#以a方式追加写入with open("maoyan.csv","a",newline="",encoding="utf-8") as f:writer=csv.writer(f)#初始化对象for r in r_list:item["name"] = r[0].strip()item["star"] = r[1].strip()item["time"] = r[2].strip()[5:15]#切取后面的时间段print(item)L=[item["name"],item["star"],item["time"]]writer.writerow(L)self.i += 1#主函数def run(self):for offset in range(0,91,10):#从0到90以步长为10爬取10个页面url=self.url.format(offset)self.get_html(url)print("爬取电影数量:",self.i)if __name__ == '__main__':start=time.time()spider=MaoyanSpider()spider.run()end=time.time()print("执行时间:%.2f"%(end-start))
更多推荐
Top100猫眼电影爬取案例
发布评论