爬虫一:豆瓣电影关键字爬取 并存入excel"/>
爬虫一:豆瓣电影关键字爬取 并存入excel
爬虫一:豆瓣电影关键字爬取
爬虫一:豆瓣电影关键字爬取 并存入excel
=T&range=0,10&tags=%E7%83%82%E7%89%87&start=20
豆瓣电影信息还是比较好拿到的
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36""Cookie": 'll="118245"; bid=ARtOOxDRteM; __utma=30149280.1180608368.1569232683.1569232683.1569232683.1; __utmc=30149280; __utmz=30149280.1569232683.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmt=1; acw_tc=2760828115692326854383671ef4946949a1588e4e0476180bf57b1e5f944d; ap_v=0,6.0; __utma=223695111.1224307206.1569232715.1569232715.1569232715.1; __utmb=223695111.0.10.1569232715; __utmc=223695111; __utmz=223695111.1569232715.1.1.utmcsr=accounts.douban|utmccn=(referral)|utmcmd=referral|utmcct=/passport/login; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1569232715%2C%22https%3A%2F%2Faccounts.douban%2Fpassport%2Flogin%3Fredir%3Dhttps%253A%252F%252Fmovie.douban%252F%22%5D; _pk_ses.100001.4cf6=*; push_noty_num=0; push_doumail_num=0; __yadk_uid=jOpNK0YqTHKGC26G4EsFha6rAzJzvStK; _vwo_uuid_v2=D2DBA0AE8FBA3AA58155A7BB3CF5E42B0|0fc9726cdfeda577fffd4d9b62d1989e; dbcl2="199182842:8rksCNklcW8"; ck=Taef; __utmv=30149280.19918; _pk_id.100001.4cf6=62084c1c85a029b2.1569232715.1.1569233089.1569232715.; __utmb=30149280.17.10.1569232683'}
首先定义headers请求头 让网站认为我们是人为的访问
content = requests.get(url, headers=headers)content_json = json.loads(content.text)["data"]for one_info in content_json:one_id = one_info["id"]print(one_id)url2 = "/%s/"%one_id# content_html = requests.get(url, headers=headers)html = requests.get(url2, headers=headers)if html.status_code == 200:content = html.content.decode("utf-8")content = etree.HTML(content)save_info(content)time.sleep(1)
发出请求 得到html数据
info = content.xpath("//div[@id='info']")[0]try:name = str(content.xpath('//*[@id="content"]/h1/span[1]/text()')[0]).replace("'", " ")except:name = "无"try:daoyan = str(info.xpath("./span[1]/span[2]/a/text()")[0] if info.xpath("./span[1]/span[2]/a/text()") else None ).replace("'", " ")except:daoyan = "无"try:bianju = str(info.xpath("./span[2]/span[2]/a/text()")[0] if info.xpath("./span[2]/span[2]/a/text()") else None).replace("'", " ")except:bianju = "无"try:zhuyan = '/'.join(info.xpath("./span[3]/span[2]/a/text()")).replace("'", " ")except:zhuyan = "无"try:leixing = '/'.join(info.xpath("./span[@property='v:genre']/text()")).replace("'", " ")except:leixing = "无"try:shangyingshijian= '/'.join(info.xpath(".//span[@property='v:initialReleaseDate']/text()")).replace("'", " ")except:shangyingshijian = "无"try:shichang = str(info.xpath(".//span[@property='v:runtime']/text()")[0]).replace("'", " ")except:shichang = "无"try:pingfen = str(content.xpath('//*[@id="interest_sectl"]/div[1]/div[2]/strong/text()')[0]).replace("'", " ")except:pingfen = "无"try:jianjie = str(content.xpath('// *[ @ id = "link-report"] / span[1]/text()')[0]).replace("'", " ")except:jianjie = "无"# tupian = str(content.xpath('//*[@id="mainpic"]/a/img/@src')[0]).replace("https://", "")try:pingjiarenshu = content.xpath('//*[@id="interest_sectl"]/div[1]/div[2]/div/div[2]/a/span/text()')[0]except:pingjiarenshu = "无"print("电影名称:", name)print("导演:", daoyan)print("编剧:", bianju)print("主演:", zhuyan)print("评分:", pingfen)print("评价人数:", pingjiarenshu)print("类型:", leixing)print("上映时间:", shangyingshijian)print("时长:", shichang)print("简介:", jianjie)# print("图片url:", tupian)one_info = [name, daoyan, bianju, zhuyan, pingfen, pingjiarenshu,leixing, shangyingshijian, shichang, jianjie]all_list.append(one_info)
通过xpath提取数据
def processing_data(content_list):# 创建一个workbook 设置编码workbook = xlwt.Workbook(encoding='utf-8')# 创建一个worksheetworksheet = workbook.add_sheet('My Worksheet')# 写入excelfor i, content in enumerate(content_list):for x, info in enumerate(content):worksheet.write(i, x, label=info) # 将数据存入excel# 保存workbook.save('电影信息.xls')
将数据存入excel
完整代码
import timeimport xlwt
from lxml import etree
import requests
import jsondef processing_data(content_list):# 创建一个workbook 设置编码workbook = xlwt.Workbook(encoding='utf-8')# 创建一个worksheetworksheet = workbook.add_sheet('My Worksheet')# 写入excelfor i, content in enumerate(content_list):for x, info in enumerate(content):worksheet.write(i, x, label=info) # 将数据存入excel# 保存workbook.save('电影信息.xls')def save_info(content):info = content.xpath("//div[@id='info']")[0]try:name = str(content.xpath('//*[@id="content"]/h1/span[1]/text()')[0]).replace("'", " ")except:name = "无"try:daoyan = str(info.xpath("./span[1]/span[2]/a/text()")[0] if info.xpath("./span[1]/span[2]/a/text()") else None ).replace("'", " ")except:daoyan = "无"try:bianju = str(info.xpath("./span[2]/span[2]/a/text()")[0] if info.xpath("./span[2]/span[2]/a/text()") else None).replace("'", " ")except:bianju = "无"try:zhuyan = '/'.join(info.xpath("./span[3]/span[2]/a/text()")).replace("'", " ")except:zhuyan = "无"try:leixing = '/'.join(info.xpath("./span[@property='v:genre']/text()")).replace("'", " ")except:leixing = "无"try:shangyingshijian= '/'.join(info.xpath(".//span[@property='v:initialReleaseDate']/text()")).replace("'", " ")except:shangyingshijian = "无"try:shichang = str(info.xpath(".//span[@property='v:runtime']/text()")[0]).replace("'", " ")except:shichang = "无"try:pingfen = str(content.xpath('//*[@id="interest_sectl"]/div[1]/div[2]/strong/text()')[0]).replace("'", " ")except:pingfen = "无"try:jianjie = str(content.xpath('// *[ @ id = "link-report"] / span[1]/text()')[0]).replace("'", " ")except:jianjie = "无"# tupian = str(content.xpath('//*[@id="mainpic"]/a/img/@src')[0]).replace("https://", "")try:pingjiarenshu = content.xpath('//*[@id="interest_sectl"]/div[1]/div[2]/div/div[2]/a/span/text()')[0]except:pingjiarenshu = "无"print("电影名称:", name)print("导演:", daoyan)print("编剧:", bianju)print("主演:", zhuyan)print("评分:", pingfen)print("评价人数:", pingjiarenshu)print("类型:", leixing)print("上映时间:", shangyingshijian)print("时长:", shichang)print("简介:", jianjie)# print("图片url:", tupian)one_info = [name, daoyan, bianju, zhuyan, pingfen, pingjiarenshu,leixing, shangyingshijian, shichang, jianjie]all_list.append(one_info)def main():try:for x in range(0,60):url = '=T&range=0,10&tags=%E7%83%82%E7%89%87&start='+ str(x*20)headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"}content = requests.get(url, headers=headers)content_json = json.loads(content.text)["data"]for one_info in content_json:one_id = one_info["id"]print(one_id)url2 = "/%s/"%one_id# content_html = requests.get(url, headers=headers)html = requests.get(url2, headers=headers)if html.status_code == 200:content = html.content.decode("utf-8")content = etree.HTML(content)save_info(content)time.sleep(1)except:processing_data(all_list)if __name__ == '__main__':all_list = []main()processing_data(all_list)
本人在校学生欢迎来交流python技术
QQ:5834135
更多推荐
爬虫一:豆瓣电影关键字爬取 并存入excel
发布评论