admin管理员组文章数量:1660687
今日头条新闻信息抓取 注意的是头条获取的ajax动态数据(数据里还是有点小坑的),json中data数据的9和19是无用信息,另外图片和视频类型也需要排除
# coding=gbk
import requests
import json
import pandas as pd
from lxml import etree
import re
import csv
# false=""
# true=""
# null=""
def download(url):
headers={
"authority":"www.toutiao",
"accept":"application/json",
"cookie":"tt_webid=UM_distide0500427da2a543680c200ee161; _ga=GA1.2.846289002.1555920121; CNZZDATA1259612802=450841153-1555907327-https%253A%252F%252Fwww.baidu%252F%7C1556003494; __tasessionId=pvm4sk1561556005960158; s_v_web_id=b3da262bf518fdcf2df06210b0065a55",
"referer":"https://www.toutiao/search/?keyword=%E6%B2%B3%E5%8D%97%20%E6%A0%BE%E5%B7%9D",
"user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36",
}
html=requests.get(url,headers=headers,verify=False,timeout=10).text
print(html)
print(type(html))
# return eval(html)
return json.loads(html)
def download1(url):
headers1={
"authority":"www.toutiao",
"accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
"cookie":"tt_webid=6682589457788569099;788569099; UM_distinctid=16a438f23b819c-0a360ff16c1a3d-7a1b34-e1000-16a438f23b93f9; csrftoken=ddb7de0500427da2a543680c200ee161; _ga=GA1.2.846289002.1555920121; CNZZDATA1259612802=450841153-1555907327-https%253A%252F%252Fwww.baidu%252F%7C1556003494; s_v_web_id=b3da262bf518fdcf2df06210b0065a55; __tasessionId=9xpt12cca1556008616299",
"referer":"https://www.toutiao/search/?keyword=%E6%B2%B3%E5%8D%97%20%E6%A0%BE%E5%B7%9D",
"user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36",
}
html=requests.get(url,headers=headers1,verify=False,timeout=10).text
return html
# html = requests.get(url=url,headers=headers1,allow_redirects=False)
# print(html.status_code)
# # print(html.headers["location"])
# if html.status_code == 302:
# new_id_url = html.headers["location"]
# print(new_id_url)
#
# return new_id_url
# else:
# print("++++++++++++++++")
# print(url)
# # print(requests.get(url=url,headers=headers1).text)
# return etree.HTML(requests.get(url=url).text)
list_all=[]
key_words=pd.read_csv(r"C:\Users\Lavector\Desktop\百事小红书\redbook.csv",engine='python',header=None).values.tolist()
# for l in range(1):
for l in range(1,len(key_words)):
key_word=key_words[l][0]
print(len(key_word))
print(key_word)
list_all.append(key_word)
# print(list_all)
try:
for i in range(20):
print("第{}页".format(i+1))
key_url="https://www.toutiao/api/search/content/?aid=24&app_name=web_search&offset={}&format=json&keyword={}&autoload=true&count=20&en_qc=1&cur_tab=1&from=search_tab".format(20*i,key_word)
key_html=download(key_url)
# print(key_html['data'])
# print(key_html['data'][0])
# print(key_html['data'][1])
# print(key_html['data'][2])
print(len(key_html['data']))
# print(key_html['data'][0][1]['media_name'])
for k in range(len(key_html['data'])-1):
print(k)
print(key_html['data'][k])
# try:
if not k ==9 :
if key_html['data'][k]['app_info']['query_type']=="SearchAggregationInternalQueryType" :
name=key_html['data'][k]['media_name']
time=key_html['data'][k]['datetime']
title=key_html['data'][k]['title']
abstract=key_html['data'][k]['abstract']
article_url=key_html['data'][k]['article_url']
article_html=download1(article_url)
content=''.join(re.findall('([\u4E00-\u9FA5])',article_html,re.S))
list_all.append([key_word,name,time,title,abstract,content,article_url])
print([key_word,name,time,title,abstract,content,article_url])
print("+++++++++++++++++++++++++++++++++++++++++======")
# elif key_html['data'][k]['cell_type']==20:
# pass
else:
pass
else:
pass
except Exception as e:
with open("头条3.csv", "w", encoding="utf-8", newline="") as f:
k = csv.writer(f, dialect="excel")
k.writerow(["关键词", "发表人", "发布时间", "标题", "简介", "内容", "链接"])
for list in list_all:
k.writerow(list)
print(e)
print('************')
pass
with open("头条3.csv", "w", encoding="utf-8", newline="") as f:
k = csv.writer(f, dialect="excel")
k.writerow(["关键词", "发表人","发布时间", "标题", "简介", "内容","链接"])
for list in list_all:
k.writerow(list)
版权声明:本文标题:今日头条新闻数据抓取 内容由热心网友自发贡献,该文观点仅代表作者本人, 转载请联系作者并注明出处:https://www.elefans.com/xitong/1729881142a1215933.html, 本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌抄袭侵权/违法违规的内容,一经查实,本站将立刻删除。
发表评论