admin管理员组

文章数量:1660864

今日头条的用户页数据爬取跟频道页的数据爬取大部分很类似,但稍微有一点不一样,就是用户主页的接口signature有点不一样,需要将当前爬取的用户id和分页时间戳一起作为入参传递进去才能获取到真正的signature,除了这一点差异外其他的都是一样的思路,上代码:

def fetch_user_articles(self, user, browser):

honey = json.loads(self.get_js())

signature = honey['_signature']

max_behot_time = "0"

_as = honey['as']

cp = honey['cp']

if self.user_page > 0:

signature = browser.execute_script("return window.TAC.sign(" + user.user_id+max_behot_time + ")")

headers = {

'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',

'Connection': 'keep-alive',

'authority': 'www.toutiao',

'referer': user.media_url,

'method': 'GET',

'path': "/c/user/article/?page_type=1&user_id={}&max_behot_time={}&count=20&as={}&cp={}&_signature={}".format(

user.user_id, max_behot_time, _as, cp, signature),

'scheme': 'https'

}

self.s.headers.update(headers)

req_url = "https://www.toutiao/c/user/article/?page_type=1&user_id={}&max_behot_time={}&count=20&as={}&cp={}&_signature={}".format(

user.user_id, max_behot_time, _as, cp, signature)

req = self.s.get(req_url, proxies=get_proxy_ip())

# 通过随机数控制请求速度

time.sleep(random.random() * 2 + 2)

data = json.loads(req.text)

max_behot_time = str(data['next'][max_behot_time])

if data['has_more']:

self.user_page = self.user_page + 1

self.parse_user_artcle(data['data'], toutiaoitem.user_id, toutiaoitem.media_url)

#在休眠2s

time.sleep(2)

self.fetch_user_articles(user, browser)

else:

self.parse_user_artcle(data['data'], toutiaoitem.user_id, toutiaoitem.media_url)

toutiaodb.save(self.user_artcile_list)

def parse_user_artcle(self, items, user_id, url):

for item in items:

toutiaoitem = toutiaoitem()

toutiaoitem.user_id = user_id

toutiaoitem.source = item['source']

toutiaoitem.title = item['title']

toutiaoitem.source_url = 'https:' + item['display_url']

toutiaoitem.media_url = url

toutiaoitem.item_id = item['item_id']

toutiaoitem.abstract = item['abstract']

toutiaoitemments_count = item['comments_count']

toutiaoitem.behot_time = item['behot_time']

toutiaoitem.image_url = item['image_url']

toutiaoitem.image_list = item['image_list']

toutiaoitem.tag = item['tag']

toutiaoitem.chinese_tag = item['chinese_tag']

toutiaoitem.read_count = item['go_detail_count']

toutiaoitem.article_genre = item['article_genre']

self.user_artcile_list.append(toutiaoitem)

本文标签: 爬虫头条所有内容今日用户