用requests+正则表达式+BeautifulSoup爬取今日头条的美图!
import json
import os
import pymongo
import requests
import requests.exceptions
import re
from bs4 import BeautifulSoup
from hashlib import md5
from test.config import *
client = pymongo.MongoClient(MONGO_URL)
db = client[MONGO_DB]
def get_page_index(offset,keyword):
dic={
'offset': offset,
'format': 'json',
'keyword': keyword,
'autoload': 'true',
'count': 20,
'cur_tab': 3,
'from':'gallery'
}
try:
url='https://www.toutiao/search_content/'
response = requests.get(url,dic)
if response.status_code ==200:
return response.text
else:
return None
except requests.exceptions.RequestException:
print('RequestException:请求错误')
def parse_page_index(html):
result = json.loads(html)
if result and 'data' in result.keys():
for item in result.get('data'):
yield item.get('article_url')
def get_page_image(url):
try:
response = requests.get(url)
if response.status_code ==200:
return response.text
return None
except requests.exceptions.RequestException:
print('请求错误',url)
return None
def parse_page_image(html,url):
soup = BeautifulSoup(html,'lxml')
title = soup.select('title')[0].text
images = soup.find_all('ul')
result = re.search('gallery: JSON.parse\((.*?)\)',html,re.S)
if result:
data = json.loads(json.loads(result.group(1)))
if data and 'sub_images' in data.keys():
images = [item.get('url') for item in data.get('sub_images')]
return {
'title':title,
'url':url,
'images':images
}
def save_to_mongo(result):
if db[MONGO_TABLE].insert(result):
print('存储到MongoDB成功',result)
return True
return False
def download_image(url):
print('正在下载:',url)
try:
response = requests.get(url)
if response.status_code ==200:
flag = save_iamges(response.content)
return flag
return None
except requests.exceptions.RequestException:
print('图片下载错误',url)
return None
def save_iamges(content):
file_path = '{0}\\images\\{1}.{2}'.format(os.getcwd(),md5(content).hexdigest(),'jpg')
if not os.path.exists(file_path):
with open(file_path,'wb') as f:
f.write(content)
f.close()
def main():
html = get_page_index(0,'美女')
url_result = parse_page_index(html)
for str in url_result:
_html = get_page_image(str)
result = parse_page_image(_html, str)
for image in result.get('images'):
download_image(image)
# save_to_mongo(result)
if __name__ == '__main__':
main()
更多推荐
爬取今日头条图片
发布评论