爬取今日头条图片

编程入门 行业动态 更新时间:2024-10-28 14:24:38

用requests+正则表达式+BeautifulSoup爬取今日头条的美图!

import json
import os
import pymongo
import requests
import requests.exceptions
import re
from bs4 import BeautifulSoup
from hashlib import md5
from test.config import *

client = pymongo.MongoClient(MONGO_URL)
db = client[MONGO_DB]

def get_page_index(offset,keyword):
    dic={
        'offset': offset,
        'format': 'json',
        'keyword': keyword,
        'autoload': 'true',
        'count': 20,
        'cur_tab': 3,
        'from':'gallery'
    }
    try:
        url='https://www.toutiao/search_content/'
        response =  requests.get(url,dic)
        if response.status_code ==200:
            return response.text
        else:
            return None
    except requests.exceptions.RequestException:
        print('RequestException:请求错误')

def parse_page_index(html):
    result = json.loads(html)
    if result and 'data' in result.keys():
        for item in result.get('data'):
            yield item.get('article_url')

def get_page_image(url):
    try:
        response =  requests.get(url)
        if response.status_code ==200:
            return response.text
        return None
    except requests.exceptions.RequestException:
        print('请求错误',url)
        return None

def parse_page_image(html,url):
    soup = BeautifulSoup(html,'lxml')
    title = soup.select('title')[0].text
    images = soup.find_all('ul')
    result = re.search('gallery: JSON.parse\((.*?)\)',html,re.S)
    if result:
        data = json.loads(json.loads(result.group(1)))
        if data and 'sub_images' in data.keys():
            images = [item.get('url') for item in data.get('sub_images')]
            return {
                'title':title,
                'url':url,
                'images':images
             }
def save_to_mongo(result):
    if db[MONGO_TABLE].insert(result):
        print('存储到MongoDB成功',result)
        return True
    return False
def download_image(url):
    print('正在下载:',url)
    try:
        response =  requests.get(url)
        if response.status_code ==200:
            flag = save_iamges(response.content)
            return flag
        return None
    except requests.exceptions.RequestException:
        print('图片下载错误',url)
        return None
def save_iamges(content):
    file_path = '{0}\\images\\{1}.{2}'.format(os.getcwd(),md5(content).hexdigest(),'jpg')
    if not os.path.exists(file_path):
        with open(file_path,'wb') as f:
            f.write(content)
            f.close()

def main():
    html =  get_page_index(0,'美女')
    url_result = parse_page_index(html)
    for str in url_result:
        _html = get_page_image(str)
        result = parse_page_image(_html, str)
        for image in result.get('images'):
            download_image(image)
        # save_to_mongo(result)

if __name__ == '__main__':
    main()

更多推荐

爬取今日头条图片

本文发布于:2023-06-14 06:01:00,感谢您对本站的认可!
本文链接:https://www.elefans.com/category/jswz/34/1445307.html
版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系,我们将在24小时内删除。
本文标签:头条   今日   图片

发布评论

评论列表 (有 0 条评论)
草根站长

>www.elefans.com

编程频道|电子爱好者 - 技术资讯及电子产品介绍!