豆瓣电影T250信息爬取
1
import requests
from bs4 import BeautifulSoup
from re import *
import csv
headers = {
'User-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/108.0.0.0 Safari/537.36'
}
def get_content(he):
html = requests.get(he, headers=headers).text
soup = BeautifulSoup(html, 'lxml')
all_movies = []
all_movie = soup.select('ol.grid_view>li')
for movie in all_movie:
movies = []
number = movie.select('.pic>em')[0].text
titles = movie.select('.title')
title = ''
for t in titles:
title += t.text.split('/')[-1].strip()
movies.append(title)
other = movie.select('.other')[0].text.split('/')
for o in other:
movies.append(o.strip())
category = movie.select('.bd>p')[0].text
re_category = findall(r'\s+(.\d{4}.+)', category.split('\n')[-2])[0]
for x in re_category.split('/'):
movies.append(x.strip())
score = movie.select('.star>span')[1].text
movies.append(score)
comment = movie.select('.star>span')[3].text
movies.append(comment[:-3])
slogan = movie.select('.quote>span')[0].text
movies.append(slogan)
all_movies.append(movies)
return all_movies
def download_content(movie):
f = open('files/电影.csv', 'a', encoding='utf-8', newline='')
w = csv.writer(f)
w.writerow(['片名', '其它', '类别', '评分', '评论人数', '标语'])
for m in movie:
w.writerow(m)
if __name__ == '__main__':
page = int(input('请输入需爬取的页数(1~10): '))
for i in range(1, page+1):
href = 'https://movie.douban/top250?start=' + str(i * 25) + '&filter='
me = get_content(href)
download_content(me)
2
import requests
from bs4 import BeautifulSoup
import csv
from re import findall
def get_one_page(page):
# 1. 发送请求获取网页数据
url = f'https://movie.douban/top250?start={page}&filter='
headers = {
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
}
response = requests.get(url, headers=headers)
html = response.text
# 2. 解析数据
soup = BeautifulSoup(html, 'lxml')
# 获取所有电影对应div
all_film_box = soup.select('.item')
# 遍历列表获取每个电影对应的div
for div in all_film_box:
# 电影名称
name = div.select_one('.title').text
# 评分
score = float(div.select_one('.rating_num').text)
# 评论人数
# comment_num = div.select('.star>span')[-1].text
comment_num = int(div.select_one('.star>span:nth-child(4)').text[:-3])
# 描述
describe_tag = div.select_one('.inq')
if describe_tag:
describe = describe_tag.text
else:
describe = ''
message = div.select_one('.bd>p').text
info = message.strip().split('\n')[-1].strip() # 字符串.strip() - 去掉字符串前后两端的空白字符
result = [x.strip() for x in info.split('/')]
# 时间
time = result[0]
# 国家
country = result[1]
# 类型
film_type = result[-1]
# 将数据写入到csv文件中
w.writerow([name, score, comment_num, time, country, film_type, describe])
print('写入成功!')
if __name__ == '__main__':
# 1. 创建writer
w = csv.writer(open('files/电影.csv', 'w', encoding='utf-8', newline=''))
w.writerow(['电影名称', '评分', '评论人数', '上映时间', '国家', '类型', '描述'])
# 2. 获取数据
# get_one_page()
for start in range(0, 226, 25):
get_one_page(start)
3
from selenium.webdriver import Chrome
from selenium.webdriver.common.by import By
import time
import openpyxl
# 1.准备Excel文件,保存数据
wb = openpyxl.Workbook()
sheet = wb.create_sheet('豆瓣电影Top250')
sheet.cell(1, 1).value = '电影名称'
sheet.cell(1, 2).value = '评分'
row = 2
# 2.通过浏览器获取网页内容
b = Chrome()
b.get('https://movie.douban/top250')
time.sleep(1)
for _ in range(10):
# 获取一页的所有电影的名字
all_title = b.find_elements(By.CSS_SELECTOR, 'div.hd>a>span:nth-child(1)')
all_name = []
for x in all_title:
all_name.append(x.text)
# 获取一页的所有的电影的评分
all_score_tag = b.find_elements(By.CLASS_NAME, 'rating_num')
all_score = []
for x in all_score_tag:
all_score.append(x.text)
# 将一页的名字和分数写入到Excel文件中
for index in range(len(all_name)):
sheet.cell(row, 1).value = all_name[index]
sheet.cell(row, 2).value = all_score[index]
row += 1
# 点击下一页
b.find_element(By.CLASS_NAME, 'next').click()
time.sleep(2)
wb.save('files/电影.xlsx')
英雄联盟皮肤图片下载
import requests
import os
def get_all_hero_id():
response = requests.get('https://game.gtimg/images/lol/act/img/js/heroList/hero_list.js')
result = response.json()
return [x['heroId'] for x in result['hero']]
def download(img_url, path):
response = requests.get(img_url)
result = response.content
with open(path, 'wb') as f:
f.write(result)
print(f'{path}:下载完成!')
def get_one_hero_skin(hero_id):
url = f'https://game.gtimg/images/lol/act/img/js/hero/{hero_id}.js'
response = requests.get(url)
result = response.json()
for x in result['skins']:
hero_name = x['heroName']
skin_name = x['name'].replace('/', '')
img_url = x['mainImg']
if not img_url:
img_url = x['chromaImg']
if not os.path.exists(f'files/{hero_name}'):
os.mkdir(f'files/{hero_name}')
download(img_url, f'files/{hero_name}/{skin_name}.png')
if __name__ == '__main__':
for x in get_all_hero_id():
get_one_hero_skin(x)
更多推荐
网络爬虫实例
发布评论