import requests
from lxml import etree
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
}
base_url = 'https://www.dy2018'
detail_urls = []
# 获取每部影片详情页面的url
def get_detail_urls(num):
for i in range(1, num+1):
if i == 1:
url = base_url + '/html/gndy/dyzz/index.html'
else:
url = base_url + '/html/gndy/dyzz/index_%d.html' % i
response = requests.get(url, headers=headers)
result = response.content.decode('gbk')
html = etree.HTML(result)
urls = html.xpath("//table[@class='tbspan']//a/@href")
for value in urls:
detail_url = base_url + value
detail_urls.append(detail_url)
# 提取需要的数据
def parse_detail_page(url,movies):
response = requests.get(url,headers=headers)
result = response.content.decode('gbk')
html = etree.HTML(result)
details = html.xpath("//div[@id='Zoom']")
movie = {}
for detail in details:
infos = detail.xpath(".//text()")
for index,info in enumerate(infos):
if info.startswith("◎片 名"):
title = info.replace("◎片 名","").strip()
movie['movie_name'] = title
elif info.startswith("◎年 代"):
year = info.replace("◎年 代","").strip()
movie['movie_year'] = year
elif info.startswith("◎产 地"):
address = info.replace("◎产 地","").strip()
movie['movie_address'] = address
elif info.startswith("◎类 别"):
category = info.replace("◎类 别","").strip()
movie['movie_category'] = category
elif info.startswith("◎语 言"):
language = info.replace("◎语 言", "").strip()
movie['movie_language'] = language
elif info.startswith("◎豆瓣评分"):
score = info.replace("◎豆瓣评分", "").strip()
movie['douban_score'] = score
elif info.startswith("◎导 演"):
director = info.replace("◎导 演", "").strip()
movie['movie_director'] = director
elif info.startswith("◎主 演"):
actor = info.replace("◎主 演", "").strip()
actors = []
actors.append(actor)
for x in range(index+1,len(infos)): # 获取所有主演
if infos[x].startswith("◎简 介"):
break
actor = infos[x].strip()
actors.append(actor)
movie['movie_actors'] = actors
elif info.startswith("◎简 介"):
for x in range(index+1,index+3):
intro = infos[x]
movie['movie_intro'] = intro
download_url = html.xpath(".//td[@bgcolor='#fdfddf']//a/text()")
for index,url in enumerate(download_url):
if url.find("电影天堂"):
del download_url[index]
movie['download_url'] = download_url
movies.append(movie)
if __name__ == '__main__':
movies = []
num = int(input("请输入需要的爬取的页数:"))
get_detail_urls(num)
for url in detail_urls:
parse_detail_page(url,movies)
for movie in movies:
for key,value in movie.items():
print(key + ":",end="")
print(value)
print()
print()
更多推荐
爬取电影天堂电影详情和磁力链接
发布评论