爬取电影天堂电影详情和磁力链接

编程入门 行业动态 更新时间:2024-10-26 18:17:00
import requests
from lxml import etree

headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
}

base_url = 'https://www.dy2018'
detail_urls = []


#  获取每部影片详情页面的url
def get_detail_urls(num):
    for i in range(1, num+1):
        if i == 1:
            url = base_url + '/html/gndy/dyzz/index.html'
        else:
            url = base_url + '/html/gndy/dyzz/index_%d.html' % i
        response = requests.get(url, headers=headers)
        result = response.content.decode('gbk')
        html = etree.HTML(result)
        urls = html.xpath("//table[@class='tbspan']//a/@href")
        for value in urls:
            detail_url = base_url + value
            detail_urls.append(detail_url)

# 提取需要的数据
def parse_detail_page(url,movies):
    response = requests.get(url,headers=headers)
    result = response.content.decode('gbk')
    html = etree.HTML(result)
    details = html.xpath("//div[@id='Zoom']")
    movie = {}
    for detail in details:
        infos = detail.xpath(".//text()")
        for index,info in enumerate(infos):
            if info.startswith("◎片  名"):
                title = info.replace("◎片  名","").strip()
                movie['movie_name'] = title
            elif info.startswith("◎年  代"):
                year = info.replace("◎年  代","").strip()
                movie['movie_year'] = year
            elif info.startswith("◎产  地"):
                address = info.replace("◎产  地","").strip()
                movie['movie_address'] = address
            elif info.startswith("◎类  别"):
                category = info.replace("◎类  别","").strip()
                movie['movie_category'] = category
            elif info.startswith("◎语  言"):
                language = info.replace("◎语  言", "").strip()
                movie['movie_language'] = language
            elif info.startswith("◎豆瓣评分"):
                score = info.replace("◎豆瓣评分", "").strip()
                movie['douban_score'] = score
            elif info.startswith("◎导  演"):
                director = info.replace("◎导  演", "").strip()
                movie['movie_director'] = director
            elif info.startswith("◎主  演"):
                actor = info.replace("◎主  演", "").strip()
                actors = []
                actors.append(actor)
                for x in range(index+1,len(infos)):       # 获取所有主演
                    if infos[x].startswith("◎简  介"):
                        break
                    actor = infos[x].strip()
                    actors.append(actor)
                movie['movie_actors'] = actors
            elif info.startswith("◎简  介"):
                for x in range(index+1,index+3):
                    intro = infos[x]
                    movie['movie_intro'] = intro
    download_url = html.xpath(".//td[@bgcolor='#fdfddf']//a/text()")
    for index,url in enumerate(download_url):
        if url.find("电影天堂"):
            del download_url[index]
    movie['download_url'] = download_url
    movies.append(movie)



if __name__ == '__main__':
    movies = []
    num = int(input("请输入需要的爬取的页数:"))
    get_detail_urls(num)
    for url in detail_urls:
        parse_detail_page(url,movies)
    for movie in movies:
        for key,value in movie.items():
            print(key + ":",end="")
            print(value)
            print()
        print()

 

更多推荐

爬取电影天堂电影详情和磁力链接

本文发布于:2023-06-14 08:04:00,感谢您对本站的认可!
本文链接:https://www.elefans.com/category/jswz/34/1454273.html
版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系,我们将在24小时内删除。
本文标签:电影   磁力   详情   链接   天堂

发布评论

评论列表 (有 0 条评论)
草根站长

>www.elefans.com

编程频道|电子爱好者 - 技术资讯及电子产品介绍!