爬虫练习1"/>
爬虫练习1
借用崔老师的网站自己写 xpath分析import multiprocessingimport requests from lxml import etree import json from os import makedirs from os.path import exists url = '/'def index_url(url):response = requests.get(url)if response.status_code == 200:html = response.textreturn etree.HTML(html)def get_url(page):baseurl = f'{url}page/{page}'print(baseurl)return index_url(baseurl)def detail_url(html):detailurl = html.xpath('//img[@class="cover"]/../@href')for detail in detailurl:details = f'{detail}'print('deatail urls', details)yield detailsdef detail_html(url):return index_url(url)def scrape_detail(html):cover = html.xpath('//img/@src')[1]name = html.xpath('//h2[@class = "m-b-sm"]/text()')[0]categories = html.xpath('//div[@class="categories"]//span/text()')country_time_deputy = html.xpath('//div[@class="m-v-sm info"]/span/text()')country = country_time_deputy[0]time = country_time_deputy[2]try:deputy = country_time_deputy[3]except IndexError:deputy = '空'drama = html.xpath('//div[@class="drama"]/p/text()')[0].strip()score = html.xpath('//p[contains(@class,"score")]/text()')[0].strip()return {'封面': cover,'名字': name,'类型': categories,'地区': country,'时长': time,'上映时间': deputy,'内容简介': drama,'分数': score} result_file = 'contxt' exists(result_file) or makedirs(result_file) def save_data(data):name = data.get('名字')data_path = f'{result_file}/{name}.json'json.dump(data,open(data_path,'w',encoding='utf-8'),ensure_ascii=False,indent=2)def main(page):indexurl = get_url(page)detailurl = detail_url(indexurl)for detail in detailurl:detailhtml = detail_html(detail)data = scrape_detail(detailhtml)save_data(data)print(data)if __name__ == '__main__':pool = multiprocessing.Pool()page = range(1,11)pool.map(main,page)pool.close()pool.join()
更多推荐
爬虫练习1
发布评论