项目之多线程爬取blue网站

编程入门行业动态更新时间:2024-10-11 05:24:53

项目<a href=https://www.elefans.com/category/jswz/34/1762947.html style= 之多线程爬取blue网站"/>

项目之多线程爬取blue网站

始于2020.2.25
以.html为例
该网址可以先从网页源码中提取出一个m3u8的文件
.m3u8
但是没加载出来时只能提取到这个：
var vHLSurl = “https://”+CN1+"/20190205/2s3wcjro/index.m3u8";
也差不多
文件中是如下信息：
#EXTM3U
#EXT-X-STREAM-INF:PROGRAM-ID=1,BANDWIDTH=800000,RESOLUTION=720x406
/ppvod/4B8FA7A19F704F55A7AEF68E90B3B854.m3u8
然后，我们从捕获的XHR接口中可以发现这样一个文件
.m3u8
打开文件看，发现就是ts文件的播放列表
摘取部分如下：
#EXTM3U
#EXT-X-VERSION:3
#EXT-X-TARGETDURATION:9
#EXT-X-MEDIA-SEQUENCE:0
#EXTINF:6.039,
/20190205/2s3wcjro/800kb/hls/hDdIBh38332000.ts
#EXTINF:6.34,
/20190205/2s3wcjro/800kb/hls/hDdIBh38332001.ts
其中ts文件就是视频的流文件
.ts
使用时加上
这下我们就可以确定思路：
（1）抓取网页源码，解析出第一个m3u8文件和视频名称，保存下来
（2）从第一个m3u8文件中解析出第二个m3u8文件，把所有的ts信息都提取出来，保存在列表中
（3）先尝试单线程爬虫爬取部分ts数据，看是否有强烈的反爬措施，爬取时最好慢一点
（4）用os.makedirs来一次创建多层文件夹，将文件保存进去
（5）研究该网站反爬措施，然后在单线程爬虫的基础上，改造多线程爬虫
（6）导入某种Python库，将ts视频按m3u8文件整合成mp4，用ffmpeg来合并视频，博客

修订&更新：
2020.2.27：
背景：（1）~（5）的已经全部实现，网站未有强烈的反爬措施，但是下载视频时会有延迟，即有些响应会超时，导致一部分线程挂掉，还有就是爬取速度不稳定，导致网断，整个程序得重新再来，这样的程序太脆弱了，我们需要一个stronger程序！
初始方案：
（1）每次爬取时抓取下的ts链接，对应文件夹中是否有这个文件，如果有，直接跳过，如果没有，就下载
（2）每次从Queue里取出ts链接时，准确来说不要取出来，就访问头部元素，如果下载成功了再取出来，防止缺少链接
（3）在遇到响应超时或其他error时，抛出错误，并继续执行程序，而不是被搞掉了
（4）将error输出为日志
方案修订：
（1）第（2）点中，链接要取出来，否则多线程会一直爬取头部元素，导致效率低下以致崩溃，改进方法为将其再插入尾部，等待被再次取出
方案实现：
（1）用os方法里的os.path.isfile()来判断是否有该文件
（2）用try和except来实现重复爬取，用BaseException来就够了
（3）超时特判用在代码中get方法添加timeout参数来实现

代码：

import requests
import re
import time
import os
import sys
import threading
import logging
import subprocess
from queue import Queue
from bs4 import BeautifulSoup# 设置日志参数，输出到log文件中
logging.basicConfig(filename='log/spider.log', level=logging.WARNING, format='%(asctime)s %(filename)s[line:%(lineno)d] %(message)s')# 建立会话
s = requests.Session()# 参数
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:73.0) Gecko/20100101 Firefox/73.0','Host': 'www.52sehua',
}download_list = []def prepare_system():path = os.getcwd().replace('\\', '/')if not os.path.exists(path + '/log'):os.makedirs(path + '/log')path = path.rsplit('/', 1)[0]if not os.path.exists(path + '/mp4'):os.makedirs(path + '/mp4')path = path + '/mp4'if not os.path.exists(path + '/ppvod'):os.makedirs(path + '/ppvod')return pathclass download_ts_thread(threading.Thread):def __init__(self, name, queue_ts, ols):super(download_ts_thread, self).__init__()self.name = nameself.queue_ts = queue_tsself.ols = olsdef fn(self, a):ret = a.group()return '0'+retdef parse_ts(self, ts):nls = len(ts)if nls == self.ols:pattern = repile(r'(\d{3}).ts')ret = pattern.sub(self.fn, ts)else:ret = tsreturn retdef run(self):print('%s启动...' % self.name)while not self.queue_ts.empty():ts = self.queue_ts.get()file_ts = self.parse_ts(ts)if os.path.isfile(file_path + file_ts):continueprint('正在下载%s...' % ts)ts_url = '' + tstry:r = s.get(url=ts_url, headers=headers, timeout=80)except BaseException as e:logging.error(e)self.queue_ts.put(ts)print(e)time.sleep(2)continuewith open(file_path + file_ts, 'wb')as fp:fp.write(r.content)print('%s下载完成...' % file_ts)time.sleep(2)print('%s结束' % self.name)def get_m3u8(recode, code, page):print('正在获取第%d个m3u8文件...' % page)url_m3u8 = '{}'# print(url_m3u8)pattern = repile(r'' + recode, re.M)filename_m3u8 = pattern.findall(code)[0]if os.path.isfile(file_path + filename_m3u8):with open(file_path + filename_m3u8, 'r')as fg:m3u8 = fg.read()print('检测到有本文件，结束！')return m3u8filepath = os.path.split(file_path + filename_m3u8)[0]if not os.path.exists(filepath):os.makedirs(filepath)url_m3u8 = url_m3u8.format(filename_m3u8)# print(url_m3u8)while 1:try:m3u8 = s.get(url=url_m3u8, headers=headers, timeout=30)m3u8.encoding = 'utf-8'except BaseException as e:logging.error(e)print(e)print('重新下载m3u8文件')continuebreakprint('第%d个m3u8文件获取完成，正在保存...' % page)with open(file_path + filename_m3u8, 'w', encoding='utf-8')as fp:fp.write(m3u8.text)print('保存完成！')return m3u8.textdef get_ts(code):print('正在获取ts信息...')pattern = repile(r'(.*?.ts)\n', re.M)list_ts = pattern.findall(code)print('ts信息获取完成！')return list_tsdef creat_queue(list_ts):q = Queue()for ts in list_ts:q.put(ts)return qdef create_ts_thread(queue_ts, ols):thread_list = []for Thread in range(1, 71):name = '下载' + str(Thread) + '号'thread_list.append(name)for name in thread_list:tdownload = download_ts_thread(name, queue_ts, ols)download_list.append(tdownload)def create_dir(ts, title):print('检验文件路径...')path = os.path.split(ts)[0]if not os.path.exists(file_path + path):print('没有文件路径，创建文件路径！')os.makedirs(file_path + path)else:print('已存在文件路径！')save_title(file_path + path, title)return pathdef get_title(text):soup = BeautifulSoup(text, 'html.parser')# 这里不能用name="xxx"的形式，因为name是关键字，所有为了不起冲突，最好统一用字典来写with open('htmlcode.txt', 'w', encoding='utf-8')as fp:fp.write(text)title = soup.find_all('meta', {'name': 'keywords'})[0]['content']return titledef save_title(path, title):with open(path + '/视频名称.txt', 'w', encoding='utf-8')as fp:fp.write(title)judge_exists_movie(path)def combine_ts(title, path):if not os.path.isfile(file_path + path + '/movie.mp4'):path = 'mp4' + path# 妈的这个我折腾了好久，淦，&&是连续执行的意思subprocess.run("F:&&cd ..&&cd " + path + "&&copy /b *.ts movie.mp4", shell=True)print('mp4文件转化完毕！')else:print('movie文件已存在！')def judge_exists_movie(path):print('检验是否已经下载过...')path_movie = path+'/movie.mp4'if os.path.isfile(path_movie):print("当前movie已下载，无需再次下载！关闭程序！")remove_ts(path)sys.exit()def remove_ts(path):print('删除ts文件...')files = os.listdir(path)for file in files:if os.path.splitext(file)[1]=='.ts':os.remove(path + '/' + file)print('删除%s中...' % file)def main():global file_pathfile_path = prepare_system()number = int(input('请输入即将下载的视频号码：'))url = '/{}.html'url = url.format(number)T1 = time.time()print('正在下载页面信息...')while 1:try:r = s.get(url=url, headers=headers, timeout=10)r.encoding = 'utf-8'except BaseException as e:logging.error(e)print(e)print('重新下载页面信息...')continuebreakprint('页面信息下载完成！')title = get_title(r.text)print(title + '正在下载...')# m3u8_1 = get_m3u8('\"https://\"\+CN\d+\+\"(.*)\"', r.text, 1)m3u8_1 = get_m3u8('\"https://.*?(/.*)\"', r.text, 1)m3u8_2 = get_m3u8('.*?.m3u8', m3u8_1, 2)list_ts = []list_ts = get_ts(m3u8_2)ordinary_len_ts = len(list_ts[0])path = create_dir(list_ts[0], title)queue_ts = creat_queue(list_ts)print('初始化下载...')time.sleep(2)create_ts_thread(queue_ts, ordinary_len_ts)print('开始下载！！！')for tdownload in download_list:tdownload.start()for tdownload in download_list:tdownload.join()T2 = time.time() - T1print('下载完成！！！')print('共用时%.3f秒' % T2)print('开始转换成mp4文件...')time.sleep(2)combine_ts(title, path)if __name__ == '__main__':main()

更多推荐

项目之多线程爬取blue网站

本文发布于:2024-02-26 12:47:03，感谢您对本站的认可！

本文链接:https://www.elefans.com/category/jswz/34/1702522.html