爬虫"/>
Huggingface网页解析和下载爬虫
解析网页:
import requests
from bs4 import BeautifulSoup# 目标网页URL
url = ''# 发送GET请求
response = requests.get(url)# 检查请求是否成功
if response.status_code == 200:# 使用BeautifulSoup解析HTML内容soup = BeautifulSoup(response.text, 'html.parser')# 假设我们要找到所有的链接for link in soup.find_all('a'):href = link.get('href')if href: # 确保href不为空print(href)
else:print("网页请求失败,状态码:", response.status_code)
/
/models
/datasets
/spaces
/docs
/pricing
/login
/join
/internlm
/internlm/internlm-20b
/models?pipeline_tag=text-generation
/models?library=transformers
/models?library=pytorch
/models?other=internlm
/models?other=feature-extraction
/models?other=custom_code
/models?license=license%3Aapache-2.0
/internlm/internlm-20b
/internlm/internlm-20b/tree/main
/internlm/internlm-20b/discussions
/internlm/internlm-20b/tree/main
/internlm/internlm-20b/commits/main
/internlm/internlm-20b/commits/main
/x54-729
/internlm/internlm-20b/commit/2d83118d863d24565da1f9c6c0fe99d3e882f25c
/internlm/internlm-20b/blob/main/.gitattributes
/internlm/internlm-20b/resolve/main/.gitattributes?download=true
/internlm/internlm-20b/commit/b8825fe3394608fe84f0f5eb6471454384fb83aa
/internlm/internlm-20b/commit/b8825fe3394608fe84f0f5eb6471454384fb83aa
/internlm/internlm-20b/blob/main/README.md
/internlm/internlm-20b/resolve/main/README.md?download=true
/internlm/internlm-20b/commit/509b748b2160d0571d067d85f8a21df018cdee29
/internlm/internlm-20b/commit/509b748b2160d0571d067d85f8a21df018cdee29
/internlm/internlm-20b/blob/main/config.json
/internlm/internlm-20b/resolve/main/config.json?download=true
/internlm/internlm-20b/commit/2d83118d863d24565da1f9c6c0fe99d3e882f25c
/internlm/internlm-20b/commit/2d83118d863d24565da1f9c6c0fe99d3e882f25c
/internlm/internlm-20b/blob/main/configuration_internlm.py
/internlm/internlm-20b/resolve/main/configuration_internlm.py?download=true
/internlm/internlm-20b/commit/53d4840ed4326a633e59501ba4ac3342757fed34
/internlm/internlm-20b/commit/53d4840ed4326a633e59501ba4ac3342757fed34
/internlm/internlm-20b/blob/main/generation_config.json
/internlm/internlm-20b/resolve/main/generation_config.json?download=true
/internlm/internlm-20b/commit/b8825fe3394608fe84f0f5eb6471454384fb83aa
/internlm/internlm-20b/commit/b8825fe3394608fe84f0f5eb6471454384fb83aa
/internlm/internlm-20b/blob/main/modeling_internlm.py
/internlm/internlm-20b/resolve/main/modeling_internlm.py?download=true
/internlm/internlm-20b/commit/c8f2f9979075c3ccd0399d042823ac719d545840
/internlm/internlm-20b/commit/c8f2f9979075c3ccd0399d042823ac719d545840
/internlm/internlm-20b/blob/main/pytorch_model-00001-of-00005.bin
/docs/hub/security-pickle
/internlm/internlm-20b/resolve/main/pytorch_model-00001-of-00005.bin?download=true
/internlm/internlm-20b/commit/b8825fe3394608fe84f0f5eb6471454384fb83aa
/internlm/internlm-20b/commit/b8825fe3394608fe84f0f5eb6471454384fb83aa
/internlm/internlm-20b/blob/main/pytorch_model-00002-of-00005.bin
/docs/hub/security-pickle
/internlm/internlm-20b/resolve/main/pytorch_model-00002-of-00005.bin?download=true
/internlm/internlm-20b/commit/b8825fe3394608fe84f0f5eb6471454384fb83aa
/internlm/internlm-20b/commit/b8825fe3394608fe84f0f5eb6471454384fb83aa
/internlm/internlm-20b/blob/main/pytorch_model-00003-of-00005.bin
/docs/hub/security-pickle
/internlm/internlm-20b/resolve/main/pytorch_model-00003-of-00005.bin?download=true
/internlm/internlm-20b/commit/b8825fe3394608fe84f0f5eb6471454384fb83aa
/internlm/internlm-20b/commit/b8825fe3394608fe84f0f5eb6471454384fb83aa
/internlm/internlm-20b/blob/main/pytorch_model-00004-of-00005.bin
/docs/hub/security-pickle
/internlm/internlm-20b/resolve/main/pytorch_model-00004-of-00005.bin?download=true
/internlm/internlm-20b/commit/b8825fe3394608fe84f0f5eb6471454384fb83aa
/internlm/internlm-20b/commit/b8825fe3394608fe84f0f5eb6471454384fb83aa
/internlm/internlm-20b/blob/main/pytorch_model-00005-of-00005.bin
/docs/hub/security-pickle
/internlm/internlm-20b/resolve/main/pytorch_model-00005-of-00005.bin?download=true
/internlm/internlm-20b/commit/b8825fe3394608fe84f0f5eb6471454384fb83aa
/internlm/internlm-20b/commit/b8825fe3394608fe84f0f5eb6471454384fb83aa
/internlm/internlm-20b/blob/main/pytorch_model.bin.index.json
/internlm/internlm-20b/resolve/main/pytorch_model.bin.index.json?download=true
/internlm/internlm-20b/commit/b8825fe3394608fe84f0f5eb6471454384fb83aa
/internlm/internlm-20b/commit/b8825fe3394608fe84f0f5eb6471454384fb83aa
/internlm/internlm-20b/blob/main/special_tokens_map.json
/internlm/internlm-20b/resolve/main/special_tokens_map.json?download=true
/internlm/internlm-20b/commit/b8825fe3394608fe84f0f5eb6471454384fb83aa
/internlm/internlm-20b/commit/b8825fe3394608fe84f0f5eb6471454384fb83aa
/internlm/internlm-20b/blob/main/tokenization_internlm.py
/internlm/internlm-20b/resolve/main/tokenization_internlm.py?download=true
/internlm/internlm-20b/commit/632df84a18d93aa5b40238a1472a8ffb38e2611c
/internlm/internlm-20b/commit/632df84a18d93aa5b40238a1472a8ffb38e2611c
/internlm/internlm-20b/blob/main/tokenizer.model
/internlm/internlm-20b/resolve/main/tokenizer.model?download=true
/internlm/internlm-20b/commit/b8825fe3394608fe84f0f5eb6471454384fb83aa
/internlm/internlm-20b/commit/b8825fe3394608fe84f0f5eb6471454384fb83aa
/internlm/internlm-20b/blob/main/tokenizer_config.json
/internlm/internlm-20b/resolve/main/tokenizer_config.json?download=true
/internlm/internlm-20b/commit/b8825fe3394608fe84f0f5eb6471454384fb83aa
/internlm/internlm-20b/commit/b8825fe3394608fe84f0f5eb6471454384fb83aa
下载代码:
import requests
from tqdm.auto import tqdmfile_url = '.bin?download=true'# 获取文件大小
response = requests.head(file_url)
total_size = int(response.headers.get('content-length', 0))# 设置流下载模式
response = requests.get(file_url, stream=True)# 检查是否请求成功
if response.status_code == 200:file_path = 'pytorch_model-00001-of-00005.bin'# 设置进度条with tqdm.wrapattr(open(file_path, "wb"), "write", miniters=1,total=total_size, desc=file_path) as fout:for chunk in response.iter_content(chunk_size=4096):fout.write(chunk)print("文件下载完成")
else:print("下载失败,状态码:", response.status_code)
更多推荐
Huggingface网页解析和下载爬虫
发布评论