1、第一种方法
# 第一种方式:requests 和 lxml结合使用
import requests
from lxml import etree
#1、拿到所有的页面链接,并使用yield返回完整的超链接
def get_html(url):
# 获取页面HTML
html=requests.get(url)
# 使用etree格式化HTML
seq=etree.HTML(html.text)
link_list=seq.xpath('//*[@id="content"]/ul/li/a/@href')
for i in link_list:
yield "http://www.runoob"+i
# 2、获取详细的页面数据
def get_html_link(link):
for i in link:
# 获取界面
link_html=requests.get(i)
# 初始化
link_seq=etree.HTML(link_html.content)
# 得到标题
title=link_seq.xpath('//*[@id="content"]/h1/text()')[0]
# 得到题目内容
subject=link_seq.xpath('//*[@id="content"]/p[position()>1 and position()<4]/text()')
subject_list='\n'.join(subject)
yield (title,subject_list)
# 3、保存数据
def save_subject(title_subject):
with open("G:/1.txt",'a+',encoding='utf-8') as f:
for title,subject_list in title_subject:
f.write(title+'\n')
f.write(subject_list+'\n')
f.write("#"*50+'\n')
# 4、函数回调
def funcall(url):
link=get_html(url)
title_subject=get_html_link(link)
save_subject(title_subject)
# 5、主函数
def main():
url='http://www.runoob/python/python-100-examples.html'
funcall(url)
if __name__=="__main__":
main()
# for i in get_html('http://www.runoob/python/python-100-examples.html'):
# print(i)
# for i in get_html_link(link):
# print(i)
2、第二种方法
# 第二种方式:urllib.request 与 beautifulsoup结合使用
import urllib.request
from bs4 import BeautifulSoup
# 1、获取所有页面链接
def get_html(url):
# 获取页面HTML源码
html=urllib.request.urlopen(url).read()
# 格式化html
soup=BeautifulSoup(html,'lxml')
# 首先找到第一个id='content'的标签,并找到子标签ul(2个)
# 其次遍历子标签ul,并获取到所有的ul的子标签li
# 然后遍历li标签,并获取到li标签下的所有a标签
# 使用yield返回超链接
for i in soup.find(id='content').find_all('ul'):
for j in i.find_all('li'):
for k in j.find_all('a'):
yield 'http://www.runoob'+k['href']
# 2、获取详细的页面数据
def get_html_link(link):
# 遍历所有的超链接
for i in link:
# 请求超链接页面HTML
link_list=urllib.request.urlopen(i).read()
# 格式化HTML
soup=BeautifulSoup(link_list,'lxml')
# 获取id='content'的标签
content=soup.find(id='content')
if content:
# 获取h1标签的内容
title=content.find('h1').string
# 获取前3个p标签的内容
conten_list=content.find_all('p',limit=3)
subject=''
for j in conten_list:
subject+=j.get_text()
yield (title,subject)
# 3、数据保存
def save_suject(title_content):
with open('G:/2.txt','w+',encoding='utf+8') as f:
for tile,content in title_content:
f.write(tile+'\n')
f.write(content+'\n')
f.write('#'*80+'\n')
# 4、函数回调
def fun_call(url):
link=get_html(url)
title_content=get_html_link(link)
save_suject(title_content)
# 5、主函数
def main():
url='http://www.runoob/python/python-100-examples.html'
fun_call(url)
if __name__=='__main__':
main()
3、第三种方法
# 第三种方式
import requests,re
from bs4 import BeautifulSoup
# 1、获取页面的超链接信息
def get_html(url):
html=requests.get(url)
html.encoding='utf-8'
soup=BeautifulSoup(html.text,'lxml')
for i in soup.find_all('a',href=repile('^/python/python-exercise')):
yield 'http://www.runoob'+i.get('href')
# 2、获取超链接页面的详细信息
def get_html_link(link_list):
for i in link_list:
html_link=requests.get(i)
html_link.encoding='utf-8'
soup=BeautifulSoup(html_link.text,'lxml')
title=soup.find('div',class_="article-intro").h1.string
con=soup.find('div',class_="article-intro").find_all('p')
i=1
list1=[]
while True:
if re.match('程序源代码',con[i].text) or re.match(' Python 100例',con[i].text) or re.match('以上实例输出结果为',con[i].text):
break
else:
list1.append(con[i].text)
i+=1
yield (title,list1)
# 3、保存数据
def save_data(content_list):
with open('G:/3.txt','w+',encoding='utf+8') as f:
for tile,content in content_list:
f.write(tile+'\n')
for i in range(len(content)):
f.write(content[i]+'\n')
f.write('#'*80+'\n')
# 4、函数回调
def fun_call(url):
link_list=get_html(url)
content_list=get_html_link(link_list)
save_data(content_list)
# 5、主函数
def main():
url='http://www.runoob/python/python-100-examples.html'
fun_call(url)
if __name__=='__main__':
main()
更多推荐
爬虫实例--菜鸟教程
发布评论