爬虫实战

编程入门行业动态更新时间:2024-10-12 18:22:05

爬虫实战

全国最高气温排行榜

前言
分析HTML
HTML源代码获取
解析网页内容
输出结果
完整代码
结果展示

前言

获取2345天气预报-全国最高气温排行榜

分析HTML

打开要爬取的网页链接
按F12进入开发者模式，点击图片的选中区域，之后可以在网页中定位代码位置
经检查发现该页面的标签主要使用块标签div标签进行包裹，与上一篇使用table标签定义表格，不同。该页面主要是在div表标签中定义class类属性进行css或js控制样式。
如：< div class='j-table' > <div class='j-tr'>
因此我们的主要任务就是获取 div标签中class_=‘j-tbody’ 标签中的内容

HTML源代码获取

# 获取html源代码。参数是url爬取页面的url
def getHtml(url):try:# 获取HTML页面对象r = requests.get(url, timeout=30)r.raise_for_status()  # 判断返回是不是200，反之产生一个HttpError异常# 规范字符编码r.encoding = r.apparent_encoding# 网页源代码return r.textexcept Exception as e:print(e.__traceback__)return '访问URL失败'

解析网页内容

使用的是find() 方法，获取div标签中class_=‘j-tbody’ 的内容即获取整个表格
tag = soup.find('div', class_='j-tbody')
循环该大div标签下的子div标签，边判断是不是bs4的Tag类型，有的包含非标签内容if (isinstance(item, bs4.element.Tag)):
之后，通过标签(子标签名) 返回子标签的列表div = item('div')（前面分析，其中一个div标签代表一行，下面包含个子标签div分别代表4个单元格）

# 解析HTML文档
def getInfo(text):# 存放数据list = []# 获取soup对象, 解析器lxml， tag对象soup = BeautifulSoup(text, 'html.parser')# 每一个标签都有名字tag = soup.find('div', class_='j-tbody')# 循环该容器中子标签for item in tag.children:# 判断item是否为bs4的Tag类型（是不是标签）if (isinstance(item, bs4.element.Tag)):# 返回所有div子标签的html文档，一个列表div = item('div')# 获取每一个div标签中的内容list.append([div[0].text, div[1].text, div[2].text, div[3].text])return list

输出结果

输出结果到txt中

# 打印结果
def printResult(list):with open("TheRankOfNationalhighestTemperature.txt", "w", encoding="utf-8") as file:# 将表头写入file.writelines("{:^10}\t{:^8}\t{:^10}\t{:^2}\n".format("排名", "城市", "今天气温", "平均气温"))# print("{:^10}\t{:^6}\t{:^16}\t{:^5}".format("排名", "城市", "今天气温", "平均气温"))# 遍历结果列表# 写函数printResult将获取的数据进行打印输出，将上述得到的结果列表插入该函数，然后遍历打印输出，# 因为注意到下图数据中文本中有很多空格和换行，这样输出肯定排版不好看，所以这里做了一个去空格和去换行符，for item in list:index = item[0].replace(" ", "").replace("\n", "")city = item[1].replace(" ", "").replace("\n", "")temperature = item[2].replace(" ", "").replace("\n", "")averageTem = item[3].replace(" ", "").replace("\n", "")file.writelines("{:^10}\t{:^6}\t{:^12}\t{:^8}\n".format(index, city, temperature, averageTem))

注意：{:^10} : 10位居中对齐

输出到csv中

'''
导出到 Excel CSV
现在我们要保存获取到的数据了。Excel 逗号分隔格式是一个不错的选择。它可以在 Excel 中打开，这样你就可以看到数据并轻松地处理它。
但是首先，我们必须导入 Python csv 模块和 datetime 模块来获取记录日期。将以下代码插入导入部分。
'''# 定义一个方法将数据写入csv文件中
def exportCsv(list):with open("TheRankOfNationalhighestTemperature.csv", "w", encoding="utf-8") as csvfile:# 获取csv的writer对象writer = csv.writer(csvfile)# 将表头写入writer.writerow(["排名", "城市", "省份", "今天气温", "平均气温", "时间"])# print("{:^10}\t{:^6}\t{:^16}\t{:^5}".format("排名", "城市", "今天气温", "平均气温"))# 遍历结果列表# 写函数printResult将获取的数据进行打印输出，将上述得到的结果列表插入该函数，然后遍历打印输出，# 因为注意到下图数据中文本中有很多空格和换行，这样输出肯定排版不好看，所以这里做了一个去空格和去换行符，for item in list:index = item[0].replace(" ", "").replace("\n", "")city = item[1].replace(" ", "").replace("\n", "")citys = jieba.lcut(city)temperature = item[2].replace(" ", "").replace("\n", "")averageTem = item[3].replace(" ", "").replace("\n", "")writer.writerow([index, citys[0], citys[1], temperature, averageTem, datetime.now()])

完整代码

# 基于requests、BeautifulSoup爬取气温排行榜
import requests, bs4, csv, jieba
from datetime import datetime
from bs4 import BeautifulSoup# 获取html源代码。参数是url爬取页面的url
def getHtml(url):try:# 获取HTML页面对象r = requests.get(url, timeout=30)r.raise_for_status()  # 判断返回是不是200，反之产生一个HttpError异常# 规范字符编码r.encoding = r.apparent_encoding# 网页源代码return r.textexcept Exception as e:print(e.__traceback__)return '访问URL失败''''
写函数writeInfo对获得的HTML文档进行解析，将我们想要的数据进行获取并且存储，参数text为我们上面获取的文档。
List为我们存储数据的列表。
find()方法查找数据的容器元素，然后得到并循环它的子标签，再得到item中的每一个div，因为得到的结果是一个列表，所以用索引来得到子标签的text。
然后将一条数据用列表存储append进结果列表，最后该函数返回一个结果列表。
'''# 解析HTML文档
def getInfo(text):# 存放数据list = []# 获取soup对象, 解析器lxml， tag对象soup = BeautifulSoup(text, 'html.parser')# 每一个标签都有名字tag = soup.find('div', class_='j-tbody')# 循环该容器中子标签for item in tag.children:# 判断item是否为bs4的Tag类型（是不是标签）if (isinstance(item, bs4.element.Tag)):# 返回所有div子标签的html文档，一个列表div = item('div')# 获取每一个div标签中的内容list.append([div[0].text, div[1].text, div[2].text, div[3].text])return list# 打印结果
def printResult(list):with open("TheRankOfNationalhighestTemperature.txt", "w", encoding="utf-8") as file:# 将表头写入file.writelines("{:^10}\t{:^8}\t{:^10}\t{:^2}\n".format("排名", "城市", "今天气温", "平均气温"))# print("{:^10}\t{:^6}\t{:^16}\t{:^5}".format("排名", "城市", "今天气温", "平均气温"))# 遍历结果列表# 写函数printResult将获取的数据进行打印输出，将上述得到的结果列表插入该函数，然后遍历打印输出，# 因为注意到下图数据中文本中有很多空格和换行，这样输出肯定排版不好看，所以这里做了一个去空格和去换行符，for item in list:index = item[0].replace(" ", "").replace("\n", "")city = item[1].replace(" ", "").replace("\n", "")temperature = item[2].replace(" ", "").replace("\n", "")averageTem = item[3].replace(" ", "").replace("\n", "")file.writelines("{:^10}\t{:^6}\t{:^12}\t{:^8}\n".format(index, city, temperature, averageTem))'''
导出到 Excel CSV
现在我们要保存获取到的数据了。Excel 逗号分隔格式是一个不错的选择。它可以在 Excel 中打开，这样你就可以看到数据并轻松地处理它。
但是首先，我们必须导入 Python csv 模块和 datetime 模块来获取记录日期。将以下代码插入导入部分。
'''# 定义一个方法将数据写入csv文件中
def exportCsv(list):with open("TheRankOfNationalhighestTemperature.csv", "w", encoding="utf-8") as csvfile:# 获取csv的writer对象writer = csv.writer(csvfile)# 将表头写入writer.writerow(["排名", "城市", "省份", "今天气温", "平均气温", "时间"])# print("{:^10}\t{:^6}\t{:^16}\t{:^5}".format("排名", "城市", "今天气温", "平均气温"))# 遍历结果列表# 写函数printResult将获取的数据进行打印输出，将上述得到的结果列表插入该函数，然后遍历打印输出，# 因为注意到下图数据中文本中有很多空格和换行，这样输出肯定排版不好看，所以这里做了一个去空格和去换行符，for item in list:index = item[0].replace(" ", "").replace("\n", "")city = item[1].replace(" ", "").replace("\n", "")citys = jieba.lcut(city)temperature = item[2].replace(" ", "").replace("\n", "")averageTem = item[3].replace(" ", "").replace("\n", "")writer.writerow([index, citys[0], citys[1], temperature, averageTem, datetime.now()])# 定义主函数
def main():# 爬取的页面链接url = ".htm"# 接收获取的html页面text = getHtml(url)# 接收返回的结果列表resultList = getInfo(text)# 调用打印输出函数exportCsv(resultList)print("成功")# 执行主函数
main()