python3爬取github项目issues

编程入门 行业动态 更新时间:2024-10-09 02:25:33

python3爬取github<a href=https://www.elefans.com/category/jswz/34/1771421.html style=项目issues"/>

python3爬取github项目issues

1、前言:最近做项目需要调研github上项目存在的相关issues,所以就根据自己需要写了一个爬虫,仅此记录一下👀

2、环境:python3.7、lxml(支持HTML、XML解析)、requests(HTTP库)

3、思路:1、先根据关键词获取相关的项目列表

                2、再获取每个项目的issues列表

                3、再获取每个issue的内容

4、代码:

# -*- codeing = utf-8 -*-
# @Time : 2020/09/25
# @Author : loadding...
# @File : reptile_github.py
# @Software : jupyterfrom lxml import etree
import requests#根据关键词获取项目列表
def get_repos_list(key_words):#初始化列表repos_list=[]#默认for i in range(1,100):url='='+str(i)+'&q='+key_words+'&type=repositories'response=requests.get(url)#获取页面源码page_source=response.text#print(page_source)tree=etree.HTML(page_source)#获取项目超链接arr=tree.xpath('//*[@class="f4 text-normal"]/a/@href')repos_list+=arrreturn repos_list#获取一个项目的issues列表
def get_issues_list(repo_name):issues_list=[]url=''+repo_name+'/issues'#print(url)response=requests.get(url)#获取源码page_source=response.texttree=etree.HTML(page_source)#获取issues数量number=tree.xpath('//*[@id="js-repo-pjax-container"]/div[1]/nav/ul/li[2]/a/span[2]')if len(number)==0:number='0'else:number=number[0].text#超过1K就爬取1000条(够用了)if number.isdigit():number=int(number)else:number=1000print(number)#计算分页数量,每页25个issuespage=0if number%25==0:page=int(number/25)else:page=int(number/25)+1for i in range(1,page+1):url=''+repo_name+'/issues?page='+str(i)response=requests.get(url)#获取源码page_source=response.texttree=etree.HTML(page_source)#获取issues超链接arr=tree.xpath('//*[@class="d-block d-md-none position-absolute top-0 bottom-0 left-0 right-0"]/@href')issues_list+=arr#/combust/mleap/issues/716#返回issues数量和列表return number,issues_list#获取一个issue的内容及评论
def get_issue_content(issue_name):#拼接issue地址url=''+issue_name#print(url)response=requests.get(url)page_source=response.texttree=etree.HTML(page_source)#获取issue内容issue_content=tree.xpath('//table//td')[0].xpath('string(.)')return issue_contentif __name__=='__main__':#测试#get_repos_list('ML pipeline')#get_issues('/combust/mleap')#get_issue_content('/combust/mleap/issues/716')'''issue="/rust-lang/rust/issues/76833"content=get_issue_content(issue)print(content)'''with open(r'D:\Jupyter_workspace\result.md','w+',encoding='utf-8') as f:key_words=input('please input a keyword:')#获取项目列表repos_list=get_repos_list(key_words)#格式:/combust/mleapfor repo in repos_list:#拼接项目urlrepos_url=''+repoprint(repos_url)f.write('\n\n')f.write(repos_url)f.write('\n')#获取项目的issues列表number,issues_list=get_issues_list(repo)f.write(str(number))f.write('\n')#格式:/combust/mleap/issues/716for issue in issues_list:#获取issue的内容issue_url=''+issuecontent=get_issue_content(issue)#content=filter_emoji(content)print(issue_url)f.write(issue_url)f.write('\n')f.write('>'*100)f.write('\n')f.write(str(content).strip())f.write('\n')f.write('<'*100)f.write('\n')f.flush()#print(content)#print(issue)print('The end!')

5、运行结果:

控制台输出结果:

生成的result.md文件内容如下:

完结撒花🌻

 

++++++++++++++++++++++++++++++++++++++++++++++分隔符+++++++++++++++++++++++++++++++++++++++++++++

上文爬取了repositories项目中issues的内容

下面是后续做的工作,直接关键字爬取issues,并添加了筛选条件(issue评论数、项目star数、issues更新时间),代码如下,后续如果有别的爬取需求结合这两个脚本修改应该就可以完成

代码:

# -*- codeing = utf-8 -*-
# @Time : 2020/10/18
# @Author : loadding...
# @File : reptile_github_issues.py
# @Software : jupyterfrom lxml import etree
import requests
import re#根据关键词获取issues列表
def get_issues_list(key_words,comments_num,star_num,need_datetime):#初始化列表issues_list=[]#设定要爬取的issues页面数量默认10页,每页10个issuesfor i in range(1,10):url='=Rust&p='+str(i)+'&q='+key_words+'&type=Issues'print("issues_url_list:",url)response=requests.get(url)#获取页面源码page_source=response.text#print(page_source)tree=etree.HTML(page_source)#获取有评论的链接列表arr=tree.xpath('//*[@class="d-flex text-small text-gray flex-wrap position-relative"]/span/../../*[@class="f4 text-normal"]/a/@href')#获取issues更新时间,列表元素个数为10issues_datetime=tree.xpath('//*[@class="d-flex text-small text-gray flex-wrap position-relative"]/span/../div/relative-time/@datetime')comments=tree.xpath('//*[@class="d-flex text-small text-gray flex-wrap position-relative"]/span')#for i in arr:#    print(i)j=0for i in comments:#获取评论数量pos=i.text[11:15].strip()#数量#打印issue urlurl=''+arr[j]print("issue url:",url)#打印评论数量print("issue comments:",pos)#判断issues comments数量是否满足条件if int(pos)>comments_num:#print(arr[j])#判断star数是否满足条件flag=check_star(arr[j],star_num)if flag:#判断issue更新时间是否满足条件issue_datetime=issues_datetime[j][:10]#打印issue更新时间print("issue datetime:",issue_datetime)print("need_datetime:",need_datetime)if issue_datetime>need_datetime:print("满足条件,爬取!")issues_list.append(arr[j])j+=1  
#     #测试获取的最终issues列表
#     for i in issues_list:
#         print(i)return issues_list#根据issues comment数量筛选
def check_comment(issue_name):#为了减少运行时间,放在get_issues_list方法里,这样就不用再次获取页面源码了pass#根据项目star数筛选,爬的是issues内容页面
def check_star(issues_name,star_num):url=''+issues_name#打印issues url#print("issues_url:",url)response=requests.get(url)#获取页面源码page_source=response.text#print(page_source)tree=etree.HTML(page_source)#print(page_source)#xpath方法获取star数失败,不知道为什么???改用正则#star=tree.xpath('//*[@id="js-repo-pjax-container"]/div[1]/div/ul/li[2]/div/form[2]/a/@aria-label')#正则表达式获取star数#如果star数是1,user没有s,所以构造下面正则表达式pattern=repile('"(\d+) user[s]? starred this repository"')#可以对正则查找的结果做非空判断,不过应该都有star数,为了执行效率没加star=pattern.findall(page_source)[0]#打印star数print("repositories star:",star)#筛选if int(star)>star_num:return True
#根据issues的更新时间筛选
def check_time(issues_name):#为了加快运行速度也放在了get_issues_list函数中passif __name__=='__main__':#读取输入并进行格式校验#输入关键字key_words=input('keyword:')#根据comment数量进行筛选comments_num=input('comment number(>):')if comments_num.isdigit():comments_num=int(comments_num)else:#输入格式错误,使用默认评论数5comments_num=5print("input error! comments default 5!")#根据star数量进行筛选star_num=input('star number(>):')if star_num.isdigit():star_num=int(star_num)else:#输入错误,使用默认值1000star_num=1000print("input error! star default 1000!")need_datetime=input('after datetime(yyyy-mm-dd):')pattern=repile('^\d{4}-\d{2}-\d{2}$')result=pattern.findall(need_datetime)if len(result)<1:#输入格式错误使用默认日期,未严格判断比如月份不超过12月等,如有需要可自行修改need_datetime='2016-09-01'print("input error! default 2016-09-01")#得到符合条件的issues列表issues_list=get_issues_list(key_words,comments_num,star_num,need_datetime)print('The final issues_list is:')if len(issues_list)>0:print("共爬取了"+str(len(issues_list))+"个issues,url如下:")for i in issues_list:#逗号输出会添加空格,用加号才会最为一个str输出print(''+i)else:print("未爬取到issues,请修改筛选条件后重试!")

运行结果:

keyword:unsafe
comment number(>):10
star number(>):30000
after datetime(yyyy-mm-dd):2016-12-12
issues_url_list: =Rust&p=1&q=unsafe&type=Issues
issue url: 
issue comments: 9
issue url: 
issue comments: 2
issue url: 
issue comments: 5
issue url: 
issue comments: 30
repositories star: 48961
issue datetime: 2020-09-13
need_datetime: 2016-12-12
满足条件,爬取!
issue url: 
issue comments: 1
issue url: 
issue comments: 2
issues_url_list: =Rust&p=2&q=unsafe&type=Issues
issue url: 
issue comments: 21
repositories star: 48961
issue datetime: 2020-08-03
need_datetime: 2016-12-12
满足条件,爬取!
issue url: 
issue comments: 3
issue url: 
issue comments: 37
repositories star: 48961
issue datetime: 2020-07-18
need_datetime: 2016-12-12
满足条件,爬取!
issue url: 
issue comments: 5
issues_url_list: =Rust&p=3&q=unsafe&type=Issues
issue url: 
issue comments: 11
repositories star: 48961
issue datetime: 2020-07-31
need_datetime: 2016-12-12
满足条件,爬取!
issue url: 
issue comments: 11
repositories star: 48961
issue datetime: 2020-07-01
need_datetime: 2016-12-12
满足条件,爬取!
issue url: 
issue comments: 34
repositories star: 48961
issue datetime: 2020-06-30
need_datetime: 2016-12-12
满足条件,爬取!
issue url: 
issue comments: 1
issue url: 
issue comments: 1
issue url: 
issue comments: 1
issues_url_list: =Rust&p=4&q=unsafe&type=Issues
issue url: 
issue comments: 2
issue url: 
issue comments: 3
issue url: 
issue comments: 4
issue url: 
issue comments: 1
issue url: 
issue comments: 4
issues_url_list: =Rust&p=5&q=unsafe&type=Issues
issue url: 
issue comments: 1
issue url: 
issue comments: 1
issue url: 
issue comments: 1
issue url: 
issue comments: 7
issue url: 
issue comments: 2
issue url: 
issue comments: 21
repositories star: 48961
issue datetime: 2015-09-02
need_datetime: 2016-12-12
issues_url_list: =Rust&p=6&q=unsafe&type=Issues
issue url: 
issue comments: 3
issue url: 
issue comments: 7
issue url: 
issue comments: 1
issue url: 
issue comments: 24
repositories star: 48961
issue datetime: 2020-04-29
need_datetime: 2016-12-12
满足条件,爬取!
issue url: 
issue comments: 1
issues_url_list: =Rust&p=7&q=unsafe&type=Issues
issue url: 
issue comments: 15
repositories star: 1419
issue url: 
issue comments: 7
issue url: 
issue comments: 6
issue url: 
issue comments: 2
issue url: 
issue comments: 4
issues_url_list: =Rust&p=8&q=unsafe&type=Issues
issue url: 
issue comments: 3
issue url: 
issue comments: 1
issue url: 
issue comments: 4
issue url: 
issue comments: 4
issues_url_list: =Rust&p=9&q=unsafe&type=Issues
issue url: 
issue comments: 1
issue url: 
issue comments: 2
issue url: 
issue comments: 40
repositories star: 48961
issue datetime: 2020-07-27
need_datetime: 2016-12-12
满足条件,爬取!
issue url: 
issue comments: 2
issue url: 
issue comments: 1
issue url: 
issue comments: 1
issue url: 
issue comments: 10
issue url: 
issue comments: 3
The final issues_list is:
共爬取了8个issues,url如下:







注:如果不需要多余的输出,减少运行时间,直接修改脚本即可

 

更多推荐

python3爬取github项目issues

本文发布于:2024-03-11 21:28:54,感谢您对本站的认可!
本文链接:https://www.elefans.com/category/jswz/34/1729936.html
版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系,我们将在24小时内删除。
本文标签:项目   github   issues

发布评论

评论列表 (有 0 条评论)
草根站长

>www.elefans.com

编程频道|电子爱好者 - 技术资讯及电子产品介绍!