爬虫"/>
mysql百度图片爬虫
这一篇我想写写如何爬取百度图片的爬虫,这个爬虫也是:搜搜gif(在线制作功能点我) 的爬虫代码,其实爬虫整体框架还是差不多的,但就是会涉及到图片的的一些处理,还是花费了我不少时间的,所以我请阅读的本爬虫的孩子还是认真一些,毕竟程序猿都不容易啊。
附代码:
PS:不会python的孩子赶快去补补吧,先把基础搞清楚再说
#coding:utf-8
"""
Createdon2015-9-17
@author:huangxie
"""
importtime,math,os,re,urllib,urllib2,cookielib
frombs4importBeautifulSoup
importtime
importre
importuuid
importjson
fromthreadingimportThread
fromQueueimportQueue
importMySQLdbasmdb
importsys
importthreading
importutils
importimitate_browser
fromMySQLdb.constants.REFRESHimportSTATUS
reload(sys)
sys.setdefaultencoding('utf-8')
DB_HOST='127.0.0.1'
DB_USER='root'
DB_PASS='root'
proxy={u'http':u'222.39.64.13:8118'}
TOP_URL="=resultjsonavatarnew&ie=utf-8&word={word}&pn={pn}&rn={rn}"
KEYWORD_URL="=utf-8&f=8&tn=baidu&wd={wd}"
"""
i_headers={'User-Agent':'Mozilla/5.0(WindowsNT6.1)AppleWebKit/537.11(KHTML,likeGecko)Chrome/23.0.1271.64Safari/537.11',
'Accept':'json;q=0.9,*/*;q=0.8',
'Accept-Charset':'utf-8;q=0.7,*;q=0.3',
'Accept-Encoding':'gzip',
'Connection':'close',
'Referer':None#注意如果依然不能抓取的话,这里可以设置抓取网站的host
}
"""
i_headers={'User-Agent':'Mozilla/5.0(WindowsNT6.1)AppleWebKit/537.36(KHTML,likeGecko)Chrome/31.0.1650.48'}
defGetDateString():
x=time.localtime(time.time())
foldername=str(x.__getattribute__("tm_year"))+"-"+str(x.__getattribute__("tm_mon"))+"-"+str(x.__getattribute__("tm_mday"))
returnfoldername
classBaiduImage(threading.Thread):
def__init__(self):
Thread.__init__(self)
self.browser=imitate_browser.BrowserBase()
self.chance=0
self.chance1=0
self.request_queue=Queue()
self.wait_ana_queue=Queue()
#self.key_word_queue.put((("动态图",0,24)))
self.count=0
self.mutex=threading.RLock()#可重入锁,使单线程可以再次获得已经获得的锁
selfmit_count=0
self.ID=500
self.next_proxy_set=set()
self.dbconn=mdb.connect(DB_HOST,DB_USER,DB_PASS,'sosogif',charset='utf8')
self.dbconn.autocommit(False)
self.dbcurr=self.dbconn.cursor()
self.dbcurr.execute('SETNAMESutf8')
"""
defrun(self):
whileTrue:
self.get_pic()
"""
defwork(self,item):
print"startthread",item
whileTrue:#MAX_REQUEST条以上则等待
self.get_pic()
self.prepare_request()
defformat_keyword_url(self,keyword):
returnKEYWORD_URL.format(wd=keyword).encode('utf-8')
defgenerateSeed(self,url):
html=self.browser.openurl(url).read()
ifhtml:
try:
soup=BeautifulSoup(html)
trs=soup.find('p',id='rs').find('table').find_all('tr')#获得所有行
fortrintrs:
ths=tr.find_all('th')
forthinths:
a=th.find_all('a')[0]
keyword=a.text.strip()
if"动态图"inkeywordor"gif"inkeyword:
print"keyword",keyword
self.dbcurr.execute('selectidfrominfowhereword=%s',(keyword))
y=self.dbcurr.fetchone()
ifnoty:
self.dbcurr.execute('INSERTINTOinfo(word,status,page_num,left_num,how_many)VALUES(%s,0,0,0,0)',(keyword))
self.dbconnmit()
except:
pass
defprepare_request(self):
self.lock()
self.dbcurr.execute('select*frominfowherestatus=0')
result=self.dbcurr.fetchone()
ifresult:
id,word,status,page_num,left_num,how_many=result
self.request_queue.put((id,word,page_num))
ifpage_num==0andleft_num==0andhow_many==0:
url=self.format_keyword_url(word)
self.generateSeed(url)
html=""
try:
url=self.format_top_url(word,page_num,24)
html=self.browser.openurl(url).read()
exceptExceptionaserr:
print"err",err
#pass
ifhtml!="":
how_many=self.how_many(html)
print"how_many",how_many
ifhow_many==None:
how_many=0
t=math.ceil(how_many/24*100)#只要前1/100即可
num=int(t)
foriinxrange(0,num-1):
self.dbcurr.execute('INSERTINTOinfo(word,status,page_num,left_num,how_many)VALUES(%s,%s,%s,%s,%s)',(word,0,i*24,num-i,how_many))
self.dbcurr.execute('updateinfoSETstatus=1WHEREid=%s',(id))#置为已经访问
self.dbconnmit()
self.unlock()
defstart_work(self,req_max):
foriteminxrange(req_max):
t=threading.Thread(target=self.work,args=(item,))
t.setDaemon(True)
t.start()
deflock(self):#加锁
self.mutex.acquire()
defunlock(self):#解锁
self.mutex.release()
defget_para(self,url,key):
values=url.split('?')[-1]
forkey_valueinvalues.split('&'):
value=key_value.split('=')
ifvalue[0]==key:
returnvalue[1]
returnNone
defmakeDateFolder(self,par,child):
#self.lock()
ifos.path.isdir(par):
path=par+'//'+GetDateString()
newFolderName=path+'//'+child
ifnotos.path.isdir(path):
os.mkdir(path)
ifnotos.path.isdir(newFolderName):
os.mkdir(newFolderName)
returnnewFolderName
else:
returnpar
#self.unlock()
defparse_json(self,data):
ipdata=json.loads(data)
try:
ifipdata['imgs']:
forninipdata['imgs']:#data子项
ifn['objURL']:
try:
proxy_support=urllib2.ProxyHandler(proxy)
opener=urllib2.build_opener(proxy_support)
urllib2.install_opener(opener)
#print"proxy",proxy
self.lock()
self.dbcurr.execute('selectIDfrompic_infowhereobjURL=%s',(n['objURL']))
y=self.dbcurr.fetchone()
#print"y=",y
ify:
print"databaseexist"
self.unlock()#continue前解锁
continue
else:
real_extension=utils.get_extension(n['objURL'])
req=urllib2.Request(n['objURL'],headers=i_headers)
resp=urllib2.urlopen(req,None,5)
dataimg=resp.read()
name=str(uuid.uuid1())
filename=""
iflen(real_extension)>4:
real_extension=".gif"
real_extension=real_extension.lower()
ifreal_extension==".gif":
filename=self.makeDateFolder("E://sosogif","d"+str(self.count%60))+"//"+name+"-www.sosogif-搜搜gif贡献"+real_extension
self.count+=1
else:
filename=self.makeDateFolder("E://sosogif","o"+str(self.count%20))+"//"+name+"-www.sosogif-搜搜gif贡献"+real_extension
self.count+=1
"""
name=str(uuid.uuid1())
filename=""
iflen(real_extension)>4:
real_extension=".gif"
filename=self.makeDateFolder("E://sosogif","d"+str(self.count%60))+"//"+name+"-www.sosogif-搜搜gif贡献"+real_extension
self.count+=1
"""
try:
ifnotos.path.exists(filename):
file_object=open(filename,'w+b')
file_object.write(dataimg)
file_object.close()
self.anaylis_info(n,filename,real_extension)#入库操作
else:
print"fileexist"
exceptIOError,e1:
print"e1=",e1
pass
self.unlock()
exceptIOError,e2:
#print"e2=",e2
pass
self.chance1+=1
exceptExceptionasparse_error:
print"parse_error",parse_error
pass
deftitle_dealwith(self,title):
#print"title",title
a=title.find("")
temp1=title[0:a]
b=title.find("")
temp2=title[a+8:b]
temp3=title[b+9:len(title)]
return(temp1+temp2+temp3).strip()
defanaylis_info(self,n,filename,real_extension):
print"success."
#ifself.wait_ana_queue.qsize()!=0:
#n,filename,real_extension=self.wait.ana_queue.get()
#self.lock()
objURL=n['objURL']#图片地址
fromURLHost=n['fromURLHost']#来源网站
width=n['width']#宽度
height=n['height']#高度
di=n['di']#用来唯一标识
type=n['type']#格式
fromPageTitle=n['fromPageTitle']#来自网站
keyword=self.title_dealwith(fromPageTitle)
cs=n['cs']#未知
os=n['os']#未知
temp=time.time()
x=time.localtime(float(temp))
acTime=time.strftime("%Y-%m-%d%H:%M:%S",x)#爬取时间
self.dbcurr.execute('selectIDfrompic_infowherecs=%s',(cs))
y=self.dbcurr.fetchone()
ifnoty:
print'addpic',filename
selfmit_count+=1
self.dbcurr.execute('INSERTINTOpic_info(objURL,fromURLHost,width,height,di,type,keyword,cs,os,acTime,filename,real_extension)VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)',(objURL,fromURLHost,width,height,di,type,keyword,cs,os,acTime,filename,real_extension))
ifselfmit_count==10:
self.dbconnmit()
selfmit_count=0
#self.unlock()
defformat_top_url(self,word,pn,rn):
url=TOP_URL.format(word=word,pn=pn,rn=rn).encode('utf-8')
returnurl
defhow_many(self,data):
try:
ipdata=json.loads(data)
ifipdata['displayNum']>0:
how_many=ipdata['displayNum']
returnint(how_many)
else:
return0
exceptExceptionase:
pass
defget_pic(self):
"""
word="gif"
pn=0
rn=24
ifself.key_word_queue.qsize()!=0:
word,pn,rn=self.key_word_queue.get()
url=self.format_top_url(word,pn,rn)
globalproxy
ifurl:
try:
html=""
try:
req=urllib2.Request(url,headers=i_headers)
response=urllib2.urlopen(req,None,5)
#print"url",url
html=self.browser.openurl(url).read()
exceptExceptionaserr:
print"err",err
#pass
ifhtml:
how_many=self.how_many(html)
#how_many=10000
print"how_many",how_many
word=self.get_para(url,"word")
rn=int(self.get_para(url,"rn"))
t=math.ceil(how_many/rn)
num=int(t)
foriteminxrange(0,num-1):
"""
try:
globalproxy
print"sizeofqueue",self.request_queue.qsize()
ifself.request_queue.qsize()!=0:
id,word,page_num=self.request_queue.get()
u=self.format_top_url(word,page_num,24)
self.lock()
self.dbcurr.execute('updateinfoSETstatus=1WHEREid=%s',(id))
self.dbconnmit()
ifself.chance>0orself.chance1>1:#任何一个出问题都给换代理
ifself.ID%100==0:
self.dbcurr.execute("selectcount(*)fromproxy")
forrinself.dbcurr:
count=r[0]
ifself.ID>count:
self.ID=50
self.dbcurr.execute("select*fromproxywhereID=%s",(self.ID))
results=self.dbcurr.fetchall()
forrinresults:
protocol=r[1]
ip=r[2]
port=r[3]
pro=(protocol,ip+":"+port)
ifpronotinself.next_proxy_set:
self.next_proxy_set.add(pro)
self.chance=0
self.chance1=0
self.ID+=1
self.unlock()
proxy_support=urllib2.ProxyHandler(proxy)
opener=urllib2.build_opener(proxy_support)
urllib2.install_opener(opener)
html=""
try:
req=urllib2.Request(u,headers=i_headers)
#print"u=",u
response=urllib2.urlopen(req,None,5)
html=response.read()
ifhtml:
#print"html",type(html)
self.parse_json(html)
exceptExceptionasex1:
#print"error=",ex1
pass
self.chance+=1
ifself.chance>0orself.chance1>1:
iflen(self.next_proxy_set)>0:
protocol,socket=self.next_proxy_set.pop()
proxy={protocol:socket}
print"changeproxyfinished<
exceptExceptionase:
print"error1",e
pass
if__name__=='__main__':
app=BaiduImage()
app.start_work(80)
#app.generateSeed()
while1:
pass
更多推荐
mysql百度图片爬虫
发布评论