爬虫"/>
python 苹果app爬虫
#-*- coding: utf-8 -*-
import urllib2
import urllib
import re
import time
import MySQLdb
import time,datetime
#from datetime import date
#----------- APP store 排行榜 -----------
class Spider_Model:
def __init__(self):
self.page = 1
self.pages = []
self.enable = False
def startWork(self,url,tabName):
nowtime = int(time.time())
content = self.GetCon(url)
oneItems = self.Match(content) #匹配一级参数
time.sleep(1)
for index,item in enumerate(oneItems):
content_two = self.GetCon(item[1])
twoItems = self.Match_two(content_two)
oneItems[index].append([twoItems[0],twoItems[1]])
if oneItems[index][6][0] == '0':
fabutime = '0'
else:
fabutime=int(time.mktime(time.strptime(oneItems[index][6][0].strip(),'%Y年%m月%d日')))
sql = "INSERT INTO "+tabName+"(`rank`,`detailurl`,`logo`,`name`,`type`,`appid`,`appstoretime`,`compatible`,`ctime`) values(%s,%s,%s,%s,%s,%s,%s,%s,%s)"%('"'+oneItems[index][0]+'"','"'+oneItems[index][1]+'"','"'+oneItems[index][2]+'"','"'+oneItems[index][3]+'"','"'+oneItems[index][4]+'"','"'+oneItems[index][5]+'"',fabutime,'"'+oneItems[index][6][1]+'"',nowtime)
self.contentDb(sql)
time.sleep(1)
def GetCon(self,url):
myUrl = url
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11','Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'}
//网站禁止爬虫解决方法加上上面的代码 模拟浏览器访问
req = urllib2.Request(myUrl, headers = headers)
global myResponse
try:
myResponse = urllib2.urlopen(req)
except urllib2.HTTPError, e:
print e.fp.read()
//异常处理必须加 否则就算模拟了浏览器 也会返回 403 原因不知道......
myPage = myResponse.read()
#encode的作用是将unicode编码转换成其他编码的字符串
#decode的作用是将其他编码的字符串转换成unicode编码
#unicodePage = myPage.decode('utf-8').encode('gbk','ignore')
#unicodePage = myPage.decode('utf-8','ignore')
return myPage
def Match(self,con):
# 找出所有class="content"的div标记
#re.S是任意匹配模式,也就是.可以匹配换行符
pattenA = repile(r'(.*?)',re.U|re.S)
pattenB = repile(r'
(.*?).(.*?)
(.*?)
.*?',re.U|re.S)match = re.findall(pattenA,con)
myItems = re.findall(pattenB,match[0])
items = []
for item in myItems:
items.append([item[0].replace("\n",""),item[1].replace("\n",""),item[2].replace("\n",""),(item[3].replace("\n","")).split('-')[0],item[4].replace("\n",""),(item[1].split('id')[1]).split('?')[0]])
return items
def Match_two(self,con):
pattenTwoA = repile(r'
.*?(.*?)',re.U|re.S)pattenTwoB = repile(r'.*?(.*?)
',re.U|re.S)matchTwoA = self.is_empty(re.findall(pattenTwoA,con))
matchTwoB = self.is_empty(re.findall(pattenTwoB,con))
itemsTwo = [matchTwoA,matchTwoB]
return itemsTwo
def is_empty(self,param):
if len(param):
res = param[0]
else:
res = '0'
return res
def contentDb(self,sql):
try:
conn = MySQLdb.connect(host="主机", user="用户", passwd="密码", db="表名",charset='utf8')
cur = conn.cursor()
result = cur.execute(sql)
connmit()
except MySQLdb.Error,e:
print "Mysql Error %d: %s" %(e.args[0],e.args[1])
addArr = [["/",'cg_jp_free'],
["/",'cg_jp_paid']]
myModel = Spider_Model()
for val in addArr:
myModel.startWork(val[0],val[1])
初识Python 代码写的有点烂,自制罪孽深重......
python版本:2.7.5 测试环境:Linux、Windows
望高手拍砖 带我一起装逼!一起飞!
更多推荐
python 苹果app爬虫
发布评论