#获取所有酒店详细信息
defget_hotel_info(url):
dcap=dict(DesiredCapabilities.PHANTOMJS)
dcap['phantomjs.page.settings.userAgent'] = ('Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36')
browser= webdriver.PhantomJS("D:/PhantomJS/phantomjs-2.1.1-windows/bin/phantomjs", desired_capabilities=dcap) #指定phantomjs程序路径
browser.get(url)
hotel_info={}
page_num= 1
while(page_num < int(get_page_num)+1):#获取一个页面的所有酒店信息
for item in browser.find_elements_by_class_name('info-wrapper'):
hotel_info['name'] = item.find_element_by_class_name('poi-title').text
hotel_info['star'] = item.find_element_by_class_name('poi-grade').text
hotel_info['consumers'] = item.find_element_by_class_name('poi-buy-num').text
hotel_info['link'] = item.find_element_by_class_name('poi-title').get_attribute('href')print("酒店名称:{}".format(hotel_info['name']))print("酒店评分:{}".format(hotel_info['star']))print("酒店销量:{}".format(hotel_info['consumers']))print("酒店链接:{}".format(hotel_info['link']))
f= open("酒店信息.txt", 'a', encoding="utf8")
f.write(hotel_info['name']+"\n"+hotel_info['star']+"\n"+hotel_info['consumers']+"\n"+hotel_info['link']+"\n")
u= hotel_info['link'][25:-1]#print(u)
#获取酒店前10页评论内容(动态加载的静态爬取)
for i in range(10):
page= i + 1s= i * 10
print("正在加载第" + str(page) + "页评论")
html= "http://ihotel.meituan/group/v1/poi/comment/" + u + "?sortType=default&noempty=1&withpic=0&filter=all&limit=10&offset=" +str(
s)+"&X-FOR-WITH="
#print(html)
#第一次只使用一个header导致爬取信息不全,添加多个可以正常爬取
my_headers =["Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36","Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36","Mozilla/5.0 (Windows NT 6.1; WOW64; rv:30.0) Gecko/20100101 Firefox/30.0","Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/537.75.14","Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Win64; x64; Trident/6.0)"]
randdom_header=random.choice(my_headers)
headers={"User-Agent":randdom_header,"Host":"ihotel.meituan"}
r= requests.get(html,headers=headers)print(r.text)
data= json.loads(r.text,strict=False)#print(data)
comments = data['data']['feedback']for n incomments:
replytime= n['feedbacktime']
content= n['comment']#print("评论时间:", replytime)
#print("评论内容:", content)
f = open("jieguo-1.txt", 'a',encoding="utf8")
f.write(content+"\n")
browser.find_element_by_class_name('paginator').find_element_by_class_name('next').find_element_by_tag_name('a').click() #一个页面写完后,通过点击"下一页"图标至下一页,继续获取
time.sleep(1)
page_num+= 1
更多推荐
python爬取酒店信息_python爬取酒店信息练习
发布评论