使用 Colly 实现 豆瓣电影Top250爬取
package main
import (
"encoding/csv"
"github/PuerkitoBio/goquery"
"github/gocolly/colly"
"log"
"os"
"strings"
"time"
)
type Movie struct {
idx string
title string
year string
info string
rating string
url string
}
func main() {
// 存储文件名
fName := "douban_movie_top250.csv"
file, err := os.Create(fName)
if err != nil {
log.Fatalf("创建文件失败 %q: %s\n", fName, err)
return
}
defer file.Close()
writer := csv.NewWriter(file)
defer writer.Flush()
// 写CSV头部
writer.Write([]string{"Idx", "Title", "Year", "Info", "Rating", "URL"})
// 起始Url
startUrl := "https://movie.douban/top250"
// 创建Collector
collector := colly.NewCollector(
// 设置用户代理
colly.UserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36"),
)
// 设置抓取频率限制
collector.Limit(&colly.LimitRule{
DomainGlob: "*",
RandomDelay: 5 * time.Second, // 随机延迟
})
// 异常处理
collector.OnError(func(response *colly.Response, err error) {
log.Println(err.Error())
})
collector.OnRequest(func(request *colly.Request) {
log.Println("start visit: ", request.URL.String())
})
// 解析列表
collector.OnHTML("ol.grid_view", func(element *colly.HTMLElement) {
// 依次遍历所有的li节点
element.DOM.Find("li").Each(func(i int, selection *goquery.Selection) {
href, found := selection.Find("div.hd > a").Attr("href")
// 如果找到了详情页,则继续下一步的处理
if found {
parseDetail(collector, href, writer)
log.Println(href)
}
})
})
// 查找下一页
collector.OnHTML("div.paginator > span.next", func(element *colly.HTMLElement) {
href, found := element.DOM.Find("a").Attr("href")
// 如果有下一页,则继续访问
if found {
element.Request.Visit(element.Request.AbsoluteURL(href))
}
})
// 起始入口
collector.Visit(startUrl)
}
/**
* 处理详情页
*/
func parseDetail(collector *colly.Collector, url string, writer *csv.Writer) {
collector = collector.Clone()
collector.Limit(&colly.LimitRule{
DomainGlob: "*",
RandomDelay: 2 * time.Second,
})
collector.OnRequest(func(request *colly.Request) {
log.Println("start visit: ", request.URL.String())
})
// 解析详情页数据
collector.OnHTML("body", func(element *colly.HTMLElement) {
selection := element.DOM.Find("div#content")
idx := selection.Find("div.top250 > span.top250-no").Text()
title := selection.Find("h1 > span").First().Text()
year := selection.Find("h1 > span.year").Text()
info := selection.Find("div#info").Text()
info = strings.ReplaceAll(info, " ", "")
info = strings.ReplaceAll(info, "\n", "; ")
rating := selection.Find("strong.rating_num").Text()
movie := Movie{
idx: idx,
title: title,
year: year,
info: info,
rating: rating,
url: element.Request.URL.String(),
}
writer.Write([]string{
idx,
title,
year,
info,
rating,
element.Request.URL.String(),
})
log.Printf("%+v", movie)
})
collector.Visit(url)
}
更多推荐
Colly实现豆瓣电影Top250爬取
发布评论