diff --git a/ch6-cloud/ch6-09-crawler.md b/ch6-cloud/ch6-09-crawler.md index 532c1a4..1418bba 100644 --- a/ch6-cloud/ch6-09-crawler.md +++ b/ch6-cloud/ch6-09-crawler.md @@ -8,6 +8,72 @@ ## 基于 colly 的单机爬虫 +```go +package main + +import ( + "fmt" + "regexp" + "time" + + "github.com/gocolly/colly" +) + +var visited = map[string]bool{} + +func main() { + // Instantiate default collector + c := colly.NewCollector( + colly.AllowedDomains("www.v2ex.com"), + colly.MaxDepth(1), + ) + + detailRegex, _ := regexp.Compile(`/go/go\?p=\d+$`) + listRegex, _ := regexp.Compile(`/t/\d+#\w+`) + + // On every a element which has href attribute call callback + c.OnHTML("a[href]", func(e *colly.HTMLElement) { + link := e.Attr("href") + + // 已访问过的详情页或列表页,跳过 + if visited[link] && (detailRegex.Match([]byte(link)) || listRegex.Match([]byte(link))) { + return + } + + // 匹配下列两种 url 模式的,才去 visit + // https://www.v2ex.com/go/go?p=2 + // https://www.v2ex.com/t/472945#reply3 + if !detailRegex.Match([]byte(link)) && !listRegex.Match([]byte(link)) { + println("not match", link) + return + } + time.Sleep(time.Second) + println("match", link) + + visited[link] = true + + time.Sleep(time.Millisecond * 2) + c.Visit(e.Request.AbsoluteURL(link)) + }) + + // Before making a request + c.OnRequest(func(r *colly.Request) { + /* + r.Headers.Set("Cookie", "") + r.Headers.Set("DNT", "1") + r.Headers.Set("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36") + r.Headers.Set("Host", "www.v2ex.com") + */ + }) + + err := c.Visit("https://www.v2ex.com/go/go") + if err != nil { + fmt.Println(err) + } +} + +``` + ## 分布式爬虫 想像一下,你们的信息分析系统运行非常之快。获取信息的速度成为了瓶颈,虽然可以用上 Go 语言所有优秀的并发特性,将单机的 CPU 和网络带宽都用满,但还是希望能够加快爬虫的爬取速度。在很多场景下,速度是有意义的: