mirror of
https://github.com/chai2010/advanced-go-programming-book.git
synced 2025-05-24 04:22:22 +00:00
add colly example
This commit is contained in:
parent
625e8494c2
commit
e87c4dfa51
@ -8,6 +8,72 @@
|
||||
|
||||
## 基于 colly 的单机爬虫
|
||||
|
||||
```go
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"regexp"
|
||||
"time"
|
||||
|
||||
"github.com/gocolly/colly"
|
||||
)
|
||||
|
||||
var visited = map[string]bool{}
|
||||
|
||||
func main() {
|
||||
// Instantiate default collector
|
||||
c := colly.NewCollector(
|
||||
colly.AllowedDomains("www.v2ex.com"),
|
||||
colly.MaxDepth(1),
|
||||
)
|
||||
|
||||
detailRegex, _ := regexp.Compile(`/go/go\?p=\d+$`)
|
||||
listRegex, _ := regexp.Compile(`/t/\d+#\w+`)
|
||||
|
||||
// On every a element which has href attribute call callback
|
||||
c.OnHTML("a[href]", func(e *colly.HTMLElement) {
|
||||
link := e.Attr("href")
|
||||
|
||||
// 已访问过的详情页或列表页,跳过
|
||||
if visited[link] && (detailRegex.Match([]byte(link)) || listRegex.Match([]byte(link))) {
|
||||
return
|
||||
}
|
||||
|
||||
// 匹配下列两种 url 模式的,才去 visit
|
||||
// https://www.v2ex.com/go/go?p=2
|
||||
// https://www.v2ex.com/t/472945#reply3
|
||||
if !detailRegex.Match([]byte(link)) && !listRegex.Match([]byte(link)) {
|
||||
println("not match", link)
|
||||
return
|
||||
}
|
||||
time.Sleep(time.Second)
|
||||
println("match", link)
|
||||
|
||||
visited[link] = true
|
||||
|
||||
time.Sleep(time.Millisecond * 2)
|
||||
c.Visit(e.Request.AbsoluteURL(link))
|
||||
})
|
||||
|
||||
// Before making a request
|
||||
c.OnRequest(func(r *colly.Request) {
|
||||
/*
|
||||
r.Headers.Set("Cookie", "")
|
||||
r.Headers.Set("DNT", "1")
|
||||
r.Headers.Set("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36")
|
||||
r.Headers.Set("Host", "www.v2ex.com")
|
||||
*/
|
||||
})
|
||||
|
||||
err := c.Visit("https://www.v2ex.com/go/go")
|
||||
if err != nil {
|
||||
fmt.Println(err)
|
||||
}
|
||||
}
|
||||
|
||||
```
|
||||
|
||||
## 分布式爬虫
|
||||
|
||||
想像一下,你们的信息分析系统运行非常之快。获取信息的速度成为了瓶颈,虽然可以用上 Go 语言所有优秀的并发特性,将单机的 CPU 和网络带宽都用满,但还是希望能够加快爬虫的爬取速度。在很多场景下,速度是有意义的:
|
||||
|
Loading…
x
Reference in New Issue
Block a user