mirror of
https://github.com/chai2010/advanced-go-programming-book.git
synced 2025-05-24 20:52:22 +00:00
add colly example
This commit is contained in:
parent
625e8494c2
commit
e87c4dfa51
@ -8,6 +8,72 @@
|
|||||||
|
|
||||||
## 基于 colly 的单机爬虫
|
## 基于 colly 的单机爬虫
|
||||||
|
|
||||||
|
```go
|
||||||
|
package main
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"regexp"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/gocolly/colly"
|
||||||
|
)
|
||||||
|
|
||||||
|
var visited = map[string]bool{}
|
||||||
|
|
||||||
|
func main() {
|
||||||
|
// Instantiate default collector
|
||||||
|
c := colly.NewCollector(
|
||||||
|
colly.AllowedDomains("www.v2ex.com"),
|
||||||
|
colly.MaxDepth(1),
|
||||||
|
)
|
||||||
|
|
||||||
|
detailRegex, _ := regexp.Compile(`/go/go\?p=\d+$`)
|
||||||
|
listRegex, _ := regexp.Compile(`/t/\d+#\w+`)
|
||||||
|
|
||||||
|
// On every a element which has href attribute call callback
|
||||||
|
c.OnHTML("a[href]", func(e *colly.HTMLElement) {
|
||||||
|
link := e.Attr("href")
|
||||||
|
|
||||||
|
// 已访问过的详情页或列表页,跳过
|
||||||
|
if visited[link] && (detailRegex.Match([]byte(link)) || listRegex.Match([]byte(link))) {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// 匹配下列两种 url 模式的,才去 visit
|
||||||
|
// https://www.v2ex.com/go/go?p=2
|
||||||
|
// https://www.v2ex.com/t/472945#reply3
|
||||||
|
if !detailRegex.Match([]byte(link)) && !listRegex.Match([]byte(link)) {
|
||||||
|
println("not match", link)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
time.Sleep(time.Second)
|
||||||
|
println("match", link)
|
||||||
|
|
||||||
|
visited[link] = true
|
||||||
|
|
||||||
|
time.Sleep(time.Millisecond * 2)
|
||||||
|
c.Visit(e.Request.AbsoluteURL(link))
|
||||||
|
})
|
||||||
|
|
||||||
|
// Before making a request
|
||||||
|
c.OnRequest(func(r *colly.Request) {
|
||||||
|
/*
|
||||||
|
r.Headers.Set("Cookie", "")
|
||||||
|
r.Headers.Set("DNT", "1")
|
||||||
|
r.Headers.Set("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36")
|
||||||
|
r.Headers.Set("Host", "www.v2ex.com")
|
||||||
|
*/
|
||||||
|
})
|
||||||
|
|
||||||
|
err := c.Visit("https://www.v2ex.com/go/go")
|
||||||
|
if err != nil {
|
||||||
|
fmt.Println(err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
## 分布式爬虫
|
## 分布式爬虫
|
||||||
|
|
||||||
想像一下,你们的信息分析系统运行非常之快。获取信息的速度成为了瓶颈,虽然可以用上 Go 语言所有优秀的并发特性,将单机的 CPU 和网络带宽都用满,但还是希望能够加快爬虫的爬取速度。在很多场景下,速度是有意义的:
|
想像一下,你们的信息分析系统运行非常之快。获取信息的速度成为了瓶颈,虽然可以用上 Go 语言所有优秀的并发特性,将单机的 CPU 和网络带宽都用满,但还是希望能够加快爬虫的爬取速度。在很多场景下,速度是有意义的:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user