1, Quick Start
In just three steps, you can deploy a crawler that crawls all the news on goCN
First, you need to Generate your own token Settings on Github — > Developer Settings — > Personal Access Tokens — > Generate new tokens
You can then configure your own environment variable export GITHUB_TOKEN=(the token generated in the first step) or change the global token in your code to your own token
var Token = GetValueFromEnv(“GITHUB_TOKEN”)
In the second step, you need to install Redis locally, and you need to start local Redis before starting the program, using the default port 6379, because the program uses Redis for deduplication by default. Redis installation can refer to redis installation
The third step, Git Clone code repository, and in the background process running crawler, every 6 hours to climb the day’s news and github push.
git clone https://github.com/lubanproj/crawl.git
cd crawl
go build -v
./crawl &
Copy the code
2, features,
- Supports regular daily crawling
- Paging crawl is supported
- Data deduplication is supported
- Github push is supported
3, show the effect
go_read
4. Source code analysis
(1) Crawl the website
// Crawl all gocn topics
func Crawl(url string) {
pattern := `/topics/\d+`
collector := colly.NewCollector()
collector.OnHTML("a[title]", func(e *colly.HTMLElement) {
// regex match topic
path := e.Attr("href")
topic, ok := regexMatch(path, pattern)
if ok {
e.Request.Visit(fmt.Sprintf("https://gocn.vip%s",topic))
}
})
redisAddr := ": 6379"
conn, err := redis.Dial("tcp",redisAddr)
iferr ! = nil { log.Fatalf("get redis conn error : %v", err)
}
defer conn.Close()
collector.OnRequest(func(r *colly.Request) {
topic, ok := regexMatch(r.URL.Path, pattern)
if ok {
r.Visit(fmt.Sprintf("https://gocn.vip%s",topic))
// fmt.Println("content",r.URL)
}
})
collector.OnResponse(func(r *colly.Response) {
topic := strings.Replace(r.Request.URL.Path,"/topics/"."".- 1)
isExist, err := existTopic(conn, topic)
// the topic has had crawled
if isExist == 1|| err ! = nil {return
}
title, content, ok := parseContent(string(r.Body))
titleAndContent := fmt.Sprintf("<h3>%s</h3>%s<hr>", title, content)
fmt.Println("titleAndContent : ", titleAndContent)
date := getDate(title)
if curDay := time.Now().Format("2006-01-02"); curDay ! = date {// just climb today's data
return
}
ifok && content ! =""&& title ! ="" {
pushToGithub(titleAndContent, Token)
}
saveDB(conn, topic, date)
})
collector.Visit(url)
}
Copy the code
(2) Regular expression parsing content
func parseContent(body string) (string.string.bool) {the pattern: = ` < p > GoCN (. | | \ n \ t) * daily news (. *?) </p>` title, _ := regexMatch(body, pattern)if title == "" {
pattern = `<h[09 -] > GoCN (. | | \ n \ t) * daily news (. | | \ n \ t) * < / h [09 -] >? ` title, _ = regexMatch(body, pattern)if title == "" {
return ""."".false} the pattern = ` > (. | | \ n \ t) * daily news (. | | \ n \ t) * (` title, _ = regexMatch (title, the pattern) title = strings. The Replace (title,"<"."".1)
title = strings.Replace(title, ">"."".1)
}
pattern = `<ol>(.|\n|\t)*</ol>`
content, _ := regexMatch(body, pattern)
return title, content, true
}
Copy the code
(3) Push Github
func pushToGithub(data, token string) error {
if data == "" {
return errors.New("params error")
}
ctx := context.Background()
ts := oauth2.StaticTokenSource(
&oauth2.Token{AccessToken: token},
)
tc := oauth2.NewClient(ctx, ts)
client := github.NewClient(tc)
c := "feat: add gocn news, date : " + time.Now().Format("2006-01-02")
sha := ""
content := &github.RepositoryContentFileOptions{
Message: &c,
SHA: &sha,
Committer: &github.CommitAuthor{
Name: github.String("lubanproj"),
Email: github.String("[email protected]"),
Login: github.String("lubanproj"),
},
Author: &github.CommitAuthor{
Name: github.String("lubanproj"),
Email: github.String("[email protected]"),
Login: github.String("lubanproj"),
},
Branch: github.String("master"),
}
op := &github.RepositoryContentGetOptions{}
repo, _, _, er := client.Repositories.GetContents(ctx, "lubanproj"."go_read"."README.md", op)
ifer ! = nil || repo == nil { fmt.Println("get github repositories error, date: ", time.Now())
return er
}
content.SHA = repo.SHA
decodeBytes, err := base64.StdEncoding.DecodeString(*repo.Content)
iferr ! = nil { fmt.Println("decode repo error, ",err)
return err
}
oldContentList := strings.Split(string(decodeBytes), "<br>")
iflen(oldContentList) ! = 2 { fmt.Println("README.md format error")
}
content.Content = []byte(oldContentList[0] + "<br>" + data + oldContentList[1])
_, _, err = client.Repositories.UpdateFile(ctx, "lubanproj"."go_read"."README.md", content)
iferr ! = nil { println(err)return err
}
return nil
}
Copy the code