1, Quick Start

In just three steps, you can deploy a crawler that crawls all the news on goCN

First, you need to Generate your own token Settings on Github — > Developer Settings — > Personal Access Tokens — > Generate new tokens

You can then configure your own environment variable export GITHUB_TOKEN=(the token generated in the first step) or change the global token in your code to your own token

var Token = GetValueFromEnv(“GITHUB_TOKEN”)

In the second step, you need to install Redis locally, and you need to start local Redis before starting the program, using the default port 6379, because the program uses Redis for deduplication by default. Redis installation can refer to redis installation

The third step, Git Clone code repository, and in the background process running crawler, every 6 hours to climb the day’s news and github push.

git clone https://github.com/lubanproj/crawl.git
cd crawl
go build -v 
./crawl &
Copy the code

2, features,

  • Supports regular daily crawling
  • Paging crawl is supported
  • Data deduplication is supported
  • Github push is supported

3, show the effect

go_read

4. Source code analysis

(1) Crawl the website

// Crawl all gocn topics
func Crawl(url string) {

	pattern := `/topics/\d+`

	collector := colly.NewCollector()
	collector.OnHTML("a[title]", func(e *colly.HTMLElement) {
		// regex match topic
		path := e.Attr("href")
		topic, ok := regexMatch(path, pattern)
		if ok {
			e.Request.Visit(fmt.Sprintf("https://gocn.vip%s",topic))
		}
	})

	redisAddr := ": 6379"
	conn, err := redis.Dial("tcp",redisAddr)
	iferr ! = nil { log.Fatalf("get redis conn error : %v", err)
	}
	defer conn.Close()

	collector.OnRequest(func(r *colly.Request) {
		topic, ok := regexMatch(r.URL.Path, pattern)
		if ok {
			r.Visit(fmt.Sprintf("https://gocn.vip%s",topic))
			// fmt.Println("content",r.URL)
		}

	})

	collector.OnResponse(func(r *colly.Response) {

		topic := strings.Replace(r.Request.URL.Path,"/topics/"."".- 1)
		isExist, err := existTopic(conn, topic)

		// the topic has had crawled
		if isExist == 1|| err ! = nil {return
		}

		title, content, ok := parseContent(string(r.Body))
		titleAndContent := fmt.Sprintf("<h3>%s</h3>%s<hr>", title, content)
		fmt.Println("titleAndContent : ", titleAndContent)

		date := getDate(title)
		if curDay := time.Now().Format("2006-01-02"); curDay ! = date {// just climb today's data
			return
		}

		ifok && content ! =""&& title ! ="" {
			pushToGithub(titleAndContent, Token)
		}

		saveDB(conn, topic, date)
	})

	collector.Visit(url)
}
Copy the code

(2) Regular expression parsing content

func parseContent(body string) (string.string.bool) {the pattern: = ` < p > GoCN (. | | \ n \ t) * daily news (. *?) </p>` title, _ := regexMatch(body, pattern)if title == "" {
		pattern = `<h[09 -] > GoCN (. | | \ n \ t) * daily news (. | | \ n \ t) * < / h [09 -] >? ` title, _ = regexMatch(body, pattern)if title == "" {
			return ""."".false} the pattern = ` > (. | | \ n \ t) * daily news (. | | \ n \ t) * (` title, _ = regexMatch (title, the pattern) title = strings. The Replace (title,"<"."".1)
		title = strings.Replace(title, ">"."".1)
	}

	pattern = `<ol>(.|\n|\t)*</ol>`
	content, _ := regexMatch(body, pattern)

	return title, content, true
}
Copy the code

(3) Push Github

func pushToGithub(data, token string) error {
	if data == "" {
		return errors.New("params error")
	}

	ctx := context.Background()
	ts := oauth2.StaticTokenSource(
		&oauth2.Token{AccessToken: token},
	)

	tc := oauth2.NewClient(ctx, ts)
	client := github.NewClient(tc)
	c := "feat: add gocn news, date : " + time.Now().Format("2006-01-02")
	sha := ""
	content := &github.RepositoryContentFileOptions{
		Message: &c,
		SHA:     &sha,
		Committer: &github.CommitAuthor{
			Name:  github.String("lubanproj"),
			Email: github.String("[email protected]"),
			Login: github.String("lubanproj"),
		},
		Author: &github.CommitAuthor{
			Name:  github.String("lubanproj"),
			Email: github.String("[email protected]"),
			Login: github.String("lubanproj"),
		},
		Branch: github.String("master"),
	}
	op := &github.RepositoryContentGetOptions{}

	repo, _, _, er := client.Repositories.GetContents(ctx, "lubanproj"."go_read"."README.md", op)
	ifer ! = nil || repo == nil { fmt.Println("get github repositories error, date: ", time.Now())
		return er
	}

	content.SHA = repo.SHA
	decodeBytes, err := base64.StdEncoding.DecodeString(*repo.Content)
	iferr ! = nil { fmt.Println("decode repo error, ",err)
		return err
	}


	oldContentList := strings.Split(string(decodeBytes), "<br>")
	iflen(oldContentList) ! = 2 { fmt.Println("README.md format error")
	}

	content.Content = []byte(oldContentList[0] + "<br>" + data + oldContentList[1])

	_, _, err = client.Repositories.UpdateFile(ctx, "lubanproj"."go_read"."README.md", content)

	iferr ! = nil { println(err)return err
	}

	return nil
}
Copy the code

For specific codes, please refer to:crawl