In order to find questions in Taro's development, a simple crawler has been written to crawl your questions down for easy search

import requests
from lxml import etree, html


header = {
    'user-agent': 'the Mozilla / 5.0 (Windows NT 10.0; Win64; X64) AppleWebKit/537.36 (KHTML, like Gecko) '
                  'the Chrome / 90.0.4430.212 Safari / 537.36'.'cookie': '_octo = GH1.1.1603785005.1622016794; logged_in=no; _gh_sess=Y9sNKguvJvVZBhj5Hy8%2BYEjMct2okitWOD%2BH8LLc%2FcFRnwLnTC%2BbZa9%2BxHAq5l1n%2FJ4uQrx4Vk2vS8JxzbE%2F6%2FeAIGKgr9t y%2Byz%2FRaD1SFH1YdqKh23FyR8gorjxXDjG2Z6U8kmW9iG61c0P8arKwSSylKpCV8aN6U1ApjCqSURVjV9ic9pSVucAVUw%2FoFesTuKQQqmNh3RlOYXEk rBecHFJj2vYXx%2B768Sxo%2FM6sxJ0pnavDSDIDWWHIIh%2FNeWosGcMAgd3BivBWhIfgbIDDw%3D%3D--1BqbR%2BZukQlv2cZf--byyWOrkNOr5SxXtdt %2BdUvw%3D%3D; tz=Asia%2FShanghai'
}


def get_data() :
    with open("./1.txt"."w") as f:
        f.truncate()
    data = []
    for num in range(1.30) :print('the first' + str(num) + 'page... ')
        data_list = get_data_from_url(num)
        data.extend(data_list)
    print('Get data done, write to file... ')
    for index in range(len(data)):
        item = data[index]
        with open("./1.txt"."a", encoding="utf-8") as f:
            f.write('(' + str(index + 1) + ') < ' + item['id'] + '> [' + item['href'] + ']] : ' + item['text'] + '\r\n')
    print('Write file complete... ')


def get_data_from_url(page) :
    url = get_url(page)
    res = requests.get(url, headers=header)
    res.encoding = 'utf-8'
    tree = html.fromstring(res.content)
    tags_a = tree.xpath('//*[@id="repo-content-pjax-container"]/div/div[4]/div[2]/div/div/div/div[2]/a')
    tags_a_len = len(tags_a)
    data_list = []
    for index in range(1, tags_a_len):
        detail = tags_a[index].attrib
        data_list.append({
            'text': tags_a[index].text,
            'id': detail['id'].'href': 'https://github.com' + detail['href']})return data_list


def get_url(page) :
    url = 'https://github.com/NervJS/taro/issues?'
    param = {
        'page': page
    }
    for i in param:
        string = i + '=' + str(param[i]) + '&'
        url = url + string
    return url


get_data()

Copy the code

mo4tech.com (Moment For Technology) is a global community with thousands techies from across the global hang out!Passionate technologists, be it gadget freaks, tech enthusiasts, coders, technopreneurs, or CIOs, you would find them all here.

In order to find questions in Taro’s development, a simple crawler has been written to crawl your questions down for easy search

In order to find questions in Taro’s development, a simple crawler has been written to crawl your questions down for easy search

Related Posts

SQL

Github star 81K! Ali’s unique Servlet combat notes were exposed for the first time and won millions of collections

Review a production problem