import requests
from lxml import etree, html
header = {
'user-agent': 'the Mozilla / 5.0 (Windows NT 10.0; Win64; X64) AppleWebKit/537.36 (KHTML, like Gecko) '
'the Chrome / 90.0.4430.212 Safari / 537.36'.'cookie': '_octo = GH1.1.1603785005.1622016794; logged_in=no; _gh_sess=Y9sNKguvJvVZBhj5Hy8%2BYEjMct2okitWOD%2BH8LLc%2FcFRnwLnTC%2BbZa9%2BxHAq5l1n%2FJ4uQrx4Vk2vS8JxzbE%2F6%2FeAIGKgr9t y%2Byz%2FRaD1SFH1YdqKh23FyR8gorjxXDjG2Z6U8kmW9iG61c0P8arKwSSylKpCV8aN6U1ApjCqSURVjV9ic9pSVucAVUw%2FoFesTuKQQqmNh3RlOYXEk rBecHFJj2vYXx%2B768Sxo%2FM6sxJ0pnavDSDIDWWHIIh%2FNeWosGcMAgd3BivBWhIfgbIDDw%3D%3D--1BqbR%2BZukQlv2cZf--byyWOrkNOr5SxXtdt %2BdUvw%3D%3D; tz=Asia%2FShanghai'
}
def get_data() :
with open("./1.txt"."w") as f:
f.truncate()
data = []
for num in range(1.30) :print('the first' + str(num) + 'page... ')
data_list = get_data_from_url(num)
data.extend(data_list)
print('Get data done, write to file... ')
for index in range(len(data)):
item = data[index]
with open("./1.txt"."a", encoding="utf-8") as f:
f.write('(' + str(index + 1) + ') < ' + item['id'] + '> [' + item['href'] + ']] : ' + item['text'] + '\r\n')
print('Write file complete... ')
def get_data_from_url(page) :
url = get_url(page)
res = requests.get(url, headers=header)
res.encoding = 'utf-8'
tree = html.fromstring(res.content)
tags_a = tree.xpath('//*[@id="repo-content-pjax-container"]/div/div[4]/div[2]/div/div/div/div[2]/a')
tags_a_len = len(tags_a)
data_list = []
for index in range(1, tags_a_len):
detail = tags_a[index].attrib
data_list.append({
'text': tags_a[index].text,
'id': detail['id'].'href': 'https://github.com' + detail['href']})return data_list
def get_url(page) :
url = 'https://github.com/NervJS/taro/issues?'
param = {
'page': page
}
for i in param:
string = i + '=' + str(param[i]) + '&'
url = url + string
return url
get_data()
Copy the code