import requests
from lxml import etree, html
header = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/90.0.4430.212 Safari/537.36',
'cookie': '_octo=GH1.1.1603785005.1622016794; logged_in=no; _gh_sess=Y9sNKguvJvVZBhj5Hy8%2BYEjMct2okitWOD%2BH8LLc%2FcFRnwLnTC%2BbZa9%2BxHAq5l1n%2FJ4uQrx4Vk2vS8JxzbE%2F6%2FeAIGKgr9ty%2Byz%2FRaD1SFH1YdqKh23FyR8gorjxXDjG2Z6U8kmW9iG61c0P8arKwSSylKpCV8aN6U1ApjCqSURVjV9ic9pSVucAVUw%2FoFesTuKQQqmNh3RlOYXEkrBecHFJj2vYXx%2B768Sxo%2FM6sxJ0pnavDSDIDWWHIIh%2FNeWosGcMAgd3BivBWhIfgbIDDw%3D%3D--1BqbR%2BZukQlv2cZf--byyWOrkNOr5SxXtdt%2BdUvw%3D%3D; tz=Asia%2FShanghai'
}
def get_data():
with open("./1.txt", "w") as f:
f.truncate()
data = []
for num in range(1, 30):
print('第' + str(num) + '页.....')
data_list = get_data_from_url(num)
data.extend(data_list)
print('获取数据完成,写入文件中...')
for index in range(len(data)):
item = data[index]
with open("./1.txt", "a", encoding="utf-8") as f:
f.write('(' + str(index + 1) + ') <' + item['id'] + '>[' + item['href'] + ']: ' + item['text'] + '\r\n')
print('写入文件完成...')
def get_data_from_url(page):
url = get_url(page)
res = requests.get(url, headers=header)
res.encoding = 'utf-8'
tree = html.fromstring(res.content)
tags_a = tree.xpath('//*[@id="repo-content-pjax-container"]/div/div[4]/div[2]/div/div/div/div[2]/a')
tags_a_len = len(tags_a)
data_list = []
for index in range(1, tags_a_len):
detail = tags_a[index].attrib
data_list.append({
'text': tags_a[index].text,
'id': detail['id'],
'href': 'https://github.com' + detail['href']
})
return data_list
def get_url(page):
url = 'https://github.com/NervJS/taro/issues?'
param = {
'page': page
}
for i in param:
string = i + '=' + str(param[i]) + '&'
url = url + string
return url
get_data()