为了方便查在Taro开发中的问题,简单的写了个爬虫把大家的问题爬下来便于搜索

356 阅读1分钟
import requests
from lxml import etree, html


header = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                  'Chrome/90.0.4430.212 Safari/537.36',
    'cookie': '_octo=GH1.1.1603785005.1622016794; logged_in=no; _gh_sess=Y9sNKguvJvVZBhj5Hy8%2BYEjMct2okitWOD%2BH8LLc%2FcFRnwLnTC%2BbZa9%2BxHAq5l1n%2FJ4uQrx4Vk2vS8JxzbE%2F6%2FeAIGKgr9ty%2Byz%2FRaD1SFH1YdqKh23FyR8gorjxXDjG2Z6U8kmW9iG61c0P8arKwSSylKpCV8aN6U1ApjCqSURVjV9ic9pSVucAVUw%2FoFesTuKQQqmNh3RlOYXEkrBecHFJj2vYXx%2B768Sxo%2FM6sxJ0pnavDSDIDWWHIIh%2FNeWosGcMAgd3BivBWhIfgbIDDw%3D%3D--1BqbR%2BZukQlv2cZf--byyWOrkNOr5SxXtdt%2BdUvw%3D%3D; tz=Asia%2FShanghai'
}


def get_data():
    with open("./1.txt", "w") as f:
        f.truncate()
    data = []
    for num in range(1, 30):
        print('第' + str(num) + '页.....')
        data_list = get_data_from_url(num)
        data.extend(data_list)
    print('获取数据完成,写入文件中...')
    for index in range(len(data)):
        item = data[index]
        with open("./1.txt", "a", encoding="utf-8") as f:
            f.write('(' + str(index + 1) + ') <' + item['id'] + '>[' + item['href'] + ']: ' + item['text'] + '\r\n')
    print('写入文件完成...')


def get_data_from_url(page):
    url = get_url(page)
    res = requests.get(url, headers=header)
    res.encoding = 'utf-8'
    tree = html.fromstring(res.content)
    tags_a = tree.xpath('//*[@id="repo-content-pjax-container"]/div/div[4]/div[2]/div/div/div/div[2]/a')
    tags_a_len = len(tags_a)
    data_list = []
    for index in range(1, tags_a_len):
        detail = tags_a[index].attrib
        data_list.append({
            'text': tags_a[index].text,
            'id': detail['id'],
            'href': 'https://github.com' + detail['href']
        })
    return data_list


def get_url(page):
    url = 'https://github.com/NervJS/taro/issues?'
    param = {
        'page': page
    }
    for i in param:
        string = i + '=' + str(param[i]) + '&'
        url = url + string
    return url


get_data()