Python爬虫16--哔哩哔哩追番热度信息

147 阅读1分钟

 看看哔哩哔哩追番热度信息,做个小练习。

import requests
import json
from queue import Queue
import threading
from lxml import etree
import re


class Drama:
    def __init__(self):
        self.start_url = 'https://api.bilibili.com/pgc/season/index/result?season_version=-1&page=1&season_type=1&pagesize=4000&type=1'
        self.headers = {
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'}
        self.detail_list_queue = Queue()
        self.detail_response_queue = Queue()
        self.response_data_queue = Queue()

    # 分析start_url的信息
    def get_start_data(self):
        response = requests.get(self.start_url, headers=self.headers)
        content = json.loads(response.content.decode())
        # print(content)
        # 提取单个番剧的链接
        for i in content['data']['list']:
            self.detail_list_queue.put(i['link'])

    # 请求详情页信息
    def parse_detail_url(self):
        while True:
            url = self.detail_list_queue.get()
            print(url)
            response = requests.get(url, headers=self.headers)
            self.detail_response_queue.put(response.content.decode())
            self.detail_list_queue.task_done()

    # 分析详情页的内容,记录数据
    def analysis_detail(self):
        while True:
            content = self.detail_response_queue.get()
            # print(content)
            # 准备使用xpath
            html = etree.HTML(content)
            # rst = etree.tostring(html, encoding="utf-8", pretty_print=True, method="html").decode("utf-8")
            # 根据rst写出xpath
            # print(rst)
            item = {}
            item['番剧名称'] = html.xpath('//a[@class="media-title"]/text()')
            item['番剧描述'] = html.xpath('//span[@class="absolute"]/text()')
            # item['番剧播放'] = html.xpath('//div[@class="ss-count"]/text()')
            item['番剧状态'] = html.xpath('//span[@class="pub-info"]/text()')
            item['番剧链接'] = re.findall(r'property="og:url" content="(.*?)">', content)
            item['追番人数'] = re.findall(r'"favorites":(.*?),"', content)
            item['番剧海报'] = re.findall(r'<meta property="og:image" content="(.*?)"><meta name="spm_prefix"', content)
            # 测试响应结果
            # print(item)
            self.response_data_queue.put(item)
            self.detail_response_queue.task_done()

    # 保存数据
    def save_data(self):
        while True:
            with open('番剧预览.txt', 'a', encoding='utf-8') as f:
                item = self.response_data_queue.get()
                f.write('番剧名称:{}\n'.format(item['番剧名称']))
                f.write('追番人数:{}\n'.format(item['追番人数']))
                # f.write('番剧播放:{}\n'.format(item['番剧播放']))
                f.write('番剧状态:{}\n'.format(item['番剧状态']))
                f.write('番剧链接:{}\n'.format(item['番剧链接']))
                f.write('番剧海报:{}\n'.format(item['番剧海报']))
                f.write('番剧描述:\n{}\n\n'.format(item['番剧描述'][0]))
                self.response_data_queue.task_done()

    def run(self):
        # 2.分析初始url的信息
        self.get_start_data()
        # t1 = threading.Thread(target=self.get_start_data)
        # t1.setDaemon(True)
        # 3.请求详情页信息
        t1 = threading.Thread(target=self.parse_detail_url)
        # 4.提取番剧详情
        t2 = threading.Thread(target=self.analysis_detail)
        # 5.保存数据
        t3 = threading.Thread(target=self.save_data)
        t_ls = [t1, t2, t3]
        for t in t_ls:
            t.setDaemon(True)
            t.start()
        self.detail_list_queue.join()
        self.detail_response_queue.join()
        self.response_data_queue.join()


if __name__ == '__main__':
    a = Drama()
    a.run()