看看哔哩哔哩追番热度信息,做个小练习。
import requests
import json
from queue import Queue
import threading
from lxml import etree
import re
class Drama:
def __init__(self):
self.start_url = 'https://api.bilibili.com/pgc/season/index/result?season_version=-1&page=1&season_type=1&pagesize=4000&type=1'
self.headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'}
self.detail_list_queue = Queue()
self.detail_response_queue = Queue()
self.response_data_queue = Queue()
def get_start_data(self):
response = requests.get(self.start_url, headers=self.headers)
content = json.loads(response.content.decode())
for i in content['data']['list']:
self.detail_list_queue.put(i['link'])
def parse_detail_url(self):
while True:
url = self.detail_list_queue.get()
print(url)
response = requests.get(url, headers=self.headers)
self.detail_response_queue.put(response.content.decode())
self.detail_list_queue.task_done()
def analysis_detail(self):
while True:
content = self.detail_response_queue.get()
html = etree.HTML(content)
item = {}
item['番剧名称'] = html.xpath('//a[@class="media-title"]/text()')
item['番剧描述'] = html.xpath('//span[@class="absolute"]/text()')
item['番剧状态'] = html.xpath('//span[@class="pub-info"]/text()')
item['番剧链接'] = re.findall(r'property="og:url" content="(.*?)">', content)
item['追番人数'] = re.findall(r'"favorites":(.*?),"', content)
item['番剧海报'] = re.findall(r'<meta property="og:image" content="(.*?)"><meta name="spm_prefix"', content)
self.response_data_queue.put(item)
self.detail_response_queue.task_done()
def save_data(self):
while True:
with open('番剧预览.txt', 'a', encoding='utf-8') as f:
item = self.response_data_queue.get()
f.write('番剧名称:{}\n'.format(item['番剧名称']))
f.write('追番人数:{}\n'.format(item['追番人数']))
f.write('番剧状态:{}\n'.format(item['番剧状态']))
f.write('番剧链接:{}\n'.format(item['番剧链接']))
f.write('番剧海报:{}\n'.format(item['番剧海报']))
f.write('番剧描述:\n{}\n\n'.format(item['番剧描述'][0]))
self.response_data_queue.task_done()
def run(self):
self.get_start_data()
t1 = threading.Thread(target=self.parse_detail_url)
t2 = threading.Thread(target=self.analysis_detail)
t3 = threading.Thread(target=self.save_data)
t_ls = [t1, t2, t3]
for t in t_ls:
t.setDaemon(True)
t.start()
self.detail_list_queue.join()
self.detail_response_queue.join()
self.response_data_queue.join()
if __name__ == '__main__':
a = Drama()
a.run()