import requests
from lxml import etree
import json
import threading
from queue import Queue
import time
class XiuShi(object):
"""抓取糗事百科"""
def __init__(self):
self.url = 'https://www.qiushibaike.com/8hr/page/{}'
self.headers = {
'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
}
self.file = open('qiushi.json', 'w')
self.url_list_queue = Queue()
self.get_data_queue = Queue()
self.parse_page_queue = Queue()
def url_list(self):
print('正在生成url列表')
for i in range(1, 14):
url = self.url.format(i)
self.url_list_queue.put(url)
def get_data(self):
while True:
print('正在发送请求')
url = self.url_list_queue.get()
response = requests.get(url, headers=self.headers)
if response.status_code == 503:
self.url_list_queue.put(url)
else:
resp_data = response.content
self.get_data_queue.put(resp_data)
self.url_list_queue.task_done()
def parse_page(self):
while True:
print('正在解析数据')
data = self.get_data_queue.get()
html = etree.HTML(data)
node_list = html.xpath('//*[contains(@id,"qiushi_tag_")]')
qiushi_list = []
for node in node_list:
qiu_dict = dict()
try:
qiu_dict['user'] = node.xpath('./div[1]/a[2]/h2/text()')[0].strip()
qiu_dict['age'] = node.xpath('./div[1]/div/text()')[0]
qiu_dict['url'] = 'https://www.qiushibaike.com' + node.xpath('./div[1]/a[1]/@href')[0]
qiu_dict['gender'] = node.xpath('./div[1]/div/@class')[0].split(' ')[-1]
except:
qiu_dict['user'] = '匿名用户'
qiu_dict['age'] = None
qiu_dict['url'] = None
qiu_dict['gender'] = None
qiu_dict['content'] = ''.join(node.xpath('./a/div/span/text()')).strip()
qiushi_list.append(qiu_dict)
self.parse_page_queue.put(qiushi_list)
self.get_data_queue.task_done()
def save_data(self):
while True:
print('正在保存数据')
qiushi_list = self.parse_page_queue.get()
for qiushi in qiushi_list:
json_data = json.dumps(qiushi, ensure_ascii=False) + ',\n'
print(json_data)
self.file.write(json_data)
self.parse_page_queue.task_done()
def __del__(self):
"""关闭文件"""
self.file.close()
def run(self):
threading_list = []
urls = threading.Thread(target=self.url_list)
threading_list.append(urls)
for i in range(1, 4):
data = threading.Thread(target=self.get_data)
threading_list.append(data)
for i in range(1, 4):
qiushi_list = threading.Thread(target=self.parse_page)
threading_list.append(qiushi_list)
save = threading.Thread(target=self.save_data)
threading_list.append(save)
for t in threading_list:
t.setDaemon(True)
t.start()
for q in (self.url_list_queue, self.get_data_queue, self.parse_page_queue):
q.join()
if __name__ == '__main__':
qiu = XiuShi()
qiu.run()