东方财富升级之生产者与消费者

602 阅读3分钟

生产者与消费者 (学习使用,请勿用于商业行为)

前言:具体分析过程请移步到我另外一篇文章:juejin.cn/post/715271…

import logging
import parsel
import requests
from jsonpath import jsonpath
import threading
from queue import Queue
logging.getLogger().setLevel(logging.INFO)


# 定义通过股票代码获取对应QuoteID的生产者
class GetQuoteID(threading.Thread):
    def __init__(self, local_queue, QuoteID_queue):
        # 初始化父类
        threading.Thread.__init__(self)
        # 初始化装载本地code的队列
        self.local_queue = local_queue
        # 初始化响应接口获取QuoteID的队列
        self.QuoteID_queue = QuoteID_queue

    # 运行线程的方法 run
    def run(self) -> None:
        # 死循环一直把队列的数据取完为止
        while True:
            # 判断队列是否为空 为空的话跳出循环
            if self.local_queue.empty():
                break
            try:
                code = self.local_queue.get()
                QuoteID = self.init_request(code)
                self.QuoteID_queue.put(QuoteID)
            except:
                pass

    def init_request(self,code):
        headers = {
            'Accept': '*/*',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'Connection': 'keep-alive',
            'Referer': 'https://www.eastmoney.com/',
            'Sec-Fetch-Dest': 'script',
            'Sec-Fetch-Mode': 'no-cors',
            'Sec-Fetch-Site': 'same-site',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36',
            'sec-ch-ua': '"Google Chrome";v="105", "Not)A;Brand";v="8", "Chromium";v="105"',
            'sec-ch-ua-mobile': '?0',
            'sec-ch-ua-platform': '"Windows"',
        }
        params = {
            'input': code,
            'type': '14',
        }
        response = requests.get('https://searchapi.eastmoney.com/***/get', params=params, cookies=None,
                                headers=headers).json()
        QuoteID = response['QuotationCodeTable']['Data'][0]['QuoteID']
        print(f'QuoteID-----------------------{QuoteID}')
        return  QuoteID

# 定义消费者 GetDetail
class GetDetail(threading.Thread):
    def __init__(self, QuoteID_queue, detail_queue):
        threading.Thread.__init__(self)
        self.QuoteID_queue = QuoteID_queue
        self.detail_queue = detail_queue

    def run(self) -> None:
        # 死循环一直把队列的数据取完为止
        while True:
            # 判断队列是否为空 为空的话跳出循环
            if self.QuoteID_queue.empty():
                break
            try:
                # 从队列中取出响应数据
                QuoteID = self.QuoteID_queue.get()
                # 调用获取detail_code方法
                self.get_detail_code(QuoteID)
            except:
                pass
    # 获取最重要的code
    def get_detail_code(self,QuoteID):
        response = requests.get(f'http://quote.eastmoney.com/***/{QuoteID}')
        response.encoding  = response.apparent_encoding
        selector = parsel.Selector(response.text)
        overview_url = selector.xpath('//a[text()="公司概况"]/@href').get()
        end_code = overview_url.split('=')[-1]
        print(f'end_code--------------------{end_code}')
        self.detail_queue.put(end_code)

# 消费者 GetRequest
class GetRequest(threading.Thread):
    def __init__(self,detail_queue,parse_queue):
        threading.Thread.__init__(self)
        self.detail_queue = detail_queue
        self.parse_queue = parse_queue


    def run(self) -> None:
        while True:
            if self.detail_queue.empty():
                break
            try:
                detail_code = self.detail_queue.get()
                response = self.request_url(detail_code)
                self.parse_queue.put(response)
            except Exception as e:
                print(f'!!!!!!!!!!!!!{e}!!!!!!!!!!!!!!!!!!!!')
                pass

    def request_url(self,code):
        headers = {
            'Accept': '*/*',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'Connection': 'keep-alive',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36',
            'X-Requested-With': 'XMLHttpRequest',
        }
        params = {
            'code': f'{code}',
        }
        response = requests.get('http://emweb.securities.eastmoney.com/***/PageAjax', params=params,
                                headers=headers, verify=False).json()
        print('response--------------------------------')
        return response

# 消费者 Parsel_data
class Parsel_data(threading.Thread):
    def __init__(self,parse_queue):
        threading.Thread.__init__(self)
        self.parse_queue = parse_queue

    def run(self) -> None:
        while True:
            if self.parse_queue.empty():
                break
            try:
                response = self.parse_queue.get()
                self.parse_response(response)
            except Exception as e:
                print(f'Parsel_data-------{e}--------!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!')
                pass

    def parse_response(self,response):
        item = {}
        item['companyName'] = jsonpath(response, "$..ORG_NAME")[0]  # 公司名称
        item['numberOfEmployees'] = jsonpath(response, "$..EMP_NUM")[0]  # 雇员人数
        item['companyProfile'] = jsonpath(response, "$..ORG_PROFILE")[0]  # 公司简介
        item['sfcIndustry'] = jsonpath(response, "$..INDUSTRYCSRC1")[0]  # 所属证监会行业
        item['dateOfEstablishment'] = jsonpath(response, "$..FOUND_DATE")[0]  # 成立日期
        print(f'item---------------------------{item}')


if __name__ == '__main__':
    code_list = [
        "300***",
        "688***",
        "300***",
        "603***",
        "300***",
        "688***",
        "300***",
        "300***",
        "600***",
        "002***",
        "300***",
        "601***",
        "836***",
        "688***",
        "301***", ]
    # 创建初始url队列
    local_queue = Queue()
    # 初始化响应接口获取QuoteID的队列
    QuoteID_queue = Queue()
    # 创建需要下载的url队列
    detail_queue = Queue()
    # 创建标题队列
    parse_queue = Queue()
    for code in code_list:
        local_queue.put(code)


    # 创建一个列表去控制爬虫线程
    logging.info('正在执行GetQuoteID任务!!!')
    get_quote_id = []
    for i in range(0, 5):
        get_quote = GetQuoteID(local_queue, QuoteID_queue)
        get_quote_id.append(get_quote)
        get_quote.start()  # 启动

    for i in get_quote_id:
        i.join()  # join就是线程同步,即主线程任务结束之后,进入阻塞状态,一直等待其他的子线程执行结束之后,主线程在终止

    logging.info('正在执行GetDetail任务!!!')
    get_detail_list = []
    for i in range(0, 5):
        parse = GetDetail(QuoteID_queue, detail_queue)
        get_detail_list.append(parse)
        parse.start()

    for i in get_detail_list:
        i.join()
    # print(detail_queue.qsize())

    logging.info('正在执行GetRequest任务!!!')
    request_list = []
    # 开启五个线程
    for i in range(0, 5):
        get_request = GetRequest(detail_queue, parse_queue)
        request_list.append(get_request)
        get_request.start()

    for i in request_list:
        i.join()

    print(f'*****************************************:{parse_queue.qsize()}')
    logging.info('正在执行Parsel_data任务!!!')
    parse_list = []
    # 开启五个线程
    for i in range(0, 5):
        parse = Parsel_data(parse_queue)
        parse_list.append(parse)
        parse.start()

    for i in parse_list:
        i.join()

以上均为学习分享,可能存在不足或者还有其他更优雅的方法,欢迎评论区留言交流!