生产者与消费者 (学习使用,请勿用于商业行为)
import logging
import parsel
import requests
from jsonpath import jsonpath
import threading
from queue import Queue
logging.getLogger().setLevel(logging.INFO)
class GetQuoteID(threading.Thread):
def __init__(self, local_queue, QuoteID_queue):
threading.Thread.__init__(self)
self.local_queue = local_queue
self.QuoteID_queue = QuoteID_queue
def run(self) -> None:
while True:
if self.local_queue.empty():
break
try:
code = self.local_queue.get()
QuoteID = self.init_request(code)
self.QuoteID_queue.put(QuoteID)
except:
pass
def init_request(self,code):
headers = {
'Accept': '*/*',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive',
'Referer': 'https://www.eastmoney.com/',
'Sec-Fetch-Dest': 'script',
'Sec-Fetch-Mode': 'no-cors',
'Sec-Fetch-Site': 'same-site',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36',
'sec-ch-ua': '"Google Chrome";v="105", "Not)A;Brand";v="8", "Chromium";v="105"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
}
params = {
'input': code,
'type': '14',
}
response = requests.get('https://searchapi.eastmoney.com/***/get', params=params, cookies=None,
headers=headers).json()
QuoteID = response['QuotationCodeTable']['Data'][0]['QuoteID']
print(f'QuoteID-----------------------{QuoteID}')
return QuoteID
class GetDetail(threading.Thread):
def __init__(self, QuoteID_queue, detail_queue):
threading.Thread.__init__(self)
self.QuoteID_queue = QuoteID_queue
self.detail_queue = detail_queue
def run(self) -> None:
while True:
if self.QuoteID_queue.empty():
break
try:
QuoteID = self.QuoteID_queue.get()
self.get_detail_code(QuoteID)
except:
pass
def get_detail_code(self,QuoteID):
response = requests.get(f'http://quote.eastmoney.com/***/{QuoteID}')
response.encoding = response.apparent_encoding
selector = parsel.Selector(response.text)
overview_url = selector.xpath('//a[text()="公司概况"]/@href').get()
end_code = overview_url.split('=')[-1]
print(f'end_code--------------------{end_code}')
self.detail_queue.put(end_code)
class GetRequest(threading.Thread):
def __init__(self,detail_queue,parse_queue):
threading.Thread.__init__(self)
self.detail_queue = detail_queue
self.parse_queue = parse_queue
def run(self) -> None:
while True:
if self.detail_queue.empty():
break
try:
detail_code = self.detail_queue.get()
response = self.request_url(detail_code)
self.parse_queue.put(response)
except Exception as e:
print(f'!!!!!!!!!!!!!{e}!!!!!!!!!!!!!!!!!!!!')
pass
def request_url(self,code):
headers = {
'Accept': '*/*',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36',
'X-Requested-With': 'XMLHttpRequest',
}
params = {
'code': f'{code}',
}
response = requests.get('http://emweb.securities.eastmoney.com/***/PageAjax', params=params,
headers=headers, verify=False).json()
print('response--------------------------------')
return response
class Parsel_data(threading.Thread):
def __init__(self,parse_queue):
threading.Thread.__init__(self)
self.parse_queue = parse_queue
def run(self) -> None:
while True:
if self.parse_queue.empty():
break
try:
response = self.parse_queue.get()
self.parse_response(response)
except Exception as e:
print(f'Parsel_data-------{e}--------!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!')
pass
def parse_response(self,response):
item = {}
item['companyName'] = jsonpath(response, "$..ORG_NAME")[0]
item['numberOfEmployees'] = jsonpath(response, "$..EMP_NUM")[0]
item['companyProfile'] = jsonpath(response, "$..ORG_PROFILE")[0]
item['sfcIndustry'] = jsonpath(response, "$..INDUSTRYCSRC1")[0]
item['dateOfEstablishment'] = jsonpath(response, "$..FOUND_DATE")[0]
print(f'item---------------------------{item}')
if __name__ == '__main__':
code_list = [
"300***",
"688***",
"300***",
"603***",
"300***",
"688***",
"300***",
"300***",
"600***",
"002***",
"300***",
"601***",
"836***",
"688***",
"301***", ]
local_queue = Queue()
QuoteID_queue = Queue()
detail_queue = Queue()
parse_queue = Queue()
for code in code_list:
local_queue.put(code)
logging.info('正在执行GetQuoteID任务!!!')
get_quote_id = []
for i in range(0, 5):
get_quote = GetQuoteID(local_queue, QuoteID_queue)
get_quote_id.append(get_quote)
get_quote.start()
for i in get_quote_id:
i.join()
logging.info('正在执行GetDetail任务!!!')
get_detail_list = []
for i in range(0, 5):
parse = GetDetail(QuoteID_queue, detail_queue)
get_detail_list.append(parse)
parse.start()
for i in get_detail_list:
i.join()
logging.info('正在执行GetRequest任务!!!')
request_list = []
for i in range(0, 5):
get_request = GetRequest(detail_queue, parse_queue)
request_list.append(get_request)
get_request.start()
for i in request_list:
i.join()
print(f'*****************************************:{parse_queue.qsize()}')
logging.info('正在执行Parsel_data任务!!!')
parse_list = []
for i in range(0, 5):
parse = Parsel_data(parse_queue)
parse_list.append(parse)
parse.start()
for i in parse_list:
i.join()
以上均为学习分享,可能存在不足或者还有其他更优雅的方法,欢迎评论区留言交流!