东方财富之队列与线程池搭配 (学习使用,请勿用于商业行为)
import concurrent.futures
import logging
import threading
import time
import requests
import parsel
import queue
from jsonpath import jsonpath
logging.getLogger().setLevel(logging.INFO)
# 创建一把锁
lock = threading.Lock()
def init_request(code):
headers = {
'Accept': '*/*',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive',
'Referer': 'https://www.eastmoney.com/',
'Sec-Fetch-Dest': 'script',
'Sec-Fetch-Mode': 'no-cors',
'Sec-Fetch-Site': 'same-site',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36',
'sec-ch-ua': '"Google Chrome";v="105", "Not)A;Brand";v="8", "Chromium";v="105"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
}
params = {
'input': code,
'type': '14',
}
response = requests.get('https://searchapi.eastmoney.com/*****/get', params=params, cookies=None,
headers=headers).json()
QuoteID = response['QuotationCodeTable']['Data'][0]['QuoteID']
response = requests.get(f'http://quote.eastmoney.com/****/{QuoteID}').text
selector =parsel.Selector(response)
overview_url = selector.xpath('//a[text()="公司概况"]/@href').get()
end_code = overview_url.split('=')[-1]
return end_code
def request_url(code):
headers = {
'Accept': '*/*',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36',
'X-Requested-With': 'XMLHttpRequest',
}
params = {
'code': f'{code}',
}
response = requests.get('http://emweb.securities.eastmoney.com/******/PageAjax', params=params,headers=headers, verify=False).json()
return response
def parse_response(response):
item = {}
item['companyName'] = jsonpath(response, "$..ORG_NAME")[0] # 公司名称
item['numberOfEmployees'] = jsonpath(response, "$..EMP_NUM")[0] # 雇员人数
item['companyProfile'] = jsonpath(response, "$..ORG_PROFILE")[0] # 公司简介
item['sfcIndustry'] = jsonpath(response, "$..INDUSTRYCSRC1")[0] # 所属证监会行业
item['dateOfEstablishment'] = jsonpath(response, "$..FOUND_DATE")[0] # 成立日期
return item
def run(code):
end_code = init_request(code)
response = request_url(end_code)
item = parse_response(response)
item['end_code'] = end_code
lock.acquire() # 上锁
print(item)
lock.release() # 解锁 1
if __name__ == '__main__':
code_list = [
"300***",
"688***",
"300***",
"603***",
"300***",
"688***",
"300***",
"300***",
"600***",
"002***",
"300***",
"601***",
"836***",
"688***",
"301***",]
start = time.time()
task_queue = queue.Queue(30) # 定义30 指定队列长度 表示只能塞30条数据 不写可以无限塞
for code in code_list:
if task_queue.full():
break
task_queue.put(code)
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
while not task_queue.empty():
executor.submit(run, task_queue.get())
end = time.time()
print(f'总用时:{end-start}!')
以上均为学习分享,可能存在不足或者还有其他更优雅的语法,欢迎评论区留言交流!