爬取知道问答策略
采用广度优先策略
采用leek分布式爬取框架
安装依赖包
pip install requests
pip install bs4
pip install py-log
pip install retrying
pip install leek
实例代码
import re
import time
import traceback
import requests
from urllib.parse import quote
from bs4 import BeautifulSoup
from leek import get_consumer
from py_log import get_logger
from retrying import retry
logger = get_logger('baidu_zhidao_spider')
def get_list_data(url):
try:
wb_data = get_list_source_html(url)
soup = BeautifulSoup(wb_data.text, 'lxml')
titles = soup.select('a.ti')
answer_times = soup.select('dd.dd.explain.f-light > span:nth-of-type(1)')
answer_users = soup.select('dd.dd.explain.f-light > span:nth-of-type(2) > a')
answers = soup.select('dd.dd.explain.f-light > span:nth-of-type(3) > a')
for title, answer_time, answer_user, answer in zip(titles, answer_times, answer_users, answers):
data = {'answer_time': answer_time.get_text(),
'answer_user': answer_user.get_text(),
'answers': answer.get_text().replace('个回答', ''),
'answer_detail_url': title['href'],
'create_time': int(time.time() * 1000)}
logger.info(data)
except:
logger.error(traceback.format_exc())
def push_list_task(keywords: list = ['景德镇']):
if keywords:
for keyword in keywords:
try:
keyword = quote(keyword)
url_first = f'https://zhidao.baidu.com/search?lm=0&rn=10&pn=10&fr=search&ie=gbk&word={keyword}'
pagesize = int(get_totalcount_by_keyword(url_first) / 10)
for page in range(0, pagesize + 1):
url = f'https://zhidao.baidu.com/search?lm=0&rn=10&pn={page * 10}&fr=search&ie=gbk&word={keyword}'
get_list_data(url)
except:
list_consumer.task_publisher.pub(url_first)
logger.error(traceback.format_exc())
@retry(stop_max_attempt_number=5)
def get_totalcount_by_keyword(url):
wb_data = get_list_source_html(url)
soup = BeautifulSoup(wb_data.text, 'lxml')
last_url = soup.select('a.pager-last')[0]['href']
total_count = re.match(".*pn=(\d+)", last_url).group(1)
logger.info(f'keyword:{url},total_count:{total_count}')
return int(total_count)
def get_list_source_html(url):
logger.info(url)
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36',
'Cookie': 'BAIDUID=18581A93B7660CCE2EF45F5C9AF78D5D:FG=1; __yjs_duid=1_5a656be8e909a5ab1dc8e6c51a171ff51620382781035; BIDUPSID=18581A93B7660CCE2EF45F5C9AF78D5D; PSTM=1620390429; BDSFRCVID_BFESS=a1POJeC62mInKaverYnGo6QQ_RiR2Y7TH6ao_I6yuPDV6U_ZuVjhEG0PVf8g0KA-hILjogKK3gOTH4PF_2uxOjjg8UtVJeC6EG0Ptf8g0M5; H_BDCLCKID_SF_BFESS=JbPtoD_yfIvafJjYhKTMbJt3-xrHKI62aKDsQI3gBhcqEIL45fJa3-tpbq5xt-juJTQr2prSypT8fxbSj4Qo2x4EQUbyQT3Q5KoaafjSBp5nhMJ_3j7JDMP0qfAjhlcy523iob3vQpPMVhQ3DRoWXPIqbN7P-p5Z5mAqKl0MLPbtbb0xXj_0D6OLDGA8t6ks-DTbB4oHK--_qnTz-4L_5-_e-xQyetJyaR3EWhRvWJ5TMCoz-UoUjqD8K4JmKUuJMeJrBhRgQxKbShPC-tPaj5LFK4oehx7n365EBIO13l02Vbc9e-t2yU_VKHjut4RMW20e0h7mWIbmsxA45J7cM4IseboJLfT-0bc4KKJxbnLWeIJEjjCKj6OyjHLjJTPj-6TKWDP8Kb7VbPTuyUnkbfJBDl5QyMbltKTHKKQXBCb08hR2bPoAeM47yajK25o72TcdK43JbJ7is-8wy5rpQT8rbtFOK5OibCrWBPOTab3vOIOTXpO1jh8zBN5thURB2DkO-4bCWJ5TMl5jDh3Mb6ksD-Ftqj_OtJ-foDKbf-OKJt5kq4bohjPw5mc9BtQmJJu80JRgHRrqqRbT5xFM2RFBDlQNa-0fQg-q3R79fR6-SfoLbJba-JFT-n370x-jLnbOVn0MW-5DVp3gMtnJyUnybPnnBT3T3H8HL4nv2JcJbM5m3x6qLTKkQN3T-PKO5bRu_CF5JIPMbD0menJb5ICq5f7-etJXfKCDbp7F5l8-hR6VhxRH54LXWfTx255ILGONBx_-yC3xOKQphpnzjRF00-OptqQdyjTpBJnN3KJmSRL9bT3vLDuJytvB2-biWb7M2MbdJUJP_IoG2Mn8M4bb3qOpBtQmJeTxoUJ25DnJhhCGe4bK-TryDHtfJfK; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; H_PS_PSSID=34099_31254_33848_33773_33607; delPer=0; BAIDUID_BFESS=EC0B8CB2E5DB2C252FE4F1A4011CDAD8:FG=1; PSINO=2; BA_HECTOR=8ka520a1812g25a0u31gc1pgs0r; ZD_ENTRY=empty; Hm_lvt_6859ce5aaf00fb00387e6434e4fcc925=1621172136,1623254974; Hm_lpvt_6859ce5aaf00fb00387e6434e4fcc925=1623254974; shitong_key_id=2; ab_sr=1.0.1_OTBiZTEzY2RmMWM3MWM1MWNkMDE4MzIwNjllOTJkYmY2M2FiN2UyZGFmYzJjMzllY2FhY2MwMTBjZDI5NzAzZDAxYzVlMzQ2Y2I2YTQyMTkyMWIxNmRhOWUxMTg3ZWU5MGY4Zjc2MmYyOGU0YjIxZmM1ZWMwNzY0ZWYzYzk0MDc1YjFmMTI3ODUyZjg2NmYyNjQ2MDU3MjRjYjg1NjU2Mg==; shitong_data=7d3434b10335de90105aee880b6976138ba670eb5ab5dfec1ec7f6fdead89d6526d65dacb8a6ec84de87e8a3bf8d823c50a8175b16a3745ff826feb07592236f5b646786f633ff511100475de3f136c051560bb904a45f5b5fab2e3376f81eee20967d70e060f03de9793416fa71e0f37a7b315109d2dbfad098fad82b5313db; shitong_sign=86830a0e'
}
wb_data = requests.request("GET", url, headers=headers, timeout=10)
wb_data.encoding = 'gbk'
return wb_data
list_consumer = get_consumer('zhidao:questions:list', consuming_function=get_list_data, threads_num=20, qps=15)
if __name__ == '__main__':
push_list_task()
list_consumer.start()
框架参数说明
本框架默认使用redis作为中间件(若未安装reids,可设置middleware='sqlite'参数,使用sqllite作为中间件)
get_consumer('zhidao:questions:list', consuming_function=get_list_data, threads_num=20, qps=15)
1.zhidao:questions:list 发布消费任务使用到的队列名称
2.threads_num=20 并发爬取设置的线程数
3.qps=15 设置的每秒最大并发请求数,防止请求过大,导致爬取站点宕机
依赖分布式框架项目参数说明文档
leek使用文档