会员购项目面试题解析:高效数据抓取与异常处理

51 阅读2分钟

会员购项目

亮点

  • 日志记录信息
  • 协程异步抓取数据,大大提高抓取速度
  • 捕获异常,并添加重试机制

源码

 import logging
 import time
 ​
 import requests
 import asyncio
 import aiohttp
 from aiohttp import ContentTypeError
 import csv
 ​
 # 配置日志
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s : %(message)s')
 ​
 ​
 # 解析数据
 def parse_data(data):
     if data:
         for meeting in data:
             project_id = meeting['project_id']
             project_name = meeting['project_name']
             start_time = meeting['start_time']
             venue_name = meeting['venue_name']
             price_low = meeting['price_low'] / 100
             price_high = meeting['price_high'] / 100
             yield {
                 'project_id': project_id,
                 'project_name': project_name,
                 'start_time': start_time,
                 'venue_name': venue_name,
                 'price_low': price_low,
                 'price_high': price_high
             }
 ​
 ​
 # 保存至csv文件中
 def save_file(city_info, city_id):
     if city_info:
         with open(f'{city_id}.csv', 'a+', newline='', encoding='utf-8') as f:
             writer = csv.writer(f)
 ​
             writer.writerow([f'{city_info["project_id"]}', f'{city_info["project_name"]}', f'{city_info["start_time"]}',
                              f'{city_info["venue_name"]}', f'{city_info["price_low"]}', f'{city_info["price_high"]}'])
 ​
 ​
 class Myspider(object):
     types_list = ['演出', '展览', '本地生活']
     cities_id_list = []
     failed_urls = []
 ​
     CONCURRENTCY = 4
     RETRY_LIMIT = 3
 ​
     def __init__(self):
         self.session = None
         self.semaphore = asyncio.Semaphore(Myspider.CONCURRENTCY)
 ​
     # 获取城市编号并设置类属性
     @staticmethod
     def set_cities_id():
         headers = {
             'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0'}
         cities_data = requests.get("https://show.bilibili.com/api/ticket/city/list?channel=4", headers=headers).json()[
             'data']
         developed_cities_id = [city['id'] for city in cities_data['list']]
         developing_cities_id = [city['id'] for part in cities_data['more'] for city in part['list']]
         Myspider.cities_id_list = developed_cities_id + developing_cities_id
         return None
 ​
     # 解决单个任务,爬取相关信息
     async def get_every_page_info(self, url):
         async with self.semaphore:
             logging.info(f"scraping {url}")
             for attempt in range(Myspider.RETRY_LIMIT):
                 try:
                     async with self.session.get(url) as response:
                         data = await response.json()
                         return data["data"]["result"]
                 except ContentTypeError:
                     logging.info(f"error ocurred when scraping {url}", exc_info=True)
                 except aiohttp.ClientError as e:
                     logging.error(f"ClientError on {url}: {e}", exc_info=True)
                     if attempt < Myspider.RETRY_LIMIT - 1:
                         await asyncio.sleep(2 ** attempt)  # Exponential backoff
                         continue
                 except aiohttp.ServerDisconnectedError:
                     logging.error(f"Server disconnected: {url}", exc_info=True)
                     if attempt < Myspider.RETRY_LIMIT - 1:
                         await asyncio.sleep(2 ** attempt)
                         continue
             Myspider.failed_urls.append(url)
             return None  # Return None if all retry attempts fail
 ​
     # 获取 此分类下 此城市下 最大页数
     def get_max_page(self, url):
         headers = {
             'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0'}
         response = requests.get(url, headers=headers)
         data = response.json()
         return data["data"]["numPages"]
 ​
     # 主方法, 获取任务列表, 开4个协程去抓
     async def main(self):
         headers = {
             'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0'}
         # 初始化session(主要加header头信息以及代理,cookie等头信息)
         async with aiohttp.ClientSession(headers=headers) as session:
             self.session = session
             for type in Myspider.types_list:
                 for city_id in Myspider.cities_id_list:
                     begin_url = "https://show.bilibili.com/api/ticket/project/listV2?version=134&page=1&pagesize=16&area={}&filter=&platform=web&p_type={}".format(
                         city_id, type)
                     max_page = self.get_max_page(begin_url)
                     # 生成任务列表
                     scrapy_tasks = [self.get_every_page_info(
                         "https://show.bilibili.com/api/ticket/project/listV2?version=134&page={}&pagesize=16&area={}&filter=&platform=web&p_type={}".format(
                             page, city_id, type)) for page in range(1, max_page + 1)]
                     # 并发执行任务,获取执行结果
                     scrapy_results = await asyncio.gather(*scrapy_tasks)
                     # 解析结果数据
                     for result in scrapy_results:
                         data = parse_data(result)
                         for city_info in data:
                             print(city_info)
                             save_file(city_info, city_id)
             # 关闭连接
             await self.session.close()
 ​
 ​
 if __name__ == '__main__':
     # 开始时间
     start_time = time.time()
     # 获取城市编号,设置类属性cities_id_list
     Myspider.set_cities_id()
     # 初始化Myspider
     spider = Myspider()
     # 创建事件循环池
     loop = asyncio.get_event_loop()
     # 注册
     loop.run_until_complete(spider.main())
     # 结束事件
     end_time = time.time()
     logging.info(f"total_time: {end_time - start_time}")
 ​
     # print(spider.get_max_page('https://show.bilibili.com/api/ticket/project/listV2?version=134&page=1&pagesize=16&area=110100&filter=&platform=web&p_type=%E5%85%A8%E9%83%A8%E7%B1%BB%E5%9E%8B'))
 ​

更多精致内容,关注公众号:[CodeRealm]

公众号.png