aiohttp+asyncio: 一种提升大量HTTP请求速度的方法 | Python

959 阅读2分钟

使用 aiohttp 和 asyncio 协程的方式,轻量地进行高并发请求

模板代码

from typing import Coroutine, List
from aiohttp import ClientSession
import asyncio

# 配置公用 event loop
LOOP = asyncio.new_event_loop()
asyncio.set_event_loop(LOOP)

async def schedule(coros: List[Coroutine]):
    """
    调度执行所有协程, 并返回一个结果列表
    """
    tasks = [asyncio.ensure_future(coro) for coro in coros]
    return await asyncio.gather(*tasks)

def execute(job: Coroutine):
    """
    执行一个协程job
    """
    future = asyncio.ensure_future(job, loop=LOOP)
    LOOP.run_until_complete(future)
    
# 主下载函数
async def download():
    async with ClientSession() as session:
        urls = [] # get urls from somewhere
        tasks = [sub_download(url, session) for url in urls]
        results = await schedule(tasks)
        # do something with results

# 次下载函数, 作为并发执行单元
async def sub_download(url: str, session: ClientSession):
    async with session.get(url) as response:
        data = await response.read()
        # do something with data
        return data


if __name__ == '__main__':
    execute(download())
函数说明
asyncio.ensure_future(coros)把一个协程对象包装成 future 对象
asyncio.gather(futures)返回一个聚合了 futures 结果的 future。任务会在事件循环中调度,不一定按照传入的顺序执行。
LOOP.run_until_complete(future)执行 event loop 直到 future 完成,返回 future 的结果或抛出异常

关键在于让协程对象(sub_download)们同时在事件循环中调度执行(schedule),这样会(近乎)同时发出 HTTP Request,(近乎)同时等待 HTTP Server 响应和接受 HTTP Response,因此就节省了大量的时间。

案例—下载1621个英雄联盟皮肤壁纸

以下载英雄联盟所有皮肤壁纸为例

image.png

image.png

from typing import Coroutine, List
import os
from aiohttp import ClientSession
import asyncio
import json
import logging
import time
	

LOGGER_FORMAT = '%(asctime)s %(levelname)s - %(message)s'
logging.basicConfig(format=LOGGER_FORMAT, level='INFO')

HERO_LIST_URL = 'https://game.gtimg.cn/images/lol/act/img/js/heroList/hero_list.js'
SKINS_URL = 'https://game.gtimg.cn/images/lol/act/img/js/hero/{}.js'

LOOP = asyncio.new_event_loop()
asyncio.set_event_loop(LOOP)

async def schedule(coros: List[Coroutine]):
    """
    调度执行所有协程, 并返回一个结果列表
    """
    tasks = [asyncio.ensure_future(coro) for coro in coros]
    return await asyncio.gather(*tasks)

def execute(job: Coroutine):
    """
    执行一个协程job
    """
    future = asyncio.ensure_future(job, loop=LOOP)
    LOOP.run_until_complete(future)

COUNTER = {}

async def async_download_skins():
    async with ClientSession() as session:
        hero_list = []
        async with session.get(HERO_LIST_URL) as response:
            data = await response.read()
            hero_list = json.loads(data)['hero']
        
        tasks = [download_save_hero_skins(hero['heroId'], session) for hero in hero_list]
        await schedule(tasks)
        logging.info("Hero total: %s, skins total: %s", len(COUNTER.keys()), sum(COUNTER.values()))

async def download_save_hero_skins(hero_id: int, session: ClientSession):
    url = SKINS_URL.format(hero_id)
    hero = {}
    async with session.get(url) as response:
        hero = json.loads(await response.read())

    hero_name = hero['hero']['name'] + ' ' + hero['hero']['title']
    path = os.path.join(os.curdir, hero_name)
    os.makedirs(path, exist_ok=True)
    tasks = []
    for skin in hero['skins']:
        if skin['mainImg']:
            file_path = os.path.join(path, skin['name'] + ".jpg")
            file_path = file_path.replace('/', '')
            task = download_save_skin(skin['mainImg'], file_path, session)
            tasks.append(task)
    await schedule(tasks)
    COUNTER[hero['hero']['title']] = len(tasks)
    logging.info("Download OK: %s, %s skins", hero_name, len(tasks))

async def download_save_skin(url: str, file_path: str, session: ClientSession):
    async with session.get(url) as response:
        data = await response.read()
        with open(file_path, 'wb') as f:
            f.write(data)
    
if __name__ == '__main__':

    path = "lol-skins"
    os.makedirs(path, exist_ok=True)
    os.chdir(path)

    start = time.time()
    execute(async_download_skins())
    end = time.time()
    logging.info("Speed time: %ss", round(end-start,2))

image.png

HTTP Request 数量: 1784 (1+162+1621)
总用时: 40.54s
平均每 Request 用时: 0.02s

发现当图片请求量大时,响应耗时会增加