aiohttp+asyncio: 一种提升大量HTTP请求速度的方法 | Python使用 aiohttp 和 asyn

使用 aiohttp 和 asyncio 协程的方式，轻量地进行高并发请求

模板代码

from typing import Coroutine, List
from aiohttp import ClientSession
import asyncio

# 配置公用 event loop
LOOP = asyncio.new_event_loop()
asyncio.set_event_loop(LOOP)

async def schedule(coros: List[Coroutine]):
    """
    调度执行所有协程, 并返回一个结果列表
    """
    tasks = [asyncio.ensure_future(coro) for coro in coros]
    return await asyncio.gather(*tasks)

def execute(job: Coroutine):
    """
    执行一个协程job
    """
    future = asyncio.ensure_future(job, loop=LOOP)
    LOOP.run_until_complete(future)
    
# 主下载函数
async def download():
    async with ClientSession() as session:
        urls = [] # get urls from somewhere
        tasks = [sub_download(url, session) for url in urls]
        results = await schedule(tasks)
        # do something with results

# 次下载函数, 作为并发执行单元
async def sub_download(url: str, session: ClientSession):
    async with session.get(url) as response:
        data = await response.read()
        # do something with data
        return data


if __name__ == '__main__':
    execute(download())

函数	说明
`asyncio.ensure_future(coros)`	把一个协程对象包装成 future 对象
`asyncio.gather(futures)`	返回一个聚合了 futures 结果的 future。任务会在事件循环中调度，不一定按照传入的顺序执行。
`LOOP.run_until_complete(future)`	执行 event loop 直到 future 完成，返回 future 的结果或抛出异常

关键在于让协程对象（sub_download）们同时在事件循环中调度执行（schedule），这样会（近乎）同时发出 HTTP Request，（近乎）同时等待 HTTP Server 响应和接受 HTTP Response，因此就节省了大量的时间。

案例—下载1621个英雄联盟皮肤壁纸

以下载英雄联盟所有皮肤壁纸为例

from typing import Coroutine, List
import os
from aiohttp import ClientSession
import asyncio
import json
import logging
import time
	

LOGGER_FORMAT = '%(asctime)s %(levelname)s - %(message)s'
logging.basicConfig(format=LOGGER_FORMAT, level='INFO')

HERO_LIST_URL = 'https://game.gtimg.cn/images/lol/act/img/js/heroList/hero_list.js'
SKINS_URL = 'https://game.gtimg.cn/images/lol/act/img/js/hero/{}.js'

LOOP = asyncio.new_event_loop()
asyncio.set_event_loop(LOOP)

async def schedule(coros: List[Coroutine]):
    """
    调度执行所有协程, 并返回一个结果列表
    """
    tasks = [asyncio.ensure_future(coro) for coro in coros]
    return await asyncio.gather(*tasks)

def execute(job: Coroutine):
    """
    执行一个协程job
    """
    future = asyncio.ensure_future(job, loop=LOOP)
    LOOP.run_until_complete(future)

COUNTER = {}

async def async_download_skins():
    async with ClientSession() as session:
        hero_list = []
        async with session.get(HERO_LIST_URL) as response:
            data = await response.read()
            hero_list = json.loads(data)['hero']
        
        tasks = [download_save_hero_skins(hero['heroId'], session) for hero in hero_list]
        await schedule(tasks)
        logging.info("Hero total: %s, skins total: %s", len(COUNTER.keys()), sum(COUNTER.values()))

async def download_save_hero_skins(hero_id: int, session: ClientSession):
    url = SKINS_URL.format(hero_id)
    hero = {}
    async with session.get(url) as response:
        hero = json.loads(await response.read())

    hero_name = hero['hero']['name'] + ' ' + hero['hero']['title']
    path = os.path.join(os.curdir, hero_name)
    os.makedirs(path, exist_ok=True)
    tasks = []
    for skin in hero['skins']:
        if skin['mainImg']:
            file_path = os.path.join(path, skin['name'] + ".jpg")
            file_path = file_path.replace('/', '')
            task = download_save_skin(skin['mainImg'], file_path, session)
            tasks.append(task)
    await schedule(tasks)
    COUNTER[hero['hero']['title']] = len(tasks)
    logging.info("Download OK: %s, %s skins", hero_name, len(tasks))

async def download_save_skin(url: str, file_path: str, session: ClientSession):
    async with session.get(url) as response:
        data = await response.read()
        with open(file_path, 'wb') as f:
            f.write(data)
    
if __name__ == '__main__':

    path = "lol-skins"
    os.makedirs(path, exist_ok=True)
    os.chdir(path)

    start = time.time()
    execute(async_download_skins())
    end = time.time()
    logging.info("Speed time: %ss", round(end-start,2))

HTTP Request 数量: 1784 (1+162+1621)
总用时: 40.54s
平均每 Request 用时: 0.02s

发现当图片请求量大时，响应耗时会增加