Python异步爬虫实战:aiohttp+asyncio高并发数据采集

2 阅读4分钟

摘要:当你需要采集上万个页面时,同步爬虫的速度让人崩溃。本文用aiohttp+asyncio实现异步爬虫,轻松达到每秒数百请求的并发量,并解决限速、异常处理、数据持久化等实战问题。

同步 vs 异步:差距有多大?

先看一个直观的对比。采集100个页面:

# 同步方式:串行请求,100个页面约需30秒
import requests
import time

urls = [f'https://httpbin.org/delay/0.3' for _ in range(100)]

start = time.time()
for url in urls:
    requests.get(url)
print(f'同步耗时: {time.time() - start:.1f}s')  # ~30s
# 异步方式:并发请求,100个页面约需1秒
import aiohttp
import asyncio
import time

async def fetch(session, url):
    async with session.get(url) as resp:
        return await resp.text()

async def main():
    urls = [f'https://httpbin.org/delay/0.3' for _ in range(100)]
    async with aiohttp.ClientSession() as session:
        tasks = [fetch(session, url) for url in urls]
        results = await asyncio.gather(*tasks)
    return results

start = time.time()
asyncio.run(main())
print(f'异步耗时: {time.time() - start:.1f}s')  # ~1s

30倍的速度差距。这就是异步的威力。

aiohttp基础

安装:

pip install aiohttp

aiohttp的ClientSession类似requests的Session,负责连接池管理和Cookie保持:

import aiohttp
import asyncio

async def main():
    # 自定义连接池
    connector = aiohttp.TCPConnector(
        limit=100,          # 总连接数上限
        limit_per_host=10,  # 每个host的连接数上限
        ttl_dns_cache=300,  # DNS缓存时间(秒)
    )
    
    timeout = aiohttp.ClientTimeout(
        total=30,       # 总超时
        connect=5,      # 连接超时
        sock_read=10,   # 读取超时
    )
    
    async with aiohttp.ClientSession(
        connector=connector,
        timeout=timeout,
        headers={'User-Agent': 'Mozilla/5.0'},
    ) as session:
        async with session.get('https://example.com') as resp:
            print(resp.status)
            html = await resp.text()

asyncio.run(main())

控制并发量:Semaphore

不加限制地并发请求会把目标服务器打崩,也容易触发反爬。用asyncio.Semaphore控制并发:

import aiohttp
import asyncio

class AsyncCrawler:
    def __init__(self, concurrency=20):
        self.semaphore = asyncio.Semaphore(concurrency)
        self.session = None
    
    async def fetch(self, url):
        async with self.semaphore:  # 限制并发数
            try:
                async with self.session.get(url) as resp:
                    if resp.status == 200:
                        return await resp.text()
                    else:
                        print(f'[{resp.status}] {url}')
                        return None
            except Exception as e:
                print(f'[Error] {url}: {e}')
                return None
    
    async def run(self, urls):
        connector = aiohttp.TCPConnector(limit=50)
        async with aiohttp.ClientSession(connector=connector) as session:
            self.session = session
            tasks = [self.fetch(url) for url in urls]
            return await asyncio.gather(*tasks)

# 使用:最多同时20个请求
crawler = AsyncCrawler(concurrency=20)
results = asyncio.run(crawler.run(urls))

带重试的请求

网络请求不可能100%成功,加上重试机制:

async def fetch_with_retry(self, url, max_retries=3):
    for attempt in range(max_retries):
        try:
            async with self.semaphore:
                async with self.session.get(url) as resp:
                    if resp.status == 200:
                        return await resp.text()
                    elif resp.status == 429:  # 被限速
                        wait = 2 ** attempt
                        print(f'Rate limited, waiting {wait}s...')
                        await asyncio.sleep(wait)
                        continue
                    else:
                        return None
        except (aiohttp.ClientError, asyncio.TimeoutError) as e:
            if attempt < max_retries - 1:
                await asyncio.sleep(1)
                continue
            print(f'Failed after {max_retries} retries: {url}')
            return None

实战:批量采集文章列表

一个完整的例子——采集技术博客的文章列表:

import aiohttp
import asyncio
import json
from datetime import datetime

class ArticleCrawler:
    def __init__(self, concurrency=10):
        self.sem = asyncio.Semaphore(concurrency)
        self.results = []
    
    async def fetch_page(self, session, page):
        url = f'https://api.example.com/articles?page={page}&size=20'
        async with self.sem:
            try:
                async with session.get(url) as resp:
                    data = await resp.json()
                    articles = data.get('items', [])
                    self.results.extend(articles)
                    print(f'Page {page}: got {len(articles)} articles')
            except Exception as e:
                print(f'Page {page} failed: {e}')
    
    async def run(self, total_pages):
        connector = aiohttp.TCPConnector(limit=20)
        timeout = aiohttp.ClientTimeout(total=30)
        
        async with aiohttp.ClientSession(
            connector=connector,
            timeout=timeout,
        ) as session:
            tasks = [
                self.fetch_page(session, page)
                for page in range(1, total_pages + 1)
            ]
            await asyncio.gather(*tasks)
        
        # 保存结果
        output = {
            'crawl_time': datetime.now().isoformat(),
            'total': len(self.results),
            'articles': self.results,
        }
        with open('articles.json', 'w', encoding='utf-8') as f:
            json.dump(output, f, ensure_ascii=False, indent=2)
        
        print(f'Done! Total: {len(self.results)} articles')

# 采集100页
crawler = ArticleCrawler(concurrency=10)
asyncio.run(crawler.run(100))

进阶:生产者-消费者模式

对于更复杂的场景(比如先采集列表页,再采集详情页),用Queue实现生产者-消费者:

async def producer(queue, session):
    """生产者:采集列表页,把详情URL放入队列"""
    for page in range(1, 101):
        async with session.get(f'https://api.example.com/list?page={page}') as resp:
            data = await resp.json()
            for item in data['items']:
                await queue.put(item['detail_url'])
    
    # 发送结束信号
    for _ in range(5):  # 5个消费者
        await queue.put(None)

async def consumer(queue, session, results):
    """消费者:从队列取URL,采集详情"""
    while True:
        url = await queue.get()
        if url is None:
            break
        try:
            async with session.get(url) as resp:
                data = await resp.json()
                results.append(data)
        except Exception as e:
            print(f'Error: {e}')
        queue.task_done()

async def main():
    queue = asyncio.Queue(maxsize=100)
    results = []
    
    async with aiohttp.ClientSession() as session:
        consumers = [
            asyncio.create_task(consumer(queue, session, results))
            for _ in range(5)
        ]
        await producer(queue, session)
        await asyncio.gather(*consumers)
    
    print(f'Total: {len(results)}')

常见坑

  1. 不要在协程里用time.sleep(),用await asyncio.sleep()
  2. ClientSession不要在协程外创建,它必须在事件循环内初始化
  3. 记得关闭Session,最好用async with
  4. Windows下asyncio可能报错,加上:asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())

总结

  • 异步爬虫比同步快10-50倍,适合大规模数据采集
  • 用Semaphore控制并发,避免打崩目标服务器
  • 加重试机制处理网络波动
  • 复杂场景用生产者-消费者模式
  • aiohttp的连接池和超时配置要根据实际场景调整

异步编程的学习曲线比同步陡一些,但一旦掌握,效率提升是质的飞跃。