摘要:当你需要采集上万个页面时,同步爬虫的速度让人崩溃。本文用aiohttp+asyncio实现异步爬虫,轻松达到每秒数百请求的并发量,并解决限速、异常处理、数据持久化等实战问题。
同步 vs 异步:差距有多大?
先看一个直观的对比。采集100个页面:
# 同步方式:串行请求,100个页面约需30秒
import requests
import time
urls = [f'https://httpbin.org/delay/0.3' for _ in range(100)]
start = time.time()
for url in urls:
requests.get(url)
print(f'同步耗时: {time.time() - start:.1f}s') # ~30s
# 异步方式:并发请求,100个页面约需1秒
import aiohttp
import asyncio
import time
async def fetch(session, url):
async with session.get(url) as resp:
return await resp.text()
async def main():
urls = [f'https://httpbin.org/delay/0.3' for _ in range(100)]
async with aiohttp.ClientSession() as session:
tasks = [fetch(session, url) for url in urls]
results = await asyncio.gather(*tasks)
return results
start = time.time()
asyncio.run(main())
print(f'异步耗时: {time.time() - start:.1f}s') # ~1s
30倍的速度差距。这就是异步的威力。
aiohttp基础
安装:
pip install aiohttp
aiohttp的ClientSession类似requests的Session,负责连接池管理和Cookie保持:
import aiohttp
import asyncio
async def main():
# 自定义连接池
connector = aiohttp.TCPConnector(
limit=100, # 总连接数上限
limit_per_host=10, # 每个host的连接数上限
ttl_dns_cache=300, # DNS缓存时间(秒)
)
timeout = aiohttp.ClientTimeout(
total=30, # 总超时
connect=5, # 连接超时
sock_read=10, # 读取超时
)
async with aiohttp.ClientSession(
connector=connector,
timeout=timeout,
headers={'User-Agent': 'Mozilla/5.0'},
) as session:
async with session.get('https://example.com') as resp:
print(resp.status)
html = await resp.text()
asyncio.run(main())
控制并发量:Semaphore
不加限制地并发请求会把目标服务器打崩,也容易触发反爬。用asyncio.Semaphore控制并发:
import aiohttp
import asyncio
class AsyncCrawler:
def __init__(self, concurrency=20):
self.semaphore = asyncio.Semaphore(concurrency)
self.session = None
async def fetch(self, url):
async with self.semaphore: # 限制并发数
try:
async with self.session.get(url) as resp:
if resp.status == 200:
return await resp.text()
else:
print(f'[{resp.status}] {url}')
return None
except Exception as e:
print(f'[Error] {url}: {e}')
return None
async def run(self, urls):
connector = aiohttp.TCPConnector(limit=50)
async with aiohttp.ClientSession(connector=connector) as session:
self.session = session
tasks = [self.fetch(url) for url in urls]
return await asyncio.gather(*tasks)
# 使用:最多同时20个请求
crawler = AsyncCrawler(concurrency=20)
results = asyncio.run(crawler.run(urls))
带重试的请求
网络请求不可能100%成功,加上重试机制:
async def fetch_with_retry(self, url, max_retries=3):
for attempt in range(max_retries):
try:
async with self.semaphore:
async with self.session.get(url) as resp:
if resp.status == 200:
return await resp.text()
elif resp.status == 429: # 被限速
wait = 2 ** attempt
print(f'Rate limited, waiting {wait}s...')
await asyncio.sleep(wait)
continue
else:
return None
except (aiohttp.ClientError, asyncio.TimeoutError) as e:
if attempt < max_retries - 1:
await asyncio.sleep(1)
continue
print(f'Failed after {max_retries} retries: {url}')
return None
实战:批量采集文章列表
一个完整的例子——采集技术博客的文章列表:
import aiohttp
import asyncio
import json
from datetime import datetime
class ArticleCrawler:
def __init__(self, concurrency=10):
self.sem = asyncio.Semaphore(concurrency)
self.results = []
async def fetch_page(self, session, page):
url = f'https://api.example.com/articles?page={page}&size=20'
async with self.sem:
try:
async with session.get(url) as resp:
data = await resp.json()
articles = data.get('items', [])
self.results.extend(articles)
print(f'Page {page}: got {len(articles)} articles')
except Exception as e:
print(f'Page {page} failed: {e}')
async def run(self, total_pages):
connector = aiohttp.TCPConnector(limit=20)
timeout = aiohttp.ClientTimeout(total=30)
async with aiohttp.ClientSession(
connector=connector,
timeout=timeout,
) as session:
tasks = [
self.fetch_page(session, page)
for page in range(1, total_pages + 1)
]
await asyncio.gather(*tasks)
# 保存结果
output = {
'crawl_time': datetime.now().isoformat(),
'total': len(self.results),
'articles': self.results,
}
with open('articles.json', 'w', encoding='utf-8') as f:
json.dump(output, f, ensure_ascii=False, indent=2)
print(f'Done! Total: {len(self.results)} articles')
# 采集100页
crawler = ArticleCrawler(concurrency=10)
asyncio.run(crawler.run(100))
进阶:生产者-消费者模式
对于更复杂的场景(比如先采集列表页,再采集详情页),用Queue实现生产者-消费者:
async def producer(queue, session):
"""生产者:采集列表页,把详情URL放入队列"""
for page in range(1, 101):
async with session.get(f'https://api.example.com/list?page={page}') as resp:
data = await resp.json()
for item in data['items']:
await queue.put(item['detail_url'])
# 发送结束信号
for _ in range(5): # 5个消费者
await queue.put(None)
async def consumer(queue, session, results):
"""消费者:从队列取URL,采集详情"""
while True:
url = await queue.get()
if url is None:
break
try:
async with session.get(url) as resp:
data = await resp.json()
results.append(data)
except Exception as e:
print(f'Error: {e}')
queue.task_done()
async def main():
queue = asyncio.Queue(maxsize=100)
results = []
async with aiohttp.ClientSession() as session:
consumers = [
asyncio.create_task(consumer(queue, session, results))
for _ in range(5)
]
await producer(queue, session)
await asyncio.gather(*consumers)
print(f'Total: {len(results)}')
常见坑
- 不要在协程里用time.sleep(),用
await asyncio.sleep() - ClientSession不要在协程外创建,它必须在事件循环内初始化
- 记得关闭Session,最好用
async with - Windows下asyncio可能报错,加上:
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
总结
- 异步爬虫比同步快10-50倍,适合大规模数据采集
- 用Semaphore控制并发,避免打崩目标服务器
- 加重试机制处理网络波动
- 复杂场景用生产者-消费者模式
- aiohttp的连接池和超时配置要根据实际场景调整
异步编程的学习曲线比同步陡一些,但一旦掌握,效率提升是质的飞跃。