一、分布式爬虫架构设计
1.1 架构核心与演进优势
传统单机爬虫流程集中,存在明显瓶颈,分布式爬虫核心组件分工明确:
- Master节点:负责任务调度、节点管理、负载均衡与故障恢复,是分布式爬虫的“大脑”
- Worker节点:执行页面下载、数据解析等采集任务,可水平扩展,是“执行单元”
- URL去重服务:识别已采集URL,避免重复请求,降低开销
- 分布式队列:实现任务可靠传递、有序调度,保障不丢失、不重复
- 代理IP池:提供高质量代理,规避IP封禁风险,保障采集连续
1.2 技术选型建议
Python生态中分布式爬虫有三种主流路径,需结合业务需求选型:
- Scrapy + Redis:解耦性强、扩展性好,需自行实现调度逻辑,适用于中等复杂度场景
- Scrapy-Redis:基于Scrapy扩展,内置调度、去重及持久化功能,开发成本低,适用于快速落地场景
- 自研框架:基于Celery等构建,定制化强,但开发与维护成本高,适用于高需求核心场景
本文以Scrapy-Redis为基础,整合亿牛云代理,构建兼顾效率与稳定性的分布式爬虫方案,代码可直接复用部署。
二、核心组件实现(可直接复用)
2.1 Redis分布式队列实现
Redis凭借高性能数据结构,是分布式队列的最优选择,完整可复用代码如下:
import redis
from urllib.parse import urlparse
import hashlib
import json
from datetime import datetime
class DistributedQueue:
"""分布式爬虫队列,支持优先级调度、URL去重与任务状态管理"""
def __init__(self, redis_host='localhost', redis_port=6379, redis_db=0):
self.redis_client = redis.StrictRedis(
host=redis_host, port=redis_port, db=redis_db, decode_responses=True
)
self.task_queue = 'crawler:task_queue' # 优先级任务队列(Sorted Set)
self.visited_set = 'crawler:visited_urls' # URL去重集合
self.status_hash = 'crawler:task_status' # 任务状态存储
def _get_url_fingerprint(self, url):
"""生成URL指纹,避免因参数差异导致重复爬取"""
parsed = urlparse(url)
normalized = f"{parsed.scheme}://{parsed.netloc.lower()}{parsed.path}"
if parsed.query:
query_params = [p.split('=')[0] for p in parsed.query.split('&') if p.split('=')[0] not in ['utm_source', 'ref']]
if query_params:
normalized += '?' + '&'.join(query_params)
return hashlib.md5(normalized.encode('utf-8')).hexdigest()
def add_task(self, url, priority=0, callback='parse', meta=None):
"""原子化添加任务,自动去重,避免重复请求"""
fp = self._get_url_fingerprint(url)
if self.redis_client.sismember(self.visited_set, fp):
return False
task_data = {'url': url, 'callback': callback, 'meta': meta or {}, 'create_time': datetime.now().isoformat()}
pipe = self.redis_client.pipeline()
pipe.sadd(self.visited_set, fp)
pipe.zadd(self.task_queue, {json.dumps(task_data): priority})
pipe.hset(self.status_hash, fp, 'pending')
pipe.execute()
return True
def get_task(self, worker_id):
"""按优先级获取任务,标记为正在处理,避免重复分配"""
tasks = self.redis_client.zrange(self.task_queue, 0, 0, withscores=True)
if not tasks:
return None
task_json, _ = tasks[0]
task_data = json.loads(task_json)
fp = self._get_url_fingerprint(task_data['url'])
self.redis_client.hset(self.status_hash, fp, f'processing_{worker_id}')
self.redis_client.zrem(self.task_queue, task_json)
return task_data
def complete_task(self, url, success=True):
"""标记任务执行结果,更新状态"""
fp = self._get_url_fingerprint(url)
self.redis_client.hset(self.status_hash, fp, 'completed' if success else 'failed')
def get_stats(self):
"""获取队列统计信息,支撑监控调度"""
pending = self.redis_client.zcard(self.task_queue)
visited = self.redis_client.scard(self.visited_set)
status = self.redis_client.hgetall(self.status_hash)
return {
'pending_tasks': pending, 'visited_urls': visited,
'processing': sum(1 for v in status.values() if v.startswith('processing_')),
'completed': sum(1 for v in status.values() if v == 'completed'),
'failed': sum(1 for v in status.values() if v == 'failed')
}
2.2 URL去重算法优化(BloomFilter)
大规模爬虫中,BloomFilter可大幅降低内存占用,支持持久化,适配重启恢复需求,完整实现:
import mmh3
from bitarray import bitarray
import pickle
import os
class PersistentBloomFilter:
"""支持持久化的布隆过滤器,用于大规模URL去重,降低内存消耗"""
def __init__(self, size=10000000, hash_count=7, filepath='bloomfilter.bin'):
self.size = size
self.hash_count = hash_count
self.bit_array = bitarray(size)
self.bit_array.setall(0)
self.filepath = filepath
self._load()
def _get_hashes(self, item):
"""生成多个哈希值,映射到位数组"""
item_str = item.encode('utf-8')
return [mmh3.hash(item_str, i) % self.size for i in range(self.hash_count)]
def add(self, item):
"""添加元素到布隆过滤器"""
for index in self._get_hashes(item):
self.bit_array[index] = 1
def contains(self, item):
"""判断元素是否存在(极低误判率)"""
return all(self.bit_array[index] for index in self._get_hashes(item))
def _load(self):
"""加载持久化状态,支持爬虫重启后恢复去重记录"""
if os.path.exists(self.filepath):
with open(self.filepath, 'rb') as f:
state = pickle.load(f)
self.bit_array = state['bit_array']
self.size = state['size']
self.hash_count = state['hash_count']
def save(self):
"""持久化当前状态,避免重启后丢失"""
with open(self.filepath, 'wb') as f:
pickle.dump({
'bit_array': self.bit_array,
'size': self.size,
'hash_count': self.hash_count
}, f)
def __enter__(self):
return self
def __exit__(self, exc_type, exc_val, exc_tb):
self.save()
2.3 亿牛云代理IP池管理
集成亿牛云代理可简化管理,实现代理自动获取、连通性检测,适配生产环境,代码如下(替换报错链接为稳定地址):
import requests
import random
import time
from datetime import datetime, timedelta
import yaml
class ProxyPool:
"""亿牛云代理池管理,支持缓存、连通性测试、白名单自动更新"""
def __init__(self, config_path='proxy_config.yaml'):
with open(config_path, 'r') as f:
self.proxy_config = yaml.safe_load(f)['yiniuyun']
self.api_endpoint = self.proxy_config['api_endpoint']
self.whitelist_api = self.proxy_config['whitelist_api']
self.username = self.proxy_config['username']
self.password = self.proxy_config['password']
self.proxy_list = []
self.last_fetch_time = None
self.cache_ttl = timedelta(minutes=5)
self._update_whitelist()
def _update_whitelist(self):
"""自动更新亿牛云白名单,确保代理可用"""
try:
current_ip = requests.get('https://api.ipify.org?format=json', timeout=5).json()['ip']
response = requests.get(
f"{self.whitelist_api}&limitip=append&newip={current_ip}",
auth=(self.username, self.password), timeout=10
)
print(f"白名单更新{'成功' if response.status_code == 200 else '失败'}:{current_ip}")
except Exception as e:
print(f"白名单更新异常:{str(e)[:50]}")
def fetch_proxies(self, count=10):
"""从亿牛云API获取代理,优先使用缓存"""
if self.proxy_list and self.last_fetch_time and datetime.now() - self.last_fetch_time < self.cache_ttl:
return random.sample(self.proxy_list, min(count, len(self.proxy_list)))
try:
response = requests.get(
f"{self.api_endpoint}&count={count}&format=json",
auth=(self.username, self.password), timeout=15
)
if response.status_code == 200:
self.proxy_list = [f"{item['ip']}:{item['port']}" for item in response.json()]
self.last_fetch_time = datetime.now()
return self.proxy_list[:count]
elif response.status_code == 429:
print("亿牛云API请求超限,使用缓存代理")
return self.proxy_list[:count] if self.proxy_list else []
else:
print(f"获取代理失败:HTTP{response.status_code}")
return self.proxy_list[:count] if self.proxy_list else []
except Exception as e:
print(f"获取代理异常:{str(e)[:50]}")
return self.proxy_list[:count] if self.proxy_list else []
def get_proxy(self, max_retries=3):
"""获取可用代理,多重试保障可用性"""
proxies = self.fetch_proxies(count=5)
for proxy in proxies:
for _ in range(max_retries):
try:
test_url = "https://api.ipify.org?format=json"
response = requests.get(
test_url,
proxies={'http': f"http://{proxy}", 'https': f"http://{proxy}"},
timeout=10
)
if response.status_code == 200:
return {
'http': f"http://{proxy}",
'https': f"http://{proxy}",
'proxy_address': proxy
}
except Exception as e:
time.sleep(1)
continue
raise Exception("无法获取可用代理,检查亿牛云配置或网络")
2.4 Scrapy-Redis分布式爬虫完整实现
整合上述组件,构建可直接部署的分布式爬虫,支持并发控制、代理集成与故障重试:
import scrapy
import time
from scrapy_redis.spiders import RedisSpider
from scrapy.http import Request
class DistributedSpider(RedisSpider):
"""基于Scrapy-Redis的分布式爬虫,集成亿牛云代理"""
name = 'distributed_spider'
redis_key = 'distributed_spider:start_urls'
redis_batch_size = 16
redis_encoding = 'utf-8'
custom_settings = {
'DUPEFILTER_CLASS': 'scrapy_redis.dupefilter.RFPDupeFilter',
'SCHEDULER': 'scrapy_redis.scheduler.Scheduler',
'SCHEDULER_PERSIST': True,
'DOWNLOAD_DELAY': 0.5,
'DOWNLOADER_MIDDLEWARES': {
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 110,
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': 400,
},
'CONCURRENT_REQUESTS': 32,
'CONCURRENT_REQUESTS_PER_DOMAIN': 8,
'RETRY_ENABLED': True,
'RETRY_TIMES': 3,
'RETRY_HTTP_CODES': [500, 502, 503, 504, 408, 429],
}
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.proxy_pool = ProxyPool()
def make_requests_from_url(self, url):
"""添加代理,构建请求"""
try:
proxy = self.proxy_pool.get_proxy()
return Request(
url=url, callback=self.parse, dont_filter=True,
meta={'proxy': proxy['http'], 'download_timeout': 15}
)
except Exception as e:
self.logger.error(f"获取代理失败:{str(e)}")
return Request(url=url, callback=self.parse, dont_filter=True)
def parse(self, response):
"""页面解析示例,可根据业务修改"""
titles = response.css('h2.title::text').getall()
for title in titles:
yield {
'title': title.strip(),
'url': response.url,
'crawl_time': time.time()
}
# 自动翻页
next_pages = response.css('a.next-page::attr(href)').getall()
for next_url in next_pages:
yield response.follow(next_url, callback=self.parse)
2.5 监控与调度系统
生产环境需配套监控调度机制,实现节点管理与故障恢复,核心实现如下:
import time
import psutil
import redis
from prometheus_client import Counter, Histogram, Gauge, start_http_server
# 监控指标定义
crawl_requests = Counter('crawl_requests_total', '总请求数', ['spider', 'status'])
crawl_duration = Histogram('crawl_duration_seconds', '请求耗时')
active_workers = Gauge('active_workers', '活跃Worker节点数')
queue_size = Gauge('queue_size', '待执行任务数')
class MonitorService:
"""爬虫监控服务,基于Prometheus暴露指标,支持可视化"""
def __init__(self, redis_host='localhost', redis_port=6379, metrics_port=8000):
self.redis_client = redis.StrictRedis(host=redis_host, port=redis_port, db=0)
start_http_server(metrics_port)
def update_metrics(self):
"""更新监控指标,供Prometheus采集"""
active_workers.set(1) # 单节点示例,多节点需扩展
queue_size.set(self.redis_client.zcard('crawler:task_queue'))
system_cpu.set(psutil.cpu_percent())
system_memory.set(psutil.virtual_memory().percent)
def record_request(self, spider_name, status):
"""记录请求状态,统计成功率"""
crawl_requests.labels(spider=spider_name, status=status).inc()
def start_monitor(self, interval=5):
"""启动监控循环"""
while True:
self.update_metrics()
time.sleep(interval)
class SchedulerService:
"""任务调度服务,实现动态扩缩容与故障恢复"""
def __init__(self, worker_configs):
self.workers = worker_configs # Worker节点配置列表
self.monitor = MonitorService()
def start_workers(self):
"""启动所有Worker节点(生产环境用subprocess实现)"""
for worker in self.workers:
worker_id = worker['id']
spider_name = worker['spider']
print(f"启动Worker节点 {worker_id},关联爬虫:{spider_name}")
def scale_worker(self):
"""根据任务队列动态调整Worker数量"""
while True:
task_count = self.monitor.redis_client.zcard('crawler:task_queue')
if task_count > 1000:
print("任务积压,建议增加Worker节点")
elif task_count < 10:
print("任务不足,可减少Worker节点")
time.sleep(60)
三、性能优化与最佳实践
3.1 核心优化策略
针对分布式爬虫性能瓶颈,从三方面优化,提升采集效率与稳定性:
- 网络优化:复用HTTP连接池,减少TCP连接开销;针对不同错误码设计智能重试,429错误延迟重试,504错误更换代理;优化DNS解析,使用公共DNS提升解析速度。
- 解析优化:提前编译正则表达式,采用“lxml定位+正则提取”组合,避免解析过程中复杂计算,耗时操作异步处理。
- 存储优化:批量写入数据,降低IO开销;按数据特征分库分表;用消息队列解耦爬取与存储,避免IO阻塞。
3.2 生产环境部署
3.2.1 容器化部署(Docker + Docker Compose)
容器化部署简化环境配置,实现快速扩缩容,完整配置如下:
# Dockerfile(精简版)
FROM python:3.9-slim
WORKDIR /app
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
COPY . .
CMD ["scrapy", "crawl", "distributed_spider"]
# docker-compose.yml
version: '3.8'
services:
redis:
image: redis:7-alpine
ports:
- "6379:6379"
volumes:
- redis_data:/data
master:
build: .
command: python master.py
depends_on:
- redis
environment:
- REDIS_HOST=redis
worker:
build: .
command: scrapy crawl distributed_spider
depends_on:
- redis
environment:
- REDIS_HOST=redis
deploy:
replicas: 5
volumes:
redis_data:
3.2.2 监控告警与成本优化
生产部署后,需配套监控与成本控制策略:
- 监控告警:用Prometheus+Grafana可视化指标,ELK集中分析日志;设置告警规则,请求成功率低于80%、队列积压超10000条时及时告警。
- 成本优化:动态调整Worker节点数量,避免资源闲置;亿牛云按需选择套餐,测试用免费订单,生产用包量套餐;核心任务用企业级代理,非核心任务可搭配低成本方案。