摘要:爬虫和反爬是一场永恒的攻防战。本文从实战角度总结8种常见反爬机制及其应对方案,包括请求头检测、频率限制、Cookie验证、JS渲染、字体反爬等,帮你在合法合规的前提下高效采集数据。
声明
本文仅用于技术学习和研究目的。爬取数据时请遵守目标网站的robots.txt协议和相关法律法规,不要对目标服务器造成过大压力。
反爬机制1:User-Agent检测
最基础的反爬,检查请求头中的UA:
import requests
from fake_useragent import UserAgent
ua = UserAgent()
# ❌ 默认UA会暴露你是爬虫
requests.get('https://example.com')
# User-Agent: python-requests/2.31.0
# ✅ 随机UA
headers = {'User-Agent': ua.random}
requests.get('https://example.com', headers=headers)
# ✅ 更完整的请求头伪装
headers = {
'User-Agent': ua.chrome,
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
}
反爬机制2:IP频率限制
# 方案1:请求间隔 + 随机延迟
import time
import random
def polite_request(url, session):
time.sleep(random.uniform(1, 3)) # 1-3秒随机延迟
return session.get(url)
# 方案2:代理IP池
proxies_pool = [
'http://ip1:port',
'http://ip2:port',
'http://ip3:port',
]
def get_with_proxy(url):
proxy = random.choice(proxies_pool)
return requests.get(url, proxies={'http': proxy, 'https': proxy}, timeout=10)
# 方案3:带自动重试的代理请求
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
session = requests.Session()
retry = Retry(total=3, backoff_factor=1, status_forcelist=[429, 500, 502, 503])
session.mount('http://', HTTPAdapter(max_retries=retry))
session.mount('https://', HTTPAdapter(max_retries=retry))
反爬机制3:Cookie/Session验证
很多网站需要先访问首页获取Cookie,才能请求API:
session = requests.Session()
# 第一步:访问首页,获取Cookie
session.get('https://example.com')
# 第二步:带Cookie请求数据接口
resp = session.get('https://example.com/api/data')
# 如果需要登录
login_data = {
'username': 'user',
'password': 'pass',
'_token': 'xxx', # CSRF token,通常需要从页面中提取
}
session.post('https://example.com/login', data=login_data)
# 提取CSRF Token
from bs4 import BeautifulSoup
page = session.get('https://example.com/login')
soup = BeautifulSoup(page.text, 'html.parser')
token = soup.find('input', {'name': '_token'})['value']
反爬机制4:JS渲染页面
页面内容由JavaScript动态生成,requests拿到的是空壳:
# 方案1:Playwright(推荐)
from playwright.sync_api import sync_playwright
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
page = browser.new_page()
page.goto('https://example.com')
page.wait_for_selector('.data-list') # 等待数据加载
content = page.content()
browser.close()
# 方案2:找到真实的API接口(更高效)
# 打开浏览器开发者工具 → Network → XHR
# 找到数据接口直接请求,跳过JS渲染
# 方案3:Splash(轻量级JS渲染服务)
# docker run -p 8050:8050 scrapinghub/splash
resp = requests.get('http://localhost:8050/render.html', params={
'url': 'https://example.com',
'wait': 2,
})
反爬机制5:请求参数加密
网站在请求中加入加密签名参数:
# 常见模式:timestamp + sign
import hashlib
import time
timestamp = str(int(time.time()))
secret = 'app_secret_key' # 需要从JS中逆向获取
# 常见签名算法
sign = hashlib.md5(f'{timestamp}{secret}'.encode()).hexdigest()
params = {
'keyword': '搜索词',
'timestamp': timestamp,
'sign': sign,
}
# 逆向思路:
# 1. 浏览器开发者工具搜索参数名(如"sign")
# 2. 在Sources面板打断点
# 3. 分析JS中的签名生成逻辑
# 4. 用Python复现
反爬机制6:字体反爬
网站用自定义字体映射,页面源码中的数字/文字和显示的不一样:
from fontTools.ttLib import TTFont
import requests
# 1. 下载字体文件
font_url = 'https://example.com/fonts/custom.woff2'
font_data = requests.get(font_url).content
with open('custom.woff2', 'wb') as f:
f.write(font_data)
# 2. 解析字体映射
font = TTFont('custom.woff2')
cmap = font.getBestCmap()
# 3. 建立映射关系
# cmap: {unicode_code: glyph_name}
# 需要根据具体网站分析glyph_name和实际字符的对应关系
# 4. 替换页面中的编码字符
def decode_font(text, mapping):
result = ''
for char in text:
code = ord(char)
if code in mapping:
result += mapping[code]
else:
result += char
return result
反爬机制7:Honeypot陷阱
隐藏的链接,正常用户看不到但爬虫会访问,触发后IP被封:
from bs4 import BeautifulSoup
def is_honeypot(element):
"""检测是否是蜜罐链接"""
style = element.get('style', '')
class_name = element.get('class', [])
# 检查是否被CSS隐藏
hidden_indicators = [
'display:none', 'display: none',
'visibility:hidden', 'visibility: hidden',
'opacity:0', 'opacity: 0',
'position:absolute', 'left:-9999px',
'height:0', 'width:0',
]
for indicator in hidden_indicators:
if indicator in style.lower():
return True
if 'hidden' in class_name or 'hide' in class_name:
return True
return False
# 过滤蜜罐链接
soup = BeautifulSoup(html, 'html.parser')
safe_links = [a['href'] for a in soup.find_all('a') if not is_honeypot(a)]
反爬机制8:TLS指纹检测
高级反爬会检测TLS握手的指纹(JA3),Python的requests库有固定的TLS指纹:
# 方案1:curl_cffi(模拟浏览器TLS指纹)
from curl_cffi import requests as cffi_requests
resp = cffi_requests.get(
'https://example.com',
impersonate='chrome110', # 模拟Chrome 110的TLS指纹
)
# 方案2:使用真实浏览器(Playwright/Selenium)
# 这是最彻底的方案,但性能最差
完整爬虫框架
把上面的技巧整合成一个可复用的框架:
import requests
import random
import time
from fake_useragent import UserAgent
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
class SmartCrawler:
def __init__(self, proxy_list=None, min_delay=1, max_delay=3):
self.session = requests.Session()
self.ua = UserAgent()
self.proxy_list = proxy_list or []
self.min_delay = min_delay
self.max_delay = max_delay
self.request_count = 0
# 自动重试
retry = Retry(total=3, backoff_factor=1, status_forcelist=[429, 500, 502, 503])
self.session.mount('http://', HTTPAdapter(max_retries=retry))
self.session.mount('https://', HTTPAdapter(max_retries=retry))
def _get_headers(self):
return {
'User-Agent': self.ua.random,
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
}
def _get_proxy(self):
if self.proxy_list:
proxy = random.choice(self.proxy_list)
return {'http': proxy, 'https': proxy}
return None
def get(self, url, **kwargs):
# 随机延迟
time.sleep(random.uniform(self.min_delay, self.max_delay))
kwargs.setdefault('headers', self._get_headers())
kwargs.setdefault('timeout', 15)
proxy = self._get_proxy()
if proxy:
kwargs.setdefault('proxies', proxy)
self.request_count += 1
# 每50次请求重建session(刷新Cookie)
if self.request_count % 50 == 0:
self.session.cookies.clear()
return self.session.get(url, **kwargs)
# 使用
crawler = SmartCrawler(min_delay=2, max_delay=5)
resp = crawler.get('https://example.com')
总结
反爬对抗的核心思路:
- 低级反爬(UA/Referer检测)→ 伪装请求头
- 中级反爬(频率/IP限制)→ 代理池 + 随机延迟
- 高级反爬(JS加密/TLS指纹)→ 逆向分析或使用真实浏览器
最重要的原则:先找API接口,能不渲染JS就不渲染,能不用浏览器就不用浏览器。效率差距是数量级的。