Python爬虫反反爬实战:突破常见反爬机制的8种方法

3 阅读5分钟

摘要:爬虫和反爬是一场永恒的攻防战。本文从实战角度总结8种常见反爬机制及其应对方案,包括请求头检测、频率限制、Cookie验证、JS渲染、字体反爬等,帮你在合法合规的前提下高效采集数据。

声明

本文仅用于技术学习和研究目的。爬取数据时请遵守目标网站的robots.txt协议和相关法律法规,不要对目标服务器造成过大压力。

反爬机制1:User-Agent检测

最基础的反爬,检查请求头中的UA:

import requests
from fake_useragent import UserAgent

ua = UserAgent()

# ❌ 默认UA会暴露你是爬虫
requests.get('https://example.com')
# User-Agent: python-requests/2.31.0

# ✅ 随机UA
headers = {'User-Agent': ua.random}
requests.get('https://example.com', headers=headers)

# ✅ 更完整的请求头伪装
headers = {
    'User-Agent': ua.chrome,
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
    'Accept-Encoding': 'gzip, deflate, br',
    'Connection': 'keep-alive',
    'Upgrade-Insecure-Requests': '1',
}

反爬机制2:IP频率限制

# 方案1:请求间隔 + 随机延迟
import time
import random

def polite_request(url, session):
    time.sleep(random.uniform(1, 3))  # 1-3秒随机延迟
    return session.get(url)

# 方案2:代理IP池
proxies_pool = [
    'http://ip1:port',
    'http://ip2:port',
    'http://ip3:port',
]

def get_with_proxy(url):
    proxy = random.choice(proxies_pool)
    return requests.get(url, proxies={'http': proxy, 'https': proxy}, timeout=10)

# 方案3:带自动重试的代理请求
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

session = requests.Session()
retry = Retry(total=3, backoff_factor=1, status_forcelist=[429, 500, 502, 503])
session.mount('http://', HTTPAdapter(max_retries=retry))
session.mount('https://', HTTPAdapter(max_retries=retry))

反爬机制3:Cookie/Session验证

很多网站需要先访问首页获取Cookie,才能请求API:

session = requests.Session()

# 第一步:访问首页,获取Cookie
session.get('https://example.com')

# 第二步:带Cookie请求数据接口
resp = session.get('https://example.com/api/data')

# 如果需要登录
login_data = {
    'username': 'user',
    'password': 'pass',
    '_token': 'xxx',  # CSRF token,通常需要从页面中提取
}
session.post('https://example.com/login', data=login_data)

# 提取CSRF Token
from bs4 import BeautifulSoup

page = session.get('https://example.com/login')
soup = BeautifulSoup(page.text, 'html.parser')
token = soup.find('input', {'name': '_token'})['value']

反爬机制4:JS渲染页面

页面内容由JavaScript动态生成,requests拿到的是空壳:

# 方案1:Playwright(推荐)
from playwright.sync_api import sync_playwright

with sync_playwright() as p:
    browser = p.chromium.launch(headless=True)
    page = browser.new_page()
    page.goto('https://example.com')
    page.wait_for_selector('.data-list')  # 等待数据加载
    content = page.content()
    browser.close()

# 方案2:找到真实的API接口(更高效)
# 打开浏览器开发者工具 → Network → XHR
# 找到数据接口直接请求,跳过JS渲染

# 方案3:Splash(轻量级JS渲染服务)
# docker run -p 8050:8050 scrapinghub/splash
resp = requests.get('http://localhost:8050/render.html', params={
    'url': 'https://example.com',
    'wait': 2,
})

反爬机制5:请求参数加密

网站在请求中加入加密签名参数:

# 常见模式:timestamp + sign
import hashlib
import time

timestamp = str(int(time.time()))
secret = 'app_secret_key'  # 需要从JS中逆向获取

# 常见签名算法
sign = hashlib.md5(f'{timestamp}{secret}'.encode()).hexdigest()

params = {
    'keyword': '搜索词',
    'timestamp': timestamp,
    'sign': sign,
}

# 逆向思路:
# 1. 浏览器开发者工具搜索参数名(如"sign")
# 2. 在Sources面板打断点
# 3. 分析JS中的签名生成逻辑
# 4. 用Python复现

反爬机制6:字体反爬

网站用自定义字体映射,页面源码中的数字/文字和显示的不一样:

from fontTools.ttLib import TTFont
import requests

# 1. 下载字体文件
font_url = 'https://example.com/fonts/custom.woff2'
font_data = requests.get(font_url).content
with open('custom.woff2', 'wb') as f:
    f.write(font_data)

# 2. 解析字体映射
font = TTFont('custom.woff2')
cmap = font.getBestCmap()

# 3. 建立映射关系
# cmap: {unicode_code: glyph_name}
# 需要根据具体网站分析glyph_name和实际字符的对应关系

# 4. 替换页面中的编码字符
def decode_font(text, mapping):
    result = ''
    for char in text:
        code = ord(char)
        if code in mapping:
            result += mapping[code]
        else:
            result += char
    return result

反爬机制7:Honeypot陷阱

隐藏的链接,正常用户看不到但爬虫会访问,触发后IP被封:

from bs4 import BeautifulSoup

def is_honeypot(element):
    """检测是否是蜜罐链接"""
    style = element.get('style', '')
    class_name = element.get('class', [])
    
    # 检查是否被CSS隐藏
    hidden_indicators = [
        'display:none', 'display: none',
        'visibility:hidden', 'visibility: hidden',
        'opacity:0', 'opacity: 0',
        'position:absolute', 'left:-9999px',
        'height:0', 'width:0',
    ]
    
    for indicator in hidden_indicators:
        if indicator in style.lower():
            return True
    
    if 'hidden' in class_name or 'hide' in class_name:
        return True
    
    return False

# 过滤蜜罐链接
soup = BeautifulSoup(html, 'html.parser')
safe_links = [a['href'] for a in soup.find_all('a') if not is_honeypot(a)]

反爬机制8:TLS指纹检测

高级反爬会检测TLS握手的指纹(JA3),Python的requests库有固定的TLS指纹:

# 方案1:curl_cffi(模拟浏览器TLS指纹)
from curl_cffi import requests as cffi_requests

resp = cffi_requests.get(
    'https://example.com',
    impersonate='chrome110',  # 模拟Chrome 110的TLS指纹
)

# 方案2:使用真实浏览器(Playwright/Selenium)
# 这是最彻底的方案,但性能最差

完整爬虫框架

把上面的技巧整合成一个可复用的框架:

import requests
import random
import time
from fake_useragent import UserAgent
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

class SmartCrawler:
    def __init__(self, proxy_list=None, min_delay=1, max_delay=3):
        self.session = requests.Session()
        self.ua = UserAgent()
        self.proxy_list = proxy_list or []
        self.min_delay = min_delay
        self.max_delay = max_delay
        self.request_count = 0
        
        # 自动重试
        retry = Retry(total=3, backoff_factor=1, status_forcelist=[429, 500, 502, 503])
        self.session.mount('http://', HTTPAdapter(max_retries=retry))
        self.session.mount('https://', HTTPAdapter(max_retries=retry))
    
    def _get_headers(self):
        return {
            'User-Agent': self.ua.random,
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
            'Accept-Encoding': 'gzip, deflate, br',
        }
    
    def _get_proxy(self):
        if self.proxy_list:
            proxy = random.choice(self.proxy_list)
            return {'http': proxy, 'https': proxy}
        return None
    
    def get(self, url, **kwargs):
        # 随机延迟
        time.sleep(random.uniform(self.min_delay, self.max_delay))
        
        kwargs.setdefault('headers', self._get_headers())
        kwargs.setdefault('timeout', 15)
        
        proxy = self._get_proxy()
        if proxy:
            kwargs.setdefault('proxies', proxy)
        
        self.request_count += 1
        
        # 每50次请求重建session(刷新Cookie)
        if self.request_count % 50 == 0:
            self.session.cookies.clear()
        
        return self.session.get(url, **kwargs)

# 使用
crawler = SmartCrawler(min_delay=2, max_delay=5)
resp = crawler.get('https://example.com')

总结

反爬对抗的核心思路:

  • 低级反爬(UA/Referer检测)→ 伪装请求头
  • 中级反爬(频率/IP限制)→ 代理池 + 随机延迟
  • 高级反爬(JS加密/TLS指纹)→ 逆向分析或使用真实浏览器

最重要的原则:先找API接口,能不渲染JS就不渲染,能不用浏览器就不用浏览器。效率差距是数量级的。