[网络安全] 爬虫开发+app逆向大神班 爬虫超级逆向7、8、9、10期|路飞|

62 阅读6分钟

Python爬虫开发实战:从基础到企业级应用的完整指南

本文将全面介绍Python爬虫开发的各个方面,从基础的HTTP请求到高级的反反爬策略,通过大量实用代码示例帮助读者构建完整的爬虫知识体系。

一、爬虫基础与HTTP协议

1. HTTP请求基础

import requests

# 基本GET请求
response = requests.get('https://www.example.com')
print(f"状态码: {response.status_code}")
print(f"响应内容长度: {len(response.text)}字节")

# 带参数的GET请求
params = {'key1': 'value1', 'key2': 'value2'}
response = requests.get('https://httpbin.org/get', params=params)
print(f"请求URL: {response.url}")

# POST请求
data = {'username': 'admin', 'password': '123456'}
response = requests.post('https://httpbin.org/post', data=data)
print(f"响应JSON: {response.json()}")

2. 请求头与Session管理

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
    'Accept-Language': 'zh-CN,zh;q=0.9',
    'Referer': 'https://www.google.com/'
}

# 使用Session保持会话
with requests.Session() as session:
    session.headers.update(headers)
    # 第一次请求设置cookies
    session.get('https://www.example.com/login')
    # 第二次请求携带cookies
    response = session.get('https://www.example.com/dashboard')
    print(f"登录后页面: {response.status_code}")

二、数据解析技术

1. BeautifulSoup解析HTML

from bs4 import BeautifulSoup
import requests

url = 'https://books.toscrape.com/'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

# 提取所有图书信息
books = []
for article in soup.select('article.product_pod'):
    title = article.h3.a['title']
    price = article.select_one('p.price_color').text
    stock = article.select_one('p.instock').text.strip()
    books.append({
        'title': title,
        'price': price,
        'stock': stock
    })

print(f"提取到{len(books)}本书籍")

2. 正则表达式提取数据

import re

# 从文本中提取电子邮件
text = """
联系我们: service@example.com
销售部门: sales@company.com
技术支持: support@test.org
"""

email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
emails = re.findall(email_pattern, text)
print(f"提取到的邮箱: {emails}")

# 提取商品价格
html = '<span class="price">¥129.00</span>'
price_pattern = r'¥(\d+\.\d{2})'
match = re.search(price_pattern, html)
if match:
    print(f"商品价格: {match.group(1)}")

三、动态内容处理

1. Selenium自动化浏览器

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import time

# 配置浏览器选项
options = webdriver.ChromeOptions()
options.add_argument('--headless')  # 无头模式
options.add_argument('--disable-gpu')
options.add_argument('user-agent=Mozilla/5.0')

# 初始化浏览器
driver = webdriver.Chrome(
    service=Service(ChromeDriverManager().install()),
    options=options
)

try:
    driver.get('https://www.taobao.com')
    
    # 搜索商品
    search_input = driver.find_element(By.ID, 'q')
    search_input.send_keys('手机')
    search_input.submit()
    
    # 等待结果加载
    time.sleep(3)
    
    # 提取商品列表
    items = driver.find_elements(By.CSS_SELECTOR, '.item.J_MouserOnverReq')
    for item in items[:5]:
        title = item.find_element(By.CSS_SELECTOR, '.title').text
        price = item.find_element(By.CSS_SELECTOR, '.price').text
        print(f"{title} - {price}")
    
finally:
    driver.quit()

2. 逆向分析AJAX接口

import requests
import json

# 分析京东评论接口
product_id = '100003395476'
url = f'https://club.jd.com/comment/productPageComments.action?productId={product_id}&score=0&sortType=5'

headers = {
    'Referer': f'https://item.jd.com/{product_id}.html',
    'User-Agent': 'Mozilla/5.0'
}

response = requests.get(url, headers=headers)
data = response.json()

# 提取评论数据
comments = data['comments']
for comment in comments[:3]:
    print(f"用户: {comment['nickname']}")
    print(f"评分: {'★' * comment['score']}")
    print(f"内容: {comment['content']}\n")

四、爬虫高级技术

1. 并发爬取实现

import concurrent.futures
import requests
from bs4 import BeautifulSoup

def fetch_page(url):
    try:
        response = requests.get(url, timeout=10)
        return url, response.text, None
    except Exception as e:
        return url, None, str(e)

def parse_page(url, html):
    soup = BeautifulSoup(html, 'html.parser')
    title = soup.title.string if soup.title else '无标题'
    return {'url': url, 'title': title}

def crawl_urls(urls, max_workers=5):
    results = []
    
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_url = {executor.submit(fetch_page, url): url for url in urls}
        
        for future in concurrent.futures.as_completed(future_to_url):
            url = future_to_url[future]
            try:
                url, html, error = future.result()
                if html:
                    result = parse_page(url, html)
                    results.append(result)
                else:
                    print(f"获取 {url} 失败: {error}")
            except Exception as e:
                print(f"处理 {url} 时出错: {e}")
    
    return results

# 示例URL列表
urls = [
    'https://www.python.org',
    'https://www.baidu.com',
    'https://www.qq.com',
    'https://www.jd.com',
    'https://www.taobao.com'
]

results = crawl_urls(urls)
for result in results:
    print(f"{result['url']}: {result['title']}")

2. 代理IP池实现

import requests
from concurrent.futures import ThreadPoolExecutor
import random

class ProxyPool:
    def __init__(self):
        self.proxies = []
        self.valid_proxies = []
    
    def fetch_proxies(self):
        """从免费代理网站获取代理IP"""
        url = 'https://www.free-proxy-list.net/'
        response = requests.get(url)
        
        # 解析HTML获取代理列表
        # 这里简化处理,实际需要更复杂的解析
        proxies = re.findall(r'\d+\.\d+\.\d+\.\d+:\d+', response.text)
        self.proxies = list(set(proxies))  # 去重
    
    def validate_proxy(self, proxy):
        """验证代理是否可用"""
        try:
            response = requests.get(
                'https://httpbin.org/ip',
                proxies={'http': f'http://{proxy}', 'https': f'http://{proxy}'},
                timeout=5
            )
            if response.status_code == 200:
                self.valid_proxies.append(proxy)
                print(f"有效代理: {proxy}")
        except:
            pass
    
    def build_pool(self):
        """构建有效代理池"""
        print("开始获取代理列表...")
        self.fetch_proxies()
        print(f"获取到{len(self.proxies)}个代理")
        
        print("开始验证代理...")
        with ThreadPoolExecutor(max_workers=20) as executor:
            executor.map(self.validate_proxy, self.proxies)
        
        print(f"验证完成,有效代理数: {len(self.valid_proxies)}")
    
    def get_random_proxy(self):
        """随机获取一个有效代理"""
        if not self.valid_proxies:
            self.build_pool()
        return random.choice(self.valid_proxies)

# 使用示例
proxy_pool = ProxyPool()
proxy = proxy_pool.get_random_proxy()
print(f"随机代理: {proxy}")

五、反爬策略与应对

1. 常见反爬手段与破解

反爬类型特征应对策略
User-Agent检测返回403或验证码轮换User-Agent
IP限制封禁频繁访问的IP使用代理IP池
验证码出现图形/滑动验证码使用打码平台或OCR识别
动态参数请求需要token/sign分析JavaScript生成逻辑
行为分析检测非人类操作模式模拟人类操作间隔

2. 高级反反爬实现

import time
import random
from fake_useragent import UserAgent

class AdvancedCrawler:
    def __init__(self):
        self.ua = UserAgent()
        self.proxy_pool = ProxyPool()
        self.request_count = 0
    
    def get_random_delay(self):
        """获取随机延迟时间"""
        return random.uniform(1, 3)
    
    def get_random_headers(self):
        """生成随机请求头"""
        return {
            'User-Agent': self.ua.random,
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
            'Referer': 'https://www.google.com/',
            'Connection': 'keep-alive'
        }
    
    def make_request(self, url, max_retries=3):
        """带有反反爬措施的请求"""
        for attempt in range(max_retries):
            try:
                # 限制请求频率
                if self.request_count > 0:
                    time.sleep(self.get_random_delay())
                
                # 准备请求参数
                headers = self.get_random_headers()
                proxy = {'http': f'http://{self.proxy_pool.get_random_proxy()}'}
                
                response = requests.get(
                    url,
                    headers=headers,
                    proxies=proxy,
                    timeout=10
                )
                
                # 检查是否被拦截
                if response.status_code == 403:
                    raise Exception('被服务器拒绝访问')
                
                self.request_count += 1
                return response.text
            
            except Exception as e:
                print(f"请求失败 (尝试 {attempt + 1}/{max_retries}): {e}")
                if attempt == max_retries - 1:
                    raise
                time.sleep(5)  # 遇到错误时延长等待时间
        
        return None

# 使用示例
crawler = AdvancedCrawler()
html = crawler.make_request('https://www.taobao.com')
if html:
    print("成功获取页面内容")

六、数据存储方案

1. 数据库存储

import sqlite3
import pandas as pd
from contextlib import closing

class DataStorage:
    def __init__(self, db_file='crawler_data.db'):
        self.db_file = db_file
        self._init_db()
    
    def _init_db(self):
        """初始化数据库表结构"""
        with closing(sqlite3.connect(self.db_file)) as conn:
            cursor = conn.cursor()
            cursor.execute("""
            CREATE TABLE IF NOT EXISTS products (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                title TEXT NOT NULL,
                price REAL,
                url TEXT UNIQUE,
                created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
            )
            """)
            conn.commit()
    
    def save_data(self, data):
        """保存数据到数据库"""
        with closing(sqlite3.connect(self.db_file)) as conn:
            try:
                df = pd.DataFrame(data)
                df.to_sql('products', conn, if_exists='append', index=False)
                print(f"成功保存{len(data)}条数据")
            except Exception as e:
                print(f"保存数据失败: {e}")
    
    def query_data(self, sql="SELECT * FROM products LIMIT 5"):
        """查询数据"""
        with closing(sqlite3.connect(self.db_file)) as conn:
            return pd.read_sql(sql, conn)

# 使用示例
storage = DataStorage()

# 假设这是爬取的商品数据
sample_data = [
    {'title': 'iPhone 13', 'price': 5999.0, 'url': 'https://example.com/iphone13'},
    {'title': '华为P50', 'price': 4488.0, 'url': 'https://example.com/p50'}
]

storage.save_data(sample_data)
print(storage.query_data())

2. 分布式存储方案

import pymongo
from pymongo import MongoClient

class MongoDBStorage:
    def __init__(self, db_name='crawler', collection_name='products'):
        self.client = MongoClient('mongodb://localhost:27017/')
        self.db = self.client[db_name]
        self.collection = self.db[collection_name]
    
    def save_data(self, data):
        """保存数据到MongoDB"""
        if isinstance(data, list):
            result = self.collection.insert_many(data)
            print(f"插入{len(result.inserted_ids)}条文档")
        else:
            result = self.collection.insert_one(data)
            print(f"插入文档ID: {result.inserted_id}")
    
    def query_data(self, filter={}, limit=5):
        """查询数据"""
        return list(self.collection.find(filter).limit(limit))

# 使用示例
mongo_storage = MongoDBStorage()
mongo_storage.save_data(sample_data)
print(mongo_storage.query_data())

七、企业级爬虫架构

1. Scrapy框架实战

import scrapy
from scrapy.crawler import CrawlerProcess

class BookSpider(scrapy.Spider):
    name = 'book_spider'
    start_urls = ['https://books.toscrape.com/']
    
    custom_settings = {
        'USER_AGENT': 'Mozilla/5.0',
        'DOWNLOAD_DELAY': 2,
        'CONCURRENT_REQUESTS': 1,
        'FEED_FORMAT': 'json',
        'FEED_URI': 'books.json'
    }
    
    def parse(self, response):
        for book in response.css('article.product_pod'):
            yield {
                'title': book.css('h3 a::attr(title)').get(),
                'price': book.css('p.price_color::text').get(),
                'stock': book.css('p.instock::text').getall()[1].strip()
            }
        
        # 翻页处理
        next_page = response.css('li.next a::attr(href)').get()
        if next_page:
            yield response.follow(next_page, callback=self.parse)

# 运行爬虫
process = CrawlerProcess()
process.crawl(BookSpider)
process.start()

2. 分布式爬虫架构

# 使用Scrapy-Redis实现分布式爬虫
from scrapy_redis.spiders import RedisSpider

class DistributedSpider(RedisSpider):
    name = 'distributed_spider'
    redis_key = 'myspider:start_urls'  # Redis中的起始URL键
    
    def parse(self, response):
        # 解析逻辑
        yield {'url': response.url, 'title': response.css('title::text').get()}
        
        # 提取新URL加入队列
        for next_page in response.css('a::attr(href)').getall():
            yield response.follow(next_page, callback=self.parse)

# 配置文件settings.py需要添加:
# SCHEDULER = "scrapy_redis.scheduler.Scheduler"
# DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
# REDIS_URL = 'redis://localhost:6379/0'

八、爬虫伦理与法律

1. 合法爬虫实践

  1. 遵守robots.txt协议

    from urllib.robotparser import RobotFileParser
    
    def check_robots_permission(url, user_agent='*'):
        rp = RobotFileParser()
        robots_url = f"{url.scheme}://{url.netloc}/robots.txt"
        rp.set_url(robots_url)
        rp.read()
        return rp.can_fetch(user_agent, url.path)
    
  2. 控制爬取频率

    import time
    
    class PoliteCrawler:
        def __init__(self, delay=5):
            self.delay = delay
            self.last_request = 0
        
        def request(self, url):
            now = time.time()
            if now - self.last_request < self.delay:
                time.sleep(self.delay - (now - self.last_request))
            self.last_request = time.time()
            return requests.get(url)
    
  3. 尊重数据版权

  4. 不爬取敏感信息

  5. 遵守网站服务条款

结语

Python爬虫技术是获取网络数据的强大工具,但也伴随着技术挑战和法律风险。本文从基础到高级全面介绍了爬虫开发的各个方面,并提供了大量实用的代码示例。希望读者在掌握这些技术的同时,也能遵守网络道德和法律法规,合理合法