Python爬虫开发实战:从基础到企业级应用的完整指南
本文将全面介绍Python爬虫开发的各个方面,从基础的HTTP请求到高级的反反爬策略,通过大量实用代码示例帮助读者构建完整的爬虫知识体系。
一、爬虫基础与HTTP协议
1. HTTP请求基础
import requests
# 基本GET请求
response = requests.get('https://www.example.com')
print(f"状态码: {response.status_code}")
print(f"响应内容长度: {len(response.text)}字节")
# 带参数的GET请求
params = {'key1': 'value1', 'key2': 'value2'}
response = requests.get('https://httpbin.org/get', params=params)
print(f"请求URL: {response.url}")
# POST请求
data = {'username': 'admin', 'password': '123456'}
response = requests.post('https://httpbin.org/post', data=data)
print(f"响应JSON: {response.json()}")
2. 请求头与Session管理
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Referer': 'https://www.google.com/'
}
# 使用Session保持会话
with requests.Session() as session:
session.headers.update(headers)
# 第一次请求设置cookies
session.get('https://www.example.com/login')
# 第二次请求携带cookies
response = session.get('https://www.example.com/dashboard')
print(f"登录后页面: {response.status_code}")
二、数据解析技术
1. BeautifulSoup解析HTML
from bs4 import BeautifulSoup
import requests
url = 'https://books.toscrape.com/'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
# 提取所有图书信息
books = []
for article in soup.select('article.product_pod'):
title = article.h3.a['title']
price = article.select_one('p.price_color').text
stock = article.select_one('p.instock').text.strip()
books.append({
'title': title,
'price': price,
'stock': stock
})
print(f"提取到{len(books)}本书籍")
2. 正则表达式提取数据
import re
# 从文本中提取电子邮件
text = """
联系我们: service@example.com
销售部门: sales@company.com
技术支持: support@test.org
"""
email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
emails = re.findall(email_pattern, text)
print(f"提取到的邮箱: {emails}")
# 提取商品价格
html = '<span class="price">¥129.00</span>'
price_pattern = r'¥(\d+\.\d{2})'
match = re.search(price_pattern, html)
if match:
print(f"商品价格: {match.group(1)}")
三、动态内容处理
1. Selenium自动化浏览器
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import time
# 配置浏览器选项
options = webdriver.ChromeOptions()
options.add_argument('--headless') # 无头模式
options.add_argument('--disable-gpu')
options.add_argument('user-agent=Mozilla/5.0')
# 初始化浏览器
driver = webdriver.Chrome(
service=Service(ChromeDriverManager().install()),
options=options
)
try:
driver.get('https://www.taobao.com')
# 搜索商品
search_input = driver.find_element(By.ID, 'q')
search_input.send_keys('手机')
search_input.submit()
# 等待结果加载
time.sleep(3)
# 提取商品列表
items = driver.find_elements(By.CSS_SELECTOR, '.item.J_MouserOnverReq')
for item in items[:5]:
title = item.find_element(By.CSS_SELECTOR, '.title').text
price = item.find_element(By.CSS_SELECTOR, '.price').text
print(f"{title} - {price}")
finally:
driver.quit()
2. 逆向分析AJAX接口
import requests
import json
# 分析京东评论接口
product_id = '100003395476'
url = f'https://club.jd.com/comment/productPageComments.action?productId={product_id}&score=0&sortType=5'
headers = {
'Referer': f'https://item.jd.com/{product_id}.html',
'User-Agent': 'Mozilla/5.0'
}
response = requests.get(url, headers=headers)
data = response.json()
# 提取评论数据
comments = data['comments']
for comment in comments[:3]:
print(f"用户: {comment['nickname']}")
print(f"评分: {'★' * comment['score']}")
print(f"内容: {comment['content']}\n")
四、爬虫高级技术
1. 并发爬取实现
import concurrent.futures
import requests
from bs4 import BeautifulSoup
def fetch_page(url):
try:
response = requests.get(url, timeout=10)
return url, response.text, None
except Exception as e:
return url, None, str(e)
def parse_page(url, html):
soup = BeautifulSoup(html, 'html.parser')
title = soup.title.string if soup.title else '无标题'
return {'url': url, 'title': title}
def crawl_urls(urls, max_workers=5):
results = []
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
future_to_url = {executor.submit(fetch_page, url): url for url in urls}
for future in concurrent.futures.as_completed(future_to_url):
url = future_to_url[future]
try:
url, html, error = future.result()
if html:
result = parse_page(url, html)
results.append(result)
else:
print(f"获取 {url} 失败: {error}")
except Exception as e:
print(f"处理 {url} 时出错: {e}")
return results
# 示例URL列表
urls = [
'https://www.python.org',
'https://www.baidu.com',
'https://www.qq.com',
'https://www.jd.com',
'https://www.taobao.com'
]
results = crawl_urls(urls)
for result in results:
print(f"{result['url']}: {result['title']}")
2. 代理IP池实现
import requests
from concurrent.futures import ThreadPoolExecutor
import random
class ProxyPool:
def __init__(self):
self.proxies = []
self.valid_proxies = []
def fetch_proxies(self):
"""从免费代理网站获取代理IP"""
url = 'https://www.free-proxy-list.net/'
response = requests.get(url)
# 解析HTML获取代理列表
# 这里简化处理,实际需要更复杂的解析
proxies = re.findall(r'\d+\.\d+\.\d+\.\d+:\d+', response.text)
self.proxies = list(set(proxies)) # 去重
def validate_proxy(self, proxy):
"""验证代理是否可用"""
try:
response = requests.get(
'https://httpbin.org/ip',
proxies={'http': f'http://{proxy}', 'https': f'http://{proxy}'},
timeout=5
)
if response.status_code == 200:
self.valid_proxies.append(proxy)
print(f"有效代理: {proxy}")
except:
pass
def build_pool(self):
"""构建有效代理池"""
print("开始获取代理列表...")
self.fetch_proxies()
print(f"获取到{len(self.proxies)}个代理")
print("开始验证代理...")
with ThreadPoolExecutor(max_workers=20) as executor:
executor.map(self.validate_proxy, self.proxies)
print(f"验证完成,有效代理数: {len(self.valid_proxies)}")
def get_random_proxy(self):
"""随机获取一个有效代理"""
if not self.valid_proxies:
self.build_pool()
return random.choice(self.valid_proxies)
# 使用示例
proxy_pool = ProxyPool()
proxy = proxy_pool.get_random_proxy()
print(f"随机代理: {proxy}")
五、反爬策略与应对
1. 常见反爬手段与破解
| 反爬类型 | 特征 | 应对策略 |
|---|---|---|
| User-Agent检测 | 返回403或验证码 | 轮换User-Agent |
| IP限制 | 封禁频繁访问的IP | 使用代理IP池 |
| 验证码 | 出现图形/滑动验证码 | 使用打码平台或OCR识别 |
| 动态参数 | 请求需要token/sign | 分析JavaScript生成逻辑 |
| 行为分析 | 检测非人类操作模式 | 模拟人类操作间隔 |
2. 高级反反爬实现
import time
import random
from fake_useragent import UserAgent
class AdvancedCrawler:
def __init__(self):
self.ua = UserAgent()
self.proxy_pool = ProxyPool()
self.request_count = 0
def get_random_delay(self):
"""获取随机延迟时间"""
return random.uniform(1, 3)
def get_random_headers(self):
"""生成随机请求头"""
return {
'User-Agent': self.ua.random,
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Referer': 'https://www.google.com/',
'Connection': 'keep-alive'
}
def make_request(self, url, max_retries=3):
"""带有反反爬措施的请求"""
for attempt in range(max_retries):
try:
# 限制请求频率
if self.request_count > 0:
time.sleep(self.get_random_delay())
# 准备请求参数
headers = self.get_random_headers()
proxy = {'http': f'http://{self.proxy_pool.get_random_proxy()}'}
response = requests.get(
url,
headers=headers,
proxies=proxy,
timeout=10
)
# 检查是否被拦截
if response.status_code == 403:
raise Exception('被服务器拒绝访问')
self.request_count += 1
return response.text
except Exception as e:
print(f"请求失败 (尝试 {attempt + 1}/{max_retries}): {e}")
if attempt == max_retries - 1:
raise
time.sleep(5) # 遇到错误时延长等待时间
return None
# 使用示例
crawler = AdvancedCrawler()
html = crawler.make_request('https://www.taobao.com')
if html:
print("成功获取页面内容")
六、数据存储方案
1. 数据库存储
import sqlite3
import pandas as pd
from contextlib import closing
class DataStorage:
def __init__(self, db_file='crawler_data.db'):
self.db_file = db_file
self._init_db()
def _init_db(self):
"""初始化数据库表结构"""
with closing(sqlite3.connect(self.db_file)) as conn:
cursor = conn.cursor()
cursor.execute("""
CREATE TABLE IF NOT EXISTS products (
id INTEGER PRIMARY KEY AUTOINCREMENT,
title TEXT NOT NULL,
price REAL,
url TEXT UNIQUE,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
""")
conn.commit()
def save_data(self, data):
"""保存数据到数据库"""
with closing(sqlite3.connect(self.db_file)) as conn:
try:
df = pd.DataFrame(data)
df.to_sql('products', conn, if_exists='append', index=False)
print(f"成功保存{len(data)}条数据")
except Exception as e:
print(f"保存数据失败: {e}")
def query_data(self, sql="SELECT * FROM products LIMIT 5"):
"""查询数据"""
with closing(sqlite3.connect(self.db_file)) as conn:
return pd.read_sql(sql, conn)
# 使用示例
storage = DataStorage()
# 假设这是爬取的商品数据
sample_data = [
{'title': 'iPhone 13', 'price': 5999.0, 'url': 'https://example.com/iphone13'},
{'title': '华为P50', 'price': 4488.0, 'url': 'https://example.com/p50'}
]
storage.save_data(sample_data)
print(storage.query_data())
2. 分布式存储方案
import pymongo
from pymongo import MongoClient
class MongoDBStorage:
def __init__(self, db_name='crawler', collection_name='products'):
self.client = MongoClient('mongodb://localhost:27017/')
self.db = self.client[db_name]
self.collection = self.db[collection_name]
def save_data(self, data):
"""保存数据到MongoDB"""
if isinstance(data, list):
result = self.collection.insert_many(data)
print(f"插入{len(result.inserted_ids)}条文档")
else:
result = self.collection.insert_one(data)
print(f"插入文档ID: {result.inserted_id}")
def query_data(self, filter={}, limit=5):
"""查询数据"""
return list(self.collection.find(filter).limit(limit))
# 使用示例
mongo_storage = MongoDBStorage()
mongo_storage.save_data(sample_data)
print(mongo_storage.query_data())
七、企业级爬虫架构
1. Scrapy框架实战
import scrapy
from scrapy.crawler import CrawlerProcess
class BookSpider(scrapy.Spider):
name = 'book_spider'
start_urls = ['https://books.toscrape.com/']
custom_settings = {
'USER_AGENT': 'Mozilla/5.0',
'DOWNLOAD_DELAY': 2,
'CONCURRENT_REQUESTS': 1,
'FEED_FORMAT': 'json',
'FEED_URI': 'books.json'
}
def parse(self, response):
for book in response.css('article.product_pod'):
yield {
'title': book.css('h3 a::attr(title)').get(),
'price': book.css('p.price_color::text').get(),
'stock': book.css('p.instock::text').getall()[1].strip()
}
# 翻页处理
next_page = response.css('li.next a::attr(href)').get()
if next_page:
yield response.follow(next_page, callback=self.parse)
# 运行爬虫
process = CrawlerProcess()
process.crawl(BookSpider)
process.start()
2. 分布式爬虫架构
# 使用Scrapy-Redis实现分布式爬虫
from scrapy_redis.spiders import RedisSpider
class DistributedSpider(RedisSpider):
name = 'distributed_spider'
redis_key = 'myspider:start_urls' # Redis中的起始URL键
def parse(self, response):
# 解析逻辑
yield {'url': response.url, 'title': response.css('title::text').get()}
# 提取新URL加入队列
for next_page in response.css('a::attr(href)').getall():
yield response.follow(next_page, callback=self.parse)
# 配置文件settings.py需要添加:
# SCHEDULER = "scrapy_redis.scheduler.Scheduler"
# DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
# REDIS_URL = 'redis://localhost:6379/0'
八、爬虫伦理与法律
1. 合法爬虫实践
-
遵守robots.txt协议
from urllib.robotparser import RobotFileParser def check_robots_permission(url, user_agent='*'): rp = RobotFileParser() robots_url = f"{url.scheme}://{url.netloc}/robots.txt" rp.set_url(robots_url) rp.read() return rp.can_fetch(user_agent, url.path) -
控制爬取频率
import time class PoliteCrawler: def __init__(self, delay=5): self.delay = delay self.last_request = 0 def request(self, url): now = time.time() if now - self.last_request < self.delay: time.sleep(self.delay - (now - self.last_request)) self.last_request = time.time() return requests.get(url) -
尊重数据版权
-
不爬取敏感信息
-
遵守网站服务条款
结语
Python爬虫技术是获取网络数据的强大工具,但也伴随着技术挑战和法律风险。本文从基础到高级全面介绍了爬虫开发的各个方面,并提供了大量实用的代码示例。希望读者在掌握这些技术的同时,也能遵守网络道德和法律法规,合理合法