Python并发编程:多线程、多进程与异步IO的深度解析

2 阅读13分钟

引言:现代Python并发编程全景

Python提供了多种并发编程模型,每种模型都有其适用场景和特点。理解它们的差异和正确使用场景,对于编写高效的Python程序至关重要。

一、多线程:I/O密集型任务的首选

1.1 基本线程操作

import threading
import time
from concurrent.futures import ThreadPoolExecutor

def worker(name, delay):
    """一个简单的线程任务"""
    print(f"线程 {name} 开始工作")
    time.sleep(delay)  # 模拟I/O操作
    print(f"线程 {name} 完成,耗时 {delay}秒")
    return f"{name}_result"

# 创建并启动线程
thread = threading.Thread(target=worker, args=("Thread-1", 2))
thread.start()
thread.join()

# 使用ThreadPoolExecutor(推荐)
def thread_pool_example():
    with ThreadPoolExecutor(max_workers=3) as executor:
        # 提交任务
        future1 = executor.submit(worker, "A", 1)
        future2 = executor.submit(worker, "B", 2)
        future3 = executor.submit(worker, "C", 3)
        
        # 获取结果
        results = [future1.result(), future2.result(), future3.result()]
        print(f"所有结果: {results}")

thread_pool_example()

1.2 线程同步与锁机制

import threading
from typing import List

class ThreadSafeCounter:
    """线程安全的计数器"""
    def __init__(self):
        self._value = 0
        self._lock = threading.Lock()
    
    def increment(self):
        """安全地增加计数"""
        with self._lock:  # 自动获取和释放锁
            self._value += 1
            return self._value
    
    @property
    def value(self):
        with self._lock:
            return self._value

def test_concurrent_counter():
    counter = ThreadSafeCounter()
    
    def increment_multiple_times():
        for _ in range(1000):
            counter.increment()
    
    # 创建多个线程并发增加计数器
    threads: List[threading.Thread] = []
    for i in range(10):
        thread = threading.Thread(target=increment_multiple_times, name=f"Worker-{i}")
        threads.append(thread)
        thread.start()
    
    # 等待所有线程完成
    for thread in threads:
        thread.join()
    
    print(f"最终计数值: {counter.value} (期望: 10000)")
    return counter.value == 10000

# 运行测试
assert test_concurrent_counter(), "计数器线程不安全!"

1.3 生产者-消费者模式

import threading
import queue
import random
import time

class ProducerConsumer:
    """生产者-消费者模式示例"""
    
    def __init__(self, max_size=5):
        self.queue = queue.Queue(maxsize=max_size)
        self.producers = []
        self.consumers = []
        
    def producer(self, producer_id: int):
        """生产者函数"""
        for i in range(5):
            item = f"产品_{producer_id}_{i}"
            self.queue.put(item)  # 如果队列满,会阻塞等待
            print(f"生产者 {producer_id} 生产了: {item}")
            time.sleep(random.uniform(0.1, 0.5))
        print(f"生产者 {producer_id} 完成")
    
    def consumer(self, consumer_id: int):
        """消费者函数"""
        while True:
            try:
                # 设置超时,避免永久阻塞
                item = self.queue.get(timeout=3)
                print(f"消费者 {consumer_id} 消费了: {item}")
                self.queue.task_done()
                time.sleep(random.uniform(0.2, 0.8))
            except queue.Empty:
                print(f"消费者 {consumer_id} 超时退出")
                break
    
    def run(self, num_producers=2, num_consumers=3):
        """运行生产者消费者模型"""
        # 创建生产者线程
        for i in range(num_producers):
            thread = threading.Thread(
                target=self.producer,
                args=(i,),
                name=f"Producer-{i}"
            )
            self.producers.append(thread)
        
        # 创建消费者线程
        for i in range(num_consumers):
            thread = threading.Thread(
                target=self.consumer,
                args=(i,),
                name=f"Consumer-{i}"
            )
            self.consumers.append(thread)
        
        # 启动所有线程
        for thread in self.producers + self.consumers:
            thread.start()
        
        # 等待生产者完成
        for producer in self.producers:
            producer.join()
        
        # 等待队列清空
        self.queue.join()
        
        # 消费者会自动退出(因为设置了超时)
        for consumer in self.consumers:
            consumer.join()
        
        print("所有任务完成")

# 运行示例
pc = ProducerConsumer(max_size=3)
pc.run(num_producers=2, num_consumers=3)

二、多进程:CPU密集型任务的解决方案

2.1 基础多进程编程

import multiprocessing
import time
import os
from concurrent.futures import ProcessPoolExecutor

def cpu_intensive_task(n):
    """CPU密集型任务:计算平方和"""
    print(f"进程 {os.getpid()} 处理任务 {n}")
    result = sum(i*i for i in range(n))
    return f"任务{n}: 平方和={result}"

def basic_multiprocessing():
    """基础多进程示例"""
    processes = []
    
    # 创建进程
    for i in range(3):
        p = multiprocessing.Process(
            target=cpu_intensive_task,
            args=(1000000,),
            name=f"Process-{i}"
        )
        processes.append(p)
        p.start()
    
    # 等待进程完成
    for p in processes:
        p.join()
    
    print("所有进程完成")

def process_pool_example():
    """使用进程池(推荐)"""
    with ProcessPoolExecutor(max_workers=multiprocessing.cpu_count()) as executor:
        # 提交多个任务
        futures = [executor.submit(cpu_intensive_task, 1000000 * (i+1)) 
                  for i in range(4)]
        
        # 收集结果
        results = [future.result() for future in futures]
        print("所有任务结果:")
        for result in results:
            print(f"  {result}")

# 比较性能
def performance_comparison():
    """比较单进程 vs 多进程性能"""
    import math
    
    def is_prime(n):
        """判断素数(CPU密集型)"""
        if n < 2:
            return False
        for i in range(2, int(math.sqrt(n)) + 1):
            if n % i == 0:
                return False
        return True
    
    numbers = [112272535095293] * 8  # 8个相同的素数检查任务
    
    # 单进程版本
    start = time.time()
    results_single = [is_prime(n) for n in numbers]
    single_time = time.time() - start
    
    # 多进程版本
    start = time.time()
    with ProcessPoolExecutor() as executor:
        results_multi = list(executor.map(is_prime, numbers))
    multi_time = time.time() - start
    
    print(f"单进程耗时: {single_time:.2f}秒")
    print(f"多进程耗时: {multi_time:.2f}秒")
    print(f"加速比: {single_time/multi_time:.2f}倍")

performance_comparison()

2.2 进程间通信

import multiprocessing
import time
from multiprocessing import Queue, Pipe, Manager

def process_communication_demo():
    """进程间通信的各种方式"""
    
    # 1. 使用Queue进行通信
    def producer(queue: Queue):
        """生产者进程"""
        for i in range(5):
            item = f"消息_{i}"
            queue.put(item)
            print(f"生产者发送: {item}")
            time.sleep(0.5)
        queue.put("END")  # 结束信号
    
    def consumer(queue: Queue, consumer_id: int):
        """消费者进程"""
        while True:
            item = queue.get()
            if item == "END":
                print(f"消费者{consumer_id} 收到结束信号")
                queue.put("END")  # 传递给其他消费者
                break
            print(f"消费者{consumer_id} 收到: {item}")
            time.sleep(0.3)
    
    print("=== Queue通信示例 ===")
    q = Queue()
    
    # 创建进程
    p1 = multiprocessing.Process(target=producer, args=(q,))
    c1 = multiprocessing.Process(target=consumer, args=(q, 1))
    c2 = multiprocessing.Process(target=consumer, args=(q, 2))
    
    # 启动进程
    p1.start()
    c1.start()
    c2.start()
    
    # 等待完成
    p1.join()
    c1.join()
    c2.join()
    
    # 2. 使用Pipe进行双向通信
    print("\n=== Pipe通信示例 ===")
    
    def pipe_worker(conn, worker_id):
        """管道工作者"""
        conn.send(f"来自工作者{worker_id}的消息")
        msg = conn.recv()
        print(f"工作者{worker_id}收到: {msg}")
        conn.close()
    
    parent_conn, child_conn = Pipe()
    worker_process = multiprocessing.Process(
        target=pipe_worker, 
        args=(child_conn, 1)
    )
    worker_process.start()
    
    # 主进程接收和发送
    print(f"主进程收到: {parent_conn.recv()}")
    parent_conn.send("主进程的回复")
    
    worker_process.join()
    
    # 3. 使用Manager共享状态
    print("\n=== Manager共享状态示例 ===")
    
    def dict_worker(shared_dict, worker_id):
        """修改共享字典的worker"""
        shared_dict[f"worker_{worker_id}"] = f"数据_{worker_id}"
        print(f"工作者{worker_id}写入完成")
    
    with Manager() as manager:
        shared_dict = manager.dict()
        
        processes = []
        for i in range(3):
            p = multiprocessing.Process(
                target=dict_worker,
                args=(shared_dict, i)
            )
            processes.append(p)
            p.start()
        
        for p in processes:
            p.join()
        
        print(f"最终共享字典: {dict(shared_dict)}")

process_communication_demo()

三、异步IO:高并发网络应用的核心

3.1 基础async/await

import asyncio
import aiohttp
import time
from typing import List

async def basic_coroutine():
    """基础的协程示例"""
    print("开始协程")
    await asyncio.sleep(1)  # 模拟I/O操作
    print("协程完成")
    return "结果"

async def multiple_coroutines():
    """多个协程并发执行"""
    # 创建多个任务
    tasks = [
        asyncio.create_task(basic_coroutine(), name=f"Task-{i}")
        for i in range(3)
    ]
    
    # 等待所有任务完成
    results = await asyncio.gather(*tasks)
    print(f"所有任务结果: {results}")

async def async_http_client():
    """异步HTTP客户端示例"""
    urls = [
        "https://httpbin.org/delay/1",
        "https://httpbin.org/delay/2",
        "https://httpbin.org/delay/1",
    ]
    
    async with aiohttp.ClientSession() as session:
        tasks = []
        for i, url in enumerate(urls):
            task = asyncio.create_task(
                fetch_url(session, url, i),
                name=f"Fetch-{i}"
            )
            tasks.append(task)
        
        results = await asyncio.gather(*tasks, return_exceptions=True)
        
        for i, result in enumerate(results):
            if isinstance(result, Exception):
                print(f"请求 {i} 失败: {result}")
            else:
                print(f"请求 {i} 成功,状态: {result}")

async def fetch_url(session, url, request_id):
    """获取单个URL"""
    print(f"开始请求 {request_id}: {url}")
    async with session.get(url) as response:
        await response.read()  # 读取响应内容
        print(f"完成请求 {request_id}")
        return response.status

async def rate_limiter_example():
    """带速率限制的异步任务"""
    
    class RateLimiter:
        """简单的令牌桶速率限制器"""
        def __init__(self, rate=2):
            self.rate = rate  # 每秒令牌数
            self.tokens = rate
            self.updated_at = asyncio.get_event_loop().time()
        
        async def acquire(self):
            """获取令牌"""
            now = asyncio.get_event_loop().time()
            elapsed = now - self.updated_at
            self.tokens = min(self.rate, self.tokens + elapsed * self.rate)
            self.updated_at = now
            
            if self.tokens < 1:
                delay = (1 - self.tokens) / self.rate
                await asyncio.sleep(delay)
                self.tokens -= 1
            else:
                self.tokens -= 1
    
    limiter = RateLimiter(rate=2)  # 每秒2个请求
    
    async def limited_task(task_id):
        """受限制的任务"""
        await limiter.acquire()
        print(f"任务 {task_id} 开始于 {time.strftime('%H:%M:%S')}")
        await asyncio.sleep(0.5)  # 模拟工作
        return f"任务{task_id}_结果"
    
    # 并发执行10个任务,但被限制为每秒2个
    tasks = [limited_task(i) for i in range(10)]
    results = await asyncio.gather(*tasks)
    print(f"所有任务完成: {results}")

# 运行示例
async def run_all_async_examples():
    print("=== 基础协程示例 ===")
    await basic_coroutine()
    
    print("\n=== 多个协程并发 ===")
    await multiple_coroutines()
    
    print("\n=== 异步HTTP客户端 ===")
    await async_http_client()
    
    print("\n=== 速率限制示例 ===")
    await rate_limiter_example()

# 在事件循环中运行
asyncio.run(run_all_async_examples())

3.2 高级异步模式

import asyncio
import random
from asyncio import Queue, Event, Lock
from typing import Optional

class AsyncProducerConsumer:
    """异步生产者-消费者模式"""
    
    def __init__(self, maxsize: int = 10):
        self.queue = Queue(maxsize=maxsize)
        self.stop_event = Event()
        
    async def producer(self, producer_id: int):
        """异步生产者"""
        try:
            for i in range(5):
                item = f"产品_{producer_id}_{i}"
                await self.queue.put(item)  # 异步放入队列
                print(f"生产者{producer_id} 生产: {item}")
                await asyncio.sleep(random.uniform(0.1, 0.5))
        finally:
            print(f"生产者{producer_id} 完成")
    
    async def consumer(self, consumer_id: int):
        """异步消费者"""
        while not self.stop_event.is_set() or not self.queue.empty():
            try:
                # 等待1秒,如果队列为空且收到停止信号则退出
                item = await asyncio.wait_for(
                    self.queue.get(), 
                    timeout=1.0
                )
                print(f"消费者{consumer_id} 消费: {item}")
                self.queue.task_done()
                await asyncio.sleep(random.uniform(0.2, 0.8))
            except asyncio.TimeoutError:
                continue
            except asyncio.CancelledError:
                break
        print(f"消费者{consumer_id} 退出")
    
    async def run(self, num_producers=2, num_consumers=3):
        """运行异步生产者消费者"""
        # 创建生产者任务
        producer_tasks = [
            asyncio.create_task(self.producer(i), name=f"Producer-{i}")
            for i in range(num_producers)
        ]
        
        # 创建消费者任务
        consumer_tasks = [
            asyncio.create_task(self.consumer(i), name=f"Consumer-{i}")
            for i in range(num_consumers)
        ]
        
        # 等待所有生产者完成
        await asyncio.gather(*producer_tasks)
        
        # 通知消费者可以停止了
        self.stop_event.set()
        
        # 等待队列清空
        await self.queue.join()
        
        # 取消消费者任务
        for task in consumer_tasks:
            task.cancel()
        
        # 等待消费者优雅退出
        await asyncio.gather(*consumer_tasks, return_exceptions=True)
        
        print("异步生产者消费者模式完成")

async def async_semaphore_example():
    """使用信号量控制并发数"""
    
    class AsyncConnectionPool:
        """异步连接池"""
        def __init__(self, pool_size: int):
            self.semaphore = asyncio.Semaphore(pool_size)
            self.active_connections = 0
            self.lock = Lock()
        
        async def get_connection(self, client_id: int):
            """获取连接(带并发限制)"""
            async with self.semaphore:  # 控制最大并发数
                async with self.lock:  # 保护共享状态
                    self.active_connections += 1
                    current = self.active_connections
            
            print(f"客户端{client_id} 获取连接,活跃连接: {current}")
            
            try:
                # 模拟连接使用
                await asyncio.sleep(random.uniform(0.5, 1.5))
                return f"连接_{client_id}_结果"
            finally:
                async with self.lock:
                    self.active_connections -= 1
                print(f"客户端{client_id} 释放连接")
    
    pool = AsyncConnectionPool(pool_size=3)  # 最多3个并发连接
    
    async def client_task(client_id: int):
        """客户端任务"""
        result = await pool.get_connection(client_id)
        return result
    
    # 创建10个客户端任务,但最多只能有3个并发
    tasks = [client_task(i) for i in range(10)]
    results = await asyncio.gather(*tasks)
    
    print(f"所有任务完成,结果数: {len(results)}")

async def async_broadcast_pattern():
    """异步广播模式"""
    
    class AsyncBroadcaster:
        """异步事件广播器"""
        def __init__(self):
            self.listeners = []
            self.lock = Lock()
        
        async def subscribe(self):
            """订阅事件"""
            queue = Queue()
            async with self.lock:
                self.listeners.append(queue)
            return queue
        
        async def unsubscribe(self, queue):
            """取消订阅"""
            async with self.lock:
                self.listeners.remove(queue)
        
        async def broadcast(self, message):
            """广播消息给所有订阅者"""
            async with self.lock:
                listeners = self.listeners.copy()
            
            for queue in listeners:
                try:
                    queue.put_nowait(message)
                except asyncio.QueueFull:
                    # 如果队列满,跳过这个订阅者
                    pass
        
        async def run_broadcaster(self):
            """运行广播器"""
            for i in range(5):
                message = f"广播消息_{i}"
                print(f"广播: {message}")
                await self.broadcast(message)
                await asyncio.sleep(1)
            
            # 发送结束信号
            await self.broadcast("END")
    
    broadcaster = AsyncBroadcaster()
    
    async def listener(listener_id: int):
        """监听者"""
        queue = await broadcaster.subscribe()
        
        while True:
            message = await queue.get()
            if message == "END":
                print(f"监听者{listener_id} 收到结束信号")
                break
            print(f"监听者{listener_id} 收到: {message}")
    
    # 创建监听者任务
    listener_tasks = [
        asyncio.create_task(listener(i), name=f"Listener-{i}")
        for i in range(3)
    ]
    
    # 运行广播器
    broadcaster_task = asyncio.create_task(broadcaster.run_broadcaster())
    
    # 等待所有任务完成
    await asyncio.gather(broadcaster_task, *listener_tasks)
    
    print("广播模式示例完成")

# 运行高级异步示例
async def run_advanced_async():
    print("=== 异步生产者消费者 ===")
    apc = AsyncProducerConsumer(maxsize=5)
    await apc.run(num_producers=2, num_consumers=3)
    
    print("\n=== 异步信号量示例 ===")
    await async_semaphore_example()
    
    print("\n=== 异步广播模式 ===")
    await async_broadcast_pattern()

asyncio.run(run_advanced_async())

四、如何选择合适的并发模型

4.1 选择指南与性能对比

import time
import concurrent.futures
import asyncio
import multiprocessing
from typing import Callable, List, Any

def performance_benchmark():
    """不同并发模型的性能对比"""
    
    def io_bound_task(task_id: int):
        """模拟I/O密集型任务"""
        time.sleep(0.1)  # 模拟I/O等待
        return f"IO任务_{task_id}"
    
    def cpu_bound_task(n: int):
        """模拟CPU密集型任务"""
        return sum(i * i for i in range(n))
    
    async def async_io_task(task_id: int):
        """异步I/O任务"""
        await asyncio.sleep(0.1)
        return f"异步IO任务_{task_id}"
    
    def run_sync(tasks: int):
        """同步执行基准"""
        start = time.time()
        results = [io_bound_task(i) for i in range(tasks)]
        return time.time() - start, results
    
    def run_threads(tasks: int, workers: int = 10):
        """多线程执行"""
        start = time.time()
        with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor:
            results = list(executor.map(io_bound_task, range(tasks)))
        return time.time() - start, results
    
    def run_processes(tasks: int, workers: int = 4):
        """多进程执行"""
        start = time.time()
        with concurrent.futures.ProcessPoolExecutor(max_workers=workers) as executor:
            results = list(executor.map(cpu_bound_task, [10000] * tasks))
        return time.time() - start, results
    
    async def run_async(tasks: int):
        """异步执行"""
        start = time.time()
        tasks_list = [async_io_task(i) for i in range(tasks)]
        results = await asyncio.gather(*tasks_list)
        return time.time() - start, results
    
    # 性能测试
    test_cases = 100
    
    print("性能对比测试 (100个任务):")
    print("-" * 50)
    
    # 同步基准
    sync_time, _ = run_sync(test_cases)
    print(f"同步执行: {sync_time:.3f}秒")
    
    # 多线程
    thread_time, _ = run_threads(test_cases)
    print(f"多线程执行: {thread_time:.3f}秒 (加速: {sync_time/thread_time:.1f}倍)")
    
    # 多进程(CPU密集型)
    process_time, _ = run_processes(test_cases)
    cpu_sync_time = time.time()
    for _ in range(test_cases):
        cpu_bound_task(10000)
    cpu_sync_time = time.time() - cpu_sync_time
    print(f"多进程执行: {process_time:.3f}秒 (加速: {cpu_sync_time/process_time:.1f}倍)")
    
    # 异步
    async_time, _ = asyncio.run(run_async(test_cases))
    print(f"异步执行: {async_time:.3f}秒 (加速: {sync_time/async_time:.1f}倍)")

performance_benchmark()

print("\n" + "="*60)
print("并发模型选择指南:")
print("="*60)
print("""
1. 多线程 (threading) 适合:
   - I/O密集型任务(网络请求、文件操作)
   - GUI应用程序(保持界面响应)
   - 需要共享内存状态的情况
   
2. 多进程 (multiprocessing) 适合:
   - CPU密集型任务(数学计算、图像处理)
   - 需要利用多核CPU的情况
   - 任务之间相互独立,不需要频繁通信
   
3. 异步IO (asyncio) 适合:
   - 高并发网络应用(Web服务器、爬虫)
   - 大量I/O操作且需要高吞吐量
   - 需要精细控制并发流程的情况
   
4. 混合模式:
   - 进程池内部使用线程池或异步
   - 异步事件循环中运行CPU密集型任务(使用run_in_executor)
   - 根据任务类型动态选择执行策略
""")

4.2 实际项目示例:Web爬虫

import asyncio
import aiohttp
from urllib.parse import urljoin, urlparse
from typing import Set, List, Optional
import time
from concurrent.futures import ThreadPoolExecutor
import hashlib
import json

class AdvancedWebCrawler:
    """高级Web爬虫,结合多种并发技术"""
    
    def __init__(self, max_concurrent: int = 10, max_depth: int = 3):
        self.max_concurrent = max_concurrent
        self.max_depth = max_depth
        self.visited_urls: Set[str] = set()
        self.url_lock = asyncio.Lock()
        self.session: Optional[aiohttp.ClientSession] = None
        self.results = []
        self.semaphore = asyncio.Semaphore(max_concurrent)
        
        # 使用线程池处理CPU密集型任务
        self.thread_pool = ThreadPoolExecutor(max_workers=4)
    
    def compute_content_hash(self, content: str) -> str:
        """计算内容哈希(CPU密集型,在线程池中运行)"""
        return hashlib.md5(content.encode()).hexdigest()
    
    async def process_content(self, url: str, html: str):
        """处理页面内容(混合异步和线程池)"""
        # 异步提取链接
        links = await self.extract_links(html, url)
        
        # 在线程池中计算哈希(CPU密集型)
        hash_future = asyncio.get_event_loop().run_in_executor(
            self.thread_pool,
            self.compute_content_hash,
            html
        )
        content_hash = await hash_future
        
        # 异步保存结果
        result = {
            'url': url,
            'hash': content_hash,
            'links': links,
            'timestamp': time.time()
        }
        
        async with self.url_lock:
            self.results.append(result)
        
        return links
    
    async def extract_links(self, html: str, base_url: str) -> List[str]:
        """提取页面中的链接(简化版)"""
        # 这里简化实现,实际项目可以使用BeautifulSoup等库
        import re
        links = re.findall(r'href="([^"]+)"', html)
        
        # 转换为绝对URL
        absolute_links = []
        for link in links:
            absolute_link = urljoin(base_url, link)
            if self.is_valid_url(absolute_link):
                absolute_links.append(absolute_link)
        
        return absolute_links[:10]  # 限制每个页面的链接数
    
    def is_valid_url(self, url: str) -> bool:
        """验证URL是否有效"""
        parsed = urlparse(url)
        return bool(parsed.netloc) and parsed.scheme in ['http', 'https']
    
    async def fetch_page(self, url: str, depth: int):
        """获取单个页面"""
        if depth > self.max_depth:
            return
        
        async with self.semaphore:  # 控制并发数
            try:
                async with self.session.get(url, timeout=10) as response:
                    if response.status == 200:
                        html = await response.text()
                        print(f"[深度{depth}] 获取成功: {url}")
                        
                        # 处理内容
                        links = await self.process_content(url, html)
                        
                        # 递归爬取链接
                        if depth < self.max_depth:
                            tasks = [
                                self.fetch_page(link, depth + 1)
                                for link in links
                                if link not in self.visited_urls
                            ]
                            if tasks:
                                await asyncio.gather(*tasks)
                    else:
                        print(f"[深度{depth}] 获取失败 {response.status}: {url}")
            except Exception as e:
                print(f"[深度{depth}] 错误 {url}: {e}")
    
    async def crawl(self, start_urls: List[str]):
        """开始爬取"""
        self.session = aiohttp.ClientSession()
        
        try:
            # 创建初始任务
            tasks = []
            for url in start_urls:
                if self.is_valid_url(url):
                    tasks.append(self.fetch_page(url, 1))
            
            # 执行所有任务
            await asyncio.gather(*tasks)
            
            print(f"\n爬取完成!")
            print(f"总共访问: {len(self.results)} 个页面")
            
            # 保存结果
            with open('crawl_results.json', 'w', encoding='utf-8') as f:
                json.dump(self.results, f, ensure_ascii=False, indent=2)
            
        finally:
            await self.session.close()
            self.thread_pool.shutdown()
    
    def run(self, start_urls: List[str]):
        """运行爬虫"""
        asyncio.run(self.crawl(start_urls))

# 使用示例
if __name__ == "__main__":
    print("启动高级Web爬虫...")
    
    start_urls = [
        "https://httpbin.org/html",
        "https://httpbin.org/links/10/0",
    ]
    
    crawler = AdvancedWebCrawler(max_concurrent=5, max_depth=2)
    
    # 测量执行时间
    start_time = time.time()
    crawler.run(start_urls)
    end_time = time.time()
    
    print(f"\n总执行时间: {end_time - start_time:.2f}秒")
    print("结果已保存到 crawl_results.json")

总结

Python提供了丰富的并发编程工具,每种工具都有其适用场景:

  1. 多线程:适合I/O密集型任务,受GIL限制但能有效处理I/O等待
  2. 多进程:适合CPU密集型任务,能充分利用多核CPU
  3. 异步IO:适合高并发网络应用,提供最好的I/O性能

实际项目中,经常需要混合使用这些技术:

  • 使用多进程处理CPU密集型计算
  • 在进程内部使用多线程处理I/O
  • 使用异步IO处理高并发网络请求
  • 使用线程池执行器在异步代码中运行阻塞操作