引言:现代Python并发编程全景
Python提供了多种并发编程模型,每种模型都有其适用场景和特点。理解它们的差异和正确使用场景,对于编写高效的Python程序至关重要。
一、多线程:I/O密集型任务的首选
1.1 基本线程操作
import threading
import time
from concurrent.futures import ThreadPoolExecutor
def worker(name, delay):
"""一个简单的线程任务"""
print(f"线程 {name} 开始工作")
time.sleep(delay) # 模拟I/O操作
print(f"线程 {name} 完成,耗时 {delay}秒")
return f"{name}_result"
# 创建并启动线程
thread = threading.Thread(target=worker, args=("Thread-1", 2))
thread.start()
thread.join()
# 使用ThreadPoolExecutor(推荐)
def thread_pool_example():
with ThreadPoolExecutor(max_workers=3) as executor:
# 提交任务
future1 = executor.submit(worker, "A", 1)
future2 = executor.submit(worker, "B", 2)
future3 = executor.submit(worker, "C", 3)
# 获取结果
results = [future1.result(), future2.result(), future3.result()]
print(f"所有结果: {results}")
thread_pool_example()
1.2 线程同步与锁机制
import threading
from typing import List
class ThreadSafeCounter:
"""线程安全的计数器"""
def __init__(self):
self._value = 0
self._lock = threading.Lock()
def increment(self):
"""安全地增加计数"""
with self._lock: # 自动获取和释放锁
self._value += 1
return self._value
@property
def value(self):
with self._lock:
return self._value
def test_concurrent_counter():
counter = ThreadSafeCounter()
def increment_multiple_times():
for _ in range(1000):
counter.increment()
# 创建多个线程并发增加计数器
threads: List[threading.Thread] = []
for i in range(10):
thread = threading.Thread(target=increment_multiple_times, name=f"Worker-{i}")
threads.append(thread)
thread.start()
# 等待所有线程完成
for thread in threads:
thread.join()
print(f"最终计数值: {counter.value} (期望: 10000)")
return counter.value == 10000
# 运行测试
assert test_concurrent_counter(), "计数器线程不安全!"
1.3 生产者-消费者模式
import threading
import queue
import random
import time
class ProducerConsumer:
"""生产者-消费者模式示例"""
def __init__(self, max_size=5):
self.queue = queue.Queue(maxsize=max_size)
self.producers = []
self.consumers = []
def producer(self, producer_id: int):
"""生产者函数"""
for i in range(5):
item = f"产品_{producer_id}_{i}"
self.queue.put(item) # 如果队列满,会阻塞等待
print(f"生产者 {producer_id} 生产了: {item}")
time.sleep(random.uniform(0.1, 0.5))
print(f"生产者 {producer_id} 完成")
def consumer(self, consumer_id: int):
"""消费者函数"""
while True:
try:
# 设置超时,避免永久阻塞
item = self.queue.get(timeout=3)
print(f"消费者 {consumer_id} 消费了: {item}")
self.queue.task_done()
time.sleep(random.uniform(0.2, 0.8))
except queue.Empty:
print(f"消费者 {consumer_id} 超时退出")
break
def run(self, num_producers=2, num_consumers=3):
"""运行生产者消费者模型"""
# 创建生产者线程
for i in range(num_producers):
thread = threading.Thread(
target=self.producer,
args=(i,),
name=f"Producer-{i}"
)
self.producers.append(thread)
# 创建消费者线程
for i in range(num_consumers):
thread = threading.Thread(
target=self.consumer,
args=(i,),
name=f"Consumer-{i}"
)
self.consumers.append(thread)
# 启动所有线程
for thread in self.producers + self.consumers:
thread.start()
# 等待生产者完成
for producer in self.producers:
producer.join()
# 等待队列清空
self.queue.join()
# 消费者会自动退出(因为设置了超时)
for consumer in self.consumers:
consumer.join()
print("所有任务完成")
# 运行示例
pc = ProducerConsumer(max_size=3)
pc.run(num_producers=2, num_consumers=3)
二、多进程:CPU密集型任务的解决方案
2.1 基础多进程编程
import multiprocessing
import time
import os
from concurrent.futures import ProcessPoolExecutor
def cpu_intensive_task(n):
"""CPU密集型任务:计算平方和"""
print(f"进程 {os.getpid()} 处理任务 {n}")
result = sum(i*i for i in range(n))
return f"任务{n}: 平方和={result}"
def basic_multiprocessing():
"""基础多进程示例"""
processes = []
# 创建进程
for i in range(3):
p = multiprocessing.Process(
target=cpu_intensive_task,
args=(1000000,),
name=f"Process-{i}"
)
processes.append(p)
p.start()
# 等待进程完成
for p in processes:
p.join()
print("所有进程完成")
def process_pool_example():
"""使用进程池(推荐)"""
with ProcessPoolExecutor(max_workers=multiprocessing.cpu_count()) as executor:
# 提交多个任务
futures = [executor.submit(cpu_intensive_task, 1000000 * (i+1))
for i in range(4)]
# 收集结果
results = [future.result() for future in futures]
print("所有任务结果:")
for result in results:
print(f" {result}")
# 比较性能
def performance_comparison():
"""比较单进程 vs 多进程性能"""
import math
def is_prime(n):
"""判断素数(CPU密集型)"""
if n < 2:
return False
for i in range(2, int(math.sqrt(n)) + 1):
if n % i == 0:
return False
return True
numbers = [112272535095293] * 8 # 8个相同的素数检查任务
# 单进程版本
start = time.time()
results_single = [is_prime(n) for n in numbers]
single_time = time.time() - start
# 多进程版本
start = time.time()
with ProcessPoolExecutor() as executor:
results_multi = list(executor.map(is_prime, numbers))
multi_time = time.time() - start
print(f"单进程耗时: {single_time:.2f}秒")
print(f"多进程耗时: {multi_time:.2f}秒")
print(f"加速比: {single_time/multi_time:.2f}倍")
performance_comparison()
2.2 进程间通信
import multiprocessing
import time
from multiprocessing import Queue, Pipe, Manager
def process_communication_demo():
"""进程间通信的各种方式"""
# 1. 使用Queue进行通信
def producer(queue: Queue):
"""生产者进程"""
for i in range(5):
item = f"消息_{i}"
queue.put(item)
print(f"生产者发送: {item}")
time.sleep(0.5)
queue.put("END") # 结束信号
def consumer(queue: Queue, consumer_id: int):
"""消费者进程"""
while True:
item = queue.get()
if item == "END":
print(f"消费者{consumer_id} 收到结束信号")
queue.put("END") # 传递给其他消费者
break
print(f"消费者{consumer_id} 收到: {item}")
time.sleep(0.3)
print("=== Queue通信示例 ===")
q = Queue()
# 创建进程
p1 = multiprocessing.Process(target=producer, args=(q,))
c1 = multiprocessing.Process(target=consumer, args=(q, 1))
c2 = multiprocessing.Process(target=consumer, args=(q, 2))
# 启动进程
p1.start()
c1.start()
c2.start()
# 等待完成
p1.join()
c1.join()
c2.join()
# 2. 使用Pipe进行双向通信
print("\n=== Pipe通信示例 ===")
def pipe_worker(conn, worker_id):
"""管道工作者"""
conn.send(f"来自工作者{worker_id}的消息")
msg = conn.recv()
print(f"工作者{worker_id}收到: {msg}")
conn.close()
parent_conn, child_conn = Pipe()
worker_process = multiprocessing.Process(
target=pipe_worker,
args=(child_conn, 1)
)
worker_process.start()
# 主进程接收和发送
print(f"主进程收到: {parent_conn.recv()}")
parent_conn.send("主进程的回复")
worker_process.join()
# 3. 使用Manager共享状态
print("\n=== Manager共享状态示例 ===")
def dict_worker(shared_dict, worker_id):
"""修改共享字典的worker"""
shared_dict[f"worker_{worker_id}"] = f"数据_{worker_id}"
print(f"工作者{worker_id}写入完成")
with Manager() as manager:
shared_dict = manager.dict()
processes = []
for i in range(3):
p = multiprocessing.Process(
target=dict_worker,
args=(shared_dict, i)
)
processes.append(p)
p.start()
for p in processes:
p.join()
print(f"最终共享字典: {dict(shared_dict)}")
process_communication_demo()
三、异步IO:高并发网络应用的核心
3.1 基础async/await
import asyncio
import aiohttp
import time
from typing import List
async def basic_coroutine():
"""基础的协程示例"""
print("开始协程")
await asyncio.sleep(1) # 模拟I/O操作
print("协程完成")
return "结果"
async def multiple_coroutines():
"""多个协程并发执行"""
# 创建多个任务
tasks = [
asyncio.create_task(basic_coroutine(), name=f"Task-{i}")
for i in range(3)
]
# 等待所有任务完成
results = await asyncio.gather(*tasks)
print(f"所有任务结果: {results}")
async def async_http_client():
"""异步HTTP客户端示例"""
urls = [
"https://httpbin.org/delay/1",
"https://httpbin.org/delay/2",
"https://httpbin.org/delay/1",
]
async with aiohttp.ClientSession() as session:
tasks = []
for i, url in enumerate(urls):
task = asyncio.create_task(
fetch_url(session, url, i),
name=f"Fetch-{i}"
)
tasks.append(task)
results = await asyncio.gather(*tasks, return_exceptions=True)
for i, result in enumerate(results):
if isinstance(result, Exception):
print(f"请求 {i} 失败: {result}")
else:
print(f"请求 {i} 成功,状态: {result}")
async def fetch_url(session, url, request_id):
"""获取单个URL"""
print(f"开始请求 {request_id}: {url}")
async with session.get(url) as response:
await response.read() # 读取响应内容
print(f"完成请求 {request_id}")
return response.status
async def rate_limiter_example():
"""带速率限制的异步任务"""
class RateLimiter:
"""简单的令牌桶速率限制器"""
def __init__(self, rate=2):
self.rate = rate # 每秒令牌数
self.tokens = rate
self.updated_at = asyncio.get_event_loop().time()
async def acquire(self):
"""获取令牌"""
now = asyncio.get_event_loop().time()
elapsed = now - self.updated_at
self.tokens = min(self.rate, self.tokens + elapsed * self.rate)
self.updated_at = now
if self.tokens < 1:
delay = (1 - self.tokens) / self.rate
await asyncio.sleep(delay)
self.tokens -= 1
else:
self.tokens -= 1
limiter = RateLimiter(rate=2) # 每秒2个请求
async def limited_task(task_id):
"""受限制的任务"""
await limiter.acquire()
print(f"任务 {task_id} 开始于 {time.strftime('%H:%M:%S')}")
await asyncio.sleep(0.5) # 模拟工作
return f"任务{task_id}_结果"
# 并发执行10个任务,但被限制为每秒2个
tasks = [limited_task(i) for i in range(10)]
results = await asyncio.gather(*tasks)
print(f"所有任务完成: {results}")
# 运行示例
async def run_all_async_examples():
print("=== 基础协程示例 ===")
await basic_coroutine()
print("\n=== 多个协程并发 ===")
await multiple_coroutines()
print("\n=== 异步HTTP客户端 ===")
await async_http_client()
print("\n=== 速率限制示例 ===")
await rate_limiter_example()
# 在事件循环中运行
asyncio.run(run_all_async_examples())
3.2 高级异步模式
import asyncio
import random
from asyncio import Queue, Event, Lock
from typing import Optional
class AsyncProducerConsumer:
"""异步生产者-消费者模式"""
def __init__(self, maxsize: int = 10):
self.queue = Queue(maxsize=maxsize)
self.stop_event = Event()
async def producer(self, producer_id: int):
"""异步生产者"""
try:
for i in range(5):
item = f"产品_{producer_id}_{i}"
await self.queue.put(item) # 异步放入队列
print(f"生产者{producer_id} 生产: {item}")
await asyncio.sleep(random.uniform(0.1, 0.5))
finally:
print(f"生产者{producer_id} 完成")
async def consumer(self, consumer_id: int):
"""异步消费者"""
while not self.stop_event.is_set() or not self.queue.empty():
try:
# 等待1秒,如果队列为空且收到停止信号则退出
item = await asyncio.wait_for(
self.queue.get(),
timeout=1.0
)
print(f"消费者{consumer_id} 消费: {item}")
self.queue.task_done()
await asyncio.sleep(random.uniform(0.2, 0.8))
except asyncio.TimeoutError:
continue
except asyncio.CancelledError:
break
print(f"消费者{consumer_id} 退出")
async def run(self, num_producers=2, num_consumers=3):
"""运行异步生产者消费者"""
# 创建生产者任务
producer_tasks = [
asyncio.create_task(self.producer(i), name=f"Producer-{i}")
for i in range(num_producers)
]
# 创建消费者任务
consumer_tasks = [
asyncio.create_task(self.consumer(i), name=f"Consumer-{i}")
for i in range(num_consumers)
]
# 等待所有生产者完成
await asyncio.gather(*producer_tasks)
# 通知消费者可以停止了
self.stop_event.set()
# 等待队列清空
await self.queue.join()
# 取消消费者任务
for task in consumer_tasks:
task.cancel()
# 等待消费者优雅退出
await asyncio.gather(*consumer_tasks, return_exceptions=True)
print("异步生产者消费者模式完成")
async def async_semaphore_example():
"""使用信号量控制并发数"""
class AsyncConnectionPool:
"""异步连接池"""
def __init__(self, pool_size: int):
self.semaphore = asyncio.Semaphore(pool_size)
self.active_connections = 0
self.lock = Lock()
async def get_connection(self, client_id: int):
"""获取连接(带并发限制)"""
async with self.semaphore: # 控制最大并发数
async with self.lock: # 保护共享状态
self.active_connections += 1
current = self.active_connections
print(f"客户端{client_id} 获取连接,活跃连接: {current}")
try:
# 模拟连接使用
await asyncio.sleep(random.uniform(0.5, 1.5))
return f"连接_{client_id}_结果"
finally:
async with self.lock:
self.active_connections -= 1
print(f"客户端{client_id} 释放连接")
pool = AsyncConnectionPool(pool_size=3) # 最多3个并发连接
async def client_task(client_id: int):
"""客户端任务"""
result = await pool.get_connection(client_id)
return result
# 创建10个客户端任务,但最多只能有3个并发
tasks = [client_task(i) for i in range(10)]
results = await asyncio.gather(*tasks)
print(f"所有任务完成,结果数: {len(results)}")
async def async_broadcast_pattern():
"""异步广播模式"""
class AsyncBroadcaster:
"""异步事件广播器"""
def __init__(self):
self.listeners = []
self.lock = Lock()
async def subscribe(self):
"""订阅事件"""
queue = Queue()
async with self.lock:
self.listeners.append(queue)
return queue
async def unsubscribe(self, queue):
"""取消订阅"""
async with self.lock:
self.listeners.remove(queue)
async def broadcast(self, message):
"""广播消息给所有订阅者"""
async with self.lock:
listeners = self.listeners.copy()
for queue in listeners:
try:
queue.put_nowait(message)
except asyncio.QueueFull:
# 如果队列满,跳过这个订阅者
pass
async def run_broadcaster(self):
"""运行广播器"""
for i in range(5):
message = f"广播消息_{i}"
print(f"广播: {message}")
await self.broadcast(message)
await asyncio.sleep(1)
# 发送结束信号
await self.broadcast("END")
broadcaster = AsyncBroadcaster()
async def listener(listener_id: int):
"""监听者"""
queue = await broadcaster.subscribe()
while True:
message = await queue.get()
if message == "END":
print(f"监听者{listener_id} 收到结束信号")
break
print(f"监听者{listener_id} 收到: {message}")
# 创建监听者任务
listener_tasks = [
asyncio.create_task(listener(i), name=f"Listener-{i}")
for i in range(3)
]
# 运行广播器
broadcaster_task = asyncio.create_task(broadcaster.run_broadcaster())
# 等待所有任务完成
await asyncio.gather(broadcaster_task, *listener_tasks)
print("广播模式示例完成")
# 运行高级异步示例
async def run_advanced_async():
print("=== 异步生产者消费者 ===")
apc = AsyncProducerConsumer(maxsize=5)
await apc.run(num_producers=2, num_consumers=3)
print("\n=== 异步信号量示例 ===")
await async_semaphore_example()
print("\n=== 异步广播模式 ===")
await async_broadcast_pattern()
asyncio.run(run_advanced_async())
四、如何选择合适的并发模型
4.1 选择指南与性能对比
import time
import concurrent.futures
import asyncio
import multiprocessing
from typing import Callable, List, Any
def performance_benchmark():
"""不同并发模型的性能对比"""
def io_bound_task(task_id: int):
"""模拟I/O密集型任务"""
time.sleep(0.1) # 模拟I/O等待
return f"IO任务_{task_id}"
def cpu_bound_task(n: int):
"""模拟CPU密集型任务"""
return sum(i * i for i in range(n))
async def async_io_task(task_id: int):
"""异步I/O任务"""
await asyncio.sleep(0.1)
return f"异步IO任务_{task_id}"
def run_sync(tasks: int):
"""同步执行基准"""
start = time.time()
results = [io_bound_task(i) for i in range(tasks)]
return time.time() - start, results
def run_threads(tasks: int, workers: int = 10):
"""多线程执行"""
start = time.time()
with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor:
results = list(executor.map(io_bound_task, range(tasks)))
return time.time() - start, results
def run_processes(tasks: int, workers: int = 4):
"""多进程执行"""
start = time.time()
with concurrent.futures.ProcessPoolExecutor(max_workers=workers) as executor:
results = list(executor.map(cpu_bound_task, [10000] * tasks))
return time.time() - start, results
async def run_async(tasks: int):
"""异步执行"""
start = time.time()
tasks_list = [async_io_task(i) for i in range(tasks)]
results = await asyncio.gather(*tasks_list)
return time.time() - start, results
# 性能测试
test_cases = 100
print("性能对比测试 (100个任务):")
print("-" * 50)
# 同步基准
sync_time, _ = run_sync(test_cases)
print(f"同步执行: {sync_time:.3f}秒")
# 多线程
thread_time, _ = run_threads(test_cases)
print(f"多线程执行: {thread_time:.3f}秒 (加速: {sync_time/thread_time:.1f}倍)")
# 多进程(CPU密集型)
process_time, _ = run_processes(test_cases)
cpu_sync_time = time.time()
for _ in range(test_cases):
cpu_bound_task(10000)
cpu_sync_time = time.time() - cpu_sync_time
print(f"多进程执行: {process_time:.3f}秒 (加速: {cpu_sync_time/process_time:.1f}倍)")
# 异步
async_time, _ = asyncio.run(run_async(test_cases))
print(f"异步执行: {async_time:.3f}秒 (加速: {sync_time/async_time:.1f}倍)")
performance_benchmark()
print("\n" + "="*60)
print("并发模型选择指南:")
print("="*60)
print("""
1. 多线程 (threading) 适合:
- I/O密集型任务(网络请求、文件操作)
- GUI应用程序(保持界面响应)
- 需要共享内存状态的情况
2. 多进程 (multiprocessing) 适合:
- CPU密集型任务(数学计算、图像处理)
- 需要利用多核CPU的情况
- 任务之间相互独立,不需要频繁通信
3. 异步IO (asyncio) 适合:
- 高并发网络应用(Web服务器、爬虫)
- 大量I/O操作且需要高吞吐量
- 需要精细控制并发流程的情况
4. 混合模式:
- 进程池内部使用线程池或异步
- 异步事件循环中运行CPU密集型任务(使用run_in_executor)
- 根据任务类型动态选择执行策略
""")
4.2 实际项目示例:Web爬虫
import asyncio
import aiohttp
from urllib.parse import urljoin, urlparse
from typing import Set, List, Optional
import time
from concurrent.futures import ThreadPoolExecutor
import hashlib
import json
class AdvancedWebCrawler:
"""高级Web爬虫,结合多种并发技术"""
def __init__(self, max_concurrent: int = 10, max_depth: int = 3):
self.max_concurrent = max_concurrent
self.max_depth = max_depth
self.visited_urls: Set[str] = set()
self.url_lock = asyncio.Lock()
self.session: Optional[aiohttp.ClientSession] = None
self.results = []
self.semaphore = asyncio.Semaphore(max_concurrent)
# 使用线程池处理CPU密集型任务
self.thread_pool = ThreadPoolExecutor(max_workers=4)
def compute_content_hash(self, content: str) -> str:
"""计算内容哈希(CPU密集型,在线程池中运行)"""
return hashlib.md5(content.encode()).hexdigest()
async def process_content(self, url: str, html: str):
"""处理页面内容(混合异步和线程池)"""
# 异步提取链接
links = await self.extract_links(html, url)
# 在线程池中计算哈希(CPU密集型)
hash_future = asyncio.get_event_loop().run_in_executor(
self.thread_pool,
self.compute_content_hash,
html
)
content_hash = await hash_future
# 异步保存结果
result = {
'url': url,
'hash': content_hash,
'links': links,
'timestamp': time.time()
}
async with self.url_lock:
self.results.append(result)
return links
async def extract_links(self, html: str, base_url: str) -> List[str]:
"""提取页面中的链接(简化版)"""
# 这里简化实现,实际项目可以使用BeautifulSoup等库
import re
links = re.findall(r'href="([^"]+)"', html)
# 转换为绝对URL
absolute_links = []
for link in links:
absolute_link = urljoin(base_url, link)
if self.is_valid_url(absolute_link):
absolute_links.append(absolute_link)
return absolute_links[:10] # 限制每个页面的链接数
def is_valid_url(self, url: str) -> bool:
"""验证URL是否有效"""
parsed = urlparse(url)
return bool(parsed.netloc) and parsed.scheme in ['http', 'https']
async def fetch_page(self, url: str, depth: int):
"""获取单个页面"""
if depth > self.max_depth:
return
async with self.semaphore: # 控制并发数
try:
async with self.session.get(url, timeout=10) as response:
if response.status == 200:
html = await response.text()
print(f"[深度{depth}] 获取成功: {url}")
# 处理内容
links = await self.process_content(url, html)
# 递归爬取链接
if depth < self.max_depth:
tasks = [
self.fetch_page(link, depth + 1)
for link in links
if link not in self.visited_urls
]
if tasks:
await asyncio.gather(*tasks)
else:
print(f"[深度{depth}] 获取失败 {response.status}: {url}")
except Exception as e:
print(f"[深度{depth}] 错误 {url}: {e}")
async def crawl(self, start_urls: List[str]):
"""开始爬取"""
self.session = aiohttp.ClientSession()
try:
# 创建初始任务
tasks = []
for url in start_urls:
if self.is_valid_url(url):
tasks.append(self.fetch_page(url, 1))
# 执行所有任务
await asyncio.gather(*tasks)
print(f"\n爬取完成!")
print(f"总共访问: {len(self.results)} 个页面")
# 保存结果
with open('crawl_results.json', 'w', encoding='utf-8') as f:
json.dump(self.results, f, ensure_ascii=False, indent=2)
finally:
await self.session.close()
self.thread_pool.shutdown()
def run(self, start_urls: List[str]):
"""运行爬虫"""
asyncio.run(self.crawl(start_urls))
# 使用示例
if __name__ == "__main__":
print("启动高级Web爬虫...")
start_urls = [
"https://httpbin.org/html",
"https://httpbin.org/links/10/0",
]
crawler = AdvancedWebCrawler(max_concurrent=5, max_depth=2)
# 测量执行时间
start_time = time.time()
crawler.run(start_urls)
end_time = time.time()
print(f"\n总执行时间: {end_time - start_time:.2f}秒")
print("结果已保存到 crawl_results.json")
总结
Python提供了丰富的并发编程工具,每种工具都有其适用场景:
- 多线程:适合I/O密集型任务,受GIL限制但能有效处理I/O等待
- 多进程:适合CPU密集型任务,能充分利用多核CPU
- 异步IO:适合高并发网络应用,提供最好的I/O性能
实际项目中,经常需要混合使用这些技术:
- 使用多进程处理CPU密集型计算
- 在进程内部使用多线程处理I/O
- 使用异步IO处理高并发网络请求
- 使用线程池执行器在异步代码中运行阻塞操作