引言
在现代计算环境中,程序性能和响应速度是用户体验的关键因素。随着多核处理器的普及和应用程序复杂性的增加,传统的单线程、顺序执行的程序已经难以满足日益增长的性能需求。并发编程作为一种重要的编程范式,允许程序同时执行多个任务,从而显著提高程序的执行效率和系统资源利用率。
Python作为一种广泛应用的编程语言,在并发编程方面提供了丰富的支持。从内置的threading和multiprocessing模块,到现代的asyncio异步编程框架,Python为开发者提供了多种并发编程的选择。
在本章中,我们将深入探讨Python并发编程的基础知识和实践技巧。我们将从并发编程的基本概念开始,逐步学习多线程编程、多进程编程以及异步编程等核心技术。通过实际的代码示例和项目实战,您将掌握如何在Python中实现高效的并发程序。
学习目标
完成本章学习后,您将能够:
- 理解并发编程的基本概念和重要性
- 掌握Python多线程编程的基本方法和应用场景
- 熟悉Python多进程编程的技术和优势
- 理解GIL(全局解释器锁)对Python并发的影响
- 掌握asyncio异步编程框架的使用
- 学会使用线程池和进程池优化并发程序
- 构建一个完整的并发应用示例
- 了解并发编程的最佳实践和注意事项
核心知识点讲解
1. 并发编程基础概念
并发编程是指程序能够同时处理多个任务的编程方式。理解并发编程的基本概念对于掌握相关技术至关重要。
并发与并行
- 并发(Concurrency):多个任务在同一时间段内交替执行,但在任意时刻只有一个任务在运行
- 并行(Parallelism):多个任务真正同时执行,需要多核处理器支持
进程与线程
- 进程(Process):操作系统分配资源的基本单位,拥有独立的内存空间
- 线程(Thread):进程内的执行单元,同一进程内的线程共享内存空间
同步与互斥
- 同步(Synchronization):协调多个线程或进程的执行顺序
- 互斥(Mutual Exclusion):确保同一时刻只有一个线程访问共享资源
2. 多线程编程
Python的threading模块提供了多线程编程的支持。线程是轻量级的执行单元,适合处理I/O密集型任务。
创建和启动线程
import threading
import time
def worker(name, duration):
"""工作函数"""
print(f"线程 {name} 开始工作")
time.sleep(duration)
print(f"线程 {name} 工作完成")
# 创建线程
thread1 = threading.Thread(target=worker, args=("Worker-1", 2))
thread2 = threading.Thread(target=worker, args=("Worker-2", 3))
# 启动线程
thread1.start()
thread2.start()
# 等待线程完成
thread1.join()
thread2.join()
print("所有线程执行完毕")
线程类继承方式
import threading
import time
class WorkerThread(threading.Thread):
def __init__(self, name, duration):
super().__init__()
self.name = name
self.duration = duration
def run(self):
"""线程执行的主要方法"""
print(f"线程 {self.name} 开始工作")
time.sleep(self.duration)
print(f"线程 {self.name} 工作完成")
# 创建并启动线程
worker1 = WorkerThread("CustomWorker-1", 2)
worker2 = WorkerThread("CustomWorker-2", 3)
worker1.start()
worker2.start()
worker1.join()
worker2.join()
print("自定义线程执行完毕")
线程同步机制
import threading
import time
# 共享资源
counter = 0
lock = threading.Lock()
def increment_counter(name, iterations):
"""增加计数器的函数"""
global counter
for i in range(iterations):
# 获取锁
with lock:
temp = counter
time.sleep(0.0001) # 模拟一些处理时间
counter = temp + 1
print(f"{name}: 计数器 = {counter}")
# 创建多个线程
threads = []
for i in range(3):
thread = threading.Thread(target=increment_counter, args=(f"Thread-{i+1}", 5))
threads.append(thread)
thread.start()
# 等待所有线程完成
for thread in threads:
thread.join()
print(f"最终计数器值: {counter}")
条件变量和事件
import threading
import time
import random
# 条件变量示例
condition = threading.Condition()
items = []
def consumer():
"""消费者线程"""
with condition:
while len(items) == 0:
print("消费者: 等待物品...")
condition.wait() # 等待条件满足
item = items.pop(0)
print(f"消费者: 消费了物品 {item}")
def producer():
"""生产者线程"""
with condition:
item = random.randint(1, 100)
items.append(item)
print(f"生产者: 生产了物品 {item}")
condition.notify() # 通知等待的线程
# 事件示例
event = threading.Event()
def waiter():
"""等待事件的线程"""
print("等待者: 等待事件被触发...")
event.wait() # 等待事件被设置
print("等待者: 事件已被触发,继续执行")
def setter():
"""设置事件的线程"""
time.sleep(2)
print("设置者: 触发事件")
event.set() # 设置事件
# 运行示例
print("=== 条件变量示例 ===")
consumer_thread = threading.Thread(target=consumer)
producer_thread = threading.Thread(target=producer)
consumer_thread.start()
time.sleep(1) # 确保消费者先等待
producer_thread.start()
consumer_thread.join()
producer_thread.join()
print("\n=== 事件示例 ===")
waiter_thread = threading.Thread(target=waiter)
setter_thread = threading.Thread(target=setter)
waiter_thread.start()
setter_thread.start()
waiter_thread.join()
setter_thread.join()
3. 多进程编程
Python的multiprocessing模块提供了多进程编程的支持。进程拥有独立的内存空间,适合处理CPU密集型任务。
创建和管理进程
import multiprocessing
import time
import os
def worker(name, duration):
"""工作函数"""
print(f"进程 {name} (PID: {os.getpid()}) 开始工作")
time.sleep(duration)
print(f"进程 {name} 工作完成")
if __name__ == "__main__":
# 创建进程
process1 = multiprocessing.Process(target=worker, args=("Process-1", 2))
process2 = multiprocessing.Process(target=worker, args=("Process-2", 3))
# 启动进程
process1.start()
process2.start()
# 等待进程完成
process1.join()
process2.join()
print("所有进程执行完毕")
进程间通信
import multiprocessing
import time
def square_worker(queue, numbers):
"""计算平方的工作者进程"""
for num in numbers:
result = num * num
queue.put((num, result))
print(f"进程 {multiprocessing.current_process().name}: {num}² = {result}")
time.sleep(0.1)
def result_collector(queue, expected_count):
"""结果收集进程"""
results = []
for _ in range(expected_count):
num, square = queue.get()
results.append((num, square))
print(f"收集到结果: {num}² = {square}")
return results
if __name__ == "__main__":
# 创建队列用于进程间通信
queue = multiprocessing.Queue()
# 要处理的数字
numbers = [1, 2, 3, 4, 5]
# 创建工作者进程
worker_process = multiprocessing.Process(
target=square_worker,
args=(queue, numbers)
)
# 创建结果收集进程
collector_process = multiprocessing.Process(
target=result_collector,
args=(queue, len(numbers))
)
# 启动进程
worker_process.start()
collector_process.start()
# 等待进程完成
worker_process.join()
collector_process.join()
print("多进程计算完成")
共享内存
import multiprocessing
import time
def increment_counter(shared_array, index, iterations):
"""增加共享计数器的函数"""
for _ in range(iterations):
shared_array[index] += 1
time.sleep(0.001)
def shared_memory_demo():
"""共享内存演示"""
# 创建共享数组
shared_array = multiprocessing.Array('i', [0, 0, 0]) # 'i'表示整数类型
# 创建多个进程
processes = []
for i in range(3):
process = multiprocessing.Process(
target=increment_counter,
args=(shared_array, i, 100)
)
processes.append(process)
process.start()
# 等待所有进程完成
for process in processes:
process.join()
print(f"共享数组最终值: {list(shared_array[:])}")
if __name__ == "__main__":
shared_memory_demo()
4. GIL(全局解释器锁)
GIL是CPython解释器的一个特性,它确保同一时刻只有一个线程执行Python字节码。
GIL的影响
import threading
import time
import multiprocessing
def cpu_bound_task(n):
"""CPU密集型任务"""
result = 0
for i in range(n):
result += i * i
return result
def threading_cpu_test():
"""多线程CPU密集型任务测试"""
start_time = time.time()
# 创建线程
threads = []
for _ in range(4):
thread = threading.Thread(target=cpu_bound_task, args=(1000000,))
threads.append(thread)
thread.start()
# 等待线程完成
for thread in threads:
thread.join()
end_time = time.time()
print(f"多线程CPU任务耗时: {end_time - start_time:.2f}秒")
def multiprocessing_cpu_test():
"""多进程CPU密集型任务测试"""
start_time = time.time()
# 创建进程
processes = []
for _ in range(4):
process = multiprocessing.Process(target=cpu_bound_task, args=(1000000,))
processes.append(process)
process.start()
# 等待进程完成
for process in processes:
process.join()
end_time = time.time()
print(f"多进程CPU任务耗时: {end_time - start_time:.2f}秒")
if __name__ == "__main__":
print("=== CPU密集型任务性能对比 ===")
threading_cpu_test()
multiprocessing_cpu_test()
5. 异步编程(asyncio)
asyncio是Python 3.4+引入的异步I/O框架,适合处理大量I/O密集型任务。
基本异步编程
import asyncio
import time
async def async_worker(name, duration):
"""异步工作函数"""
print(f"协程 {name} 开始工作")
await asyncio.sleep(duration) # 异步等待
print(f"协程 {name} 工作完成")
return f"{name} 完成"
async def main():
"""主协程"""
# 创建多个协程任务
task1 = asyncio.create_task(async_worker("Worker-1", 2))
task2 = asyncio.create_task(async_worker("Worker-2", 3))
# 等待任务完成
result1 = await task1
result2 = await task2
print(f"结果: {result1}, {result2}")
# 运行异步程序
# asyncio.run(main())
异步并发执行
import asyncio
import aiohttp
import time
async def fetch_url(session, url):
"""异步获取URL内容"""
try:
async with session.get(url) as response:
content = await response.text()
return f"URL: {url}, 状态码: {response.status}, 内容长度: {len(content)}"
except Exception as e:
return f"URL: {url}, 错误: {e}"
async def fetch_multiple_urls():
"""并发获取多个URL"""
urls = [
"https://httpbin.org/delay/1",
"https://httpbin.org/delay/2",
"https://httpbin.org/delay/1",
"https://httpbin.org/delay/3"
]
async with aiohttp.ClientSession() as session:
# 并发执行所有请求
start_time = time.time()
results = await asyncio.gather(*[fetch_url(session, url) for url in urls])
end_time = time.time()
for result in results:
print(result)
print(f"总耗时: {end_time - start_time:.2f}秒")
# 运行异步程序
# asyncio.run(fetch_multiple_urls())
异步生成器和队列
import asyncio
import random
async def async_producer(queue, num_items):
"""异步生产者"""
for i in range(num_items):
item = random.randint(1, 100)
await queue.put(item)
print(f"生产者: 生产了 {item}")
await asyncio.sleep(0.1)
# 发送结束信号
await queue.put(None)
async def async_consumer(queue, name):
"""异步消费者"""
while True:
item = await queue.get()
if item is None:
# 收到结束信号
await queue.put(None) # 传递给其他消费者
break
print(f"消费者 {name}: 消费了 {item}")
await asyncio.sleep(0.2)
queue.task_done()
async def producer_consumer_demo():
"""生产者-消费者演示"""
# 创建异步队列
queue = asyncio.Queue(maxsize=5)
# 创建生产者和消费者任务
producer_task = asyncio.create_task(async_producer(queue, 10))
consumer_tasks = [
asyncio.create_task(async_consumer(queue, f"Consumer-{i+1}"))
for i in range(2)
]
# 等待所有任务完成
await producer_task
await asyncio.gather(*consumer_tasks)
# asyncio.run(producer_consumer_demo())
6. 线程池和进程池
concurrent.futures模块提供了高级的线程池和进程池接口,简化了并发编程。
线程池使用
import concurrent.futures
import time
import requests
def fetch_url(url):
"""获取URL内容"""
try:
response = requests.get(url, timeout=5)
return f"URL: {url}, 状态码: {response.status_code}"
except Exception as e:
return f"URL: {url}, 错误: {e}"
def thread_pool_demo():
"""线程池演示"""
urls = [
"https://httpbin.org/delay/1",
"https://httpbin.org/delay/2",
"https://httpbin.org/delay/1",
"https://httpbin.org/delay/3",
"https://httpbin.org/delay/1"
]
start_time = time.time()
# 使用线程池
with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
# 提交任务
future_to_url = {executor.submit(fetch_url, url): url for url in urls}
# 获取结果
for future in concurrent.futures.as_completed(future_to_url):
url = future_to_url[future]
try:
result = future.result()
print(result)
except Exception as e:
print(f"URL {url} 产生异常: {e}")
end_time = time.time()
print(f"线程池总耗时: {end_time - start_time:.2f}秒")
# thread_pool_demo()
进程池使用
import concurrent.futures
import time
import math
def cpu_intensive_task(n):
"""CPU密集型任务"""
result = 0
for i in range(n):
result += math.sqrt(i)
return result
def process_pool_demo():
"""进程池演示"""
tasks = [1000000, 1500000, 2000000, 1200000]
start_time = time.time()
# 使用进程池
with concurrent.futures.ProcessPoolExecutor(max_workers=2) as executor:
# 提交任务
future_to_task = {executor.submit(cpu_intensive_task, task): task for task in tasks}
# 获取结果
for future in concurrent.futures.as_completed(future_to_task):
task = future_to_task[future]
try:
result = future.result()
print(f"任务 {task} 完成,结果: {result:.2f}")
except Exception as e:
print(f"任务 {task} 产生异常: {e}")
end_time = time.time()
print(f"进程池总耗时: {end_time - start_time:.2f}秒")
# process_pool_demo()
代码示例与实战
让我们通过一个完整的并发应用项目来实践所学知识。
实战:Web爬虫并发框架
import asyncio
import aiohttp
import time
import json
from typing import List, Dict, Optional
from dataclasses import dataclass
from urllib.parse import urljoin, urlparse
import re
@dataclass
class CrawlResult:
"""爬取结果数据类"""
url: str
status_code: int
title: str
links: List[str]
content_length: int
crawl_time: float
class AsyncWebCrawler:
"""异步Web爬虫"""
def __init__(self, max_concurrent: int = 10, timeout: int = 30):
self.max_concurrent = max_concurrent
self.timeout = timeout
self.session = None
self.results: List[CrawlResult] = []
async def __aenter__(self):
"""异步上下文管理器入口"""
timeout = aiohttp.ClientTimeout(total=self.timeout)
self.session = aiohttp.ClientSession(timeout=timeout)
return self
async def __aexit__(self, exc_type, exc_val, exc_tb):
"""异步上下文管理器出口"""
if self.session:
await self.session.close()
def extract_title(self, html: str) -> str:
"""从HTML中提取标题"""
match = re.search(r'<title[^>]*>(.*?)</title>', html, re.IGNORECASE | re.DOTALL)
return match.group(1).strip() if match else "无标题"
def extract_links(self, html: str, base_url: str) -> List[str]:
"""从HTML中提取链接"""
# 匹配href属性
href_pattern = r'href=["\']([^"\']+)["\']'
matches = re.findall(href_pattern, html, re.IGNORECASE)
links = []
for href in matches:
# 处理相对链接
if href.startswith(('http://', 'https://')):
links.append(href)
elif href.startswith('//'):
links.append(f"https:{href}")
elif href.startswith('/'):
parsed_base = urlparse(base_url)
links.append(f"{parsed_base.scheme}://{parsed_base.netloc}{href}")
elif href.startswith('#'):
continue # 跳过锚点链接
else:
links.append(urljoin(base_url, href))
return links
async def crawl_single_url(self, url: str) -> Optional[CrawlResult]:
"""爬取单个URL"""
start_time = time.time()
try:
async with self.session.get(url) as response:
content = await response.text()
elapsed_time = time.time() - start_time
title = self.extract_title(content)
links = self.extract_links(content, url)
result = CrawlResult(
url=url,
status_code=response.status,
title=title,
links=links[:10], # 限制链接数量
content_length=len(content),
crawl_time=elapsed_time
)
print(f"✓ 成功爬取: {url} (状态码: {response.status})")
return result
except asyncio.TimeoutError:
print(f"✗ 超时: {url}")
return None
except Exception as e:
print(f"✗ 错误: {url} - {str(e)}")
return None
async def crawl_urls(self, urls: List[str]) -> List[CrawlResult]:
"""并发爬取多个URL"""
semaphore = asyncio.Semaphore(self.max_concurrent)
async def crawl_with_semaphore(url: str) -> Optional[CrawlResult]:
async with semaphore:
return await self.crawl_single_url(url)
# 创建所有爬取任务
tasks = [crawl_with_semaphore(url) for url in urls]
# 并发执行所有任务
results = await asyncio.gather(*tasks, return_exceptions=True)
# 过滤有效结果
valid_results = [result for result in results if isinstance(result, CrawlResult)]
self.results.extend(valid_results)
return valid_results
def save_results(self, filename: str = "crawl_results.json"):
"""保存爬取结果"""
results_data = []
for result in self.results:
results_data.append({
"url": result.url,
"status_code": result.status_code,
"title": result.title,
"links": result.links,
"content_length": result.content_length,
"crawl_time": result.crawl_time
})
with open(filename, 'w', encoding='utf-8') as f:
json.dump(results_data, f, ensure_ascii=False, indent=2)
print(f"结果已保存到 {filename}")
class ThreadPoolWebCrawler:
"""线程池Web爬虫"""
def __init__(self, max_workers: int = 10, timeout: int = 30):
self.max_workers = max_workers
self.timeout = timeout
self.results: List[CrawlResult] = []
def crawl_single_url(self, url: str) -> Optional[CrawlResult]:
"""爬取单个URL(同步版本)"""
import requests
start_time = time.time()
try:
response = requests.get(url, timeout=self.timeout)
elapsed_time = time.time() - start_time
# 提取标题和链接(简化版本)
title_match = re.search(r'<title[^>]*>(.*?)</title>', response.text, re.IGNORECASE | re.DOTALL)
title = title_match.group(1).strip() if title_match else "无标题"
result = CrawlResult(
url=url,
status_code=response.status_code,
title=title,
links=[], # 简化处理
content_length=len(response.content),
crawl_time=elapsed_time
)
print(f"✓ 成功爬取: {url} (状态码: {response.status_code})")
return result
except requests.Timeout:
print(f"✗ 超时: {url}")
return None
except Exception as e:
print(f"✗ 错误: {url} - {str(e)}")
return None
def crawl_urls(self, urls: List[str]) -> List[CrawlResult]:
"""使用线程池爬取多个URL"""
import concurrent.futures
with concurrent.futures.ThreadPoolExecutor(max_workers=self.max_workers) as executor:
# 提交所有任务
future_to_url = {executor.submit(self.crawl_single_url, url): url for url in urls}
# 收集结果
results = []
for future in concurrent.futures.as_completed(future_to_url):
result = future.result()
if result:
results.append(result)
self.results.extend(results)
return results
async def performance_comparison():
"""性能对比演示"""
test_urls = [
"https://httpbin.org/delay/1",
"https://httpbin.org/delay/2",
"https://httpbin.org/delay/1",
"https://httpbin.org/delay/3",
"https://httpbin.org/delay/1",
"https://httpbin.org/delay/2",
"https://httpbin.org/delay/1",
"https://httpbin.org/delay/2"
]
print("=== 异步爬虫性能测试 ===")
start_time = time.time()
async with AsyncWebCrawler(max_concurrent=5) as crawler:
results = await crawler.crawl_urls(test_urls)
async_time = time.time() - start_time
print(f"异步爬虫耗时: {async_time:.2f}秒")
print(f"爬取成功: {len(results)} 个URL")
print("\n=== 线程池爬虫性能测试 ===")
start_time = time.time()
thread_crawler = ThreadPoolWebCrawler(max_workers=5)
thread_results = thread_crawler.crawl_urls(test_urls)
thread_time = time.time() - start_time
print(f"线程池爬虫耗时: {thread_time:.2f}秒")
print(f"爬取成功: {len(thread_results)} 个URL")
print(f"\n性能对比: 异步爬虫比线程池快 {((thread_time - async_time) / thread_time * 100):.1f}%")
def crawler_demo():
"""爬虫演示"""
urls = [
"https://httpbin.org/html",
"https://httpbin.org/json",
"https://httpbin.org/xml",
"https://httpbin.org/robots.txt"
]
print("=== 异步爬虫演示 ===")
async def run_async_crawler():
async with AsyncWebCrawler(max_concurrent=3) as crawler:
results = await crawler.crawl_urls(urls)
crawler.save_results("async_results.json")
return results
# 运行异步爬虫
results = asyncio.run(run_async_crawler())
print(f"\n爬取结果统计:")
print(f"总URL数: {len(urls)}")
print(f"成功爬取: {len(results)}")
print(f"成功率: {len(results)/len(urls)*100:.1f}%")
for result in results:
print(f"- {result.url}: {result.title} ({result.content_length} 字节, {result.crawl_time:.2f}秒)")
# 运行演示
# crawler_demo()
# asyncio.run(performance_comparison())
小结与回顾
在本章中,我们深入学习了Python并发编程的各种方法和技巧。主要内容包括:
-
并发编程基础:理解了并发与并行、进程与线程等基本概念,为后续学习奠定了理论基础。
-
多线程编程:掌握了使用threading模块进行多线程编程的方法,包括线程创建、同步机制和通信方式。
-
多进程编程:学会了使用multiprocessing模块进行多进程编程,理解了进程间通信和共享内存的使用。
-
GIL影响:深入了解了Python GIL对并发编程的影响,明确了线程和进程在不同场景下的适用性。
-
异步编程:掌握了asyncio异步编程框架的使用,学会了编写高效的异步程序。
-
线程池和进程池:学习了使用concurrent.futures模块简化并发编程的方法。
-
实战项目:通过完整的Web爬虫并发框架项目,实践了各种并发编程技术的综合应用。
并发编程是现代软件开发中的重要技能,能够显著提升程序性能和用户体验。掌握这些技能不仅能够帮助您编写更高效的程序,还能让您更好地利用现代多核处理器的强大能力。随着实践经验的积累,您可以进一步学习更高级的并发编程技术,如分布式计算、微服务架构等。
练习与挑战
-
基础练习
- 编写一个多线程程序,实现文件的并发下载功能
- 创建一个多进程程序,计算大数组中每个元素的平方根
- 使用asyncio实现一个简单的异步HTTP服务器
-
进阶挑战
- 实现一个线程安全的生产者-消费者队列系统
- 创建一个支持任务优先级的线程池调度器
- 开发一个异步数据库连接池管理器
-
综合项目
- 构建一个分布式任务处理系统,支持任务分发和结果收集
- 开发一个实时数据处理管道,能够并发处理多个数据流
- 实现一个高性能的Web爬虫集群,支持大规模网站爬取
扩展阅读
-
官方文档:
-
进阶库:
- trio: 友好的异步I/O库
- curio: 异步I/O库
- celery: 分布式任务队列
- ray: 分布式计算框架
-
专业书籍:
- 《Python并行编程手册》- Giancarlo Zaccone著
- 《Effective Python》- Brett Slatkin著
- 《流畅的Python》- Luciano Ramalho著
-
在线资源:
- Real Python: 并发编程教程
- Python官方并发编程指南
- Asyncio官方文档和教程
-
相关技术:
- 学习分布式系统设计原则
- 了解消息队列和任务调度
- 掌握容器化技术(Docker)在并发应用中的应用
- 学习微服务架构和负载均衡