Python 并行与并发:案例与实现

0 阅读5分钟

一、核心概念辨析

维度并发 (Concurrency)并行 (Parallelism)
本质多任务交替执行多任务同时执行
硬件单核即可必须多核
Python 实现threading / asynciomultiprocessing / concurrent.futures.ProcessPoolExecutor
适用场景I/O 密集型(网络、磁盘)CPU 密集型(计算、加解密)
关键限制GIL 限制线程的 CPU 并行进程间通信开销大

GIL(全局解释器锁):CPython 解释器同一时刻只允许一个线程执行 Python 字节码,因此多线程对 CPU 密集型任务无加速,但对 I/O 密集型任务有效(I/O 时会释放 GIL)。


二、三种并发模型对比

┌─────────────────┬──────────────┬──────────────┬─────────────────┐
│      模型        │    调度方     │   切换成本    │      场景        │
├─────────────────┼──────────────┼──────────────┼─────────────────┤
│ 多进程           │   操作系统    │     高       │  CPU 密集        │
│ 多线程           │   操作系统    │     中       │  I/O 密集(阻塞)│
│ 协程 (asyncio)  │   事件循环    │     低       │  I/O 密集(非阻塞)│
└─────────────────┴──────────────┴──────────────┴─────────────────┘

三、案例一:I/O 密集型 —— 批量抓取 URL

3.1 串行版本(基线)

import time
import requests

URLS = [
    "https://httpbin.org/delay/1",
    "https://httpbin.org/delay/1",
    "https://httpbin.org/delay/1",
    "https://httpbin.org/delay/1",
    "https://httpbin.org/delay/1",
]

def fetch(url: str) -> int:
    return requests.get(url, timeout=10).status_code

def main():
    start = time.perf_counter()
    results = [fetch(u) for u in URLS]
    print(f"串行耗时 {time.perf_counter() - start:.2f}s, 结果 {results}")

if __name__ == "__main__":
    main()
# 输出:串行耗时 ~5.20s

3.2 多线程版本(ThreadPoolExecutor)

import time
import requests
from concurrent.futures import ThreadPoolExecutor

URLS = ["https://httpbin.org/delay/1"] * 5

def fetch(url: str) -> int:
    return requests.get(url, timeout=10).status_code

def main():
    start = time.perf_counter()
    with ThreadPoolExecutor(max_workers=5) as pool:
        results = list(pool.map(fetch, URLS))
    print(f"多线程耗时 {time.perf_counter() - start:.2f}s, 结果 {results}")

if __name__ == "__main__":
    main()
# 输出:多线程耗时 ~1.10s

原理requests.get 在等待网络响应时会释放 GIL,其他线程可以执行,从而实现 I/O 并发。

3.3 协程版本(asyncio + aiohttp)

import asyncio
import time
import aiohttp

URLS = ["https://httpbin.org/delay/1"] * 5

async def fetch(session: aiohttp.ClientSession, url: str) -> int:
    async with session.get(url) as resp:
        return resp.status

async def main():
    start = time.perf_counter()
    async with aiohttp.ClientSession() as session:
        tasks = [fetch(session, u) for u in URLS]
        results = await asyncio.gather(*tasks)
    print(f"协程耗时 {time.perf_counter() - start:.2f}s, 结果 {results}")

if __name__ == "__main__":
    asyncio.run(main())
# 输出:协程耗时 ~1.05s,单线程内即可处理上万并发

关键点

  • asyncio.gather 并发调度多个协程
  • 单线程事件循环,无线程切换成本
  • 每个连接占用内存远低于线程(~KB vs ~MB)

四、案例二:CPU 密集型 —— 大数质因数分解

4.1 串行版本

import time
import math

def factorize(n: int) -> list[int]:
    factors = []
    d = 2
    while d * d <= n:
        while n % d == 0:
            factors.append(d)
            n //= d
        d += 1
    if n > 1:
        factors.append(n)
    return factors

NUMBERS = [112272535095293] * 8  # 大质数

def main():
    start = time.perf_counter()
    results = [factorize(n) for n in NUMBERS]
    print(f"串行耗时 {time.perf_counter() - start:.2f}s")

if __name__ == "__main__":
    main()
# 输出:串行耗时 ~12.0s(8 核机器)

4.2 多线程版本(无效,仅作对比)

from concurrent.futures import ThreadPoolExecutor

def main():
    start = time.perf_counter()
    with ThreadPoolExecutor(max_workers=8) as pool:
        results = list(pool.map(factorize, NUMBERS))
    print(f"多线程耗时 {time.perf_counter() - start:.2f}s")
# 输出:多线程耗时 ~12.0s(GIL 导致无加速)

4.3 多进程版本(真正的并行)

import time
from concurrent.futures import ProcessPoolExecutor

def factorize(n: int) -> list[int]:
    factors = []
    d = 2
    while d * d <= n:
        while n % d == 0:
            factors.append(d)
            n //= d
        d += 1
    if n > 1:
        factors.append(n)
    return factors

NUMBERS = [112272535095293] * 8

def main():
    start = time.perf_counter()
    with ProcessPoolExecutor(max_workers=8) as pool:
        results = list(pool.map(factorize, NUMBERS))
    print(f"多进程耗时 {time.perf_counter() - start:.2f}s")

if __name__ == "__main__":
    main()
# 输出:多进程耗时 ~1.6s(8 核接近线性加速)

注意事项

  • Windows 下必须有 if __name__ == "__main__": 守卫,否则子进程会无限递归 fork
  • 函数与参数必须可 pickle 序列化
  • 进程启动有固定开销(~50-200ms),任务太小反而变慢

五、案例三:混合负载 —— 生产者消费者

5.1 线程 + Queue

import threading
import queue
import time
import random

q: queue.Queue[int] = queue.Queue(maxsize=10)
SENTINEL = None

def producer(n: int):
    for i in range(n):
        item = random.randint(1, 100)
        q.put(item)
        print(f"生产 {item}")
        time.sleep(0.1)
    q.put(SENTINEL)

def consumer(name: str):
    while True:
        item = q.get()
        if item is SENTINEL:
            q.put(SENTINEL)  # 通知其他消费者
            break
        print(f"  [{name}] 消费 {item}")
        time.sleep(0.2)
        q.task_done()

if __name__ == "__main__":
    t_p = threading.Thread(target=producer, args=(10,))
    t_c1 = threading.Thread(target=consumer, args=("C1",))
    t_c2 = threading.Thread(target=consumer, args=("C2",))
    t_p.start(); t_c1.start(); t_c2.start()
    t_p.join(); t_c1.join(); t_c2.join()

5.2 asyncio 版本

import asyncio
import random

async def producer(q: asyncio.Queue, n: int):
    for i in range(n):
        item = random.randint(1, 100)
        await q.put(item)
        print(f"生产 {item}")
        await asyncio.sleep(0.1)
    await q.put(None)

async def consumer(q: asyncio.Queue, name: str):
    while True:
        item = await q.get()
        if item is None:
            await q.put(None)
            break
        print(f"  [{name}] 消费 {item}")
        await asyncio.sleep(0.2)

async def main():
    q: asyncio.Queue = asyncio.Queue(maxsize=10)
    await asyncio.gather(
        producer(q, 10),
        consumer(q, "C1"),
        consumer(q, "C2"),
    )

if __name__ == "__main__":
    asyncio.run(main())

六、案例四:异步 Web 服务(FastAPI)

# pip install fastapi uvicorn httpx
import asyncio
import httpx
from fastapi import FastAPI

app = FastAPI()

@app.get("/aggregate")
async def aggregate():
    """并发聚合 3 个下游服务"""
    async with httpx.AsyncClient(timeout=5) as client:
        user_task = client.get("https://httpbin.org/delay/1")
        order_task = client.get("https://httpbin.org/delay/1")
        stock_task = client.get("https://httpbin.org/delay/1")
        user, order, stock = await asyncio.gather(
            user_task, order_task, stock_task
        )
    return {
        "user": user.status_code,
        "order": order.status_code,
        "stock": stock.status_code,
    }

# uvicorn main:app --workers 4
# 多进程 worker (并行) + 单 worker 内 asyncio (并发) = 双层加速

七、同步原语对比

场景threadingasyncio
互斥Lockasyncio.Lock
信号量Semaphoreasyncio.Semaphore
事件Eventasyncio.Event
队列queue.Queueasyncio.Queue
等待Thread.join()await task / gather

asyncio.Semaphore 限流示例

import asyncio
import aiohttp

sem = asyncio.Semaphore(10)  # 最多 10 个并发请求

async def fetch_limited(session, url):
    async with sem:
        async with session.get(url) as resp:
            return await resp.text()

async def main():
    urls = [f"https://httpbin.org/get?id={i}" for i in range(100)]
    async with aiohttp.ClientSession() as session:
        results = await asyncio.gather(
            *[fetch_limited(session, u) for u in urls]
        )
    print(f"完成 {len(results)} 个请求")

八、决策树:怎么选?

任务是 CPU 密集?
├── 是 → multiprocessing / ProcessPoolExecutor
│        (或考虑 numpy/numba/Cython 释放 GIL)
└── 否(I/O 密集)
    ├── 已有同步库(如 requests、psycopg2)
    │   └── ThreadPoolExecutor(改造成本低)
    └── 有异步库(aiohttp、asyncpg、httpx)
        └── asyncio(单机最高吞吐)

九、常见陷阱

  1. 多进程下全局变量不共享:每个进程有独立内存,需用 ManagerQueue
  2. 协程中调用阻塞函数会阻塞整个事件循环:用 loop.run_in_executorasyncio.to_thread
    result = await asyncio.to_thread(blocking_io_func, arg)
    
  3. 线程池中异常会被吞掉:必须 future.result()as_completed 才会抛出
  4. ProcessPoolExecutor 任务必须可 pickle:lambda、嵌套函数会报错
  5. asyncio.gather 中一个失败会取消其他:用 return_exceptions=True 收集所有结果

十、Python 3.13+ 自由线程(No-GIL)展望

Python 3.13 引入实验性 --disable-gil 构建(PEP 703),未来多线程也能真正并行 CPU 密集任务。但短期内生态兼容性仍是问题,多进程仍是 CPU 并行的稳妥选择