Python性能优化实战：从分析瓶颈到提速10倍第一步：找到瓶颈（别猜，用工具） cProfile（标准库） line_

摘要：Python慢是共识，但大多数时候不是Python的问题，是代码写法的问题。本文从性能分析工具入手，覆盖数据结构选择、算法优化、内存优化、C扩展加速等实战技巧，帮你找到瓶颈并精准优化。

第一步：找到瓶颈（别猜，用工具）

cProfile（标准库）

import cProfile
import pstats

# 方法1：命令行
# python -m cProfile -s cumulative script.py

# 方法2：代码中使用
def main():
    data = [i ** 2 for i in range(1000000)]
    result = sorted(data, reverse=True)
    return sum(result[:100])

profiler = cProfile.Profile()
profiler.enable()
main()
profiler.disable()

stats = pstats.Stats(profiler)
stats.sort_stats('cumulative')
stats.print_stats(20)  # 打印前20个最耗时的函数

line_profiler（逐行分析）

pip install line_profiler

# 用 @profile 装饰要分析的函数
@profile
def process_data(data):
    result = []
    for item in data:           # 这行花了多少时间？
        if item > 100:          # 这行呢？
            result.append(item * 2)
    return sorted(result)       # 排序花了多少？

# 运行
# kernprof -l -v script.py

输出示例：

Line #  Hits    Time  Per Hit  % Time  Line Contents
     3  1       0.1     0.1     0.0    result = []
     4  1000000 120.5   0.0    45.2    for item in data:
     5  1000000  80.3   0.0    30.1    if item > 100:
     6  500000   35.2   0.0    13.2    result.append(item * 2)
     7  1        30.5  30.5    11.5    return sorted(result)

memory_profiler（内存分析）

pip install memory_profiler

from memory_profiler import profile

@profile
def memory_hungry():
    a = [i for i in range(1000000)]      # +38 MiB
    b = {i: i**2 for i in range(1000000)} # +80 MiB
    del a                                  # -38 MiB
    return b

# python -m memory_profiler script.py

数据结构优化

列表 vs 生成器

import sys

# ❌ 列表：一次性加载到内存
data = [i ** 2 for i in range(10000000)]
print(sys.getsizeof(data))  # ~80MB

# ✅ 生成器：按需生成，几乎不占内存
data = (i ** 2 for i in range(10000000))
print(sys.getsizeof(data))  # 200 bytes

# 实际场景：处理大文件
# ❌
lines = open('huge.csv').readlines()  # 全部读入内存
for line in lines:
    process(line)

# ✅
with open('huge.csv') as f:  # 逐行读取
    for line in f:
        process(line)

字典/集合 vs 列表查找

import time

data = list(range(1000000))
data_set = set(data)

# ❌ 列表查找：O(n)
start = time.time()
999999 in data
print(f'列表查找: {time.time() - start:.6f}s')  # ~0.01s

# ✅ 集合查找：O(1)
start = time.time()
999999 in data_set
print(f'集合查找: {time.time() - start:.6f}s')  # ~0.000001s

# 快了10000倍！

collections模块

from collections import defaultdict, Counter, deque

# defaultdict：避免key不存在的判断
word_count = defaultdict(int)
for word in words:
    word_count[word] += 1  # 不需要 if word in word_count

# Counter：计数专用
counter = Counter(words)
top10 = counter.most_common(10)

# deque：双端队列，两端操作O(1)
# ❌ list.insert(0, x) 是 O(n)
# ✅ deque.appendleft(x) 是 O(1)
queue = deque(maxlen=1000)  # 固定长度，自动丢弃旧数据
queue.append(item)
queue.appendleft(item)

slots（减少对象内存）

import sys

# 普通类：每个实例有__dict__
class PointNormal:
    def __init__(self, x, y):
        self.x = x
        self.y = y

# slots类：固定属性，无__dict__
class PointSlots:
    __slots__ = ('x', 'y')
    def __init__(self, x, y):
        self.x = x
        self.y = y

p1 = PointNormal(1, 2)
p2 = PointSlots(1, 2)

print(sys.getsizeof(p1.__dict__))  # 104 bytes
# p2没有__dict__，省了这部分内存

# 百万个对象时差距明显
# PointNormal: ~160MB
# PointSlots:  ~64MB（省60%）

循环优化

# ❌ 慢：Python循环
result = []
for i in range(1000000):
    result.append(i ** 2)

# ✅ 快：列表推导（C层面循环）
result = [i ** 2 for i in range(1000000)]
# 快30-50%

# ✅ 更快：map + lambda
result = list(map(lambda x: x ** 2, range(1000000)))

# ✅ 最快：NumPy向量化
import numpy as np
arr = np.arange(1000000)
result = arr ** 2
# 快100倍+

避免重复计算

# ❌ 每次循环都调用len()
for i in range(len(data)):
    process(data[i])

# ✅ 直接迭代
for item in data:
    process(item)

# ❌ 循环内重复查找属性
for item in data:
    result.append(item.strip().lower())

# ✅ 局部变量缓存
strip = str.strip
lower = str.lower
append = result.append
for item in data:
    append(lower(strip(item)))

字符串优化

# ❌ 字符串拼接：每次创建新对象
result = ''
for s in strings:
    result += s  # O(n²)

# ✅ join：一次性拼接
result = ''.join(strings)  # O(n)

# ❌ 格式化：% 和 format
msg = 'Hello %s, you are %d years old' % (name, age)
msg = 'Hello {}, you are {} years old'.format(name, age)

# ✅ f-string（最快）
msg = f'Hello {name}, you are {age} years old'

缓存优化

from functools import lru_cache

# 自动缓存函数结果
@lru_cache(maxsize=128)
def fibonacci(n):
    if n < 2:
        return n
    return fibonacci(n - 1) + fibonacci(n - 2)

# 没有缓存：fibonacci(40) 需要几十秒
# 有缓存：fibonacci(40) 瞬间完成

# 查看缓存命中率
print(fibonacci.cache_info())
# CacheInfo(hits=38, misses=41, maxsize=128, currsize=41)

# 清除缓存
fibonacci.cache_clear()

并发加速

import asyncio
import aiohttp
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor

# IO密集型 → 多线程/异步
# CPU密集型 → 多进程

# 异步HTTP请求（IO密集型）
async def fetch_all(urls):
    async with aiohttp.ClientSession() as session:
        tasks = [session.get(url) for url in urls]
        responses = await asyncio.gather(*tasks)
        return [await r.text() for r in responses]

# 多进程计算（CPU密集型）
def cpu_heavy(n):
    return sum(i ** 2 for i in range(n))

with ProcessPoolExecutor(max_workers=4) as executor:
    results = list(executor.map(cpu_heavy, [10**6] * 4))

C扩展加速

Cython

# fib.pyx
def fibonacci_cy(int n):
    cdef int a = 0, b = 1, i
    for i in range(n):
        a, b = b, a + b
    return a

# 比纯Python快50-100倍

NumPy向量化

import numpy as np

# ❌ Python循环
def distance_python(x1, y1, x2, y2):
    result = []
    for i in range(len(x1)):
        d = ((x1[i] - x2[i])**2 + (y1[i] - y2[i])**2) ** 0.5
        result.append(d)
    return result

# ✅ NumPy向量化
def distance_numpy(x1, y1, x2, y2):
    return np.sqrt((x1 - x2)**2 + (y1 - y2)**2)

# 100万个点：
# Python: 2.5s
# NumPy:  0.01s（快250倍）

性能优化清单

按投入产出比排序：

选对数据结构（set代替list查找）— 效果：10-10000倍
用内置函数和标准库（map/filter/sorted）— 效果：2-5倍
列表推导代替循环 — 效果：1.3-2倍
NumPy向量化 — 效果：10-100倍
缓存（lru_cache）— 效果：取决于重复率
并发/并行 — 效果：接近核心数倍
C扩展（Cython/C）— 效果：10-100倍

总结

Python性能优化的核心原则：

先测量，再优化（别猜瓶颈在哪）
优化算法和数据结构的收益远大于微观优化
80%的时间花在20%的代码上，找到那20%
能用NumPy/Pandas的场景别写循环
实在不够快，考虑Cython或换语言写热点函数

记住：过早优化是万恶之源。先让代码正确，再让它快。