Python性能优化实战:从分析瓶颈到提速10倍

3 阅读5分钟

摘要:Python慢是共识,但大多数时候不是Python的问题,是代码写法的问题。本文从性能分析工具入手,覆盖数据结构选择、算法优化、内存优化、C扩展加速等实战技巧,帮你找到瓶颈并精准优化。

第一步:找到瓶颈(别猜,用工具)

cProfile(标准库)

import cProfile
import pstats

# 方法1:命令行
# python -m cProfile -s cumulative script.py

# 方法2:代码中使用
def main():
    data = [i ** 2 for i in range(1000000)]
    result = sorted(data, reverse=True)
    return sum(result[:100])

profiler = cProfile.Profile()
profiler.enable()
main()
profiler.disable()

stats = pstats.Stats(profiler)
stats.sort_stats('cumulative')
stats.print_stats(20)  # 打印前20个最耗时的函数

line_profiler(逐行分析)

pip install line_profiler
# 用 @profile 装饰要分析的函数
@profile
def process_data(data):
    result = []
    for item in data:           # 这行花了多少时间?
        if item > 100:          # 这行呢?
            result.append(item * 2)
    return sorted(result)       # 排序花了多少?

# 运行
# kernprof -l -v script.py

输出示例:

Line #  Hits    Time  Per Hit  % Time  Line Contents
     3  1       0.1     0.1     0.0    result = []
     4  1000000 120.5   0.0    45.2    for item in data:
     5  1000000  80.3   0.0    30.1    if item > 100:
     6  500000   35.2   0.0    13.2    result.append(item * 2)
     7  1        30.5  30.5    11.5    return sorted(result)

memory_profiler(内存分析)

pip install memory_profiler
from memory_profiler import profile

@profile
def memory_hungry():
    a = [i for i in range(1000000)]      # +38 MiB
    b = {i: i**2 for i in range(1000000)} # +80 MiB
    del a                                  # -38 MiB
    return b

# python -m memory_profiler script.py

数据结构优化

列表 vs 生成器

import sys

# ❌ 列表:一次性加载到内存
data = [i ** 2 for i in range(10000000)]
print(sys.getsizeof(data))  # ~80MB

# ✅ 生成器:按需生成,几乎不占内存
data = (i ** 2 for i in range(10000000))
print(sys.getsizeof(data))  # 200 bytes

# 实际场景:处理大文件
# ❌
lines = open('huge.csv').readlines()  # 全部读入内存
for line in lines:
    process(line)

# ✅
with open('huge.csv') as f:  # 逐行读取
    for line in f:
        process(line)

字典/集合 vs 列表查找

import time

data = list(range(1000000))
data_set = set(data)

# ❌ 列表查找:O(n)
start = time.time()
999999 in data
print(f'列表查找: {time.time() - start:.6f}s')  # ~0.01s

# ✅ 集合查找:O(1)
start = time.time()
999999 in data_set
print(f'集合查找: {time.time() - start:.6f}s')  # ~0.000001s

# 快了10000倍!

collections模块

from collections import defaultdict, Counter, deque

# defaultdict:避免key不存在的判断
word_count = defaultdict(int)
for word in words:
    word_count[word] += 1  # 不需要 if word in word_count

# Counter:计数专用
counter = Counter(words)
top10 = counter.most_common(10)

# deque:双端队列,两端操作O(1)
# ❌ list.insert(0, x) 是 O(n)
# ✅ deque.appendleft(x) 是 O(1)
queue = deque(maxlen=1000)  # 固定长度,自动丢弃旧数据
queue.append(item)
queue.appendleft(item)

slots(减少对象内存)

import sys

# 普通类:每个实例有__dict__
class PointNormal:
    def __init__(self, x, y):
        self.x = x
        self.y = y

# slots类:固定属性,无__dict__
class PointSlots:
    __slots__ = ('x', 'y')
    def __init__(self, x, y):
        self.x = x
        self.y = y

p1 = PointNormal(1, 2)
p2 = PointSlots(1, 2)

print(sys.getsizeof(p1.__dict__))  # 104 bytes
# p2没有__dict__,省了这部分内存

# 百万个对象时差距明显
# PointNormal: ~160MB
# PointSlots:  ~64MB(省60%)

循环优化

# ❌ 慢:Python循环
result = []
for i in range(1000000):
    result.append(i ** 2)

# ✅ 快:列表推导(C层面循环)
result = [i ** 2 for i in range(1000000)]
# 快30-50%

# ✅ 更快:map + lambda
result = list(map(lambda x: x ** 2, range(1000000)))

# ✅ 最快:NumPy向量化
import numpy as np
arr = np.arange(1000000)
result = arr ** 2
# 快100倍+

避免重复计算

# ❌ 每次循环都调用len()
for i in range(len(data)):
    process(data[i])

# ✅ 直接迭代
for item in data:
    process(item)

# ❌ 循环内重复查找属性
for item in data:
    result.append(item.strip().lower())

# ✅ 局部变量缓存
strip = str.strip
lower = str.lower
append = result.append
for item in data:
    append(lower(strip(item)))

字符串优化

# ❌ 字符串拼接:每次创建新对象
result = ''
for s in strings:
    result += s  # O(n²)

# ✅ join:一次性拼接
result = ''.join(strings)  # O(n)

# ❌ 格式化:% 和 format
msg = 'Hello %s, you are %d years old' % (name, age)
msg = 'Hello {}, you are {} years old'.format(name, age)

# ✅ f-string(最快)
msg = f'Hello {name}, you are {age} years old'

缓存优化

from functools import lru_cache

# 自动缓存函数结果
@lru_cache(maxsize=128)
def fibonacci(n):
    if n < 2:
        return n
    return fibonacci(n - 1) + fibonacci(n - 2)

# 没有缓存:fibonacci(40) 需要几十秒
# 有缓存:fibonacci(40) 瞬间完成

# 查看缓存命中率
print(fibonacci.cache_info())
# CacheInfo(hits=38, misses=41, maxsize=128, currsize=41)

# 清除缓存
fibonacci.cache_clear()

并发加速

import asyncio
import aiohttp
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor

# IO密集型 → 多线程/异步
# CPU密集型 → 多进程

# 异步HTTP请求(IO密集型)
async def fetch_all(urls):
    async with aiohttp.ClientSession() as session:
        tasks = [session.get(url) for url in urls]
        responses = await asyncio.gather(*tasks)
        return [await r.text() for r in responses]

# 多进程计算(CPU密集型)
def cpu_heavy(n):
    return sum(i ** 2 for i in range(n))

with ProcessPoolExecutor(max_workers=4) as executor:
    results = list(executor.map(cpu_heavy, [10**6] * 4))

C扩展加速

Cython

# fib.pyx
def fibonacci_cy(int n):
    cdef int a = 0, b = 1, i
    for i in range(n):
        a, b = b, a + b
    return a

# 比纯Python快50-100倍

NumPy向量化

import numpy as np

# ❌ Python循环
def distance_python(x1, y1, x2, y2):
    result = []
    for i in range(len(x1)):
        d = ((x1[i] - x2[i])**2 + (y1[i] - y2[i])**2) ** 0.5
        result.append(d)
    return result

# ✅ NumPy向量化
def distance_numpy(x1, y1, x2, y2):
    return np.sqrt((x1 - x2)**2 + (y1 - y2)**2)

# 100万个点:
# Python: 2.5s
# NumPy:  0.01s(快250倍)

性能优化清单

按投入产出比排序:

  1. 选对数据结构(set代替list查找)— 效果:10-10000倍
  2. 用内置函数和标准库(map/filter/sorted)— 效果:2-5倍
  3. 列表推导代替循环 — 效果:1.3-2倍
  4. NumPy向量化 — 效果:10-100倍
  5. 缓存(lru_cache)— 效果:取决于重复率
  6. 并发/并行 — 效果:接近核心数倍
  7. C扩展(Cython/C)— 效果:10-100倍

总结

Python性能优化的核心原则:

  • 先测量,再优化(别猜瓶颈在哪)
  • 优化算法和数据结构的收益远大于微观优化
  • 80%的时间花在20%的代码上,找到那20%
  • 能用NumPy/Pandas的场景别写循环
  • 实在不够快,考虑Cython或换语言写热点函数

记住:过早优化是万恶之源。先让代码正确,再让它快。