摘要:Python慢是共识,但大多数时候不是Python的问题,是代码写法的问题。本文从性能分析工具入手,覆盖数据结构选择、算法优化、内存优化、C扩展加速等实战技巧,帮你找到瓶颈并精准优化。
第一步:找到瓶颈(别猜,用工具)
cProfile(标准库)
import cProfile
import pstats
# 方法1:命令行
# python -m cProfile -s cumulative script.py
# 方法2:代码中使用
def main():
data = [i ** 2 for i in range(1000000)]
result = sorted(data, reverse=True)
return sum(result[:100])
profiler = cProfile.Profile()
profiler.enable()
main()
profiler.disable()
stats = pstats.Stats(profiler)
stats.sort_stats('cumulative')
stats.print_stats(20) # 打印前20个最耗时的函数
line_profiler(逐行分析)
pip install line_profiler
# 用 @profile 装饰要分析的函数
@profile
def process_data(data):
result = []
for item in data: # 这行花了多少时间?
if item > 100: # 这行呢?
result.append(item * 2)
return sorted(result) # 排序花了多少?
# 运行
# kernprof -l -v script.py
输出示例:
Line # Hits Time Per Hit % Time Line Contents
3 1 0.1 0.1 0.0 result = []
4 1000000 120.5 0.0 45.2 for item in data:
5 1000000 80.3 0.0 30.1 if item > 100:
6 500000 35.2 0.0 13.2 result.append(item * 2)
7 1 30.5 30.5 11.5 return sorted(result)
memory_profiler(内存分析)
pip install memory_profiler
from memory_profiler import profile
@profile
def memory_hungry():
a = [i for i in range(1000000)] # +38 MiB
b = {i: i**2 for i in range(1000000)} # +80 MiB
del a # -38 MiB
return b
# python -m memory_profiler script.py
数据结构优化
列表 vs 生成器
import sys
# ❌ 列表:一次性加载到内存
data = [i ** 2 for i in range(10000000)]
print(sys.getsizeof(data)) # ~80MB
# ✅ 生成器:按需生成,几乎不占内存
data = (i ** 2 for i in range(10000000))
print(sys.getsizeof(data)) # 200 bytes
# 实际场景:处理大文件
# ❌
lines = open('huge.csv').readlines() # 全部读入内存
for line in lines:
process(line)
# ✅
with open('huge.csv') as f: # 逐行读取
for line in f:
process(line)
字典/集合 vs 列表查找
import time
data = list(range(1000000))
data_set = set(data)
# ❌ 列表查找:O(n)
start = time.time()
999999 in data
print(f'列表查找: {time.time() - start:.6f}s') # ~0.01s
# ✅ 集合查找:O(1)
start = time.time()
999999 in data_set
print(f'集合查找: {time.time() - start:.6f}s') # ~0.000001s
# 快了10000倍!
collections模块
from collections import defaultdict, Counter, deque
# defaultdict:避免key不存在的判断
word_count = defaultdict(int)
for word in words:
word_count[word] += 1 # 不需要 if word in word_count
# Counter:计数专用
counter = Counter(words)
top10 = counter.most_common(10)
# deque:双端队列,两端操作O(1)
# ❌ list.insert(0, x) 是 O(n)
# ✅ deque.appendleft(x) 是 O(1)
queue = deque(maxlen=1000) # 固定长度,自动丢弃旧数据
queue.append(item)
queue.appendleft(item)
slots(减少对象内存)
import sys
# 普通类:每个实例有__dict__
class PointNormal:
def __init__(self, x, y):
self.x = x
self.y = y
# slots类:固定属性,无__dict__
class PointSlots:
__slots__ = ('x', 'y')
def __init__(self, x, y):
self.x = x
self.y = y
p1 = PointNormal(1, 2)
p2 = PointSlots(1, 2)
print(sys.getsizeof(p1.__dict__)) # 104 bytes
# p2没有__dict__,省了这部分内存
# 百万个对象时差距明显
# PointNormal: ~160MB
# PointSlots: ~64MB(省60%)
循环优化
# ❌ 慢:Python循环
result = []
for i in range(1000000):
result.append(i ** 2)
# ✅ 快:列表推导(C层面循环)
result = [i ** 2 for i in range(1000000)]
# 快30-50%
# ✅ 更快:map + lambda
result = list(map(lambda x: x ** 2, range(1000000)))
# ✅ 最快:NumPy向量化
import numpy as np
arr = np.arange(1000000)
result = arr ** 2
# 快100倍+
避免重复计算
# ❌ 每次循环都调用len()
for i in range(len(data)):
process(data[i])
# ✅ 直接迭代
for item in data:
process(item)
# ❌ 循环内重复查找属性
for item in data:
result.append(item.strip().lower())
# ✅ 局部变量缓存
strip = str.strip
lower = str.lower
append = result.append
for item in data:
append(lower(strip(item)))
字符串优化
# ❌ 字符串拼接:每次创建新对象
result = ''
for s in strings:
result += s # O(n²)
# ✅ join:一次性拼接
result = ''.join(strings) # O(n)
# ❌ 格式化:% 和 format
msg = 'Hello %s, you are %d years old' % (name, age)
msg = 'Hello {}, you are {} years old'.format(name, age)
# ✅ f-string(最快)
msg = f'Hello {name}, you are {age} years old'
缓存优化
from functools import lru_cache
# 自动缓存函数结果
@lru_cache(maxsize=128)
def fibonacci(n):
if n < 2:
return n
return fibonacci(n - 1) + fibonacci(n - 2)
# 没有缓存:fibonacci(40) 需要几十秒
# 有缓存:fibonacci(40) 瞬间完成
# 查看缓存命中率
print(fibonacci.cache_info())
# CacheInfo(hits=38, misses=41, maxsize=128, currsize=41)
# 清除缓存
fibonacci.cache_clear()
并发加速
import asyncio
import aiohttp
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
# IO密集型 → 多线程/异步
# CPU密集型 → 多进程
# 异步HTTP请求(IO密集型)
async def fetch_all(urls):
async with aiohttp.ClientSession() as session:
tasks = [session.get(url) for url in urls]
responses = await asyncio.gather(*tasks)
return [await r.text() for r in responses]
# 多进程计算(CPU密集型)
def cpu_heavy(n):
return sum(i ** 2 for i in range(n))
with ProcessPoolExecutor(max_workers=4) as executor:
results = list(executor.map(cpu_heavy, [10**6] * 4))
C扩展加速
Cython
# fib.pyx
def fibonacci_cy(int n):
cdef int a = 0, b = 1, i
for i in range(n):
a, b = b, a + b
return a
# 比纯Python快50-100倍
NumPy向量化
import numpy as np
# ❌ Python循环
def distance_python(x1, y1, x2, y2):
result = []
for i in range(len(x1)):
d = ((x1[i] - x2[i])**2 + (y1[i] - y2[i])**2) ** 0.5
result.append(d)
return result
# ✅ NumPy向量化
def distance_numpy(x1, y1, x2, y2):
return np.sqrt((x1 - x2)**2 + (y1 - y2)**2)
# 100万个点:
# Python: 2.5s
# NumPy: 0.01s(快250倍)
性能优化清单
按投入产出比排序:
- 选对数据结构(set代替list查找)— 效果:10-10000倍
- 用内置函数和标准库(map/filter/sorted)— 效果:2-5倍
- 列表推导代替循环 — 效果:1.3-2倍
- NumPy向量化 — 效果:10-100倍
- 缓存(lru_cache)— 效果:取决于重复率
- 并发/并行 — 效果:接近核心数倍
- C扩展(Cython/C)— 效果:10-100倍
总结
Python性能优化的核心原则:
- 先测量,再优化(别猜瓶颈在哪)
- 优化算法和数据结构的收益远大于微观优化
- 80%的时间花在20%的代码上,找到那20%
- 能用NumPy/Pandas的场景别写循环
- 实在不够快,考虑Cython或换语言写热点函数
记住:过早优化是万恶之源。先让代码正确,再让它快。