🔍 一、快速诊断:Python 特有命令
- 进程级连接诊断
# 查看 Python 进程连接状态(替换 <pid> 为实际进程ID)
ss -tanp | grep python | grep <pid>
# 统计各状态连接数
cat /proc/<pid>/net/tcp | awk '{print $4}' | sort | uniq -c
# 查看文件描述符(socket 也是一种 fd)
ls -l /proc/<pid>/fd | grep socket | wc -l
ls -l /proc/<pid>/fd | awk '{print $NF}' | sort | uniq -c | sort -rn
- Python 运行时诊断(无需重启)
# 在运行中的 Python 进程里注入诊断代码(使用 gdb 或 manhole)
import sys
import threading
import gc
import socket
# 查看活跃线程
print(f"活跃线程数: {threading.active_count()}")
for t in threading.enumerate():
print(f" - {t.name}: {t.ident}")
# 查看 socket 对象(可能被 GC 但还没关闭)
sockets = [obj for obj in gc.get_objects() if isinstance(obj, socket.socket)]
print(f"内存中 socket 对象数: {len(sockets)}")
for s in sockets[:10]: # 只显示前10个
try:
print(f" fd={s.fileno()}, {s.getpeername()} -> {s.getsockname()}")
except:
print(f" fd={s.fileno()}, [未连接或已关闭]")
- 使用
py-spy进行实时诊断(无需修改代码)
# 安装 py-spy
pip install py-spy
# 实时查看线程状态(类似 top)
py-spy top --pid <pid>
# 导出火焰图(查看 CPU 耗时)
py-spy record -o profile.svg --pid <pid>
# 查看线程堆栈(类似 jstack)
py-spy dump --pid <pid>
🎯 二、Python 常见故障模式
故障模式 Python 特有症状 典型场景
GIL 阻塞 CPU 不高但响应慢,线程状态多为 PyEval_RestoreThread 多线程 CPU 密集型任务 + IO 操作
asyncio 事件循环阻塞 协程不切换,大量任务 pending 在 async 函数中调用了同步阻塞 IO
requests 会话未复用 大量 TIME_WAIT,每次请求新建 TCP 连接 未使用 requests.Session()
线程池未释放 线程数持续增长,最终 OOM ThreadPoolExecutor 未 shutdown()
数据库连接泄漏 CLOSE_WAIT 堆积,连接池耗尽 SQLAlchemy 会话未 close() 或 remove()
🛠️ 三、分层排查与解决方案
第一层:同步代码(requests + ThreadPoolExecutor)
❌ 错误示范:连接泄漏经典案例
import requests
from concurrent.futures import ThreadPoolExecutor
# 错误1:每次新建 Session(导致大量 TIME_WAIT)
def fetch_bad(url):
resp = requests.get(url, timeout=30) # 每次新建 TCP 连接!
return resp.text
# 错误2:线程池不复用 + 不关闭
def batch_fetch_bad(urls):
for url in urls: # 串行执行,慢
fetch_bad(url)
# 或者错误地使用线程池但不关闭
executor = ThreadPoolExecutor(max_workers=100) # 无界增长!
futures = [executor.submit(fetch_bad, url) for url in urls]
return [f.result() for f in futures] # 忘记 executor.shutdown()
# 错误3:不设置超时(默认挂起 forever)
requests.get("http://slow-api.com") # 可能卡死数小时
✅ 正确实践:连接池化 + 资源管理
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from concurrent.futures import ThreadPoolExecutor, as_completed
import contextlib
# 1. 创建复用 Session(连接池核心)
def create_session(
pool_connections=10,
pool_maxsize=100,
max_retries=3,
backoff_factor=0.5
):
session = requests.Session()
# 重试策略(关键!防止瞬时故障耗尽连接)
retry_strategy = Retry(
total=max_retries,
backoff_factor=backoff_factor,
status_forcelist=[429, 500, 502, 503, 504],
allowed_methods=["HEAD", "GET", "OPTIONS", "POST"]
)
# 连接池配置
adapter = HTTPAdapter(
pool_connections=pool_connections, # 主机数
pool_maxsize=pool_maxsize, # 单主机连接数
max_retries=retry_strategy,
pool_block=True # 连接池满时阻塞而非抛异常(可配)
)
session.mount("http://", adapter)
session.mount("https://", adapter)
# 全局超时默认值
session.request = lambda *args, **kwargs: requests.Session.request(
session, *args, **kwargs, timeout=(5, 30) # (连接超时, 读取超时)
)
return session
# 2. 正确使用线程池(上下文管理器确保关闭)
def batch_fetch_good(urls, max_workers=10):
session = create_session(pool_maxsize=max_workers * 2)
# 使用上下文管理器确保线程池关闭
with ThreadPoolExecutor(max_workers=max_workers) as executor:
# 提交所有任务
future_to_url = {
executor.submit(session.get, url): url
for url in urls
}
results = {}
for future in as_completed(future_to_url, timeout=60): # 总超时
url = future_to_url[future]
try:
resp = future.result()
results[url] = resp.text
except Exception as e:
results[url] = f"Error: {e}"
# 关键:异常时也要确保响应体被读取并关闭
if hasattr(future, 'exception') and future.exception():
exc = future.exception()
if hasattr(exc, 'response') and exc.response:
exc.response.close()
return results
# 退出时自动调用 executor.shutdown(wait=True) + session.close()
# 3. 更安全的异步迭代(防止内存爆炸)
def batch_fetch_streaming(urls, max_workers=10, chunk_size=100):
"""分批处理,控制内存和并发"""
session = create_session()
with ThreadPoolExecutor(max_workers=max_workers) as executor:
for i in range(0, len(urls), chunk_size):
chunk = urls[i:i + chunk_size]
futures = [executor.submit(session.get, url) for url in chunk]
for future in as_completed(futures):
yield future.result()
# 每批完成后强制清理(可选)
import gc
gc.collect()
第二层:异步代码(asyncio + aiohttp)
❌ 错误示范:asyncio 阻塞陷阱
import asyncio
import requests # 致命:在 async 里用同步库!
async def bad_async_fetch(url):
# 错误:requests 会阻塞整个事件循环!
resp = requests.get(url) # 阻塞所有协程
return resp.text
async def main():
# 错误:没有限制并发数,会瞬间创建数千连接
tasks = [bad_async_fetch(f"http://api.com/{i}") for i in range(10000)]
await asyncio.gather(*tasks) # 资源爆炸!
# 错误:不设置超时(默认 None,可能永久挂起)
asyncio.wait_for(some_coroutine, timeout=None)
✅ 正确实践:纯异步 + 信号量控制
import asyncio
import aiohttp
from aiohttp import ClientTimeout, TCPConnector
import asyncpg # 异步数据库
# 1. 创建带限流的 Session(核心!)
async def create_async_session(
limit=100, # 总连接池大小
limit_per_host=30, # 单主机连接数
ttl_dns_cache=300, # DNS 缓存5分钟
use_dns_cache=True
):
# TCP 连接器配置(比 requests 更精细)
connector = TCPConnector(
limit=limit,
limit_per_host=limit_per_host,
ttl_dns_cache=ttl_dns_cache,
use_dns_cache=use_dns_cache,
enable_cleanup_closed=True, # 自动清理关闭连接
force_close=False, # 保持长连接
ssl=False # 如需 HTTPS 设为 True 或 ssl.SSLContext
)
# 超时配置(多层防护)
timeout = ClientTimeout(
total=30, # 总超时(含连接、发送、读取)
connect=5, # 建立连接超时
sock_read=10 # 读取数据超时
)
session = aiohttp.ClientSession(
connector=connector,
timeout=timeout,
headers={"Connection": "keep-alive"}
)
return session
# 2. 信号量控制并发(防止雪崩)
async def fetch_with_semaphore(session, url, semaphore):
async with semaphore: # 限制同时进行的请求数
try:
async with session.get(url) as resp:
# 关键:使用 raise_for_status 检查 HTTP 错误
resp.raise_for_status()
return await resp.text()
except asyncio.TimeoutError:
# 超时后确保连接被回收
return None
except aiohttp.ClientError as e:
# 记录错误但不让异常扩散
print(f"Request failed: {e}")
return None
# 3. 完整并发控制示例
async def controlled_fetch(urls, max_concurrent=50):
semaphore = asyncio.Semaphore(max_concurrent)
session = await create_async_session(limit=max_concurrent * 2)
try:
# 创建任务但立即开始执行(控制内存)
tasks = []
for url in urls:
task = asyncio.create_task(
fetch_with_semaphore(session, url, semaphore)
)
tasks.append(task)
# 当积压任务过多时,先处理一批(背压控制)
if len(tasks) >= 1000:
done, pending = await asyncio.wait(
tasks,
return_when=asyncio.FIRST_COMPLETED
)
tasks = list(pending)
for d in done:
yield await d
# 处理剩余任务
if tasks:
results = await asyncio.gather(*tasks, return_exceptions=True)
for r in results:
yield r
finally:
# 关键:确保 session 关闭(会关闭底层连接池)
await session.close()
# 4. 异步数据库连接池(PostgreSQL 示例)
async def create_db_pool():
return await asyncpg.create_pool(
"postgresql://user:pass@localhost/db",
min_size=5, # 最小连接数
max_size=20, # 最大连接数(关键!)
max_queries=50000, # 单连接最大查询数(防泄漏)
max_inactive_time=300, # 5分钟无活动回收
command_timeout=60, # 查询超时
server_settings={
'jit': 'off' # 禁用 JIT 加速短查询
}
)
# 使用示例
async def main():
pool = await create_db_pool()
try:
# 自动管理连接生命周期
async with pool.acquire() as conn:
async with conn.transaction():
result = await conn.fetch("SELECT * FROM users WHERE id = $1", 1)
return result
finally:
await pool.close()
第三层:WSGI/ASGI 服务器配置
Gunicorn + Flask/FastAPI 优化
# gunicorn.conf.py - 生产环境配置
import multiprocessing
import os
# 工作进程模型(关键选择)
# - sync: 每个请求一个线程,简单但并发低
# - gevent: 协程模型,高并发推荐(需 monkey patch)
# - uvicorn.workers.UvicornWorker: ASGI 异步
worker_class = "gevent" # 或 "uvicorn.workers.UvicornWorker"
# 工作进程数(CPU 密集型:2-4x CPU;IO 密集型:可更高)
workers = multiprocessing.cpu_count() * 2 + 1
worker_connections = 1000 # gevent 模式下每个 worker 的最大并发
# 线程池(仅 sync 模式有效)
threads = 4
# 连接超时(防止慢客户端占用连接)
timeout = 30
graceful_timeout = 10
keepalive = 5 # 长连接保持秒数
# 请求限制(防止恶意请求)
max_requests = 10000 # 单 worker 处理 10000 请求后重启(防内存泄漏)
max_requests_jitter = 1000 # 随机抖动,防止所有 worker 同时重启
# 日志
accesslog = "-"
errorlog = "-"
loglevel = "warning"
# 预加载应用(节省内存)
preload_app = True
# 关键:限制请求体大小(防攻击)
limit_request_line = 4094
limit_request_fields = 100
limit_request_field_size = 8190
启动命令
# 使用 gevent(高并发 HTTP)
gunicorn -c gunicorn.conf.py "app:create_app()"
# 使用 uvicorn(纯异步 ASGI)
gunicorn -k uvicorn.workers.UvicornWorker -w 4 "app:asgi_app"
# 使用 gunicorn + uvicorn(生产推荐)
gunicorn -k uvicorn.workers.UvicornH11Worker -w 4 -b 0.0.0.0:8000 "app:asgi_app"
第四层:监控与可观测性
使用 prometheus_client 暴露指标
from prometheus_client import Counter, Histogram, Gauge, start_http_server
import functools
import time
import asyncio
# 定义指标
ACTIVE_REQUESTS = Gauge(
'http_requests_active',
'当前处理中的请求数',
['method', 'endpoint']
)
REQUEST_DURATION = Histogram(
'http_request_duration_seconds',
'请求处理耗时',
['method', 'endpoint', 'status'],
buckets=[.005, .01, .025, .05, .075, .1, .25, .5, .75, 1.0, 2.5, 5.0, 7.5, 10.0]
)
CONNECTION_POOL_SIZE = Gauge(
'connection_pool_size',
'连接池大小',
['pool_name']
)
# 装饰器自动监控
def monitor_async(func):
@functools.wraps(func)
async def wrapper(*args, **kwargs):
method = kwargs.get('method', 'GET')
endpoint = func.__name__
with ACTIVE_REQUESTS.labels(method=method, endpoint=endpoint).track_inprogress():
start = time.time()
try:
result = await func(*args, **kwargs)
status = 200
return result
except Exception as e:
status = 500
raise
finally:
duration = time.time() - start
REQUEST_DURATION.labels(
method=method,
endpoint=endpoint,
status=status
).observe(duration)
return wrapper
# 监控连接池(以 aiohttp 为例)
async def monitor_aiohttp_session(session, name="default"):
if hasattr(session, '_connector'):
connector = session._connector
while True:
CONNECTION_POOL_SIZE.labels(pool_name=name).set(connector.size)
CONNECTION_POOL_SIZE.labels(pool_name=f"{name}_limit").set(connector.limit)
await asyncio.sleep(10) # 每10秒更新
# 启动指标服务器(在应用启动时调用)
def start_metrics_server(port=9090):
start_http_server(port)
print(f"Metrics server started on port {port}")
健康检查端点(Kubernetes 必备)
from fastapi import FastAPI, HTTPException
import asyncio
import psutil
app = FastAPI()
# 连接池状态检查(自定义)
class ConnectionPoolHealth:
def __init__(self):
self.pools = {}
def register_pool(self, name, pool_obj, max_size_attr='maxsize',
used_attr='size', available_attr='available'):
self.pools[name] = {
'obj': pool_obj,
'max': max_size_attr,
'used': used_attr,
'avail': available_attr
}
def check(self):
status = {}
for name, config in self.pools.items():
pool = config['obj']
try:
max_size = getattr(pool, config['max'], 0)
used = getattr(pool, config['used'], 0)
available = getattr(pool, config['avail'], 0)
usage = used / max_size if max_size > 0 else 0
status[name] = {
'healthy': usage < 0.9, # 使用率<90%为健康
'usage': f"{usage:.1%}",
'used': used,
'max': max_size,
'available': available
}
except Exception as e:
status[name] = {'healthy': False, 'error': str(e)}
return status
pool_health = ConnectionPoolHealth()
@app.get("/health")
async def health_check():
# 系统资源检查
cpu_percent = psutil.cpu_percent(interval=0.1)
memory = psutil.virtual_memory()
# 连接池检查
pool_status = pool_health.check()
pools_healthy = all(s.get('healthy', False) for s in pool_status.values())
healthy = (
cpu_percent < 90 and
memory.percent < 90 and
pools_healthy
)
status_code = 200 if healthy else 503
return {
"status": "healthy" if healthy else "unhealthy",
"cpu": f"{cpu_percent}%",
"memory": f"{memory.percent}%",
"pools": pool_status
}, status_code
@app.get("/ready")
async def readiness_check():
"""启动检查:依赖服务是否就绪"""
# 检查数据库连接等
try:
# await check_db_connection()
return {"ready": True}
except Exception as e:
raise HTTPException(status_code=503, detail=str(e))
🔧 五、Python 专用应急工具
- 现场诊断脚本(直接注入运行进程)
#!/usr/bin/env python3
# debug_injector.py - 用于诊断运行中的 Python 进程
import sys
import threading
import gc
import asyncio
import inspect
def diagnose():
print("=" * 50)
print(f"Python 版本: {sys.version}")
print(f"活跃线程数: {threading.active_count()}")
print(f"当前线程: {threading.current_thread().name}")
# 检查 asyncio 事件循环
try:
loop = asyncio.get_running_loop()
print(f"事件循环运行中: {loop.is_running()}")
print(f"默认执行器: {loop._default_executor}")
if hasattr(loop, '_scheduled'):
print(f"计划任务数: {len(loop._scheduled)}")
except RuntimeError:
print("无运行中的事件循环")
# 检查 socket 对象
import socket
sockets = [obj for obj in gc.get_objects() if isinstance(obj, socket.socket)]
print(f"\n内存中 socket 对象: {len(sockets)}")
# 检查线程池
from concurrent.futures import ThreadPoolExecutor
executors = [obj for obj in gc.get_objects() if isinstance(obj, ThreadPoolExecutor)]
print(f"\nThreadPoolExecutor 实例: {len(executors)}")
for i, exe in enumerate(executors):
print(f" Executor {i}: workers={exe._max_workers}, "
f"active={len(exe._threads)}, "
f"queue={exe._work_queue.qsize()}")
# 检查 aiohttp 会话(如果存在)
try:
import aiohttp
sessions = [obj for obj in gc.get_objects() if isinstance(obj, aiohttp.ClientSession)]
print(f"\naiohttp ClientSession: {len(sessions)}")
for s in sessions:
if hasattr(s, '_connector'):
conn = s._connector
print(f" Connector: size={conn.size}, limit={conn.limit}, "
f"limit_per_host={conn.limit_per_host}")
except ImportError:
pass
print("=" * 50)
if __name__ == "__main__":
diagnose()
使用方法:
# 方法1:使用 gdb 注入运行进程
gdb -p <pid> -ex 'call PyRun_SimpleString("exec(open(\"debug_injector.py\").read())")' -ex 'detach' -ex 'quit'
# 方法2:使用 manhole(需预先安装)
pip install manhole
# 在应用启动时添加:
import manhole
manhole.install()
# 然后连接诊断
python -m manhole <pid>
- 连接泄漏检测装饰器
import functools
import weakref
import logging
logger = logging.getLogger(__name__)
def track_connections(cls):
"""类装饰器:跟踪所有实例的生命周期"""
instances = weakref.WeakSet()
original_init = cls.__init__
@functools.wraps(original_init)
def new_init(self, *args, **kwargs):
original_init(self, *args, **kwargs)
instances.add(self)
# 记录创建栈
import traceback
self._creation_stack = traceback.format_stack()
cls.__init__ = new_init
@classmethod
def get_live_instances(cls):
return list(instances)
@classmethod
def log_leaked(cls, threshold=100):
live = cls.get_live_instances()
if len(live) > threshold:
logger.warning(f"检测到 {cls.__name__} 泄漏: {len(live)} 个存活实例")
for i, inst in enumerate(live[:5]): # 只显示前5个
if hasattr(inst, '_creation_stack'):
logger.warning(f"实例 {i} 创建位置:\n{''.join(inst._creation_stack[-3:])}")
cls.get_live_instances = get_live_instances
cls.log_leaked = log_leaked
return cls
# 使用示例
@track_connections
class DatabaseConnection:
def __init__(self, dsn):
self.dsn = dsn
self.conn = None
def close(self):
if self.conn:
self.conn.close()
self.conn = None
# 定期检查泄漏
import asyncio
async def leak_monitor():
while True:
DatabaseConnection.log_leaked(threshold=50)
await asyncio.sleep(60)
📋 六、Python 排查 checklist
□ 确认 Python 版本(3.8+ 的 asyncio 更稳定)
□ 检查是否混用同步/异步代码(requests 在 async def 中?)
□ 验证 Session/ClientSession 是否复用(不要每次新建)
□ 确认所有池化资源有界(ThreadPoolExecutor、连接池)
□ 检查超时配置(连接、读取、总超时缺一不可)
□ 验证资源关闭逻辑(try/finally 或 async with)
□ 确认使用健康检查端点(Kubernetes 场景)
□ 部署监控指标(prometheus_client)
□ 配置 Gunicorn/Uvicorn 工作模式与参数
□ 测试故障注入(模拟下游超时、连接拒绝)