python Web开发从入门到精通(二十四)FastAPI性能优化实战 - 支撑千万级并发访问(下)

6 阅读1分钟

第4部分:异步任务处理 - 解放主请求线程

4.1 为什么需要异步任务队列

FastAPI的BackgroundTasks适合轻量级操作,但对于耗时任务(发邮件、生成报表、图片处理等),应该使用专业的任务队列:

from fastapi import BackgroundTasks
import asyncio

# ❌ 不适用于耗时任务
@app.post("/generate-report")
async def generate_report(bg: BackgroundTasks):
    def generate_report_sync():
        time.sleep(30)  # 耗时30秒
        return "报告生成完成"
    
    bg.add_task(generate_report_sync)  # 会阻塞工作进程!
    return {"status": "processing"}

# ✅ 正确做法:使用Celery或RQ

4.2 集成Celery异步任务队列

创建 app/celery_app.py

from celery import Celery
from celery.schedules import crontab
import asyncio
from app.database import AsyncSessionLocal
from app.models import Order, User
from datetime import datetime
import logging

logger = logging.getLogger(__name__)

# 创建Celery应用
celery_app = Celery(
    'fastapi_tasks',
    broker='redis://localhost:6379/1',  # Redis作为消息代理
    backend='redis://localhost:6379/2',  # Redis作为结果后端
    include=['app.tasks']  # 包含任务模块
)

# Celery配置
celery_app.conf.update(
    task_serializer='json',
    accept_content=['json'],
    result_serializer='json',
    timezone='Asia/Shanghai',
    enable_utc=True,
    task_track_started=True,
    task_time_limit=30 * 60,  # 任务超时时间30分钟
    task_soft_time_limit=25 * 60,
    worker_prefetch_multiplier=1,  # 每个worker一次只取一个任务
    worker_max_tasks_per_child=1000,  # 每个worker处理1000个任务后重启
    beat_schedule={
        # 每天凌晨1点清理过期缓存
        'clean_expired_cache': {
            'task': 'app.tasks.clean_expired_cache',
            'schedule': crontab(hour=1, minute=0),
        },
        # 每5分钟同步用户活跃度
        'sync_user_activity': {
            'task': 'app.tasks.sync_user_activity',
            'schedule': 300.0,  # 每300秒
        },
    }
)

# 异步任务基类
class AsyncTask:
    """支持异步数据库操作的Celery任务基类"""
    
    @staticmethod
    async def get_db_session():
        """获取异步数据库会话"""
        async with AsyncSessionLocal() as session:
            try:
                yield session
                await session.commit()
            except Exception:
                await session.rollback()
                raise

# 在Celery任务中使用异步数据库
@celery_app.task(bind=True, base=AsyncTask, name="process_order")
def process_order_task(self, order_id: int):
    """处理订单的异步任务(实际生产中使用asyncio.run)"""
    async def async_process():
        async with AsyncSessionLocal() as session:
            # 查询订单
            result = await session.execute(
                select(Order).filter(Order.id == order_id)
            )
            order = result.scalar_one_or_none()
            
            if order:
                # 处理订单逻辑
                order.status = 'processed'
                order.processed_at = datetime.now()
                await session.commit()
                
                logger.info(f"✅ 订单处理完成: {order_id}")
                return {"status": "success", "order_id": order_id}
            
            return {"status": "failed", "reason": "订单不存在"}
    
    # 在同步函数中运行异步代码
    return asyncio.run(async_process())

创建 app/tasks.py

import asyncio
from datetime import datetime, timedelta
from typing import List, Dict, Any
import logging

from app.celery_app import celery_app
from app.database import AsyncSessionLocal
from app.models import User, Order, Product
from app.cache import cache

logger = logging.getLogger(__name__)

@celery_app.task
def send_welcome_email(user_id: int):
    """发送欢迎邮件(异步任务)"""
    # 模拟邮件发送耗时
    logger.info(f"📧 正在给用户 {user_id} 发送欢迎邮件...")
    # 实际发送邮件逻辑
    return {"status": "sent", "user_id": user_id, "timestamp": datetime.now().isoformat()}

@celery_app.task
def generate_sales_report(start_date: str, end_date: str):
    """生成销售报表(耗时任务)"""
    async def async_generate():
        async with AsyncSessionLocal() as session:
            # 查询指定时间段的订单
            result = await session.execute(
                select(Order)
                .filter(Order.created_at >= start_date)
                .filter(Order.created_at <= end_date)
            )
            orders = result.scalars().all()
            
            # 统计报表
            total_sales = sum(order.total_amount for order in orders)
            total_orders = len(orders)
            top_products = {}
            
            for order in orders:
                # 简化处理:实际中需要关联商品表
                if order.product_id not in top_products:
                    top_products[order.product_id] = 0
                top_products[order.product_id] += 1
            
            report = {
                "period": f"{start_date}{end_date}",
                "total_sales": total_sales,
                "total_orders": total_orders,
                "top_products": sorted(
                    top_products.items(),
                    key=lambda x: x[1],
                    reverse=True
                )[:10],
                "generated_at": datetime.now().isoformat()
            }
            
            # 缓存报表结果
            cache_key = f"sales_report:{start_date}:{end_date}"
            await cache.set(cache_key, report, ttl=3600)
            
            return report
    
    return asyncio.run(async_generate())

@celery_app.task
def update_product_popularity():
    """更新商品热度(周期性任务)"""
    async def async_update():
        async with AsyncSessionLocal() as session:
            # 查询最近24小时的订单
            yesterday = datetime.now() - timedelta(days=1)
            
            result = await session.execute(
                select(Order.product_id, func.count(Order.id).label('order_count'))
                .filter(Order.created_at >= yesterday)
                .group_by(Order.product_id)
            )
            
            product_stats = result.all()
            
            # 更新商品热度
            for product_id, order_count in product_stats:
                await session.execute(
                    update(Product)
                    .where(Product.id == product_id)
                    .values(
                        popularity=order_count,
                        last_updated=datetime.now()
                    )
                )
            
            await session.commit()
            
            # 清除相关缓存
            cache_keys = ["hot_products:*", "product:*"]
            for pattern in cache_keys:
                # 实际中需要实现按模式删除的功能
                pass
            
            logger.info(f"✅ 商品热度更新完成,处理了 {len(product_stats)} 个商品")
            return {"status": "success", "count": len(product_stats)}
    
    return asyncio.run(async_update())

第5部分:部署架构优化 - 从单机到高可用集群

5.1 Uvicorn + Gunicorn 多进程架构

创建 gunicorn_conf.py 配置文件:

import multiprocessing
import os

# 服务器配置
host = os.getenv("HOST", "0.0.0.0")
port = os.getenv("PORT", "8000")
bind = f"{host}:{port}"

# 工作进程配置
workers = int(os.getenv("WORKERS", multiprocessing.cpu_count() * 2 + 1))
worker_class = "uvicorn.workers.UvicornWorker"
worker_connections = 1000

# 进程管理
max_requests = 1000
max_requests_jitter = 50
timeout = 120
keepalive = 2

# 日志配置
accesslog = "-"
errorlog = "-"
loglevel = "info"

# 性能调优
preload_app = True  # 预加载应用,提高启动速度
reload = False  # 生产环境禁用热重载

# 监控
statsd_host = os.getenv("STATSD_HOST", "localhost:9125")

生产环境启动命令:

# 使用Gunicorn管理Uvicorn workers
gunicorn -c gunicorn_conf.py app.main:app

# 或者直接使用uvicorn(适合容器化部署)
uvicorn app.main:app \
  --host 0.0.0.0 \
  --port 8000 \
  --workers 8 \
  --loop uvloop \
  --http httptools \
  --log-level info \
  --no-access-log  # 生产环境建议关闭访问日志,避免性能影响

5.2 Nginx负载均衡配置

创建 nginx.conf 配置文件:

upstream fastapi_backend {
    # 负载均衡策略:最小连接数
    least_conn;
    
    # 后端服务器列表(可以动态扩展)
    server 127.0.0.1:8000 weight=10 max_fails=3 fail_timeout=30s;
    server 127.0.0.1:8001 weight=10 max_fails=3 fail_timeout=30s;
    server 127.0.0.1:8002 weight=10 max_fails=3 fail_timeout=30s;
    
    # 健康检查
    keepalive 32;
}

server {
    listen 80;
    server_name api.example.com;
    
    # 静态文件由Nginx直接处理(减轻FastAPI负担)
    location /static/ {
        alias /var/www/static/;
        expires 30d;
        add_header Cache-Control "public, immutable";
    }
    
    # API请求转发到FastAPI
    location / {
        proxy_pass http://fastapi_backend;
        proxy_http_version 1.1;
        proxy_set_header Upgrade $http_upgrade;
        proxy_set_header Connection "upgrade";
        proxy_set_header Host $host;
        proxy_set_header X-Real-IP $remote_addr;
        proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
        proxy_set_header X-Forwarded-Proto $scheme;
        
        # 超时配置
        proxy_connect_timeout 60s;
        proxy_send_timeout 60s;
        proxy_read_timeout 60s;
        
        # 缓冲配置
        proxy_buffering on;
        proxy_buffer_size 4k;
        proxy_buffers 8 4k;
        proxy_busy_buffers_size 8k;
    }
    
    # 健康检查端点
    location /health {
        access_log off;
        return 200 "healthy\n";
        add_header Content-Type text/plain;
    }
}

5.3 Docker容器化部署

创建 Dockerfile

# 使用官方Python镜像
FROM python:3.10-slim

# 设置工作目录
WORKDIR /app

# 安装系统依赖
RUN apt-get update && apt-get install -y \
    gcc \
    postgresql-client \
    && rm -rf /var/lib/apt/lists/*

# 复制依赖文件
COPY requirements.txt .

# 安装Python依赖
RUN pip install --no-cache-dir -r requirements.txt

# 复制应用代码
COPY . .

# 创建非root用户
RUN useradd -m -u 1000 fastapi && chown -R fastapi:fastapi /app
USER fastapi

# 暴露端口
EXPOSE 8000

# 健康检查
HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
    CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')"

# 启动命令(可以根据环境变量选择启动方式)
CMD ["gunicorn", "-c", "gunicorn_conf.py", "app.main:app"]

创建 docker-compose.yml 用于本地开发和测试:

version: '3.8'

services:
  postgres:
    image: postgres:15
    environment:
      POSTGRES_USER: fastapi
      POSTGRES_PASSWORD: fastapi123
      POSTGRES_DB: fastapi_db
    volumes:
      - postgres_data:/var/lib/postgresql/data
    ports:
      - "5432:5432"
    healthcheck:
      test: ["CMD-SHELL", "pg_isready -U fastapi"]
      interval: 10s
      timeout: 5s
      retries: 5

  redis:
    image: redis:7-alpine
    ports:
      - "6379:6379"
    command: redis-server --appendonly yes
    volumes:
      - redis_data:/data
    healthcheck:
      test: ["CMD", "redis-cli", "ping"]
      interval: 10s
      timeout: 5s
      retries: 5

  fastapi:
    build: .
    ports:
      - "8000:8000"
    environment:
      DATABASE_URL: postgresql+asyncpg://fastapi:fastapi123@postgres:5432/fastapi_db
      REDIS_URL: redis://redis:6379/0
      WORKERS: 4
    depends_on:
      postgres:
        condition: service_healthy
      redis:
        condition: service_healthy
    volumes:
      - ./app:/app/app
      - ./static:/var/www/static

  celery:
    build: .
    command: celery -A app.celery_app worker --loglevel=info --concurrency=4
    environment:
      DATABASE_URL: postgresql+asyncpg://fastapi:fastapi123@postgres:5432/fastapi_db
      REDIS_URL: redis://redis:6379/0
    depends_on:
      - redis
      - postgres

  celery-beat:
    build: .
    command: celery -A app.celery_app beat --loglevel=info
    environment:
      DATABASE_URL: postgresql+asyncpg://fastapi:fastapi123@postgres:5432/fastapi_db
      REDIS_URL: redis://localhost:6379/0
    depends_on:
      - redis
      - postgres

  nginx:
    image: nginx:alpine
    ports:
      - "80:80"
    volumes:
      - ./nginx.conf:/etc/nginx/nginx.conf
      - ./static:/var/www/static
    depends_on:
      - fastapi

volumes:
  postgres_data:
  redis_data:

第6部分:监控与告警 - 可观测性体系

6.1 Prometheus + Grafana 监控方案

创建 app/monitoring.py

import time
import asyncio
from typing import Dict, Any, Optional
from datetime import datetime
import logging
import psutil
import os
from prometheus_client import Counter, Histogram, Gauge, generate_latest, REGISTRY
from fastapi import Request, Response
from contextlib import asynccontextmanager

# 配置日志
logger = logging.getLogger(__name__)

# ==================== Prometheus指标定义 ====================

# 请求相关指标
REQUEST_COUNT = Counter(
    'fastapi_requests_total',
    '总请求数',
    ['method', 'endpoint', 'status']
)

REQUEST_LATENCY = Histogram(
    'fastapi_request_duration_seconds',
    '请求延迟',
    ['method', 'endpoint'],
    buckets=(0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0)
)

ACTIVE_REQUESTS = Gauge(
    'fastapi_active_requests',
    '活跃请求数'
)

# 缓存相关指标
CACHE_HITS_TOTAL = Counter(
    'cache_hits_total',
    '缓存命中数',
    ['cache_layer']
)

CACHE_MISSES_TOTAL = Counter(
    'cache_misses_total',
    '缓存未命中数',
    ['cache_layer']
)

CACHE_SIZE_ITEMS = Gauge(
    'cache_size_items',
    '缓存条目数',
    ['cache_layer']
)

# 数据库相关指标
DB_CONNECTIONS_IN_USE = Gauge(
    'db_connections_in_use',
    '正在使用的数据库连接数'
)

DB_CONNECTIONS_IDLE = Gauge(
    'db_connections_idle',
    '空闲的数据库连接数'
)

DB_CONNECTION_WAIT_SECONDS = Histogram(
    'db_connection_wait_seconds',
    '获取数据库连接的等待时间',
    buckets=(0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0)
)

# 系统指标
SYSTEM_CPU_PERCENT = Gauge(
    'system_cpu_usage_percent',
    '系统CPU使用率'
)

SYSTEM_MEMORY_PERCENT = Gauge(
    'system_memory_usage_percent',
    '系统内存使用率'
)

PROCESS_MEMORY_USAGE_BYTES = Gauge(
    'process_memory_usage_bytes',
    '进程内存使用量'
)

# 业务指标
PRODUCT_VIEWS_TOTAL = Counter(
    'product_views_total',
    '商品查看总数',
    ['product_id']
)

ORDER_CREATED_TOTAL = Counter(
    'orders_created_total',
    '订单创建总数'
)

# ==================== 监控中间件 ====================

async def monitor_request(request: Request, call_next) -> Response:
    """
    监控中间件:记录请求指标
    
    关键功能:
    1. 记录请求延迟
    2. 记录请求计数
    3. 记录活跃请求数
    4. 添加请求跟踪ID
    """
    # 开始时间
    start_time = time.time()
    
    # 增加活跃请求数
    ACTIVE_REQUESTS.inc()
    
    # 添加请求ID
    request_id = f"req_{int(start_time * 1000)}_{os.getpid()}"
    request.state.request_id = request_id
    
    try:
        # 执行请求
        response = await call_next(request)
        
        # 计算延迟
        latency = time.time() - start_time
        
        # 记录请求统计
        REQUEST_COUNT.labels(
            method=request.method,
            endpoint=request.url.path,
            status=response.status_code
        ).inc()
        
        # 记录延迟
        REQUEST_LATENCY.labels(
            method=request.method,
            endpoint=request.url.path
        ).observe(latency)
        
        # 添加性能头信息
        response.headers["X-Request-ID"] = request_id
        response.headers["X-Response-Time"] = f"{latency:.3f}s"
        
        # 记录慢请求
        if latency > 1.0:  # 超过1秒认为是慢请求
            logger.warning(f"🚨 慢请求检测: {request.method} {request.url.path} - {latency:.3f}s")
        
        return response
        
    except Exception as e:
        # 记录异常请求
        latency = time.time() - start_time
        REQUEST_COUNT.labels(
            method=request.method,
            endpoint=request.url.path,
            status=500
        ).inc()
        
        logger.error(f"❌ 请求处理异常: {request.method} {request.url.path} - {str(e)}")
        raise
        
    finally:
        # 减少活跃请求数
        ACTIVE_REQUESTS.dec()

# ==================== 系统监控 ====================

async def update_system_metrics():
    """更新系统指标"""
    try:
        # CPU使用率
        SYSTEM_CPU_PERCENT.set(psutil.cpu_percent(interval=None))
        
        # 内存使用率
        memory = psutil.virtual_memory()
        SYSTEM_MEMORY_PERCENT.set(memory.percent)
        
        # 进程内存使用量
        process = psutil.Process(os.getpid())
        PROCESS_MEMORY_USAGE_BYTES.set(process.memory_info().rss)
        
        # 其他系统指标(可以按需扩展)
        # - 磁盘I/O
        # - 网络I/O
        # - 打开文件数
        # - 线程数
        
    except Exception as e:
        logger.error(f"❌ 更新系统指标失败: {e}")

# ==================== 数据库监控 ====================

async def monitor_database_pool():
    """监控数据库连接池"""
    try:
        from app.database import get_pool_stats
        
        stats = await get_pool_stats()
        
        if stats:
            DB_CONNECTIONS_IN_USE.set(stats.get('connections_in_use', 0))
            DB_CONNECTIONS_IDLE.set(stats.get('connections_idle', 0))
            
            # 检查连接池使用率
            total = stats.get('connections_in_use', 0) + stats.get('connections_idle', 0)
            pool_size = stats.get('pool_size', 20)
            
            if total > 0 and pool_size > 0:
                usage_rate = stats.get('connections_in_use', 0) / pool_size
                
                if usage_rate > 0.8:
                    logger.warning(f"⚠️ 数据库连接池使用率过高: {usage_rate:.1%}")
                
    except Exception as e:
        logger.error(f"❌ 数据库连接池监控失败: {e}")

# ==================== 缓存监控 ====================

async def monitor_cache_stats():
    """监控缓存统计"""
    try:
        from app.cache import cache
        
        stats = cache.get_stats()
        
        # 更新缓存命中/未命中指标
        local_stats = stats.get('local_cache', {})
        CACHE_HITS_TOTAL.labels(cache_layer='local').inc(local_stats.get('hits', 0))
        CACHE_MISSES_TOTAL.labels(cache_layer='local').inc(local_stats.get('misses', 0))
        CACHE_SIZE_ITEMS.labels(cache_layer='local').set(local_stats.get('size', 0))
        
        redis_stats = stats.get('redis_cache', {})
        CACHE_HITS_TOTAL.labels(cache_layer='redis').inc(redis_stats.get('hits', 0))
        CACHE_MISSES_TOTAL.labels(cache_layer='redis').inc(redis_stats.get('misses', 0))
        
    except Exception as e:
        logger.error(f"❌ 缓存监控失败: {e}")

# ==================== 综合监控服务 ====================

class MonitoringService:
    """监控服务"""
    
    def __init__(self):
        self.metrics_update_task = None
        self.is_running = False
        
    async def start(self):
        """启动监控服务"""
        if self.is_running:
            return
        
        self.is_running = True
        
        # 启动周期性指标更新
        self.metrics_update_task = asyncio.create_task(self._periodic_update())
        
        logger.info("✅ 监控服务已启动")
    
    async def stop(self):
        """停止监控服务"""
        self.is_running = False
        
        if self.metrics_update_task:
            self.metrics_update_task.cancel()
            try:
                await self.metrics_update_task
            except asyncio.CancelledError:
                pass
        
        logger.info("✅ 监控服务已停止")
    
    async def _periodic_update(self):
        """周期性更新指标"""
        while self.is_running:
            try:
                # 更新系统指标
                await update_system_metrics()
                
                # 监控数据库连接池
                await monitor_database_pool()
                
                # 监控缓存统计
                await monitor_cache_stats()
                
                # 记录应用状态
                await self._record_app_metrics()
                
                # 每30秒更新一次
                await asyncio.sleep(30)
                
            except asyncio.CancelledError:
                break
            except Exception as e:
                logger.error(f"❌ 周期性监控更新失败: {e}")
                await asyncio.sleep(60)  # 出错后等待更长时间
    
    async def _record_app_metrics(self):
        """记录应用特定指标"""
        try:
            # 记录应用运行时间
            uptime = time.time() - psutil.Process(os.getpid()).create_time()
            
            # 记录Python内存使用
            import gc
            gc.collect()  # 强制垃圾回收
            
            # 记录活动协程数
            tasks = [task for task in asyncio.all_tasks() if not task.done()]
            
            # 记录事件循环延迟(如果有重要影响)
            loop = asyncio.get_event_loop()
            
            # 这里可以添加更多应用特定的指标
            
        except Exception as e:
            logger.error(f"❌ 记录应用指标失败: {e}")
    
    async def get_system_info(self) -> Dict[str, Any]:
        """获取系统信息"""
        try:
            import platform
            
            return {
                "timestamp": datetime.now().isoformat(),
                "system": {
                    "platform": platform.platform(),
                    "python_version": platform.python_version(),
                    "cpu_count": psutil.cpu_count(),
                    "cpu_physical_cores": psutil.cpu_count(logical=False),
                },
                "memory": {
                    "total": psutil.virtual_memory().total,
                    "available": psutil.virtual_memory().available,
                    "percent": psutil.virtual_memory().percent,
                },
                "disk": {
                    "total": psutil.disk_usage("/").total,
                    "used": psutil.disk_usage("/").used,
                    "free": psutil.disk_usage("/").free,
                    "percent": psutil.disk_usage("/").percent,
                },
                "process": {
                    "pid": os.getpid(),
                    "memory_rss": psutil.Process(os.getpid()).memory_info().rss,
                    "cpu_percent": psutil.Process(os.getpid()).cpu_percent(),
                }
            }
            
        except Exception as e:
            logger.error(f"❌ 获取系统信息失败: {e}")
            return {"error": str(e)}

# ==================== FastAPI端点 ====================

def setup_monitoring_endpoints(app):
    """设置监控端点"""
    
    @app.get("/metrics")
    async def metrics():
        """Prometheus指标端点"""
        try:
            # 更新指标
            await update_system_metrics()
            await monitor_database_pool()
            await monitor_cache_stats()
            
            # 生成Prometheus格式的指标
            return Response(
                content=generate_latest(REGISTRY),
                media_type="text/plain"
            )
            
        except Exception as e:
            logger.error(f"❌ 生成指标失败: {e}")
            return Response(
                content=f"Error generating metrics: {str(e)}",
                status_code=500,
                media_type="text/plain"
            )
    
    @app.get("/health")
    async def health_check():
        """健康检查端点"""
        try:
            # 检查数据库健康
            from app.database import check_database_health
            db_health = await check_database_health()
            
            # 检查缓存健康
            from app.cache import check_cache_health
            cache_health = await check_cache_health()
            
            # 检查系统负载
            cpu_percent = psutil.cpu_percent(interval=0.1)
            memory_percent = psutil.virtual_memory().percent
            
            # 判断整体健康状态
            overall_status = "healthy"
            
            if db_health.get("status") != "healthy":
                overall_status = "unhealthy"
            elif cache_health.get("status") != "healthy":
                overall_status = "unhealthy"
            elif cpu_percent > 90:
                overall_status = "unhealthy"
            elif memory_percent > 90:
                overall_status = "unhealthy"
            
            return {
                "status": overall_status,
                "timestamp": datetime.now().isoformat(),
                "database": db_health,
                "cache": cache_health,
                "system": {
                    "cpu_percent": cpu_percent,
                    "memory_percent": memory_percent,
                },
                "uptime": time.time() - psutil.Process(os.getpid()).create_time(),
            }
            
        except Exception as e:
            logger.error(f"❌ 健康检查失败: {e}")
            return {
                "status": "unhealthy",
                "timestamp": datetime.now().isoformat(),
                "error": str(e),
            }
    
    @app.get("/system/info")
    async def system_info():
        """系统信息端点"""
        try:
            monitoring_service = MonitoringService()
            info = await monitoring_service.get_system_info()
            return info
            
        except Exception as e:
            logger.error(f"❌ 获取系统信息失败: {e}")
            return {"error": str(e)}
    
    @app.get("/system/stats")
    async def system_stats():
        """系统统计端点"""
        try:
            # 数据库连接池统计
            from app.database import get_pool_stats
            db_stats = await get_pool_stats()
            
            # 缓存统计
            from app.cache import cache
            cache_stats = cache.get_stats()
            
            # 系统统计
            import psutil
            
            return {
                "database": db_stats,
                "cache": cache_stats,
                "system": {
                    "cpu_count": psutil.cpu_count(),
                    "cpu_percent": psutil.cpu_percent(interval=0.1),
                    "memory_total": psutil.virtual_memory().total,
                    "memory_used": psutil.virtual_memory().used,
                    "memory_percent": psutil.virtual_memory().percent,
                    "disk_total": psutil.disk_usage("/").total,
                    "disk_used": psutil.disk_usage("/").used,
                    "disk_percent": psutil.disk_usage("/").percent,
                },
                "application": {
                    "active_requests": ACTIVE_REQUESTS._value.get(),
                    "request_rate": self._calculate_request_rate(),
                }
            }
            
        except Exception as e:
            logger.error(f"❌ 获取系统统计失败: {e}")
            return {"error": str(e)}

# ==================== 工具函数 ====================

def _calculate_request_rate():
    """计算请求率(简化实现)"""
    # 实际项目中可以从日志或专门的计数器中计算
    return 0

def setup_request_logging():
    """设置请求日志"""
    import logging
    import json
    
    class RequestLogFormatter(logging.Formatter):
        """请求日志格式化器"""
        
        def format(self, record):
            log_record = {
                "timestamp": self.formatTime(record),
                "level": record.levelname,
                "message": record.getMessage(),
                "module": record.module,
                "funcName": record.funcName,
                "lineno": record.lineno,
            }
            
            if hasattr(record, 'request_id'):
                log_record['request_id'] = record.request_id
            
            if hasattr(record, 'method'):
                log_record['method'] = record.method
                log_record['endpoint'] = record.endpoint
                log_record['status'] = record.status
                log_record['latency'] = record.latency
            
            return json.dumps(log_record, ensure_ascii=False)
    
    # 配置日志处理器
    handler = logging.StreamHandler()
    handler.setFormatter(RequestLogFormatter())
    
    # 添加到根日志记录器
    logging.getLogger().addHandler(handler)
    
    logger.info("✅ 请求日志配置完成")

# 全局监控服务实例
monitoring_service = MonitoringService()

# 启动监控服务
async def start_monitoring():
    """启动监控"""
    await monitoring_service.start()

# 停止监控服务
async def stop_monitoring():
    """停止监控"""
    await monitoringitoring_service.stop()

第7部分:高级优化技巧 - 超越基础配置

7.1 连接池深度调优

除了基本的连接池配置,还有一些高级优化手段:

# 高级连接池配置
engine = create_async_engine(
    DATABASE_URL,
    # 基础配置
    pool_size=20,
    max_overflow=30,
    
    # 高级优化
    pool_timeout=30,        # 连接获取超时
    pool_recycle=1800,      # 连接回收时间(防止数据库超时)
    pool_pre_ping=True,     # 连接前健康检查
    pool_use_lifo=True,     # 后进先出,提高缓存局部性
    
    # 性能优化
    echo=False,             # 生产环境关闭SQL日志
    echo_pool=False,        # 连接池日志
    hide_parameters=True,   # 生产环境隐藏参数
    
    # 连接设置
    connect_args={
        "server_settings": {
            "jit": "off",                  # 关闭JIT编译
            "effective_cache_size": "8GB", # 告诉PostgreSQL可用缓存
        },
        "command_timeout": 30,             # SQL命令超时
    }
)

7.2 查询优化策略

批量查询优化

# ❌ 低效:多次单条查询
async def get_users(ids: List[int], db: AsyncSession):
    users = []
    for user_id in ids:
        result = await db.execute(select(User).filter(User.id == user_id))
        user = result.scalar_one_or_none()
        users.append(user)
    return users

# ✅ 高效:批量查询
async def get_users_batch(ids: List[int], db: AsyncSession):
    result = await db.execute(select(User).filter(User.id.in_(ids)))
    return result.scalars().all()

懒加载与预加载

# 懒加载(按需查询)
async def get_user_with_orders_lazy(user_id: int, db: AsyncSession):
    result = await db.execute(select(User).filter(User.id == user_id))
    user = result.scalar_one()
    # 当访问user.orders时才会查询订单
    return user

# 预加载(一次性查询)
async def get_user_with_orders_eager(user_id: int, db: AsyncSession):
    result = await db.execute(
        select(User).options(joinedload(User.orders)).filter(User.id == user_id)
    )
    return result.unique().scalar_one()

7.3 缓存高级策略

热点数据识别与预热

class HotSpotDetector:
    """热点数据检测器"""
    
    def __init__(self):
        self.access_counts = {}
        self.last_reset_time = time.time()
    
    def record_access(self, key: str):
        """记录访问"""
        self.access_counts[key] = self.access_counts.get(key, 0) + 1
    
    def get_hot_keys(self, threshold: int = 100) -> List[str]:
        """获取热点键"""
        return [
            key for key, count in self.access_counts.items()
            if count >= threshold
        ]
    
    async def preload_hot_data(self, db: AsyncSession):
        """预加载热点数据"""
        hot_keys = self.get_hot_keys()
        for key in hot_keys:
            # 从数据库加载并缓存
            if key.startswith('product:'):
                product_id = int(key.split(':')[1])
                result = await db.execute(
                    select(Product).filter(Product.id == product_id)
                )
                product = result.scalar_one_or_none()
                if product:
                    await cache.set(key, product, ttl=3600)

缓存雪崩防护

class CacheAvalancheProtection:
    """缓存雪崩防护"""
    
    def __init__(self, cache_instance):
        self.cache = cache_instance
        self.mutexes = {}
    
    async def get_with_protection(self, key: str, loader_func: Callable, ttl: int = 300) -> Any:
        """带防护的缓存获取"""
        # 尝试获取缓存
        cached_data = await self.cache.get(key)
        if cached_data is not None:
            return cached_data
        
        # 互斥锁防止缓存击穿
        mutex_key = f"mutex:{key}"
        mutex_acquired = False
        
        try:
            # 尝试获取互斥锁
            mutex_acquired = await self._acquire_mutex(mutex_key)
            
            if mutex_acquired:
                # 重新检查缓存(双检锁)
                cached_data = await self.cache.get(key)
                if cached_data is not None:
                    return cached_data
                
                # 执行加载函数
                if asyncio.iscoroutinefunction(loader_func):
                    data = await loader_func()
                else:
                    data = loader_func()
                
                # 设置缓存(随机过期时间,防止雪崩)
                randomized_ttl = ttl + random.randint(-60, 60)
                await self.cache.set(key, data, randomized_ttl)
                
                return data
            else:
                # 等待并重试
                await asyncio.sleep(0.01)
                return await self.get_with_protection(key, loader_func, ttl)
                
        finally:
            if mutex_acquired:
                await self._release_mutex(mutex_key)
    
    async def _acquire_mutex(self, key: str, timeout: int = 10) -> bool:
        """获取互斥锁"""
        try:
            result = await self.cache.set(
                key, 
                f"locked_{int(time.time())}", 
                ex=timeout, 
                nx=True  # 仅当键不存在时设置
            )
            return result
        except Exception:
            return False
    
    async def _release_mutex(self, key: str):
        """释放互斥锁"""
        try:
            await self.cache.delete(key)
        except Exception:
            pass

第8部分:实战案例分析 - 大流量场景应对

8.1 秒杀场景优化

挑战

  • 瞬时超高并发
  • 库存一致性要求
  • 防止超卖

解决方案

@router.post("/seckill/{product_id}")
async def seckill_product(
    product_id: int,
    user_id: int = Depends(get_current_user),
    db: AsyncSession = Depends(get_db)
):
    """
    秒杀商品接口
    """
    # 1. 频率限制
    redis = await cache.connect()
    key = f"seckill:rate:{user_id}:{product_id}"
    attempts = await redis.incr(key)
    
    if attempts == 1:
        await redis.expire(key, 60)  # 每分钟限制
    
    if attempts > 3:
        raise HTTPException(status_code=429, detail="请求过于频繁")
    
    # 2. 库存预检查(缓存)
    stock_key = f"seckill:stock:{product_id}"
    stock = await redis.get(stock_key)
    
    if not stock:
        # 从数据库加载库存
        result = await db.execute(
            select(Product.stock).filter(Product.id == product_id)
        )
        db_stock = result.scalar() or 0
        
        # 缓存库存(短暂过期时间)
        await redis.setex(stock_key, 5, db_stock)
        stock = db_stock
    else:
        stock = int(stock)
    
    if stock <= 0:
        raise HTTPException(status_code=400, detail="商品已售罄")
    
    # 3. 分布式锁保证库存原子操作
    lock_key = f"seckill:lock:{product_id}"
    lock_token = f"{user_id}_{int(time.time()*1000)}"
    
    try:
        # 尝试获取锁
        lock_acquired = await redis.set(
            lock_key,
            lock_token,
            ex=5,    # 5秒超时
            nx=True  # 仅当键不存在时设置
        )
        
        if not lock_acquired:
            raise HTTPException(status_code=409, detail="系统繁忙,请重试")
        
        # 4. 扣减库存(Redis原子操作)
        remaining = await redis.decr(stock_key)
        
        if remaining < 0:
            # 恢复库存
            await redis.incr(stock_key)
            raise HTTPException(status_code=400, detail="库存不足")
        
        # 5. 异步创建订单
        background_tasks.add_task(
            create_seckill_order,
            product_id,
            user_id
        )
        
        return {
            "status": "success",
            "message": "秒杀成功",
            "remaining_stock": remaining
        }
        
    finally:
        # 释放锁(检查是否为自己的锁)
        current_token = await redis.get(lock_key)
        if current_token == lock_token:
            await redis.delete(lock_key)

8.2 高并发查询优化

挑战

  • 数据库连接池压力
  • 查询响应时间波动
  • 缓存命中率不稳定

解决方案

class QueryOptimizer:
    """查询优化器"""
    
    def __init__(self):
        self.query_stats = {}
        self.slow_query_threshold = 1.0  # 1秒
        
    async def optimized_query(self, query_func: Callable, cache_key: str = None) -> Any:
        """优化查询"""
        start_time = time.time()
        
        # 1. 缓存优先
        if cache_key:
            cached_data = await cache.get(cache_key)
            if cached_data is not None:
                return cached_data
        
        # 2. 执行查询
        try:
            result = await query_func()
            
            # 3. 记录查询性能
            query_time = time.time() - start_time
            
            if query_time > self.slow_query_threshold:
                self._log_slow_query(query_func.__name__, query_time)
            
            # 4. 更新缓存
            if cache_key:
                ttl = self._calculate_ttl(query_time)
                await cache.set(cache_key, result, ttl)
            
            return result
            
        except Exception as e:
            query_time = time.time() - start_time
            logger.error(f"❌ 查询失败: {query_func.__name__}, 耗时{query_time:.3f}s, 错误{str(e)}")
            raise
    
    def _calculate_ttl(self, query_time: float) -> int:
        """根据查询时间计算缓存过期时间"""
        if query_time > 2.0:
            return 3600  # 慢查询缓存时间长
        elif query_time > 1.0:
            return 1800
        elif query_time > 0.5:
            return 900
        else:
            return 300
    
    def _log_slow_query(self, query_name: str, query_time: float):
        """记录慢查询"""
        logger.warning(f"⚠️ 慢查询: {query_name}, 耗时{query_time:.3f}s")
        
        # 更新统计
        if query_name not in self.query_stats:
            self.query_stats[query_name] = {
                "count": 0,
                "total_time": 0,
                "max_time": 0,
            }
        
        stats = self.query_stats[query_name]
        stats["count"] += 1
        stats["total_time"] += query_time
        stats["max_time"] = max(stats["max_time"], query_time)

第9部分:总结与最佳实践

9.1 核心优化要点总结

经过完整的性能优化实践,我们总结出以下关键要点:

  1. 异步架构是基础

    • 使用异步数据库驱动(asyncpg/aiomysql)
    • 避免在异步函数中调用同步代码
    • 合理使用asyncio任务和协程
  2. 连接池优化是关键

    • 设置合理的pool_size和max_overflow
    • 启用pool_pre_ping防止失效连接
    • 监控连接池使用情况
  3. 缓存策略是核武器

    • 实施多级缓存(进程内+Redis)
    • 防缓存穿透和缓存击穿
    • 合理的缓存失效策略
  4. 部署架构是保障

    • 使用Gunicorn管理Uvicorn workers
    • Nginx负载均衡和静态文件服务
    • 容器化部署和自动扩缩容
  5. 可观测性是眼睛

    • 全面的监控指标
    • 及时的告警机制
    • 性能基准测试

9.2 持续优化建议

性能优化不是一次性工作,而是一个持续的过程:

  1. 定期性能评估

    • 每周review关键指标
    • 每月分析瓶颈和优化机会
    • 每季度架构评估
  2. 渐进式优化

    • 一次只优化一个方面
    • 每次优化都要有数据支撑
    • 避免过度优化
  3. 技术栈更新

    • 关注FastAPI和依赖库更新
    • 及时应用性能改进
    • 测试新特性的影响
  4. 容量规划

    • 根据业务增长预测资源需求
    • 提前规划扩容方案
    • 成本效益分析

9.3 行动号召

现在,轮到你动手实践了:

  1. 诊断现有应用

    • 使用本文提供的工具分析你的FastAPI应用
    • 识别性能瓶颈和优化机会
  2. 逐步实施优化

    • 从连接池优化开始
    • 然后添加缓存层
    • 最后优化部署架构
  3. 建立监控体系

    • 部署Prometheus和Grafana
    • 设置关键指标告警
    • 建立性能基准
  4. 持续学习和改进

    • 关注FastAPI社区和最佳实践
    • 定期review和优化你的应用
    • 分享你的经验和成果

记住:性能优化是一场马拉松,不是短跑。通过系统化的方法、持续的改进和正确的工具,你的FastAPI应用也能轻松应对千万级并发访问!

👋 我是扣子,一名专注于Python后端技术的开发者。如果你有任何问题或想要深入讨论,欢迎在评论区留言!

🚀 让我们一起构建高性能、高可用的后端系统!