LLM微服务架构设计:构建可扩展的AI后端服务

5 阅读1分钟

为什么LLM需要特殊的微服务架构

把LLM调用包装成一个微服务,听起来很简单——不就是封装API调用吗?但实际上,LLM的特性使它比普通HTTP服务复杂得多:

  1. 延迟高且不可预测:P99延迟可达30-60秒,普通服务超时策略完全不适用
  2. Token是稀缺资源:不是按次计费,而是按处理的文字量计费,必须精确控制
  3. 流式响应是主流:用户希望实时看到输出,不是等30秒后一次性刷出来
  4. 故障模式特殊:模型幻觉、安全过滤、内容截断都需要业务层面的处理
  5. 上下文状态复杂:多轮对话的会话状态管理比普通API复杂得多

本文设计一套适合生产环境的LLM微服务架构,覆盖从接口设计到运维监控的完整链路。


核心架构:LLM Gateway模式

客户端(Web/App/Other Services)
         │
         ▼
   ┌─────────────────┐
   │   LLM Gateway   │  ← 统一入口
   │  - 认证鉴权      │
   │  - 路由决策      │
   │  - 限流熔断      │
   │  - 请求日志      │
   └────────┬────────┘
            │
    ┌───────┼───────┐
    ▼       ▼       ▼
┌──────┐ ┌──────┐ ┌──────┐
│Chat  │ │RAG   │ │Agent │  ← 业务服务层
│Service│ │Service│ │Service│
└──────┘ └──────┘ └──────┘
    │       │       │
    └───────┼───────┘
            ▼
   ┌─────────────────┐
   │  LLM Provider   │  ← 模型调用层
   │  Pool           │
   │  - OpenAI       │
   │  - Anthropic    │
   │  - 自托管模型    │
   └─────────────────┘

第一层:LLM Gateway实现

基础Gateway

from fastapi import FastAPI, Request, HTTPException, Depends
from fastapi.responses import StreamingResponse
import httpx
import asyncio
import time
import uuid
from typing import AsyncIterator

app = FastAPI()

class LLMGateway:
    def __init__(self):
        self.rate_limiter = RateLimiter()
        self.circuit_breaker = CircuitBreaker()
        self.logger = StructuredLogger()
        self.router = ModelRouter()
    
    async def handle_request(
        self,
        request: LLMRequest,
        user: User,
        stream: bool = False
    ):
        request_id = str(uuid.uuid4())
        start_time = time.time()
        
        # 1. 限流检查
        if not await self.rate_limiter.allow(user.id):
            raise HTTPException(429, "Rate limit exceeded")
        
        # 2. 熔断器检查
        if not self.circuit_breaker.is_healthy():
            raise HTTPException(503, "Service temporarily unavailable")
        
        # 3. 路由选择(选择最优模型/提供商)
        provider = self.router.select(request, user)
        
        # 4. 执行请求
        try:
            if stream:
                return await self._stream_request(request, provider, request_id)
            else:
                response = await self._complete_request(request, provider, request_id)
                
                # 5. 记录日志
                await self.logger.log_completion(
                    request_id=request_id,
                    user_id=user.id,
                    model=provider.model,
                    input_tokens=response.usage.prompt_tokens,
                    output_tokens=response.usage.completion_tokens,
                    latency_ms=int((time.time() - start_time) * 1000),
                    success=True
                )
                return response
        except Exception as e:
            self.circuit_breaker.record_failure()
            raise

模型路由器

class ModelRouter:
    """根据请求特征选择最优模型"""
    
    def __init__(self):
        self.models = {
            "gpt-4o": {"cost_per_1k_input": 5, "cost_per_1k_output": 15, "max_context": 128000},
            "gpt-4o-mini": {"cost_per_1k_input": 0.15, "cost_per_1k_output": 0.6, "max_context": 128000},
            "claude-3-5-sonnet": {"cost_per_1k_input": 3, "cost_per_1k_output": 15, "max_context": 200000},
        }
    
    def select(self, request: LLMRequest, user: User) -> ProviderConfig:
        token_count = self._estimate_tokens(request)
        
        # 超长上下文 → 选择支持长上下文的模型
        if token_count > 100000:
            return ProviderConfig(model="claude-3-5-sonnet")
        
        # 企业用户 → 使用高质量模型
        if user.tier == "enterprise":
            return ProviderConfig(model="gpt-4o")
        
        # 简单任务(分类、摘要、翻译)→ 使用便宜模型
        if request.task_type in ["classification", "summarization", "translation"]:
            return ProviderConfig(model="gpt-4o-mini")
        
        # 默认:性价比模型
        return ProviderConfig(model="gpt-4o-mini")
    
    def _estimate_tokens(self, request: LLMRequest) -> int:
        # 简单估算:4个字符≈1个token
        total_chars = sum(len(m.content) for m in request.messages)
        return total_chars // 4

第二层:流式响应设计

流式是LLM应用的核心用户体验,必须在架构层面正确支持:

from fastapi import APIRouter
from fastapi.responses import StreamingResponse
from openai import AsyncOpenAI
import json

router = APIRouter()
client = AsyncOpenAI()

@router.post("/v1/chat/stream")
async def stream_chat(request: ChatRequest, user: User = Depends(get_current_user)):
    """流式聊天接口"""
    
    async def generate() -> AsyncIterator[str]:
        usage_tokens = {"prompt": 0, "completion": 0}
        
        try:
            stream = await client.chat.completions.create(
                model=request.model,
                messages=request.messages,
                stream=True,
                stream_options={"include_usage": True},  # 请求包含token使用量
            )
            
            # 发送开始事件
            yield f"data: {json.dumps({'type': 'start', 'request_id': request.id})}\n\n"
            
            async for chunk in stream:
                if chunk.choices and chunk.choices[0].delta.content:
                    content = chunk.choices[0].delta.content
                    yield f"data: {json.dumps({'type': 'delta', 'content': content})}\n\n"
                
                # 捕获usage信息(在最后一个chunk中)
                if chunk.usage:
                    usage_tokens["prompt"] = chunk.usage.prompt_tokens
                    usage_tokens["completion"] = chunk.usage.completion_tokens
            
            # 发送结束事件
            yield f"data: {json.dumps({'type': 'done', 'usage': usage_tokens})}\n\n"
            yield "data: [DONE]\n\n"
            
        except Exception as e:
            # 错误也要以SSE格式发送,让客户端能优雅处理
            yield f"data: {json.dumps({'type': 'error', 'message': str(e)})}\n\n"
        finally:
            # 记录使用量到计费系统
            await billing.record(user.id, usage_tokens)
    
    return StreamingResponse(
        generate(),
        media_type="text/event-stream",
        headers={
            "Cache-Control": "no-cache",
            "X-Accel-Buffering": "no",  # 禁止Nginx缓冲(重要!)
        }
    )

客户端消费流式响应

// 前端JavaScript消费SSE流
async function streamChat(messages, onDelta, onDone) {
    const response = await fetch('/v1/chat/stream', {
        method: 'POST',
        headers: {'Content-Type': 'application/json'},
        body: JSON.stringify({ messages })
    });
    
    const reader = response.body.getReader();
    const decoder = new TextDecoder();
    let buffer = '';
    
    while (true) {
        const { done, value } = await reader.read();
        if (done) break;
        
        buffer += decoder.decode(value, { stream: true });
        const lines = buffer.split('\n');
        buffer = lines.pop();  // 保留不完整的行
        
        for (const line of lines) {
            if (line.startsWith('data: ')) {
                const data = line.slice(6);
                if (data === '[DONE]') {
                    onDone();
                    return;
                }
                const event = JSON.parse(data);
                if (event.type === 'delta') {
                    onDelta(event.content);
                }
            }
        }
    }
}

第三层:会话状态管理

多轮对话需要在服务端保存会话历史:

import redis
from dataclasses import dataclass, asdict

@dataclass
class ConversationMessage:
    role: str          # "user" | "assistant" | "system"
    content: str
    timestamp: float
    token_count: int

class ConversationManager:
    def __init__(self, redis_client: redis.Redis):
        self.redis = redis_client
        self.max_history_tokens = 50000  # 保留最近50k tokens的历史
        self.session_ttl = 3600 * 24     # 会话24小时过期
    
    async def get_messages(self, session_id: str) -> list[dict]:
        """获取会话历史(自动处理超长历史的截断)"""
        key = f"conv:{session_id}"
        raw_messages = self.redis.lrange(key, 0, -1)
        
        messages = [json.loads(m) for m in raw_messages]
        
        # 计算token总量,超出时从最旧的消息开始截断
        total_tokens = sum(m.get("token_count", 0) for m in messages)
        while total_tokens > self.max_history_tokens and len(messages) > 2:
            removed = messages.pop(0)  # 删除最旧的
            total_tokens -= removed.get("token_count", 0)
        
        # 始终保留system message(如果有的话)
        return [{"role": m["role"], "content": m["content"]} for m in messages]
    
    async def append_messages(
        self,
        session_id: str,
        user_message: str,
        assistant_message: str,
        user_tokens: int,
        assistant_tokens: int
    ):
        """追加一轮对话"""
        key = f"conv:{session_id}"
        pipe = self.redis.pipeline()
        
        for role, content, tokens in [
            ("user", user_message, user_tokens),
            ("assistant", assistant_message, assistant_tokens)
        ]:
            msg = ConversationMessage(
                role=role,
                content=content,
                timestamp=time.time(),
                token_count=tokens
            )
            pipe.rpush(key, json.dumps(asdict(msg)))
        
        pipe.expire(key, self.session_ttl)
        pipe.execute()
    
    async def clear_session(self, session_id: str):
        self.redis.delete(f"conv:{session_id}")

第四层:Token计量与计费

class TokenBudgetManager:
    """管理用户的Token预算"""
    
    def __init__(self, db, redis_client):
        self.db = db
        self.redis = redis_client
        self.pricing = {
            "gpt-4o": {"input": 5.0, "output": 15.0},     # $/1M tokens
            "gpt-4o-mini": {"input": 0.15, "output": 0.6},
            "claude-3-5-sonnet": {"input": 3.0, "output": 15.0},
        }
    
    async def check_budget(self, user_id: str, estimated_tokens: int) -> bool:
        """检查用户是否有足够预算"""
        remaining = await self.get_remaining_budget(user_id)
        return remaining >= estimated_tokens
    
    async def get_remaining_budget(self, user_id: str) -> int:
        """获取当月剩余Token预算"""
        # 先查Redis缓存
        key = f"budget:{user_id}:{self._current_month()}"
        cached = self.redis.get(key)
        if cached:
            return int(cached)
        
        # 缓存miss,查数据库
        plan = await self.db.get_user_plan(user_id)
        used = await self.db.get_monthly_usage(user_id)
        remaining = plan.monthly_token_limit - used
        
        # 缓存1分钟(不用太精确)
        self.redis.setex(key, 60, remaining)
        return remaining
    
    async def record_usage(self, user_id: str, model: str, input_tokens: int, output_tokens: int):
        """记录使用量,更新账单"""
        cost = (
            input_tokens / 1_000_000 * self.pricing[model]["input"] +
            output_tokens / 1_000_000 * self.pricing[model]["output"]
        )
        
        await self.db.insert_usage_record({
            "user_id": user_id,
            "model": model,
            "input_tokens": input_tokens,
            "output_tokens": output_tokens,
            "cost_usd": cost,
            "timestamp": datetime.now()
        })
        
        # 使缓存失效
        key = f"budget:{user_id}:{self._current_month()}"
        self.redis.delete(key)

第五层:可观测性

from opentelemetry import trace, metrics
from opentelemetry.trace import Status, StatusCode

tracer = trace.get_tracer("llm-service")
meter = metrics.get_meter("llm-service")

# 定义指标
latency_histogram = meter.create_histogram(
    "llm.request.latency",
    unit="ms",
    description="LLM请求延迟分布"
)

token_counter = meter.create_counter(
    "llm.tokens.total",
    unit="tokens",
    description="Token使用量"
)

def instrument_llm_call(model: str, user_id: str):
    """装饰器:自动为LLM调用添加追踪"""
    def decorator(func):
        async def wrapper(*args, **kwargs):
            with tracer.start_as_current_span(
                f"llm.complete.{model}",
                attributes={
                    "llm.model": model,
                    "llm.user_id": user_id,
                }
            ) as span:
                start = time.time()
                try:
                    result = await func(*args, **kwargs)
                    
                    # 记录指标
                    latency = (time.time() - start) * 1000
                    latency_histogram.record(latency, {"model": model})
                    
                    if result.usage:
                        token_counter.add(
                            result.usage.total_tokens,
                            {"model": model, "type": "total"}
                        )
                    
                    span.set_status(Status(StatusCode.OK))
                    return result
                    
                except Exception as e:
                    span.set_status(Status(StatusCode.ERROR, str(e)))
                    span.record_exception(e)
                    raise
        return wrapper
    return decorator

部署配置参考

# kubernetes/llm-service.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
  name: llm-gateway
spec:
  replicas: 3
  template:
    spec:
      containers:
      - name: llm-gateway
        image: myapp/llm-gateway:v1.0
        resources:
          requests:
            cpu: "500m"
            memory: "512Mi"
          limits:
            cpu: "2000m"
            memory: "2Gi"
        env:
        - name: OPENAI_API_KEY
          valueFrom:
            secretKeyRef:
              name: llm-secrets
              key: openai-api-key
        # LLM服务需要更长的超时
        readinessProbe:
          httpGet:
            path: /health
            port: 8000
          timeoutSeconds: 5
          periodSeconds: 10
---
# 专门针对LLM长连接的HPA配置
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
spec:
  metrics:
  # 不用CPU,用请求队列长度作为扩缩容指标
  - type: External
    external:
      metric:
        name: llm_queue_depth
      target:
        type: AverageValue
        averageValue: "10"  # 队列积压超过10就扩容

结语

LLM微服务架构的核心挑战在于它不是普通的同步请求-响应服务。长延迟、流式输出、状态复杂、成本敏感——这些特性决定了你必须从架构设计阶段就把它们当作一等公民。

Gateway模式提供了统一的控制面,会话管理解决了多轮对话的复杂性,Token计量保证了商业可持续性,可观测性让你对系统行为有清晰认知。

这些不是"优化项",而是生产LLM服务的基础设施。早建,省得后悔。