为什么LLM需要特殊的微服务架构
把LLM调用包装成一个微服务,听起来很简单——不就是封装API调用吗?但实际上,LLM的特性使它比普通HTTP服务复杂得多:
- 延迟高且不可预测:P99延迟可达30-60秒,普通服务超时策略完全不适用
- Token是稀缺资源:不是按次计费,而是按处理的文字量计费,必须精确控制
- 流式响应是主流:用户希望实时看到输出,不是等30秒后一次性刷出来
- 故障模式特殊:模型幻觉、安全过滤、内容截断都需要业务层面的处理
- 上下文状态复杂:多轮对话的会话状态管理比普通API复杂得多
本文设计一套适合生产环境的LLM微服务架构,覆盖从接口设计到运维监控的完整链路。
核心架构:LLM Gateway模式
客户端(Web/App/Other Services)
│
▼
┌─────────────────┐
│ LLM Gateway │ ← 统一入口
│ - 认证鉴权 │
│ - 路由决策 │
│ - 限流熔断 │
│ - 请求日志 │
└────────┬────────┘
│
┌───────┼───────┐
▼ ▼ ▼
┌──────┐ ┌──────┐ ┌──────┐
│Chat │ │RAG │ │Agent │ ← 业务服务层
│Service│ │Service│ │Service│
└──────┘ └──────┘ └──────┘
│ │ │
└───────┼───────┘
▼
┌─────────────────┐
│ LLM Provider │ ← 模型调用层
│ Pool │
│ - OpenAI │
│ - Anthropic │
│ - 自托管模型 │
└─────────────────┘
第一层:LLM Gateway实现
基础Gateway
from fastapi import FastAPI, Request, HTTPException, Depends
from fastapi.responses import StreamingResponse
import httpx
import asyncio
import time
import uuid
from typing import AsyncIterator
app = FastAPI()
class LLMGateway:
def __init__(self):
self.rate_limiter = RateLimiter()
self.circuit_breaker = CircuitBreaker()
self.logger = StructuredLogger()
self.router = ModelRouter()
async def handle_request(
self,
request: LLMRequest,
user: User,
stream: bool = False
):
request_id = str(uuid.uuid4())
start_time = time.time()
# 1. 限流检查
if not await self.rate_limiter.allow(user.id):
raise HTTPException(429, "Rate limit exceeded")
# 2. 熔断器检查
if not self.circuit_breaker.is_healthy():
raise HTTPException(503, "Service temporarily unavailable")
# 3. 路由选择(选择最优模型/提供商)
provider = self.router.select(request, user)
# 4. 执行请求
try:
if stream:
return await self._stream_request(request, provider, request_id)
else:
response = await self._complete_request(request, provider, request_id)
# 5. 记录日志
await self.logger.log_completion(
request_id=request_id,
user_id=user.id,
model=provider.model,
input_tokens=response.usage.prompt_tokens,
output_tokens=response.usage.completion_tokens,
latency_ms=int((time.time() - start_time) * 1000),
success=True
)
return response
except Exception as e:
self.circuit_breaker.record_failure()
raise
模型路由器
class ModelRouter:
"""根据请求特征选择最优模型"""
def __init__(self):
self.models = {
"gpt-4o": {"cost_per_1k_input": 5, "cost_per_1k_output": 15, "max_context": 128000},
"gpt-4o-mini": {"cost_per_1k_input": 0.15, "cost_per_1k_output": 0.6, "max_context": 128000},
"claude-3-5-sonnet": {"cost_per_1k_input": 3, "cost_per_1k_output": 15, "max_context": 200000},
}
def select(self, request: LLMRequest, user: User) -> ProviderConfig:
token_count = self._estimate_tokens(request)
# 超长上下文 → 选择支持长上下文的模型
if token_count > 100000:
return ProviderConfig(model="claude-3-5-sonnet")
# 企业用户 → 使用高质量模型
if user.tier == "enterprise":
return ProviderConfig(model="gpt-4o")
# 简单任务(分类、摘要、翻译)→ 使用便宜模型
if request.task_type in ["classification", "summarization", "translation"]:
return ProviderConfig(model="gpt-4o-mini")
# 默认:性价比模型
return ProviderConfig(model="gpt-4o-mini")
def _estimate_tokens(self, request: LLMRequest) -> int:
# 简单估算:4个字符≈1个token
total_chars = sum(len(m.content) for m in request.messages)
return total_chars // 4
第二层:流式响应设计
流式是LLM应用的核心用户体验,必须在架构层面正确支持:
from fastapi import APIRouter
from fastapi.responses import StreamingResponse
from openai import AsyncOpenAI
import json
router = APIRouter()
client = AsyncOpenAI()
@router.post("/v1/chat/stream")
async def stream_chat(request: ChatRequest, user: User = Depends(get_current_user)):
"""流式聊天接口"""
async def generate() -> AsyncIterator[str]:
usage_tokens = {"prompt": 0, "completion": 0}
try:
stream = await client.chat.completions.create(
model=request.model,
messages=request.messages,
stream=True,
stream_options={"include_usage": True}, # 请求包含token使用量
)
# 发送开始事件
yield f"data: {json.dumps({'type': 'start', 'request_id': request.id})}\n\n"
async for chunk in stream:
if chunk.choices and chunk.choices[0].delta.content:
content = chunk.choices[0].delta.content
yield f"data: {json.dumps({'type': 'delta', 'content': content})}\n\n"
# 捕获usage信息(在最后一个chunk中)
if chunk.usage:
usage_tokens["prompt"] = chunk.usage.prompt_tokens
usage_tokens["completion"] = chunk.usage.completion_tokens
# 发送结束事件
yield f"data: {json.dumps({'type': 'done', 'usage': usage_tokens})}\n\n"
yield "data: [DONE]\n\n"
except Exception as e:
# 错误也要以SSE格式发送,让客户端能优雅处理
yield f"data: {json.dumps({'type': 'error', 'message': str(e)})}\n\n"
finally:
# 记录使用量到计费系统
await billing.record(user.id, usage_tokens)
return StreamingResponse(
generate(),
media_type="text/event-stream",
headers={
"Cache-Control": "no-cache",
"X-Accel-Buffering": "no", # 禁止Nginx缓冲(重要!)
}
)
客户端消费流式响应
// 前端JavaScript消费SSE流
async function streamChat(messages, onDelta, onDone) {
const response = await fetch('/v1/chat/stream', {
method: 'POST',
headers: {'Content-Type': 'application/json'},
body: JSON.stringify({ messages })
});
const reader = response.body.getReader();
const decoder = new TextDecoder();
let buffer = '';
while (true) {
const { done, value } = await reader.read();
if (done) break;
buffer += decoder.decode(value, { stream: true });
const lines = buffer.split('\n');
buffer = lines.pop(); // 保留不完整的行
for (const line of lines) {
if (line.startsWith('data: ')) {
const data = line.slice(6);
if (data === '[DONE]') {
onDone();
return;
}
const event = JSON.parse(data);
if (event.type === 'delta') {
onDelta(event.content);
}
}
}
}
}
第三层:会话状态管理
多轮对话需要在服务端保存会话历史:
import redis
from dataclasses import dataclass, asdict
@dataclass
class ConversationMessage:
role: str # "user" | "assistant" | "system"
content: str
timestamp: float
token_count: int
class ConversationManager:
def __init__(self, redis_client: redis.Redis):
self.redis = redis_client
self.max_history_tokens = 50000 # 保留最近50k tokens的历史
self.session_ttl = 3600 * 24 # 会话24小时过期
async def get_messages(self, session_id: str) -> list[dict]:
"""获取会话历史(自动处理超长历史的截断)"""
key = f"conv:{session_id}"
raw_messages = self.redis.lrange(key, 0, -1)
messages = [json.loads(m) for m in raw_messages]
# 计算token总量,超出时从最旧的消息开始截断
total_tokens = sum(m.get("token_count", 0) for m in messages)
while total_tokens > self.max_history_tokens and len(messages) > 2:
removed = messages.pop(0) # 删除最旧的
total_tokens -= removed.get("token_count", 0)
# 始终保留system message(如果有的话)
return [{"role": m["role"], "content": m["content"]} for m in messages]
async def append_messages(
self,
session_id: str,
user_message: str,
assistant_message: str,
user_tokens: int,
assistant_tokens: int
):
"""追加一轮对话"""
key = f"conv:{session_id}"
pipe = self.redis.pipeline()
for role, content, tokens in [
("user", user_message, user_tokens),
("assistant", assistant_message, assistant_tokens)
]:
msg = ConversationMessage(
role=role,
content=content,
timestamp=time.time(),
token_count=tokens
)
pipe.rpush(key, json.dumps(asdict(msg)))
pipe.expire(key, self.session_ttl)
pipe.execute()
async def clear_session(self, session_id: str):
self.redis.delete(f"conv:{session_id}")
第四层:Token计量与计费
class TokenBudgetManager:
"""管理用户的Token预算"""
def __init__(self, db, redis_client):
self.db = db
self.redis = redis_client
self.pricing = {
"gpt-4o": {"input": 5.0, "output": 15.0}, # $/1M tokens
"gpt-4o-mini": {"input": 0.15, "output": 0.6},
"claude-3-5-sonnet": {"input": 3.0, "output": 15.0},
}
async def check_budget(self, user_id: str, estimated_tokens: int) -> bool:
"""检查用户是否有足够预算"""
remaining = await self.get_remaining_budget(user_id)
return remaining >= estimated_tokens
async def get_remaining_budget(self, user_id: str) -> int:
"""获取当月剩余Token预算"""
# 先查Redis缓存
key = f"budget:{user_id}:{self._current_month()}"
cached = self.redis.get(key)
if cached:
return int(cached)
# 缓存miss,查数据库
plan = await self.db.get_user_plan(user_id)
used = await self.db.get_monthly_usage(user_id)
remaining = plan.monthly_token_limit - used
# 缓存1分钟(不用太精确)
self.redis.setex(key, 60, remaining)
return remaining
async def record_usage(self, user_id: str, model: str, input_tokens: int, output_tokens: int):
"""记录使用量,更新账单"""
cost = (
input_tokens / 1_000_000 * self.pricing[model]["input"] +
output_tokens / 1_000_000 * self.pricing[model]["output"]
)
await self.db.insert_usage_record({
"user_id": user_id,
"model": model,
"input_tokens": input_tokens,
"output_tokens": output_tokens,
"cost_usd": cost,
"timestamp": datetime.now()
})
# 使缓存失效
key = f"budget:{user_id}:{self._current_month()}"
self.redis.delete(key)
第五层:可观测性
from opentelemetry import trace, metrics
from opentelemetry.trace import Status, StatusCode
tracer = trace.get_tracer("llm-service")
meter = metrics.get_meter("llm-service")
# 定义指标
latency_histogram = meter.create_histogram(
"llm.request.latency",
unit="ms",
description="LLM请求延迟分布"
)
token_counter = meter.create_counter(
"llm.tokens.total",
unit="tokens",
description="Token使用量"
)
def instrument_llm_call(model: str, user_id: str):
"""装饰器:自动为LLM调用添加追踪"""
def decorator(func):
async def wrapper(*args, **kwargs):
with tracer.start_as_current_span(
f"llm.complete.{model}",
attributes={
"llm.model": model,
"llm.user_id": user_id,
}
) as span:
start = time.time()
try:
result = await func(*args, **kwargs)
# 记录指标
latency = (time.time() - start) * 1000
latency_histogram.record(latency, {"model": model})
if result.usage:
token_counter.add(
result.usage.total_tokens,
{"model": model, "type": "total"}
)
span.set_status(Status(StatusCode.OK))
return result
except Exception as e:
span.set_status(Status(StatusCode.ERROR, str(e)))
span.record_exception(e)
raise
return wrapper
return decorator
部署配置参考
# kubernetes/llm-service.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: llm-gateway
spec:
replicas: 3
template:
spec:
containers:
- name: llm-gateway
image: myapp/llm-gateway:v1.0
resources:
requests:
cpu: "500m"
memory: "512Mi"
limits:
cpu: "2000m"
memory: "2Gi"
env:
- name: OPENAI_API_KEY
valueFrom:
secretKeyRef:
name: llm-secrets
key: openai-api-key
# LLM服务需要更长的超时
readinessProbe:
httpGet:
path: /health
port: 8000
timeoutSeconds: 5
periodSeconds: 10
---
# 专门针对LLM长连接的HPA配置
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
spec:
metrics:
# 不用CPU,用请求队列长度作为扩缩容指标
- type: External
external:
metric:
name: llm_queue_depth
target:
type: AverageValue
averageValue: "10" # 队列积压超过10就扩容
结语
LLM微服务架构的核心挑战在于它不是普通的同步请求-响应服务。长延迟、流式输出、状态复杂、成本敏感——这些特性决定了你必须从架构设计阶段就把它们当作一等公民。
Gateway模式提供了统一的控制面,会话管理解决了多轮对话的复杂性,Token计量保证了商业可持续性,可观测性让你对系统行为有清晰认知。
这些不是"优化项",而是生产LLM服务的基础设施。早建,省得后悔。