大模型版本更新、Prompt 迭代,一旦直接全量上线,风险极高。用户体验的任何下滑都直接影响口碑和留存。灰度发布(Canary Release)是 LLM 应用安全迭代的核心工程实践。
本文系统梳理 LLM 应用的灰度发布策略,从最基础的流量分割到自动化 rollback。
为什么 LLM 应用特别需要灰度
传统软件的灰度发布已经相当成熟,但 LLM 应用有几个特殊之处:
1. 变更对象不只是代码 LLM 应用的"变更"包括:
- 模型版本升级(GPT-4o → GPT-4o-2026-05)
- System Prompt 修改
- 检索策略变化(RAG)
- Temperature、top_p 等超参数调整
2. 质量难以用传统指标衡量 代码发布看错误率,LLM 发布要看"回答质量"——这是主观的,需要特定评估方法。
3. 模型行为的随机性 相同输入可能产生不同输出,使得 A/B 测试结果需要更大的样本量才有统计显著性。
灰度架构设计
核心组件
用户请求
↓
流量路由层(Traffic Router)
├── 5% → Canary 版本(新 Prompt/模型)
└── 95% → Stable 版本(当前生产)
↓
LLM 调用层
↓
质量监控层(Quality Monitor)
├── 实时评分
├── 异常检测
└── 自动回滚触发
路由层实现
import hashlib
import random
from typing import Optional
from dataclasses import dataclass
from enum import Enum
class DeploymentVariant(str, Enum):
STABLE = "stable"
CANARY = "canary"
ROLLBACK = "rollback"
@dataclass
class RouterConfig:
"""路由配置"""
canary_percentage: float = 0.05 # 5% 流量给 Canary
sticky_sessions: bool = True # 同一用户始终路由到同一版本
exclude_users: list = None # 排除特定用户(如 VIP 用户不进灰度)
include_only: list = None # 只有这些用户进灰度(内测用户)
class CanaryRouter:
def __init__(self, config: RouterConfig):
self.config = config
self._force_rollback = False
def get_variant(self, user_id: str, request_id: str = None) -> DeploymentVariant:
"""决定请求走哪个版本"""
# 强制回滚状态
if self._force_rollback:
return DeploymentVariant.ROLLBACK
# 排除特定用户
if self.config.exclude_users and user_id in self.config.exclude_users:
return DeploymentVariant.STABLE
# 仅限特定用户
if self.config.include_only:
if user_id not in self.config.include_only:
return DeploymentVariant.STABLE
return DeploymentVariant.CANARY
# 基于用户 ID 的一致性路由(sticky sessions)
if self.config.sticky_sessions:
hash_val = int(hashlib.md5(user_id.encode()).hexdigest(), 16)
pct = (hash_val % 10000) / 10000
else:
# 每次请求随机
pct = random.random()
if pct < self.config.canary_percentage:
return DeploymentVariant.CANARY
return DeploymentVariant.STABLE
def trigger_rollback(self):
"""触发回滚(将所有流量切到 stable)"""
self._force_rollback = True
print("⚠️ 已触发回滚,所有流量切换到 Stable 版本")
def resume_canary(self, new_percentage: float = None):
"""恢复 Canary 流量"""
self._force_rollback = False
if new_percentage is not None:
self.config.canary_percentage = new_percentage
print(f"✅ 已恢复 Canary,流量占比: {self.config.canary_percentage:.1%}")
版本配置管理
from dataclasses import dataclass, field
from typing import Optional, Dict, Any
@dataclass
class LLMVersionConfig:
"""LLM 版本配置"""
version_id: str
model: str
system_prompt: str
temperature: float = 0.7
max_tokens: int = 2000
retrieval_strategy: Optional[str] = None # RAG 检索策略
extra_params: Dict[str, Any] = field(default_factory=dict)
# 版本元信息
description: str = ""
deployed_at: Optional[str] = None
deployed_by: str = ""
class VersionConfigStore:
"""版本配置存储"""
def __init__(self):
self._configs: Dict[str, LLMVersionConfig] = {}
self._stable_version: Optional[str] = None
self._canary_version: Optional[str] = None
def register_version(self, config: LLMVersionConfig):
self._configs[config.version_id] = config
print(f"已注册版本: {config.version_id}")
def set_stable(self, version_id: str):
if version_id not in self._configs:
raise ValueError(f"版本不存在: {version_id}")
self._stable_version = version_id
print(f"Stable 版本: {version_id}")
def set_canary(self, version_id: str):
if version_id not in self._configs:
raise ValueError(f"版本不存在: {version_id}")
self._canary_version = version_id
print(f"Canary 版本: {version_id}")
def get_config(self, variant: DeploymentVariant) -> LLMVersionConfig:
if variant == DeploymentVariant.CANARY and self._canary_version:
return self._configs[self._canary_version]
elif variant == DeploymentVariant.ROLLBACK:
# 回滚时用 stable 的前一个版本
return self._configs.get(self._stable_version)
else:
return self._configs.get(self._stable_version)
# 使用示例
store = VersionConfigStore()
# 注册 stable 版本
store.register_version(LLMVersionConfig(
version_id="v1.2.0",
model="gpt-4o",
system_prompt="你是一个专业助手...",
temperature=0.7,
description="当前稳定版本",
))
# 注册 canary 版本(新 Prompt)
store.register_version(LLMVersionConfig(
version_id="v1.3.0-canary",
model="gpt-4o",
system_prompt="你是一个专业助手(改进版)...",
temperature=0.5, # 调低了 temperature
description="优化了回答格式,降低了 temperature",
deployed_by="张三",
))
store.set_stable("v1.2.0")
store.set_canary("v1.3.0-canary")
质量监控与自动回滚
import statistics
from collections import deque
from datetime import datetime, timedelta
import threading
class QualityMonitor:
"""实时质量监控,异常时自动触发回滚"""
def __init__(self, router: CanaryRouter, window_size: int = 100):
self.router = router
self.window_size = window_size
# 滑动窗口存储各版本的质量指标
self.stable_metrics = deque(maxlen=window_size)
self.canary_metrics = deque(maxlen=window_size)
# 回滚阈值
self.error_rate_threshold = 0.05 # 错误率超过 5%
self.rating_drop_threshold = 0.5 # 评分下降超过 0.5 分
self.latency_spike_threshold = 2.0 # 延迟增加超过 2x
self._lock = threading.Lock()
def record_metric(
self,
variant: DeploymentVariant,
is_error: bool = False,
rating: Optional[float] = None, # 1-5
latency_ms: int = None,
):
"""记录单次请求的质量指标"""
metric = {
"ts": datetime.utcnow(),
"is_error": is_error,
"rating": rating,
"latency_ms": latency_ms,
}
with self._lock:
if variant == DeploymentVariant.STABLE:
self.stable_metrics.append(metric)
elif variant == DeploymentVariant.CANARY:
self.canary_metrics.append(metric)
# 检查是否需要回滚
self._check_and_rollback()
def _check_and_rollback(self):
"""检查质量是否下降,必要时回滚"""
if len(self.canary_metrics) < 20:
return # 样本不足,不判断
stable_data = list(self.stable_metrics)
canary_data = list(self.canary_metrics)
# 检查错误率
canary_errors = sum(1 for m in canary_data if m["is_error"]) / len(canary_data)
stable_errors = sum(1 for m in stable_data if m["is_error"]) / len(stable_data) if stable_data else 0
if canary_errors > self.error_rate_threshold:
self._auto_rollback(f"Canary 错误率过高: {canary_errors:.1%} (阈值: {self.error_rate_threshold:.1%})")
return
if stable_errors > 0 and canary_errors > stable_errors * 3:
self._auto_rollback(f"Canary 错误率是 Stable 的 {canary_errors/stable_errors:.1f} 倍")
return
# 检查评分
canary_ratings = [m["rating"] for m in canary_data if m["rating"] is not None]
stable_ratings = [m["rating"] for m in stable_data if m["rating"] is not None]
if len(canary_ratings) >= 10 and len(stable_ratings) >= 10:
canary_avg = statistics.mean(canary_ratings)
stable_avg = statistics.mean(stable_ratings)
if stable_avg - canary_avg > self.rating_drop_threshold:
self._auto_rollback(
f"Canary 评分下降: {canary_avg:.2f} vs Stable {stable_avg:.2f}"
)
# 检查延迟
canary_latency = [m["latency_ms"] for m in canary_data if m["latency_ms"]]
stable_latency = [m["latency_ms"] for m in stable_data if m["latency_ms"]]
if canary_latency and stable_latency:
canary_p95 = sorted(canary_latency)[int(len(canary_latency) * 0.95)]
stable_p95 = sorted(stable_latency)[int(len(stable_latency) * 0.95)]
if canary_p95 > stable_p95 * self.latency_spike_threshold:
self._auto_rollback(
f"Canary P95 延迟过高: {canary_p95}ms vs Stable {stable_p95}ms"
)
def _auto_rollback(self, reason: str):
"""自动回滚"""
print(f"🚨 自动回滚触发: {reason}")
print(f" 时间: {datetime.utcnow().isoformat()}")
self.router.trigger_rollback()
# 发送告警
self._send_alert(f"LLM 灰度自动回滚\n原因: {reason}")
def _send_alert(self, message: str):
"""发送告警(集成钉钉/Slack/PagerDuty 等)"""
print(f"[ALERT] {message}")
# TODO: 对接告警系统
def get_comparison_report(self) -> dict:
"""生成版本对比报告"""
stable = list(self.stable_metrics)
canary = list(self.canary_metrics)
def calc_stats(metrics):
if not metrics:
return {}
errors = sum(1 for m in metrics if m["is_error"])
ratings = [m["rating"] for m in metrics if m["rating"]]
latencies = [m["latency_ms"] for m in metrics if m["latency_ms"]]
return {
"count": len(metrics),
"error_rate": errors / len(metrics),
"avg_rating": statistics.mean(ratings) if ratings else None,
"p50_latency": sorted(latencies)[len(latencies)//2] if latencies else None,
"p95_latency": sorted(latencies)[int(len(latencies)*0.95)] if latencies else None,
}
return {
"stable": calc_stats(stable),
"canary": calc_stats(canary),
"recommendation": "canary_winning" if (
canary and stable and
calc_stats(canary).get("avg_rating", 0) > calc_stats(stable).get("avg_rating", 0)
) else "stable_winning",
}
完整发布流程
class GradualRolloutManager:
"""渐进式发布管理器"""
STAGES = [0.01, 0.05, 0.10, 0.25, 0.50, 1.0] # 逐步扩大流量
def __init__(self, router: CanaryRouter, monitor: QualityMonitor):
self.router = router
self.monitor = monitor
self.current_stage = 0
async def start_rollout(
self,
new_version_id: str,
store: VersionConfigStore,
stage_duration_hours: int = 2,
):
"""启动渐进式发布"""
print(f"开始发布 {new_version_id}")
store.set_canary(new_version_id)
for stage_pct in self.STAGES:
print(f"\n--- 阶段: {stage_pct:.0%} 流量 ---")
self.router.config.canary_percentage = stage_pct
self.router.resume_canary(stage_pct)
# 等待一段时间收集数据
print(f"等待 {stage_duration_hours} 小时观察...")
await asyncio.sleep(stage_duration_hours * 3600)
# 检查是否被自动回滚
if self.router._force_rollback:
print("❌ 发布失败:触发了自动回滚")
return False
# 获取当前报告
report = self.monitor.get_comparison_report()
print(f"当前质量对比: {report}")
# 如果是最后一个阶段(100%),切换 stable
if stage_pct == 1.0:
store.set_stable(new_version_id)
print(f"✅ 发布完成!{new_version_id} 已成为 Stable 版本")
return True
return True
快速回滚手册
场景1:发现问题,立即手动回滚
# 通过 API 触发回滚
curl -X POST https://api.yourapp.com/admin/llm/rollback \
-H "Authorization: Bearer $ADMIN_TOKEN" \
-d '{"reason": "用户投诉增加30%"}'
场景2:回滚后分析原因
# 拉取回滚前后的对比数据
report = monitor.get_comparison_report()
print(json.dumps(report, indent=2, ensure_ascii=False))
# 找出 Canary 版本的差评样本
bad_canary = db.query("""
SELECT request_id, prompt, response, rating, user_feedback
FROM llm_requests
WHERE variant='canary' AND rating <= 2
ORDER BY created_at DESC LIMIT 20
""")
总结
LLM 应用的灰度发布核心在于:
- 流量路由:小比例先行,逐步扩大
- 质量监控:实时追踪错误率、评分、延迟
- 自动回滚:设好阈值,出问题自动处理
- 渐进式:每个阶段有足够的观察时间
这套体系让你可以安全地迭代 LLM 应用,快速验证新版本是否真的更好,有问题立即止损。大胆迭代,安全发布。