引言:没有评估就没有进步
"我们的 AI 助手回答质量变差了,但我们说不清楚哪里差,也不知道哪个版本好。"
这是 AI 应用团队最常见的困境之一。没有系统的评估体系,Prompt 迭代、模型切换、RAG 优化都变成了"感觉驱动"——可能做了 10 次"优化",其中 6 次实际上让质量下降了,但你不知道。
2026 年,LLM 评估已经从"有没有"问题变成了"做得好不好"的竞争力问题。本文构建一套完整的评估工程体系,涵盖评估维度设计、评测数据集构建、自动化评测流水线和持续监控方案。
一、评估维度框架
1.1 不同场景的评估维度
对话助手:
├── 相关性(回答是否回应了问题)
├── 准确性(信息是否正确)
├── 完整性(是否充分回答)
├── 无害性(是否包含有害内容)
└── 格式合规(是否符合输出格式要求)
RAG 系统:
├── 检索质量
│ ├── 召回率(Recall):相关文档是否都被检索到
│ └── 精确率(Precision):检索到的文档有多少是相关的
├── 生成质量
│ ├── 忠实度(Faithfulness):答案是否基于检索到的上下文
│ └── 答案相关性(Answer Relevancy):答案是否回应了问题
└── 端到端质量
└── 上下文精确率/召回率
代码助手:
├── 代码正确性(是否可执行、是否通过测试)
├── 代码规范性(是否符合项目风格)
└── 任务完成率(是否完成了要求的功能)
1.2 RAGAS 框架:RAG 评估的工业标准
from ragas import evaluate
from ragas.metrics import (
faithfulness, # 忠实度:答案是否来自上下文
answer_relevancy, # 答案相关性
context_precision, # 上下文精确率
context_recall, # 上下文召回率
)
from datasets import Dataset
# 准备评估数据集
eval_data = {
"question": [
"什么是 RAG 技术?",
"如何优化向量检索的精度?",
],
"answer": [
"RAG(检索增强生成)是一种结合检索和生成的 AI 技术...",
"可以通过以下方法优化向量检索精度:1. 使用更好的 embedding 模型...",
],
"contexts": [
["RAG 全称 Retrieval-Augmented Generation,是..."],
["向量检索的精度优化方法包括 Reranking、HyDE..."],
],
"ground_truth": [ # 人工标注的标准答案
"RAG 是检索增强生成技术,通过检索外部知识库来增强大模型的回答质量。",
"优化向量检索精度的主要方法有:使用 Reranking 模型、HyDE 查询扩展、提高 ef 参数等。",
]
}
dataset = Dataset.from_dict(eval_data)
# 执行评估
result = evaluate(
dataset=dataset,
metrics=[faithfulness, answer_relevancy, context_precision, context_recall],
)
print(result)
# Output:
# {'faithfulness': 0.92, 'answer_relevancy': 0.87,
# 'context_precision': 0.85, 'context_recall': 0.79}
二、LLM 作为评判者(LLM-as-Judge)
2.1 为什么需要 LLM-as-Judge
传统 NLP 指标(BLEU、ROUGE)依赖参考答案的字面匹配,不适合评估 LLM 输出:
- "汽车"和"轿车"语义相同,但字符串匹配得分为 0
- 同一问题可以有多个正确答案
LLM-as-Judge 用另一个 LLM 来评判输出质量,更接近人类判断:
from openai import OpenAI
import json
client = OpenAI()
def llm_judge_response(
question: str,
response: str,
reference: str = None,
criteria: list[str] = None
) -> dict:
"""使用 LLM 评判回答质量"""
default_criteria = [
"准确性:信息是否正确,没有明显错误",
"相关性:是否直接回答了问题",
"完整性:是否充分回答,没有遗漏关键点",
"清晰度:表达是否清晰,易于理解"
]
criteria_text = "\n".join(f"{i+1}. {c}" for i, c in enumerate(criteria or default_criteria))
reference_text = f"\n\n参考标准答案:{reference}" if reference else ""
prompt = f"""你是一个客观的 AI 评审员。请对以下 AI 回答进行评分。
问题:{question}
AI 回答:{response}{reference_text}
请按以下标准评分(每项 1-5 分):
{criteria_text}
输出 JSON 格式:
{{
"scores": {{
"准确性": 分数,
"相关性": 分数,
"完整性": 分数,
"清晰度": 分数
}},
"overall": 综合分数(1-10),
"strengths": ["优点1", "优点2"],
"weaknesses": ["不足1", "不足2"],
"reasoning": "评分理由"
}}"""
response_obj = client.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": prompt}],
response_format={"type": "json_object"},
temperature=0 # 评判要稳定,temperature=0
)
return json.loads(response_obj.choices[0].message.content)
2.2 避免 LLM Judge 的偏见
def llm_judge_pairwise(
question: str,
response_a: str,
response_b: str
) -> dict:
"""A/B 对比评判,减少位置偏见"""
# 随机交换 A/B 顺序,运行两次取平均,消除位置偏见
import random
results = []
for swap in [False, True]:
if swap:
resp1, resp2, label1, label2 = response_b, response_a, "B", "A"
else:
resp1, resp2, label1, label2 = response_a, response_b, "A", "B"
prompt = f"""比较两个 AI 对同一问题的回答,选出更好的那个。
问题:{question}
回答1:{resp1}
回答2:{resp2}
哪个回答更好?输出 JSON:
{{"winner": "1或2", "reason": "原因", "margin": "slight/moderate/significant"}}"""
response = client.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": prompt}],
response_format={"type": "json_object"},
temperature=0
)
result = json.loads(response.choices[0].message.content)
# 映射回原始 A/B 标签
winner = label1 if result["winner"] == "1" else label2
results.append({"winner": winner, "margin": result["margin"]})
# 两次结果一致则采纳,否则标记为"平局"
if results[0]["winner"] == results[1]["winner"]:
return {"winner": results[0]["winner"], "confidence": "high"}
else:
return {"winner": "tie", "confidence": "low"}
三、自动化评测流水线
3.1 评测数据集管理
import json
from pathlib import Path
from dataclasses import dataclass, asdict
from datetime import datetime
@dataclass
class EvalCase:
id: str
question: str
expected_answer: str
category: str # 分类:factual/reasoning/creative/code
difficulty: str # 难度:easy/medium/hard
tags: list[str]
created_at: str = None
def __post_init__(self):
if not self.created_at:
self.created_at = datetime.now().isoformat()
class EvalDataset:
def __init__(self, name: str, path: str):
self.name = name
self.path = Path(path)
self.cases: list[EvalCase] = []
self._load()
def _load(self):
if self.path.exists():
with open(self.path, 'r', encoding='utf-8') as f:
data = json.load(f)
self.cases = [EvalCase(**c) for c in data]
def add_case(self, case: EvalCase):
self.cases.append(case)
self._save()
def filter(self, category: str = None, difficulty: str = None) -> list[EvalCase]:
filtered = self.cases
if category:
filtered = [c for c in filtered if c.category == category]
if difficulty:
filtered = [c for c in filtered if c.difficulty == difficulty]
return filtered
3.2 完整评测流水线
import asyncio
from concurrent.futures import ThreadPoolExecutor
class EvalPipeline:
def __init__(self, target_system, judge_model="gpt-4o"):
self.target = target_system
self.judge_model = judge_model
self.results = []
async def run(self, dataset: EvalDataset, batch_size: int = 10) -> dict:
"""运行完整评测,返回汇总结果"""
cases = dataset.cases
all_results = []
# 批量处理
for i in range(0, len(cases), batch_size):
batch = cases[i:i+batch_size]
batch_results = await asyncio.gather(*[
self._eval_single(case) for case in batch
])
all_results.extend(batch_results)
print(f"进度: {min(i+batch_size, len(cases))}/{len(cases)}")
return self._summarize(all_results)
async def _eval_single(self, case: EvalCase) -> dict:
"""评测单个样例"""
# 获取模型回答
response = await self.target.generate(case.question)
# LLM 评判
judge_result = llm_judge_response(
question=case.question,
response=response,
reference=case.expected_answer
)
return {
"case_id": case.id,
"category": case.category,
"difficulty": case.difficulty,
"response": response,
"scores": judge_result["scores"],
"overall": judge_result["overall"],
}
def _summarize(self, results: list[dict]) -> dict:
"""汇总评测结果"""
overall_scores = [r["overall"] for r in results]
# 按类别汇总
category_scores = {}
for r in results:
cat = r["category"]
if cat not in category_scores:
category_scores[cat] = []
category_scores[cat].append(r["overall"])
return {
"total_cases": len(results),
"overall_mean": sum(overall_scores) / len(overall_scores),
"overall_median": sorted(overall_scores)[len(overall_scores)//2],
"by_category": {
cat: sum(scores)/len(scores)
for cat, scores in category_scores.items()
},
"raw_results": results
}
四、持续监控:线上质量追踪
import random
from datetime import datetime
class ProductionEvalMonitor:
"""生产环境的质量监控,对线上请求随机采样评测"""
def __init__(self, sample_rate: float = 0.05):
self.sample_rate = sample_rate # 5% 采样率
self.eval_queue = []
def on_request_complete(self, request: str, response: str, metadata: dict):
"""每次请求完成时触发"""
if random.random() < self.sample_rate:
self.eval_queue.append({
"timestamp": datetime.now().isoformat(),
"request": request,
"response": response,
"metadata": metadata
})
# 队列满 100 条时批量评测
if len(self.eval_queue) >= 100:
self._process_queue()
def _process_queue(self):
"""批量评测采样数据"""
batch = self.eval_queue[:100]
self.eval_queue = self.eval_queue[100:]
scores = []
for item in batch:
result = llm_judge_response(
question=item["request"],
response=item["response"]
)
scores.append(result["overall"])
avg_score = sum(scores) / len(scores)
# 如果质量下降超过阈值,触发告警
if avg_score < 7.0:
self._trigger_alert(avg_score, scores)
结语
LLM 评估工程的核心价值,在于把"AI 质量好不好"这个模糊问题,变成可量化、可追踪、可比较的具体指标。
从 LLM-as-Judge 开始,建立基础评测能力;再建设评测数据集,形成标准基准;最后引入持续监控,实现质量的线上追踪。这三步构成了完整的 AI 应用质量保障体系。
有了评估体系,每一次模型升级、Prompt 优化、系统改动,才能有据可依地做出"这次改动让质量提升了 X%"的判断。