Trae Agent Evaluation 模块架构分析
概述
Evaluation 模块是 Trae Agent 项目的评估系统,用于在 SWE-bench、SWE-bench-Live 和 Multi-SWE-bench 等基准测试上评估 Agent 的性能。该模块采用 Docker 容器化执行、并行处理和模块化设计。
一、整体架构
evaluation/
├── run_evaluation.py # 主评估入口
├── utils.py # 工具函数
├── setup.sh # 环境设置脚本
├── patch_selection/ # Patch 选择模块(Selector Agent)
│ ├── selector.py # Selector 主入口
│ ├── analysis.py # 结果分析工具
│ └── trae_selector/ # Selector Agent 核心
│ ├── selector_evaluation.py # 评估逻辑
│ ├── selector_agent.py # Selector Agent 实现
│ ├── sandbox.py # Docker 沙箱
│ └── tools/ # 工具集
└── README.md # 文档
二、核心组件详解
2.1 BenchmarkEvaluation 类
文件: evaluation/run_evaluation.py
这是主评估类,负责管理整个评估流程:
class BenchmarkEvaluation:
"""
主评估类,处理:
1. Docker 镜像管理
2. 环境准备
3. Patch 生成
4. 结果评估
"""
def __init__(
self,
benchmark: str, # 基准名称 (swe_bench/swe_bench_live/multi_swe_bench)
working_dir: str, # 工作目录
trae_config_file_name: str, # Trae Agent 配置
dataset: str = "SWE-bench_Verified", # 数据集
max_workers: int = 4, # 并行工作数
instance_ids: list[str] | None = None, # 指定实例
):
# 加载配置和数据集
self.config = BENCHMARK_CONFIG[benchmark]
self.dataset = self.config.load_dataset(dataset)
self.docker_client = from_env()
# 并行执行设置
self.max_workers = max_workers
self.instance_ids = instance_ids
核心方法
| 方法 | 功能 |
|---|---|
pull_images() | 拉取所需 Docker 镜像 |
prepare_trae_agent() | 构建 Trae Agent 环境 |
run_instances() | 并行运行实例评估 |
evaluate() | 执行评估并收集结果 |
执行流程
def run_instances(self):
"""并行执行多个实例"""
with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
# 提交所有任务
futures = {
executor.submit(self._run_single_instance, instance_id): instance_id
for instance_id in self.instance_ids
}
# 收集结果
for future in as_completed(futures):
instance_id = futures[future]
try:
result = future.result()
self._save_result(instance_id, result)
except Exception as e:
print(f"Instance {instance_id} failed: {e}")
2.2 配置系统 (BENCHMARK_CONFIG)
文件: evaluation/utils.py
@dataclass
class BenchmarkConfig:
"""基准测试配置"""
name: str
valid_datasets: list[str]
load_dataset: Callable[[str], list[dict]] # 数据集加载函数
image_name: Callable[[str], str] # 镜像名称生成
evaluate_harness: Callable[..., None] # 评估函数
# 配置注册表
BENCHMARK_CONFIG = {
"swe_bench": BenchmarkConfig(
name="swe_bench",
valid_datasets=["SWE-bench_Verified", "SWE-bench_Lite", "SWE-bench_Test"],
load_dataset=lambda name: load_dataset(name),
image_name=lambda instance_id: f"swebench/sweb.eval.x86_64.{instance_id}",
evaluate_harness=swebench_evaluate_harness_after,
),
"swe_bench_live": BenchmarkConfig(...),
"multi_swe_bench": BenchmarkConfig(...),
}
设计优点:
- 统一接口,支持多种基准测试
- 易于扩展新的基准
- 配置与逻辑分离
2.3 Docker 执行环境
镜像管理
def _check_images(self):
"""检查 Docker 镜像是否存在"""
for item in self.dataset:
instance_id = item["instance_id"]
image_name = self._image_name(instance_id)
try:
self.docker_client.images.get(image_name)
self.image_status[instance_id] = True
except ImageNotFound:
self.image_status[instance_id] = False
def pull_images(self):
"""拉取缺失的镜像"""
instance_ids = [id for id, exists in self.image_status.items() if not exists]
for instance_id in tqdm(instance_ids, desc="Downloading images"):
image_name = self._image_name(instance_id)
self.docker_client.images.pull(image_name)
容器执行
def docker_exec(container: Container, command: str) -> tuple[int, str]:
"""在容器中执行命令"""
exec_result: ExecResult = container.exec_run(cmd=command)
return_code = exec_result[0]
output = exec_result[1].decode("utf-8")
return return_code, output
三、Patch Selection 模块(Selector Agent)
3.1 架构概述
Selector Agent 是一个基于 Agent 的集成推理系统,用于从多个候选 Patch 中选择最佳方案。
patch_selection/
├── selector.py # 主入口
├── analysis.py # 结果分析
└── trae_selector/
├── selector_evaluation.py # 评估协调
├── selector_agent.py # Agent 实现
├── sandbox.py # Docker 沙箱
└── tools/ # 工具集
├── execute_bash.py
├── execute_str_replace_editor.py
└── ...
3.2 SelectorEvaluation 类
文件: evaluation/patch_selection/trae_selector/selector_evaluation.py
class SelectorEvaluation:
"""
Patch 选择评估器
支持分组处理和多数投票
"""
def __init__(
self,
llm_config: ModelConfig,
num_candidate: int, # 候选 Patch 数量
max_retry: int, # 最大重试次数
max_turn: int, # 最大对话轮数
log_path: str,
output_path: str,
patches_path: str,
instance_list: list[dict],
candidate_dic: dict,
tools_path: str,
statistics_path: str,
group_size: int, # 分组大小
majority_voting: bool = True, # 是否启用多数投票
):
...
分组处理策略
def run_instance(
instance,
candidate_log,
num_candidate: int,
group_size: int,
...
):
"""将候选 Patch 分组处理"""
# 将 N 个候选分为 M 组(每组 group_size 个)
groups = []
for i in range(0, num_candidate, group_size):
this_group = {
"instance_id": candidate_log["instance_id"],
"patches": candidate_log["patches"][i:i + group_size],
"regressions": candidate_log["regressions"][i:i + group_size],
"success_id": candidate_log["success_id"][i:i + group_size],
}
groups.append(this_group)
# 每组独立选择
for group_id, group in enumerate(groups):
run_instance_by_group(
instance=instance,
candidate_log=group,
group_id=group_id,
...
)
多数投票机制
def run_instance_by_group(..., majority_voting=True):
"""多数投票选择最佳 Patch"""
if majority_voting:
# 多次运行,统计选择频率
selections = []
for trial in range(num_trials):
selected_patch = selector_agent.run()
selections.append(selected_patch)
# 选择频率最高的 Patch
counter = Counter(selections)
final_selection = counter.most_common(1)[0][0]
else:
# 单次选择
final_selection = selector_agent.run()
3.3 SelectorAgent 类
文件: evaluation/patch_selection/trae_selector/selector_agent.py
class SelectorAgent:
"""
Patch 选择 Agent
职责:分析候选 Patch 并选择最佳方案
"""
def __init__(
self,
llm_config: ModelConfig,
sandbox: Sandbox,
project_path: str,
issue_description: str,
candidate_list: list[CandidatePatch],
max_turn: int = 50,
):
self.llm_config = llm_config
self.sandbox = sandbox
self.candidate_list = candidate_list
self.max_turn = max_turn
# 初始化工具
self.tools = [
tools_registry[tool_name](model_provider=llm_config.model_provider.provider)
for tool_name in ["bash", "str_replace_based_edit_tool"]
]
# 初始化 LLM 客户端
self.llm_client = LLMClient(llm_config)
# 构建系统提示词
self.initial_messages = [
LLMMessage(role="system", content=build_system_prompt(len(candidate_list)))
]
系统提示词设计
def build_system_prompt(candidate_length: int) -> str:
"""构建系统提示词"""
return f"""\
# ROLE: Act as an expert code evaluator.
Given a codebase, an github issue and **{candidate_length} candidate patches**
proposed by your colleagues, your responsibility is to **select the correct one**
to solve the issue.
# WORK PROCESS:
1. Understand the Issue and Codebase
2. Analyze the Candidate Patches
3. Validate Functionality (Optional but Recommended)
4. Select the Best Patch
# FINAL REPORT:
### Status: succeed
### Result: Patch-x
### Analysis: [Explain why Patch-x is correct.]
"""
执行循环
def run(self):
"""Agent 执行主循环"""
messages = self.initial_messages
turn = 0
while turn < self.max_turn:
turn += 1
# 1. 调用 LLM
llm_response = self.llm_client.chat(messages, self.llm_config, self.tools)
# 2. 检查是否完成选择
if self._check_final_answer(llm_response.content):
return self._extract_selection(llm_response.content)
# 3. 执行工具调用
tool_results = parse_tool_response(llm_response, ...)
# 4. 更新消息历史
messages.extend(tool_results)
3.4 Sandbox 沙箱系统
文件: evaluation/patch_selection/trae_selector/sandbox.py
class Sandbox:
"""
Docker 沙箱环境
提供隔离的执行环境
"""
def __init__(
self,
namespace: str,
name: str,
tag: str,
instance: dict,
tools_path: str,
):
self.namespace = namespace
self.name = name
self.tag = tag
self.client = docker.from_env()
self.commit_id = instance["base_commit"]
self.tools_path = tools_path
self.container = None
self.shell = None
def start_container(self):
"""启动 Docker 容器"""
image = f"{self.namespace}/{self.name}:{self.tag}"
self.container = self.client.containers.run(
image,
detach=True,
tty=True,
stdin_open=True,
privileged=True,
volumes={
"/tmp": {"bind": "/tmp", "mode": "rw"}
}
)
# 复制工具到容器
cmd = f"docker cp {self.tools_path} {self.container.name}:/home/swe-bench/"
subprocess.run(cmd, check=True, shell=True)
# 检出代码
self.container.exec_run(f"git checkout {self.commit_id}")
交互式 Shell 会话
def start_shell(self):
"""启动交互式 Shell"""
command = f"docker exec -it {self.container.id} /bin/bash"
self.shell = pexpect.spawn(command, maxread=200000)
self.shell.expect([r"\$ ", r"# "], timeout=10)
def get_session(self):
"""获取可执行会话"""
self.start_shell()
class Session:
def __init__(self, sandbox):
self.sandbox = sandbox
def execute(self, command, timeout=60):
"""在沙箱中执行命令"""
self.sandbox.shell.sendline(command)
self.sandbox.shell.expect([r"swe-bench@.*:.*\$ "], timeout)
return self.sandbox.shell.before.decode("utf-8")
return Session(self)
四、工具执行系统
4.1 工具代理模式
文件: evaluation/patch_selection/trae_selector/tools/tools/execute_bash.py
#!/usr/bin/env python3
"""
在 Docker 容器内执行 Bash 命令的代理脚本
"""
import argparse
import subprocess
import sys
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--command", required=True)
args = parser.parse_args()
try:
result = subprocess.run(
args.command,
shell=True,
capture_output=True,
text=True,
timeout=300
)
print(result.stdout)
print(result.stderr, file=sys.stderr)
if result.returncode == 0:
print("Tool Call Status: 0")
else:
print("Tool Call Status: -1")
except Exception as e:
print(f"Error: {e}")
print("Tool Call Status: -1")
if __name__ == "__main__":
main()
4.2 工具调用解析
def parse_tool_response(answer: LLMResponse, finish_reason: str, sandbox_session):
"""解析 LLM 的工具调用请求"""
result = []
for tool_call in answer.tool_calls:
tool_name = tool_call.name
tool_arguments = tool_call.arguments
# 构建执行命令
if tool_name == "str_replace_based_edit_tool":
cmd = "cd /home/swe-bench/tools/ && /home/swe-bench/py312/bin/python3 execute_str_replace_editor.py"
elif tool_name == "bash":
cmd = "cd /home/swe-bench/tools/ && /home/swe-bench/py312/bin/python3 execute_bash.py"
# 添加参数
for key, value in tool_arguments.items():
cmd += f" --{key} {shlex.quote(str(value))}"
# 在沙箱中执行
output = sandbox_session.execute(cmd)
# 解析执行状态
if "Tool Call Status: 0" in output:
success = True
else:
success = False
result.append(LLMMessage(
role="user",
content=output,
tool_result=ToolResult(...)
))
return result
五、结果分析
5.1 输出结构
results/
├── log/ # LLM 交互日志
│ └── group_0/
│ └── instance_id_voting_0_trial_1.json
├── output/ # 标准输出/错误
│ └── group_0/
│ └── instance_id.log
├── patch/ # 选中的 Patch
│ └── group_0/
│ └── instance_id_1.patch
└── statistics/ # 统计结果
└── group_0/
└── instance_id.json
5.2 统计分析
文件: evaluation/patch_selection/analysis.py
def analyze_results(result_path: str):
"""分析选择结果"""
statistics = []
for stats_file in Path(result_path).glob("statistics/**/*.json"):
with open(stats_file) as f:
data = json.load(f)
statistics.append({
"instance_id": data["instance_id"],
"selected_patch": data["selected_patch"],
"is_correct": data["is_correct"],
"success_rate": data["success_rate"],
})
# 计算整体准确率
total = len(statistics)
correct = sum(1 for s in statistics if s["is_correct"])
accuracy = correct / total if total > 0 else 0
print(f"Total instances: {total}")
print(f"Correct selections: {correct}")
print(f"Accuracy: {accuracy:.2%}")
六、设计模式总结
| 设计模式 | 应用场景 | 实现位置 |
|---|---|---|
| 策略模式 | 不同基准测试的配置策略 | BENCHMARK_CONFIG |
| 工厂模式 | LLMClient、Sandbox 创建 | LLMClient.create(), Sandbox.__init__() |
| 模板方法模式 | 评估流程定义 | BenchmarkEvaluation.run_instances() |
| 代理模式 | Docker 容器执行 | docker_exec(), Sandbox |
| 命令模式 | 工具调用封装 | ToolCall, execute_bash.py |
| 观察者模式 | 结果收集 | ThreadPoolExecutor + as_completed() |
| 建造者模式 | 系统提示词构建 | build_system_prompt() |
| 迭代器模式 | 分组处理 | range(0, num_candidate, group_size) |
七、使用示例
7.1 运行完整评估
# 设置环境
chmod +x setup.sh
./setup.sh swe_bench
# 运行评估
python -m evaluation.run_evaluation \
--benchmark swe_bench \
--dataset SWE-bench_Verified \
--working_dir ./workspace \
--trae_config_file_name trae_config.yaml \
--max_workers 4
7.2 运行 Patch 选择
python evaluation/patch_selection/selector.py \
--instances_path "swebench-verified.json" \
--candidate_path "patch_candidates.jsonl" \
--result_path "./results" \
--num_candidate 10 \
--group_size 10 \
--max_workers 10 \
--config_file trae_config.yaml \
--model_name trae_agent_model \
--majority_voting
八、核心特点
- 容器化执行:使用 Docker 提供隔离环境
- 并行处理:支持多实例并行评估
- 模块化设计:易于扩展新的基准测试
- 可复现性:固定 commit 和依赖版本
- Agent 集成:支持 Selector Agent 智能选择
- 容错机制:重试和错误处理
最后更新: 2026-03-16