第10章 MCP系统的监控与调试
前言
MCP系统在生产环境中运行时,监控和调试能力至关重要。本章讲解如何建立完善的监控体系、诊断问题、优化性能,确保MCP系统稳定可靠。
10.1 MCP通信的日志与追踪
10.1.1 结构化日志系统
graph TB
A["日志系统"] --> B["日志等级"]
A --> C["日志内容"]
A --> D["日志存储"]
A --> E["日志分析"]
B --> B1["DEBUG - 详细信息"]
B --> B2["INFO - 一般信息"]
B --> B3["WARNING - 警告信息"]
B --> B4["ERROR - 错误信息"]
B --> B5["CRITICAL - 严重错误"]
C --> C1["时间戳"]
C --> C2["事件类型"]
C --> C3["操作标识"]
C --> C4["关键数据"]
D --> D1["本地文件"]
D --> D2["中央日志系统"]
D --> D3["时间序列数据库"]
E --> E1["聚合分析"]
E --> E2["异常检测"]
E --> E3["趋势追踪"]
结构化日志设计 (Python):
import logging
import json
from datetime import datetime
from typing import Dict, Any, Optional
class StructuredLogger:
"""结构化日志记录器"""
def __init__(self, name: str, log_level: str = "INFO"):
self.logger = logging.getLogger(name)
self.logger.setLevel(getattr(logging, log_level))
# 添加JSON格式的日志处理器
handler = logging.StreamHandler()
formatter = logging.Formatter('%(message)s')
handler.setFormatter(formatter)
self.logger.addHandler(handler)
def log_event(self, event_type: str, level: str = "INFO",
**kwargs):
"""
记录结构化日志
Args:
event_type: 事件类型
level: 日志等级
**kwargs: 其他字段
"""
log_entry = {
"timestamp": datetime.now().isoformat(),
"event_type": event_type,
"level": level,
**kwargs
}
log_func = getattr(self.logger, level.lower())
log_func(json.dumps(log_entry, ensure_ascii=False))
def log_connection_event(self, server_name: str, state: str,
duration: Optional[float] = None,
error: Optional[str] = None):
"""记录连接事件"""
self.log_event(
"connection",
"INFO" if state == "connected" else "WARNING",
server_name=server_name,
state=state,
duration_ms=int(duration * 1000) if duration else None,
error=error
)
def log_tool_execution(self, tool_name: str, execution_time: float,
success: bool, error: Optional[str] = None,
arguments_hash: Optional[str] = None):
"""记录工具执行事件"""
self.log_event(
"tool_execution",
"INFO" if success else "ERROR",
tool_name=tool_name,
execution_time_ms=int(execution_time * 1000),
success=success,
error=error,
arguments_hash=arguments_hash
)
def log_resource_access(self, resource_uri: str, operation: str,
execution_time: float, success: bool,
size_bytes: Optional[int] = None,
error: Optional[str] = None):
"""记录资源访问事件"""
self.log_event(
"resource_access",
"INFO" if success else "ERROR",
resource_uri=resource_uri,
operation=operation,
execution_time_ms=int(execution_time * 1000),
success=success,
size_bytes=size_bytes,
error=error
)
# 使用示例
logger = StructuredLogger("MCP", "DEBUG")
logger.log_connection_event(
server_name="database",
state="connected",
duration=0.123
)
logger.log_tool_execution(
tool_name="query_sales",
execution_time=0.456,
success=True,
arguments_hash="abc123"
)
10.1.2 分布式追踪
from typing import List, Dict
from dataclasses import dataclass
from datetime import datetime
import uuid
@dataclass
class Span:
"""追踪跨度"""
trace_id: str
span_id: str
parent_span_id: Optional[str]
operation_name: str
start_time: datetime
end_time: Optional[datetime] = None
tags: Dict[str, str] = None
logs: List[Dict] = None
def duration_ms(self) -> float:
"""获取执行时间(毫秒)"""
if not self.end_time:
return None
return (self.end_time - self.start_time).total_seconds() * 1000
class Tracer:
"""分布式追踪器"""
def __init__(self):
self.spans: Dict[str, List[Span]] = {}
self.current_trace_id: Optional[str] = None
self.current_span_stack: List[Span] = []
def start_trace(self) -> str:
"""开始新的追踪"""
trace_id = str(uuid.uuid4())
self.current_trace_id = trace_id
self.spans[trace_id] = []
return trace_id
def start_span(self, operation_name: str, tags: Dict = None) -> Span:
"""开始一个新的跨度"""
parent_span_id = None
if self.current_span_stack:
parent_span_id = self.current_span_stack[-1].span_id
span = Span(
trace_id=self.current_trace_id,
span_id=str(uuid.uuid4()),
parent_span_id=parent_span_id,
operation_name=operation_name,
start_time=datetime.now(),
tags=tags or {}
)
self.current_span_stack.append(span)
return span
def end_span(self, span: Span, tags: Dict = None):
"""结束一个跨度"""
span.end_time = datetime.now()
if tags:
span.tags.update(tags)
if span in self.current_span_stack:
self.current_span_stack.remove(span)
if span.trace_id in self.spans:
self.spans[span.trace_id].append(span)
def get_trace(self, trace_id: str) -> List[Span]:
"""获取完整追踪"""
return self.spans.get(trace_id, [])
def export_trace_to_jaeger(self, trace_id: str) -> Dict:
"""导出追踪为Jaeger格式"""
spans = self.get_trace(trace_id)
return {
"traceID": trace_id,
"spans": [
{
"traceID": span.trace_id,
"spanID": span.span_id,
"parentSpanID": span.parent_span_id,
"operationName": span.operation_name,
"startTime": int(span.start_time.timestamp() * 1e6),
"duration": int(span.duration_ms() * 1000),
"tags": span.tags
}
for span in spans
]
}
# 使用示例
tracer = Tracer()
async def example_trace():
"""追踪示例"""
trace_id = tracer.start_trace()
# 第1步:连接
conn_span = tracer.start_span("connect_server", {"server": "database"})
await asyncio.sleep(0.1) # 模拟操作
tracer.end_span(conn_span, {"success": "true"})
# 第2步:查询
query_span = tracer.start_span("execute_query")
await asyncio.sleep(0.2) # 模拟查询
tracer.end_span(query_span, {"rows": "1000"})
# 导出追踪
trace_data = tracer.export_trace_to_jaeger(trace_id)
print(json.dumps(trace_data, indent=2))
10.2 调试工具与技巧
10.2.1 MCP消息检查器
from typing import List, Dict
from enum import Enum
class MessageDirection(Enum):
"""消息方向"""
OUTBOUND = "outbound" # 客户端发送
INBOUND = "inbound" # 服务器发送
class MCPMessageInspector:
"""MCP消息检查器"""
def __init__(self, capture_size: int = 1000):
self.message_history: List[Dict] = []
self.max_size = capture_size
def capture_message(self, message: Dict, direction: MessageDirection,
server_name: str = "default"):
"""
捕获消息
Args:
message: 消息内容
direction: 消息方向
server_name: 服务器名称
"""
captured = {
"timestamp": datetime.now().isoformat(),
"direction": direction.value,
"server": server_name,
"message": message,
"size_bytes": len(json.dumps(message).encode('utf-8'))
}
self.message_history.append(captured)
# 保持消息历史的大小
if len(self.message_history) > self.max_size:
self.message_history.pop(0)
def filter_messages(self, method: str = None, direction: str = None,
success_only: bool = False) -> List[Dict]:
"""
过滤消息历史
Args:
method: 方法名称
direction: 消息方向
success_only: 仅显示成功的响应
Returns:
过滤后的消息列表
"""
results = self.message_history
if method:
results = [m for m in results
if m["message"].get("method") == method or
m["message"].get("params", {}).get("method") == method]
if direction:
results = [m for m in results if m["direction"] == direction]
if success_only:
results = [m for m in results
if "error" not in m["message"]]
return results
def get_message_statistics(self) -> Dict:
"""获取消息统计"""
stats = {
"total_messages": len(self.message_history),
"outbound_count": len([m for m in self.message_history
if m["direction"] == "outbound"]),
"inbound_count": len([m for m in self.message_history
if m["direction"] == "inbound"]),
"total_bytes": sum(m["size_bytes"] for m in self.message_history),
"avg_message_size": 0
}
if stats["total_messages"] > 0:
stats["avg_message_size"] = stats["total_bytes"] / stats["total_messages"]
return stats
def export_messages(self, filename: str):
"""导出消息到文件"""
with open(filename, 'w', encoding='utf-8') as f:
json.dump({
"export_time": datetime.now().isoformat(),
"statistics": self.get_message_statistics(),
"messages": self.message_history
}, f, indent=2, ensure_ascii=False)
# 使用示例
inspector = MCPMessageInspector()
# 捕获消息
outgoing_msg = {
"jsonrpc": "2.0",
"method": "tools/list",
"id": "1"
}
inspector.capture_message(outgoing_msg, MessageDirection.OUTBOUND)
# 查看统计
stats = inspector.get_message_statistics()
print(f"Total messages: {stats['total_messages']}")
print(f"Avg size: {stats['avg_message_size']:.0f} bytes")
# 导出
inspector.export_messages("mcp_messages.json")
10.2.2 交互式调试器
import cmd
from typing import Optional
class MCPDebugger(cmd.Cmd):
"""交互式MCP调试器"""
def __init__(self, client, inspector: MCPMessageInspector):
super().__init__()
self.client = client
self.inspector = inspector
self.prompt = "mcp> "
def do_messages(self, arg):
"""显示消息历史
用法: messages [method] [--outbound|--inbound]
"""
parts = arg.split()
method = parts[0] if parts else None
direction = parts[1].replace("--", "") if len(parts) > 1 else None
messages = self.inspector.filter_messages(method=method, direction=direction)
for i, msg in enumerate(messages[-10:]): # 显示最近10条
print(f"\n[{i}] {msg['timestamp']} ({msg['direction']})")
print(json.dumps(msg['message'], indent=2, ensure_ascii=False))
def do_stats(self, arg):
"""显示消息统计"""
stats = self.inspector.get_message_statistics()
print("\n=== Message Statistics ===")
for key, value in stats.items():
print(f"{key}: {value}")
def do_tools(self, arg):
"""列出所有可用的工具"""
# 这里需要调用实际的client方法
print("Available tools:")
# ... 实现工具列表显示
def do_call(self, arg):
"""调用工具
用法: call tool_name arg1=value1 arg2=value2
"""
parts = arg.split()
if not parts:
print("Usage: call tool_name [args...]")
return
tool_name = parts[0]
arguments = {}
for part in parts[1:]:
if "=" in part:
key, value = part.split("=", 1)
arguments[key] = value
print(f"Calling {tool_name} with {arguments}...")
# ... 实现工具调用
def do_export(self, arg):
"""导出消息到文件"""
filename = arg or "mcp_debug_export.json"
self.inspector.export_messages(filename)
print(f"Exported to {filename}")
def do_quit(self, arg):
"""退出调试器"""
return True
# 使用示例
# debugger = MCPDebugger(client, inspector)
# debugger.cmdloop()
10.3 性能分析与优化
10.3.1 性能指标收集
from collections import defaultdict
from statistics import mean, stdev
class PerformanceMetrics:
"""性能指标收集"""
def __init__(self):
self.operation_times: Dict[str, List[float]] = defaultdict(list)
self.error_counts: Dict[str, int] = defaultdict(int)
self.cache_hits: int = 0
self.cache_misses: int = 0
def record_operation(self, operation_name: str, duration_ms: float,
success: bool = True, error: Optional[str] = None):
"""记录操作"""
self.operation_times[operation_name].append(duration_ms)
if not success and error:
self.error_counts[operation_name] += 1
def record_cache_hit(self):
"""记录缓存命中"""
self.cache_hits += 1
def record_cache_miss(self):
"""记录缓存未命中"""
self.cache_misses += 1
def get_statistics(self, operation_name: str = None) -> Dict:
"""获取统计信息"""
if operation_name:
times = self.operation_times.get(operation_name, [])
if not times:
return {}
return {
"operation": operation_name,
"count": len(times),
"avg_ms": mean(times),
"min_ms": min(times),
"max_ms": max(times),
"stdev": stdev(times) if len(times) > 1 else 0,
"p50_ms": sorted(times)[len(times)//2],
"p95_ms": sorted(times)[int(len(times)*0.95)],
"p99_ms": sorted(times)[int(len(times)*0.99)],
"errors": self.error_counts.get(operation_name, 0)
}
# 返回所有操作的统计
return {
op: self.get_statistics(op)
for op in self.operation_times.keys()
}
def get_cache_stats(self) -> Dict:
"""获取缓存统计"""
total = self.cache_hits + self.cache_misses
return {
"cache_hits": self.cache_hits,
"cache_misses": self.cache_misses,
"total_accesses": total,
"hit_rate": self.cache_hits / total if total > 0 else 0
}
def generate_report(self) -> str:
"""生成性能报告"""
report = []
report.append("=== Performance Report ===\n")
# 操作性能
report.append("## Operation Performance\n")
stats = self.get_statistics()
for op_name, op_stats in stats.items():
report.append(f"\n{op_name}:")
report.append(f" Count: {op_stats['count']}")
report.append(f" Avg: {op_stats['avg_ms']:.2f}ms")
report.append(f" Min: {op_stats['min_ms']:.2f}ms")
report.append(f" Max: {op_stats['max_ms']:.2f}ms")
report.append(f" P95: {op_stats['p95_ms']:.2f}ms")
report.append(f" P99: {op_stats['p99_ms']:.2f}ms")
report.append(f" Errors: {op_stats['errors']}")
# 缓存统计
report.append("\n## Cache Statistics\n")
cache_stats = self.get_cache_stats()
report.append(f"Hit Rate: {cache_stats['hit_rate']*100:.1f}%")
report.append(f"Total Accesses: {cache_stats['total_accesses']}")
return "\n".join(report)
10.3.2 性能瓶颈识别
class BottleneckDetector:
"""瓶颈检测器"""
def __init__(self, metrics: PerformanceMetrics,
threshold_percentile: float = 95):
self.metrics = metrics
self.threshold_percentile = threshold_percentile
def identify_slow_operations(self) -> List[Dict]:
"""
识别缓慢的操作
Returns:
缓慢操作列表
"""
slow_ops = []
for op_name, times in self.metrics.operation_times.items():
if not times:
continue
# 计算阈值
threshold = sorted(times)[int(len(times)*self.threshold_percentile/100)]
# 找出超过阈值的操作
slow_count = sum(1 for t in times if t > threshold)
if slow_count > 0:
slow_ops.append({
"operation": op_name,
"slow_count": slow_count,
"total_count": len(times),
"threshold_ms": threshold,
"percentage": slow_count / len(times) * 100
})
return sorted(slow_ops, key=lambda x: x['percentage'], reverse=True)
def identify_error_prone_operations(self) -> List[Dict]:
"""识别容易出错的操作"""
error_ops = []
for op_name, error_count in self.metrics.error_counts.items():
total = len(self.metrics.operation_times.get(op_name, []))
if total == 0:
continue
error_rate = error_count / total
if error_rate > 0:
error_ops.append({
"operation": op_name,
"error_count": error_count,
"total_count": total,
"error_rate": error_rate * 100
})
return sorted(error_ops, key=lambda x: x['error_rate'], reverse=True)
def get_optimization_recommendations(self) -> List[str]:
"""获取优化建议"""
recommendations = []
# 检查缓存命中率
cache_stats = self.metrics.get_cache_stats()
if cache_stats['hit_rate'] < 0.7:
recommendations.append(
f"⚠️ 低缓存命中率 ({cache_stats['hit_rate']*100:.1f}%)。"
"考虑调整缓存策略或增加缓存大小。"
)
# 检查缓慢操作
slow_ops = self.identify_slow_operations()
if slow_ops:
top_slow = slow_ops[0]
recommendations.append(
f"⚠️ {top_slow['operation']} 操作较慢 "
f"({top_slow['percentage']:.1f}% 超过阈值)。"
"考虑优化或缓存该操作。"
)
# 检查错误率
error_ops = self.identify_error_prone_operations()
if error_ops:
top_error = error_ops[0]
recommendations.append(
f"🔴 {top_error['operation']} 错误率较高 "
f"({top_error['error_rate']:.1f}%)。"
"需要增强错误处理和重试机制。"
)
return recommendations
10.4 常见问题排查
10.4.1 连接问题诊断
class ConnectionDiagnostics:
"""连接问题诊断"""
@staticmethod
async def diagnose_connection(connection: 'MCPConnection') -> Dict:
"""
诊断连接问题
Returns:
诊断结果
"""
diagnosis = {
"connection_state": connection.state.value,
"connected": connection.state == ConnectionState.CONNECTED,
"uptime_seconds": 0,
"last_activity_seconds_ago": 0,
"ping_response_time_ms": 0,
"issues": []
}
# 检查连接状态
if connection.connection_time:
uptime = (datetime.now() - connection.connection_time).total_seconds()
diagnosis["uptime_seconds"] = uptime
if connection.last_activity:
inactive = (datetime.now() - connection.last_activity).total_seconds()
diagnosis["last_activity_seconds_ago"] = inactive
# 检查连接响应性
if connection.state == ConnectionState.CONNECTED:
try:
start = datetime.now()
await connection._send_message({
"jsonrpc": "2.0",
"method": "ping",
"id": "diagnostic_ping"
})
ping_time = (datetime.now() - start).total_seconds() * 1000
diagnosis["ping_response_time_ms"] = ping_time
if ping_time > 1000:
diagnosis["issues"].append(
f"连接响应缓慢 ({ping_time:.0f}ms)"
)
except TimeoutError:
diagnosis["issues"].append("Ping超时 - 连接可能不稳定")
except Exception as e:
diagnosis["issues"].append(f"Ping失败: {str(e)}")
return diagnosis
10.4.2 工具调用故障排查
class ToolExecutionDiagnostics:
"""工具执行诊断"""
@staticmethod
async def diagnose_tool_call(executor: 'ToolExecutor',
tool_name: str,
arguments: Dict) -> Dict:
"""诊断工具调用问题"""
diagnosis = {
"tool_name": tool_name,
"arguments": arguments,
"steps": [],
"success": False,
"error": None
}
try:
# 1. 验证工具名称
diagnosis["steps"].append("✓ 工具名称有效")
# 2. 验证参数
diagnosis["steps"].append("✓ 参数格式有效")
# 3. 尝试调用
start = datetime.now()
result = await executor.call_tool(tool_name, arguments)
duration = (datetime.now() - start).total_seconds()
diagnosis["steps"].append(f"✓ 工具调用成功 ({duration:.2f}s)")
diagnosis["success"] = True
diagnosis["result"] = result
except Exception as e:
diagnosis["error"] = str(e)
diagnosis["steps"].append(f"✗ 调用失败: {str(e)}")
return diagnosis
本章总结
| 核心概念 | 关键点 |
|---|---|
| 结构化日志 | JSON格式、多等级、事件分类 |
| 分布式追踪 | 追踪ID、跨度、时序关系 |
| 消息检查 | 捕获、过滤、统计、导出 |
| 交互式调试 | 命令行工具、实时查询 |
| 性能指标 | 操作时间、缓存率、错误率 |
| 瓶颈识别 | 慢操作检测、错误率分析 |
| 连接诊断 | 状态检查、响应性测试 |
| 故障排查 | 步骤拆解、逐项诊断 |
常见问题
Q1: 如何在生产环境启用详细日志而不影响性能? A: 使用异步日志处理、采样机制、配置级别控制。重要事件总是记录,普通事件可采样。
Q2: 如何追踪跨越多个服务器的请求? A: 使用统一的trace_id,在所有消息中传递,记录服务间的调用关系。
Q3: 性能指标应该多久聚合一次? A: 实时收集,但聚合分析可以1分钟或5分钟一次,根据系统规模调整。
Q4: 如何识别内存泄漏? A: 监控内存占用趋势、对象引用计数、连接池大小变化。
Q5: 调试器对性能有多大影响? A: 结构化日志<1%,消息检查2-5%,完整追踪5-10%,按需启用。
实战要点
✅ 推荐做法
- 使用结构化日志便于自动分析
- 实施分布式追踪追踪跨域请求
- 定期生成性能报告
- 建立性能基线
- 设置告警阈值
- 保留诊断数据用于事后分析
- 自动化瓶颈识别
❌ 避免的做法
- 不要记录过多日志影响性能
- 不要遗漏关键的错误信息
- 不要在生产环境忘记关闭调试模式
- 不要长期保存巨量日志
- 不要忽视异常模式
- 不要在关键路径上做复杂分析
延伸阅读
- 分布式追踪系统:www.jaegertracing.io/
- 日志聚合ELK Stack:www.elastic.co/what-is/elk…
- Prometheus监控:prometheus.io/
- 可观测性最佳实践:opentelemetry.io/
第二部分完成!
恭喜!您已经掌握了MCP开发的完整基础框架——从服务器、工具、资源、提示到客户端,再到监控调试。下一部分将进入行业应用案例,看MCP如何在真实场景中发挥威力!