第15章 监控、告警与运维MCP应用
前言
可靠的运维体系是业务稳定运行的基石。本章展示如何通过MCP将Claude集成到IT运维系统中,实现自动化故障诊断、智能告警和决策支持。
15.1 案例1:IT运维自动化系统
15.1.1 应用场景
graph TB
A["监控系统"] --> B["服务器监控"]
A --> C["应用日志"]
A --> D["性能指标"]
A --> E["网络流量"]
F["告警规则"] --> F1["阈值告警"]
F --> F2["异常检测"]
F --> F3["趋势分析"]
G["Claude运维"] --> G1["故障诊断"]
G --> G2["根因分析"]
G --> G3["修复建议"]
G --> G4["自动化处理"]
A --> G
F --> G
15.1.2 实现架构
from typing import Dict, List, Optional, Tuple
from dataclasses import dataclass
from datetime import datetime, timedelta
from enum import Enum
import json
class Severity(Enum):
"""告警严重级别"""
INFO = "info"
WARNING = "warning"
ERROR = "error"
CRITICAL = "critical"
class Status(Enum):
"""系统状态"""
HEALTHY = "healthy"
DEGRADED = "degraded"
CRITICAL = "critical"
UNKNOWN = "unknown"
@dataclass
class Metric:
"""指标"""
name: str
value: float
timestamp: datetime
unit: str
host: str = None
@dataclass
class Alert:
"""告警"""
id: str
title: str
description: str
severity: Severity
source: str
timestamp: datetime
resolved_at: Optional[datetime] = None
related_metrics: List[Dict] = None
class MetricsCollector:
"""指标收集器"""
def __init__(self):
self.metrics: Dict[str, List[Metric]] = {}
self.alerts: List[Alert] = []
async def collect_system_metrics(self, host: str) -> Dict:
"""
收集系统指标
Args:
host: 主机名
Returns:
系统指标字典
"""
# 模拟收集指标(实际应通过SSH/API)
now = datetime.now()
metrics = {
"cpu_usage": 45.2,
"memory_usage": 62.8,
"disk_usage": 78.5,
"network_in": 1024 * 100, # bytes/s
"network_out": 1024 * 200, # bytes/s
"load_average": 2.3
}
# 存储指标
for metric_name, value in metrics.items():
key = f"{host}:{metric_name}"
if key not in self.metrics:
self.metrics[key] = []
metric = Metric(
name=metric_name,
value=value,
timestamp=now,
unit=self._get_unit(metric_name),
host=host
)
self.metrics[key].append(metric)
# 保持时间序列(最多7天)
cutoff = now - timedelta(days=7)
self.metrics[key] = [m for m in self.metrics[key] if m.timestamp > cutoff]
return metrics
def _get_unit(self, metric_name: str) -> str:
"""获取指标单位"""
units = {
"cpu_usage": "%",
"memory_usage": "%",
"disk_usage": "%",
"network_in": "bytes/s",
"network_out": "bytes/s",
"load_average": ""
}
return units.get(metric_name, "")
async def detect_anomalies(self, host: str,
lookback_hours: int = 24) -> List[Alert]:
"""
检测异常
Args:
host: 主机名
lookback_hours: 回溯时间(小时)
Returns:
检测到的告警列表
"""
alerts = []
now = datetime.now()
threshold_time = now - timedelta(hours=lookback_hours)
# 检查关键指标
cpu_key = f"{host}:cpu_usage"
memory_key = f"{host}:memory_usage"
# CPU异常
if cpu_key in self.metrics:
cpu_metrics = [m for m in self.metrics[cpu_key] if m.timestamp > threshold_time]
if cpu_metrics:
avg_cpu = sum(m.value for m in cpu_metrics) / len(cpu_metrics)
if avg_cpu > 80:
alert = Alert(
id=f"cpu_high_{now.timestamp()}",
title=f"High CPU Usage on {host}",
description=f"CPU usage is {avg_cpu:.1f}%",
severity=Severity.ERROR if avg_cpu > 90 else Severity.WARNING,
source=f"monitor:{host}",
timestamp=now
)
alerts.append(alert)
# 内存异常
if memory_key in self.metrics:
memory_metrics = [m for m in self.metrics[memory_key] if m.timestamp > threshold_time]
if memory_metrics:
avg_memory = sum(m.value for m in memory_metrics) / len(memory_metrics)
if avg_memory > 85:
alert = Alert(
id=f"memory_high_{now.timestamp()}",
title=f"High Memory Usage on {host}",
description=f"Memory usage is {avg_memory:.1f}%",
severity=Severity.ERROR if avg_memory > 95 else Severity.WARNING,
source=f"monitor:{host}",
timestamp=now
)
alerts.append(alert)
return alerts
class OperationsAnalyzer:
"""运维分析器"""
def __init__(self, metrics_collector: MetricsCollector):
self.collector = metrics_collector
async def analyze_alert(self, alert: Alert) -> Dict:
"""
分析告警
Args:
alert: 告警对象
Returns:
分析结果
"""
# 解析告警信息
parts = alert.source.split(":")
host = parts[1] if len(parts) > 1 else "unknown"
# 获取历史数据
history = self._get_metric_history(host, alert.title)
# 分析趋势
trend = self._analyze_trend(history)
# 生成根因分析
root_causes = self._identify_root_causes(host, alert)
# 生成修复建议
recommendations = self._generate_recommendations(alert, root_causes)
return {
"alert_id": alert.id,
"host": host,
"severity": alert.severity.value,
"title": alert.title,
"trend": trend,
"root_causes": root_causes,
"recommendations": recommendations,
"estimated_impact": self._estimate_impact(alert),
"urgency": self._calculate_urgency(alert, trend)
}
def _get_metric_history(self, host: str, metric_title: str,
lookback_hours: int = 24) -> List[Metric]:
"""获取指标历史"""
# 从告警标题推断指标名称
metric_type = "cpu_usage" if "CPU" in metric_title else "memory_usage"
key = f"{host}:{metric_type}"
if key not in self.collector.metrics:
return []
threshold = datetime.now() - timedelta(hours=lookback_hours)
return [m for m in self.collector.metrics[key] if m.timestamp > threshold]
def _analyze_trend(self, history: List[Metric]) -> str:
"""分析趋势"""
if len(history) < 2:
return "insufficient_data"
# 简单的趋势分析
early_avg = sum(m.value for m in history[:len(history)//2]) / (len(history)//2)
late_avg = sum(m.value for m in history[len(history)//2:]) / (len(history) - len(history)//2)
if late_avg > early_avg * 1.2:
return "increasing"
elif late_avg < early_avg * 0.8:
return "decreasing"
else:
return "stable"
def _identify_root_causes(self, host: str, alert: Alert) -> List[str]:
"""识别根本原因"""
causes = []
if "CPU" in alert.title:
causes.append("High process activity")
causes.append("Inefficient application")
causes.append("Scheduled batch jobs")
elif "Memory" in alert.title:
causes.append("Memory leak in application")
causes.append("Cache accumulation")
causes.append("Insufficient swapiness")
return causes
def _generate_recommendations(self, alert: Alert,
root_causes: List[str]) -> List[str]:
"""生成建议"""
recommendations = []
if "CPU" in alert.title:
recommendations.append("Check top consuming processes")
recommendations.append("Review cron jobs schedule")
recommendations.append("Consider process migration")
elif "Memory" in alert.title:
recommendations.append("Investigate memory usage per process")
recommendations.append("Restart affected services")
recommendations.append("Increase physical memory")
return recommendations
def _estimate_impact(self, alert: Alert) -> str:
"""估计影响范围"""
if alert.severity == Severity.CRITICAL:
return "high"
elif alert.severity == Severity.ERROR:
return "medium"
else:
return "low"
def _calculate_urgency(self, alert: Alert, trend: str) -> str:
"""计算紧急程度"""
if alert.severity == Severity.CRITICAL and trend == "increasing":
return "immediate"
elif alert.severity == Severity.ERROR:
return "high"
else:
return "normal"
class AutomatedRemediator:
"""自动修复器"""
async def execute_remediation(self, alert: Alert,
action: str) -> Dict:
"""
执行修复操作
Args:
alert: 告警
action: 修复操作
Returns:
执行结果
"""
if action == "restart_service":
return await self._restart_service(alert)
elif action == "scale_out":
return await self._scale_out(alert)
elif action == "increase_memory":
return await self._increase_memory(alert)
else:
return {"success": False, "error": f"Unknown action: {action}"}
async def _restart_service(self, alert: Alert) -> Dict:
"""重启服务"""
# 实际应通过SSH/API执行
return {
"success": True,
"action": "restart_service",
"message": "Service restarted successfully",
"timestamp": datetime.now().isoformat()
}
async def _scale_out(self, alert: Alert) -> Dict:
"""横向扩展"""
return {
"success": True,
"action": "scale_out",
"message": "Instances scaled out by 1",
"timestamp": datetime.now().isoformat()
}
async def _increase_memory(self, alert: Alert) -> Dict:
"""增加内存"""
return {
"success": True,
"action": "increase_memory",
"message": "Memory increased by 2GB",
"timestamp": datetime.now().isoformat()
}
15.2 案例2:应用性能监控(APM)
@dataclass
class TraceSpan:
"""追踪跨度"""
span_id: str
trace_id: str
operation_name: str
duration_ms: float
start_time: datetime
tags: Dict[str, str]
errors: List[str] = None
class ApplicationPerformanceMonitor:
"""应用性能监控"""
def __init__(self):
self.traces: Dict[str, List[TraceSpan]] = {}
self.performance_baselines: Dict[str, float] = {}
async def analyze_latency(self, trace_id: str) -> Dict:
"""
分析延迟
Args:
trace_id: 追踪ID
Returns:
延迟分析结果
"""
if trace_id not in self.traces:
return {}
spans = self.traces[trace_id]
total_duration = sum(s.duration_ms for s in spans)
# 找出最慢的跨度
slowest = max(spans, key=lambda x: x.duration_ms)
# 分析关键路径
critical_path = [s for s in spans if s.duration_ms > 10]
return {
"trace_id": trace_id,
"total_duration_ms": total_duration,
"span_count": len(spans),
"slowest_span": {
"operation": slowest.operation_name,
"duration_ms": slowest.duration_ms
},
"critical_path": [
{
"operation": s.operation_name,
"duration_ms": s.duration_ms
}
for s in critical_path
],
"performance_status": self._evaluate_performance(total_duration)
}
def _evaluate_performance(self, duration_ms: float) -> str:
"""评估性能"""
if duration_ms < 100:
return "excellent"
elif duration_ms < 500:
return "good"
elif duration_ms < 2000:
return "acceptable"
else:
return "poor"
15.3 MCP服务器集成
class MonitoringMCPServer:
"""监控MCP服务器"""
def __init__(self, analyzer: OperationsAnalyzer,
remediator: AutomatedRemediator,
apm: ApplicationPerformanceMonitor):
self.analyzer = analyzer
self.remediator = remediator
self.apm = apm
def get_tools(self) -> List[Dict]:
"""定义工具"""
return [
{
"name": "analyze_alert",
"description": "分析告警",
"inputSchema": {
"type": "object",
"properties": {
"alert_id": {"type": "string"},
"title": {"type": "string"},
"description": {"type": "string"},
"severity": {"type": "string"}
},
"required": ["alert_id", "title"]
}
},
{
"name": "execute_remediation",
"description": "执行修复",
"inputSchema": {
"type": "object",
"properties": {
"alert_id": {"type": "string"},
"action": {"type": "string"}
},
"required": ["alert_id", "action"]
}
},
{
"name": "analyze_performance",
"description": "分析性能",
"inputSchema": {
"type": "object",
"properties": {
"trace_id": {"type": "string"}
},
"required": ["trace_id"]
}
}
]
async def call_tool(self, tool_name: str, arguments: Dict) -> str:
"""调用工具"""
import json
try:
if tool_name == "analyze_alert":
alert = Alert(
id=arguments["alert_id"],
title=arguments["title"],
description=arguments.get("description", ""),
severity=Severity(arguments.get("severity", "warning")),
source="mcp",
timestamp=datetime.now()
)
result = await self.analyzer.analyze_alert(alert)
elif tool_name == "execute_remediation":
alert = Alert(
id=arguments["alert_id"],
title="",
description="",
severity=Severity.ERROR,
source="mcp",
timestamp=datetime.now()
)
result = await self.remediator.execute_remediation(
alert,
arguments["action"]
)
elif tool_name == "analyze_performance":
result = await self.apm.analyze_latency(arguments["trace_id"])
else:
return json.dumps({"error": f"Unknown tool: {tool_name}"})
return json.dumps(result, ensure_ascii=False, indent=2)
except Exception as e:
return json.dumps({"error": str(e)})
15.4 高级告警管理
class AlertAggregator:
"""告警聚合器"""
def __init__(self):
self.alerts: Dict[str, Alert] = {}
self.alert_groups: Dict[str, List[str]] = {}
def aggregate_related_alerts(self, alert: Alert) -> str:
"""
聚合相关告警
Args:
alert: 告警对象
Returns:
告警组ID
"""
# 基于来源和主机聚合
group_key = f"{alert.source}"
if group_key not in self.alert_groups:
self.alert_groups[group_key] = []
self.alert_groups[group_key].append(alert.id)
self.alerts[alert.id] = alert
return group_key
def get_alert_summary(self) -> Dict:
"""获取告警摘要"""
by_severity = {}
for alert in self.alerts.values():
severity = alert.severity.value
by_severity[severity] = by_severity.get(severity, 0) + 1
return {
"total_alerts": len(self.alerts),
"by_severity": by_severity,
"groups": len(self.alert_groups)
}
class AlertIncidentManager:
"""告警事件管理"""
def __init__(self):
self.incidents: Dict[str, Dict] = {}
self.incident_counter = 0
async def create_incident(self, alerts: List[Alert]) -> Dict:
"""
创建事件
Args:
alerts: 告警列表
Returns:
事件信息
"""
self.incident_counter += 1
incident_id = f"INC_{self.incident_counter:06d}"
# 计算事件严重性
max_severity = max([a.severity.value for a in alerts],
key=lambda x: {"info": 0, "warning": 1, "error": 2, "critical": 3}.get(x, 0))
self.incidents[incident_id] = {
"id": incident_id,
"alerts": [a.id for a in alerts],
"severity": max_severity,
"created_at": datetime.now().isoformat(),
"status": "open",
"assigned_to": None,
"resolution_notes": []
}
return self.incidents[incident_id]
async def resolve_incident(self, incident_id: str,
resolution_note: str) -> Dict:
"""解决事件"""
if incident_id not in self.incidents:
return {"error": "Incident not found"}
incident = self.incidents[incident_id]
incident["status"] = "resolved"
incident["resolution_notes"].append({
"timestamp": datetime.now().isoformat(),
"note": resolution_note
})
incident["resolved_at"] = datetime.now().isoformat()
return incident
15.5 完整的运维工作流
class OperationsWorkflow:
"""运维工作流"""
def __init__(self, analyzer: OperationsAnalyzer,
remediator: AutomatedRemediator,
aggregator: AlertAggregator,
incident_manager: AlertIncidentManager):
self.analyzer = analyzer
self.remediator = remediator
self.aggregator = aggregator
self.incidents = incident_manager
async def handle_critical_alert(self, alert: Alert) -> Dict:
"""
处理严重告警的完整工作流
Args:
alert: 告警对象
Returns:
处理结果
"""
workflow = {
"alert_id": alert.id,
"steps": [],
"outcome": None
}
# 第1步:聚合
workflow["steps"].append({"name": "Aggregating", "status": "running"})
group_id = self.aggregator.aggregate_related_alerts(alert)
workflow["steps"][-1]["status"] = "completed"
workflow["steps"][-1]["data"] = {"group_id": group_id}
# 第2步:分析
workflow["steps"].append({"name": "Analyzing", "status": "running"})
analysis = await self.analyzer.analyze_alert(alert)
workflow["steps"][-1]["status"] = "completed"
workflow["steps"][-1]["data"] = analysis
# 第3步:判断是否需要自动修复
workflow["steps"].append({"name": "Auto-remediation", "status": "running"})
if alert.severity == Severity.CRITICAL and analysis["urgency"] == "immediate":
# 建议的修复操作
recommended_action = self._get_recommended_action(analysis["root_causes"])
if recommended_action:
# 执行自动修复
remediation_result = await self.remediator.execute_remediation(
alert,
recommended_action
)
workflow["steps"][-1]["status"] = "completed"
workflow["steps"][-1]["data"] = remediation_result
else:
workflow["steps"][-1]["status"] = "skipped"
workflow["steps"][-1]["reason"] = "No recommended action"
else:
workflow["steps"][-1]["status"] = "skipped"
workflow["steps"][-1]["reason"] = "Not critical enough for auto-remediation"
# 第4步:创建事件
workflow["steps"].append({"name": "Create Incident", "status": "running"})
incident = await self.incidents.create_incident([alert])
workflow["steps"][-1]["status"] = "completed"
workflow["steps"][-1]["data"] = incident
workflow["outcome"] = {
"status": "handled",
"incident_id": incident["id"],
"recommendations": analysis["recommendations"]
}
return workflow
def _get_recommended_action(self, root_causes: List[str]) -> str:
"""获取推荐操作"""
# 简单的规则引擎
if "High process activity" in root_causes:
return "restart_service"
elif "Memory leak" in root_causes:
return "restart_service"
elif "Inefficient application" in root_causes:
return "scale_out"
return None
async def monitor_and_respond(self, hosts: List[str],
interval_seconds: int = 60) -> Dict:
"""
持续监控和响应
Args:
hosts: 主机列表
interval_seconds: 监控间隔
Returns:
监控结果
"""
monitoring_result = {
"duration": "60s",
"hosts": hosts,
"events": []
}
# 这里应该是长时间运行的循环
# 为了示例,我们只执行一次
for host in hosts:
metrics = await self.analyzer.collector.collect_system_metrics(host)
anomalies = await self.analyzer.collector.detect_anomalies(host)
for alert in anomalies:
result = await self.handle_critical_alert(alert)
monitoring_result["events"].append(result)
return monitoring_result
15.6 性能监控深度分析
class AdvancedPerformanceAnalyzer:
"""高级性能分析"""
def __init__(self, apm: ApplicationPerformanceMonitor):
self.apm = apm
self.performance_history: Dict[str, List[Dict]] = {}
async def detect_performance_anomaly(self, trace_id: str) -> Dict:
"""
检测性能异常
Args:
trace_id: 追踪ID
Returns:
异常检测结果
"""
analysis = await self.apm.analyze_latency(trace_id)
if "slowest_span" in analysis:
slowest = analysis["slowest_span"]
# 记录历史
if trace_id not in self.performance_history:
self.performance_history[trace_id] = []
self.performance_history[trace_id].append({
"timestamp": datetime.now().isoformat(),
"duration_ms": analysis["total_duration_ms"],
"slowest_span": slowest
})
# 检测异常
if len(self.performance_history[trace_id]) > 1:
return self._analyze_trend(trace_id)
return analysis
def _analyze_trend(self, trace_id: str) -> Dict:
"""分析趋势"""
history = self.performance_history[trace_id]
recent = history[-1]["duration_ms"]
previous_avg = sum(h["duration_ms"] for h in history[:-1]) / (len(history) - 1)
degradation = (recent - previous_avg) / previous_avg if previous_avg > 0 else 0
if degradation > 0.2: # 超过20%降级
return {
"anomaly_detected": True,
"degradation_percentage": f"{degradation * 100:.1f}%",
"severity": "high" if degradation > 0.5 else "medium"
}
return {"anomaly_detected": False}
class DashboardMetrics:
"""仪表板指标"""
def __init__(self):
self.metrics_snapshots: List[Dict] = []
async def collect_snapshot(self, collector: MetricsCollector,
hosts: List[str]) -> Dict:
"""
收集快照
Args:
collector: 指标收集器
hosts: 主机列表
Returns:
快照数据
"""
snapshot = {
"timestamp": datetime.now().isoformat(),
"hosts": {}
}
for host in hosts:
metrics = await collector.collect_system_metrics(host)
snapshot["hosts"][host] = metrics
self.metrics_snapshots.append(snapshot)
# 只保留最近100个快照
if len(self.metrics_snapshots) > 100:
self.metrics_snapshots = self.metrics_snapshots[-100:]
return snapshot
def get_trend_data(self, host: str, metric_name: str,
last_n: int = 10) -> List[Dict]:
"""获取趋势数据"""
trend = []
for snapshot in self.metrics_snapshots[-last_n:]:
if host in snapshot["hosts"] and metric_name in snapshot["hosts"][host]:
trend.append({
"timestamp": snapshot["timestamp"],
"value": snapshot["hosts"][host][metric_name]
})
return trend
15.7 完整使用示例
async def main():
"""完整使用示例"""
# 初始化组件
collector = MetricsCollector()
analyzer = OperationsAnalyzer(collector)
remediator = AutomatedRemediator()
aggregator = AlertAggregator()
incident_manager = AlertIncidentManager()
# 初始化工作流
workflow = OperationsWorkflow(
analyzer,
remediator,
aggregator,
incident_manager
)
apm = ApplicationPerformanceMonitor()
perf_analyzer = AdvancedPerformanceAnalyzer(apm)
dashboard = DashboardMetrics()
# 1. 收集指标
print("📊 Collecting metrics...")
await collector.collect_system_metrics("server-01")
# 2. 检测异常
print("🔍 Detecting anomalies...")
alerts = await collector.detect_anomalies("server-01")
print(f"Found {len(alerts)} anomalies")
# 3. 处理告警
if alerts:
print("⚠️ Handling alerts...")
for alert in alerts:
result = await workflow.handle_critical_alert(alert)
print(f"Alert handled: {alert.title}")
# 4. 收集快照
print("📸 Collecting dashboard snapshot...")
snapshot = await dashboard.collect_snapshot(collector, ["server-01"])
print(f"Snapshot: {snapshot}")
# 5. 获取统计
print("📈 Summary...")
summary = aggregator.get_alert_summary()
print(json.dumps(summary, indent=2))
本章总结
| 关键点 | 说明 |
|---|---|
| 指标收集 | 多维度系统监控 |
| 异常检测 | 基于阈值和趋势的检测 |
| 根因分析 | 自动化故障诊断 |
| 修复建议 | 智能化操作建议 |
| 自动修复 | 自动化故障处理 |
| 性能分析 | 应用级别性能监控 |
常见问题
Q1: 如何处理告警风暴? A: 告警聚合、去重、分级、动态阈值调整。
Q2: 如何提高根因分析准确度? A: 机器学习、历史对比、多维度关联分析。
Q3: 如何避免误报? A: 统计学方法、业务时间段差异、基线动态调整。
Q4: 如何实现智能修复? A: 可信度评分、灰度发布、自动回滚机制。
Q5: 如何管理修复权限? A: 权限分级、审批流程、操作记录和追溯。
下一章预告:第16章将讲述营销与客户服务MCP应用!