Prometheus+Grafana+Alertmanager并不合适业务日志告警
存储机制差异 Prometheus 存储特点: 时间序列数据库:专门存储数值型指标 采样存储:按固定间隔采样,会丢失中间数据 内存优化:优先存储最近数据,历史数据压缩
日志存储需求: 全量存储:需要保存每条日志的完整信息 全文搜索:需要支持关键词搜索 复杂查询:需要支持多字段组合查询
compose.yaml
prometheus:
image: prom/prometheus:v2.45.0
container_name: prometheus
restart: always
ports:
- "9090:9090"
volumes:
- D:/codeapp/docker_data/monitor/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml
- D:/codeapp/docker_data/monitor/prometheus/alert_rules.yml:/etc/prometheus/alert_rules.yml
- D:/codeapp/docker_data/monitor/prometheus/data:/prometheus
command:
- "--config.file=/etc/prometheus/prometheus.yml"
- "--storage.tsdb.path=/prometheus"
- "--web.enable-lifecycle"
networks:
- monitor-net
depends_on:
- alertmanager
alertmanager:
image: prom/alertmanager:v0.25.0
container_name: alertmanager
restart: always
ports:
- "9093:9093"
volumes:
- D:/codeapp/docker_data/monitor/alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml
- D:/codeapp/docker_data/monitor/alertmanager/data:/data
command:
- "--config.file=/etc/alertmanager/alertmanager.yml"
- "--storage.path=/data"
networks:
- monitor-net
grafana:
image: grafana/grafana:9.5.2
container_name: grafana
restart: always
ports:
- "3000:3000"
volumes:
- D:/codeapp/docker_data/monitor/grafana/data:/var/lib/grafana
environment:
- GF_SECURITY_ADMIN_PASSWORD=admin
- GF_USERS_ALLOW_SIGN_UP=false
networks:
- monitor-net
depends_on:
- prometheus
networks:
monitor-net:
driver: bridge
alert_rules.yaml
- name: go_zero_error_log_alerts
rules:
# 监控错误日志数量
- alert: HighErrorLogRate
expr: increase(log_entries_total{level="error"}[5m]) > 10
for: 2m
labels:
severity: "warning"
service: "productstock"
annotations:
summary: "错误日志频率过高"
description: "服务 {{ $labels.job }} 在最近 5 分钟内产生了 {{ $value }} 条错误日志"
# 监控特定错误类型
- alert: DatabaseErrorDetected
expr: increase(log_entries_total{level="error",error_type="database"}[5m]) > 0
for: 1m
labels:
severity: "critical"
service: "productstock"
annotations:
summary: "检测到数据库错误"
description: "服务 {{ $labels.job }} 出现数据库错误"
# 监控业务逻辑错误
- alert: BusinessLogicErrorDetected
expr: increase(log_entries_total{level="error",error_type="business"}[5m]) > 0
for: 1m
labels:
severity: "warning"
service: "productstock"
annotations:
summary: "检测到业务逻辑错误"
description: "服务 {{ $labels.job }} 出现业务逻辑错误"
# 监控 panic 错误
- alert: PanicErrorDetected
expr: increase(log_entries_total{level="error",error_type="panic"}[5m]) > 0
for: 1m
labels:
severity: "critical"
service: "productstock"
annotations:
summary: "检测到 Panic 错误"
description: "服务 {{ $labels.job }} 出现 Panic 错误"
- name: go_zero_builtin_alerts
rules:
# 服务可用性
- alert: GoZeroServiceDown
expr: up{job="productstock-rpc"} == 0
for: 30s
labels:
severity: "critical"
service: "productstock"
annotations:
summary: "库存服务异常"
description: "库存 RPC 服务已停止运行超过 30 秒"
# HTTP 错误率
- alert: HighHTTPErrorRate
expr: rate(http_requests_total{status=~"5.."}[5m]) / rate(http_requests_total[5m]) * 100 > 5
for: 5m
labels:
severity: "warning"
service: "productstock"
annotations:
summary: "HTTP 错误率过高"
description: "服务 {{ $labels.job }} 的 HTTP 5xx 错误率超过 5%,当前值: {{ $value }}%"
# RPC 错误率
- alert: HighRPCErrorRate
expr: rate(grpc_server_handled_total{grpc_code!="OK"}[5m]) / rate(grpc_server_handled_total[5m]) * 100 > 5
for: 5m
labels:
severity: "warning"
service: "productstock"
annotations:
summary: "RPC 错误率过高"
description: "服务 {{ $labels.job }} 的 RPC 错误率超过 5%,当前值: {{ $value }}%"
- name: system_alerts
rules:
# CPU 使用率告警
- alert: HighCPUUsage
expr: 100 - (avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
for: 5m
labels:
severity: warning
annotations:
summary: "高 CPU 使用率"
description: "实例 {{ $labels.instance }} 的 CPU 使用率超过 80%"
# 内存使用率告警
- alert: HighMemoryUsage
expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100 > 85
for: 5m
labels:
severity: warning
annotations:
summary: "高内存使用率"
description: "实例 {{ $labels.instance }} 的内存使用率超过 85%"
# 磁盘使用率告警
- alert: HighDiskUsage
expr: (node_filesystem_size_bytes - node_filesystem_free_bytes) / node_filesystem_size_bytes * 100 > 90
for: 5m
labels:
severity: critical
annotations:
summary: "高磁盘使用率"
description: "实例 {{ $labels.instance }} 的磁盘使用率超过 90%"
alertmanager.yaml
resolve_timeout: 5m
route:
group_by: ['alertname', 'severity']
group_wait: 10s
group_interval: 10s
repeat_interval: 1h
receiver: 'default'
receivers:
- name: 'default'
# webhook_configs:
# - url: 'http://host.docker.internal:5001/' # Windows 宿主机测试地址
# send_resolved: true
# timeout: 10s
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'job']
prometheus.yaml
scrape_interval: 15s
evaluation_interval: 15s
rule_files:
- "alert_rules.yml"
alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager:9093 # 使用容器名
scrape_configs:
# 监控Prometheus自身
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
# 监控您的库存 RPC 服务
- job_name: 'productstock-rpc'
static_configs:
- targets: ['172.29.119.29:6470']
metrics_path: '/metrics'
scrape_interval: 10s
scrape_timeout: 5s
# 监控 Alertmanager (使用容器名)
- job_name: 'alertmanager'
static_configs:
- targets: ['alertmanager:9093'] # 改为容器名
scrape_interval: 15s