微服务监控-Docker 安装prometheus、设置告警规则

80 阅读2分钟

Prometheus+Grafana+Alertmanager并不合适业务日志告警

存储机制差异 Prometheus 存储特点: 时间序列数据库:专门存储数值型指标 采样存储:按固定间隔采样,会丢失中间数据 内存优化:优先存储最近数据,历史数据压缩

日志存储需求: 全量存储:需要保存每条日志的完整信息 全文搜索:需要支持关键词搜索 复杂查询:需要支持多字段组合查询

image.png

image.png compose.yaml

  prometheus:
    image: prom/prometheus:v2.45.0
    container_name: prometheus
    restart: always
    ports:
      - "9090:9090"
    volumes:
      - D:/codeapp/docker_data/monitor/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml
      - D:/codeapp/docker_data/monitor/prometheus/alert_rules.yml:/etc/prometheus/alert_rules.yml
      - D:/codeapp/docker_data/monitor/prometheus/data:/prometheus
    command:
      - "--config.file=/etc/prometheus/prometheus.yml"
      - "--storage.tsdb.path=/prometheus"
      - "--web.enable-lifecycle"
    networks:
      - monitor-net
    depends_on:
      - alertmanager

  alertmanager:
    image: prom/alertmanager:v0.25.0
    container_name: alertmanager
    restart: always
    ports:
      - "9093:9093"
    volumes:
      - D:/codeapp/docker_data/monitor/alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml
      - D:/codeapp/docker_data/monitor/alertmanager/data:/data
    command:
      - "--config.file=/etc/alertmanager/alertmanager.yml"
      - "--storage.path=/data"
    networks:
      - monitor-net

  grafana:
    image: grafana/grafana:9.5.2
    container_name: grafana
    restart: always
    ports:
      - "3000:3000"
    volumes:
      - D:/codeapp/docker_data/monitor/grafana/data:/var/lib/grafana
    environment:
      - GF_SECURITY_ADMIN_PASSWORD=admin
      - GF_USERS_ALLOW_SIGN_UP=false
    networks:
      - monitor-net
    depends_on:
      - prometheus

networks:
  monitor-net:
    driver: bridge

alert_rules.yaml

- name: go_zero_error_log_alerts
  rules:
  # 监控错误日志数量
  - alert: HighErrorLogRate
    expr: increase(log_entries_total{level="error"}[5m]) > 10
    for: 2m
    labels:
      severity: "warning"
      service: "productstock"
    annotations:
      summary: "错误日志频率过高"
      description: "服务 {{ $labels.job }} 在最近 5 分钟内产生了 {{ $value }} 条错误日志"

  # 监控特定错误类型
  - alert: DatabaseErrorDetected
    expr: increase(log_entries_total{level="error",error_type="database"}[5m]) > 0
    for: 1m
    labels:
      severity: "critical"
      service: "productstock"
    annotations:
      summary: "检测到数据库错误"
      description: "服务 {{ $labels.job }} 出现数据库错误"

  # 监控业务逻辑错误
  - alert: BusinessLogicErrorDetected
    expr: increase(log_entries_total{level="error",error_type="business"}[5m]) > 0
    for: 1m
    labels:
      severity: "warning"
      service: "productstock"
    annotations:
      summary: "检测到业务逻辑错误"
      description: "服务 {{ $labels.job }} 出现业务逻辑错误"

  # 监控 panic 错误
  - alert: PanicErrorDetected
    expr: increase(log_entries_total{level="error",error_type="panic"}[5m]) > 0
    for: 1m
    labels:
      severity: "critical"
      service: "productstock"
    annotations:
      summary: "检测到 Panic 错误"
      description: "服务 {{ $labels.job }} 出现 Panic 错误"

- name: go_zero_builtin_alerts
  rules:
  # 服务可用性
  - alert: GoZeroServiceDown
    expr: up{job="productstock-rpc"} == 0
    for: 30s
    labels:
      severity: "critical"
      service: "productstock"
    annotations:
      summary: "库存服务异常"
      description: "库存 RPC 服务已停止运行超过 30 秒"

  # HTTP 错误率
  - alert: HighHTTPErrorRate
    expr: rate(http_requests_total{status=~"5.."}[5m]) / rate(http_requests_total[5m]) * 100 > 5
    for: 5m
    labels:
      severity: "warning"
      service: "productstock"
    annotations:
      summary: "HTTP 错误率过高"
      description: "服务 {{ $labels.job }} 的 HTTP 5xx 错误率超过 5%,当前值: {{ $value }}%"

  # RPC 错误率
  - alert: HighRPCErrorRate
    expr: rate(grpc_server_handled_total{grpc_code!="OK"}[5m]) / rate(grpc_server_handled_total[5m]) * 100 > 5
    for: 5m
    labels:
      severity: "warning"
      service: "productstock"
    annotations:
      summary: "RPC 错误率过高"
      description: "服务 {{ $labels.job }} 的 RPC 错误率超过 5%,当前值: {{ $value }}%"

- name: system_alerts
  rules:
  # CPU 使用率告警
  - alert: HighCPUUsage
    expr: 100 - (avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "高 CPU 使用率"
      description: "实例 {{ $labels.instance }} 的 CPU 使用率超过 80%"

  # 内存使用率告警
  - alert: HighMemoryUsage
    expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100 > 85
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "高内存使用率"
      description: "实例 {{ $labels.instance }} 的内存使用率超过 85%"

  # 磁盘使用率告警
  - alert: HighDiskUsage
    expr: (node_filesystem_size_bytes - node_filesystem_free_bytes) / node_filesystem_size_bytes * 100 > 90
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "高磁盘使用率"
      description: "实例 {{ $labels.instance }} 的磁盘使用率超过 90%"

alertmanager.yaml

  resolve_timeout: 5m

route:
  group_by: ['alertname', 'severity']
  group_wait: 10s
  group_interval: 10s
  repeat_interval: 1h
  receiver: 'default'

receivers:
- name: 'default'
  # webhook_configs:
  # - url: 'http://host.docker.internal:5001/'  # Windows 宿主机测试地址
  #   send_resolved: true
  #   timeout: 10s

inhibit_rules:
  - source_match:
      severity: 'critical'
    target_match:
      severity: 'warning'
    equal: ['alertname', 'job']

prometheus.yaml

  scrape_interval: 15s
  evaluation_interval: 15s

rule_files:
  - "alert_rules.yml"

alerting:
  alertmanagers:
    - static_configs:
        - targets:
          - alertmanager:9093  # 使用容器名

scrape_configs:
  # 监控Prometheus自身
  - job_name: 'prometheus'
    static_configs:
      - targets: ['localhost:9090']

  # 监控您的库存 RPC 服务
  - job_name: 'productstock-rpc'
    static_configs:
      - targets: ['172.29.119.29:6470']
    metrics_path: '/metrics'
    scrape_interval: 10s
    scrape_timeout: 5s

  # 监控 Alertmanager (使用容器名)
  - job_name: 'alertmanager'
    static_configs:
      - targets: ['alertmanager:9093']  # 改为容器名
    scrape_interval: 15s