02 kube-prometheus-stack 部署操作指南

0 阅读2分钟

介绍

kube-prometheus-stack 是一个基于 Prometheus Operator 的完整 Kubernetes 监控解决方案,通过 Helm Chart 一键部署,提供端到端的集群监控能力。该堆栈集成了 Prometheus、Alertmanager、Grafana 等核心组件,同时包含 kube-state-metrics、node-exporter 等导出器,能够自动发现和监控集群中的服务、Pod 等资源。它预置了丰富的 Grafana 仪表板和 Prometheus 告警规则,支持高可用部署、持久化存储、自定义指标采集和多种告警通知渠道,是云原生环境下监控 Kubernetes 集群的首选方案。

部署操作

  1. 添加仓库
helm repo add prometheus-community https://prometheus-community.github.io/helm-charts

helm repo update
  1. 基础配置文件
cat /opt/values.yaml

# 1. Prometheus 核心配置
prometheus:
  prometheusSpec:
    serviceMonitorSelector: {}
    serviceMonitorSelectorNilUsesHelmValues: false
    ruleSelector: {}
    ruleSelectorNilUsesHelmValues: false
    retention: 15d
    storageSpec:
      volumeClaimTemplate:
        spec:
          storageClassName: nfs-storage
          accessModes: ["ReadWriteOnce"]
          resources:
            requests:
              storage: 10Gi
  # 开启 Prometheus UI 的域名访问
  ingress:
    enabled: true
    ingressClassName: higress
    hosts:
      - prometheus.aassd.com
    paths:
      - /
    pathType: Prefix

# 2. Alertmanager 配置
alertmanager:
  enabled: true
  alertmanagerSpec:
    # --- 存储配置 ---
    storage:
      volumeClaimTemplate:
        spec:
          storageClassName: nfs-storage
          accessModes: ["ReadWriteOnce"]
          resources:
            requests:
              storage: 5Gi

  # --- Ingress 域名访问 ---
  ingress:
    enabled: true
    ingressClassName: higress
    hosts:
      - alertmanager.aassd.com
    paths:
      - /
    pathType: Prefix

  # --- 告警策略 ---
  config:
    global:
      resolve_timeout: 5m
    
    # 保留官方的抑制规则,减少冗余告警
    inhibit_rules:
      - source_matchers: ['severity = critical']
        target_matchers: ['severity =~ warning|info']
        equal: ['instance', 'alertname']

    route:
      group_by: ['instance', 'alertname']
      group_wait: 30s
      group_interval: 5m
      repeat_interval: 4h
      # 由于还没有 webhook 服务,默认指向 null
      receiver: 'null'
      routes:
        - matchers:
            - alertname = "Watchdog"
          receiver: 'null'
          continue: false

    receivers:
      - name: 'null'
      - name: 'default-receiver'
        webhook_configs:
          - url: 'http://test.com/webhook'
            send_resolved: true

# 3. Grafana 配置
grafana:
  adminPassword: admin
  persistence:
    enabled: true
    storageClassName: nfs-storage
    size: 10Gi
  ingress:
    enabled: true
    ingressClassName: higress
    hosts:
      - grafana.aassd.com
      
# 非生产环境可禁用不必要的监控抓取和告警
kubeEtcd:
  enabled: false

kubeScheduler:
  enabled: false
  
kubeControllerManager:
  enabled: false

# 禁用控制面组件监控
kubeApiServer:
  enabled: true

kubeEtcd:
  enabled: false

kubeControllerManager:
  enabled: false

kubeScheduler:
  enabled: false

kubeProxy:
  enabled: false

  1. 执行部署
# helm 第一次安装需要下载 .tgz 压缩包,可访问 github 手动下载后再安装
# helm install prometheus-stack /opt/kube-prometheus-stack.tgz  -f prometheus-stack/values.yaml -n monitoring

# 校验文件格式
helm install prometheus-stack prometheus-community/kube-prometheus-stack -f /opt/values.yaml --dry-run

helm install prometheus-stack prometheus-community/kube-prometheus-stack \
  -n monitoring \
  --create-namespace \
  -f /opt/values.yaml

# 卸载
helm uninstall prometheus-stack -n monitoring
kubectl delete pvc -l app.kubernetes.io/instance=prometheus-stack -n monitoring