介绍
kube-prometheus-stack 是一个基于 Prometheus Operator 的完整 Kubernetes 监控解决方案,通过 Helm Chart 一键部署,提供端到端的集群监控能力。该堆栈集成了 Prometheus、Alertmanager、Grafana 等核心组件,同时包含 kube-state-metrics、node-exporter 等导出器,能够自动发现和监控集群中的服务、Pod 等资源。它预置了丰富的 Grafana 仪表板和 Prometheus 告警规则,支持高可用部署、持久化存储、自定义指标采集和多种告警通知渠道,是云原生环境下监控 Kubernetes 集群的首选方案。
部署操作
- 添加仓库
helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
helm repo update
- 基础配置文件
cat /opt/values.yaml
# 1. Prometheus 核心配置
prometheus:
prometheusSpec:
serviceMonitorSelector: {}
serviceMonitorSelectorNilUsesHelmValues: false
ruleSelector: {}
ruleSelectorNilUsesHelmValues: false
retention: 15d
storageSpec:
volumeClaimTemplate:
spec:
storageClassName: nfs-storage
accessModes: ["ReadWriteOnce"]
resources:
requests:
storage: 10Gi
# 开启 Prometheus UI 的域名访问
ingress:
enabled: true
ingressClassName: higress
hosts:
- prometheus.aassd.com
paths:
- /
pathType: Prefix
# 2. Alertmanager 配置
alertmanager:
enabled: true
alertmanagerSpec:
# --- 存储配置 ---
storage:
volumeClaimTemplate:
spec:
storageClassName: nfs-storage
accessModes: ["ReadWriteOnce"]
resources:
requests:
storage: 5Gi
# --- Ingress 域名访问 ---
ingress:
enabled: true
ingressClassName: higress
hosts:
- alertmanager.aassd.com
paths:
- /
pathType: Prefix
# --- 告警策略 ---
config:
global:
resolve_timeout: 5m
# 保留官方的抑制规则,减少冗余告警
inhibit_rules:
- source_matchers: ['severity = critical']
target_matchers: ['severity =~ warning|info']
equal: ['instance', 'alertname']
route:
group_by: ['instance', 'alertname']
group_wait: 30s
group_interval: 5m
repeat_interval: 4h
# 由于还没有 webhook 服务,默认指向 null
receiver: 'null'
routes:
- matchers:
- alertname = "Watchdog"
receiver: 'null'
continue: false
receivers:
- name: 'null'
- name: 'default-receiver'
webhook_configs:
- url: 'http://test.com/webhook'
send_resolved: true
# 3. Grafana 配置
grafana:
adminPassword: admin
persistence:
enabled: true
storageClassName: nfs-storage
size: 10Gi
ingress:
enabled: true
ingressClassName: higress
hosts:
- grafana.aassd.com
# 非生产环境可禁用不必要的监控抓取和告警
kubeEtcd:
enabled: false
kubeScheduler:
enabled: false
kubeControllerManager:
enabled: false
# 禁用控制面组件监控
kubeApiServer:
enabled: true
kubeEtcd:
enabled: false
kubeControllerManager:
enabled: false
kubeScheduler:
enabled: false
kubeProxy:
enabled: false
- 执行部署
# helm 第一次安装需要下载 .tgz 压缩包,可访问 github 手动下载后再安装
# helm install prometheus-stack /opt/kube-prometheus-stack.tgz -f prometheus-stack/values.yaml -n monitoring
# 校验文件格式
helm install prometheus-stack prometheus-community/kube-prometheus-stack -f /opt/values.yaml --dry-run
helm install prometheus-stack prometheus-community/kube-prometheus-stack \
-n monitoring \
--create-namespace \
-f /opt/values.yaml
# 卸载
helm uninstall prometheus-stack -n monitoring
kubectl delete pvc -l app.kubernetes.io/instance=prometheus-stack -n monitoring