一、使用kube-prometheus部署
- 方案1:官方安装包;麻烦、废弃。推荐方案2
- 方案2:最简单部署方案kube-prometheus
1.1、方案1部署【不推荐】
https://github.com/prometheus/prometheus/releases
https://github.com/prometheus/prometheus/releases/download/v2.30.3/prometheus-2.30.3.linux-amd64.tar.gz
[root@VM-16-14-centos data]# rz
rz waiting to receive.**B0100000023be50
[root@VM-16-14-centos data]# # Received /Users/shencaifeiyangdekk/Downloads/prometheus-2.30.3.linux-amd64.tar.gz
[root@VM-16-14-centos data]# tar zxvf prometheus-2.30.3.linux-amd64.tar.gz
[root@VM-16-14-centos prometheus-2.30.3.linux-amd64]# docker pull prom/prometheus
Using default tag: latest
latest: Pulling from prom/prometheus
aa2a8d90b84c: Pull complete
b45d31ee2d7f: Pull complete
7579d86a00c9: Pull complete
8583d0bc7e17: Pull complete
b32caf1c5e65: Pull complete
e53f205885a2: Pull complete
6366df248f46: Pull complete
a63db3af7b6e: Pull complete
94cd9f02fa61: Pull complete
2511fa13a76c: Pull complete
50c2584d9f31: Pull complete
22749d939f03: Pull complete
Digest: sha256:e9620d250b16ffe0eb9e3eac7dd76151848424c17d577632ae9ca61d1328687e
Status: Downloaded newer image for prom/prometheus:latest
docker.io/prom/prometheus:latest
[root@VM-16-14-centos prometheus-2.30.3.linux-amd64]# docker images
REPOSITORY TAG IMAGE ID CREATED SIZE
quay.io/coreos/flannel v0.15.0 09b38f011a29 9 days ago 69.5MB
k8s.gcr.io/ingress-nginx/controller v1.0.4 a9f76bcccfb5 2 weeks ago 284MB
k8s.gcr.io/ingress-nginx/kube-webhook-certgen v1.1.1 c41e9fcadf5a 2 weeks ago 47.7MB
rancher/mirrored-flannelcni-flannel-cni-plugin v1.2 98660e6e4c3a 2 weeks ago 8.98MB
prom/prometheus latest 227ae20e1b04 3 weeks ago 193MB
- 镜像报错,当多个副本时kubectl describe pod,并未提示出现异常的副本。所以当集群多个node时,要确保每一个node镜像服务正常、可用。每个node该有的镜像都需要导入或pull。
1.2、方案2部署【推荐】
[root@VM-16-14-centos data]# rz
rz waiting to receive.**B0100000023be50
[root@VM-16-14-centos data]# # Received /Users/shencaifeiyangdekk/Downloads/kube-prometheus-0.9.0.tar.gz
[root@VM-16-14-centos data]# tar zxvf kube-prometheus-0.9.0.tar.gz
[root@VM-16-14-centos manifests]# docker pull prom/node-exporter
Using default tag: latest
latest: Pulling from prom/node-exporter
aa2a8d90b84c: Already exists
b45d31ee2d7f: Already exists
6c8262c5af55: Pull complete
Digest: sha256:a990408ed288669bbad5b5b374fe1584e54825cde4a911c1a3d6301a907a030c
Status: Downloaded newer image for prom/node-exporter:latest
docker.io/prom/node-exporter:latest
[root@VM-16-14-centos manifests]# docker pull grafana/grafana
Using default tag: latest
latest: Pulling from grafana/grafana
a0d0a0d46f8b: Pull complete
4ea30d1bad29: Pull complete
d4013d3e00e6: Pull complete
a057b814b467: Pull complete
c5ef01975842: Pull complete
4f4fb700ef54: Pull complete
6eaab6edc5fb: Pull complete
f5a6be398aeb: Pull complete
Digest: sha256:b338711d103c017c28be8496d2b3128cccce7b13fa5768b8fc3c6067ca54bdb1
Status: Downloaded newer image for grafana/grafana:latest
docker.io/grafana/grafana:latest
[root@VM-16-14-centos manifests]#
grafana/grafana latest 5c0692a90154 6 days ago 249MB
prom/node-exporter latest 0fafea149859 2 months ago 21.2MB
[root@VM-16-14-centos manifests]# tree
.
├── alertmanager-alertmanager.yaml
├── alertmanager-podDisruptionBudget.yaml
├── alertmanager-prometheusRule.yaml
├── alertmanager-secret.yaml
├── alertmanager-serviceAccount.yaml
├── alertmanager-serviceMonitor.yaml
├── alertmanager-service.yaml
├── blackbox-exporter-clusterRoleBinding.yaml
├── blackbox-exporter-clusterRole.yaml
├── blackbox-exporter-configuration.yaml
├── blackbox-exporter-deployment.yaml
├── blackbox-exporter-serviceAccount.yaml
├── blackbox-exporter-serviceMonitor.yaml
├── blackbox-exporter-service.yaml
├── grafana-dashboardDatasources.yaml
├── grafana-dashboardDefinitions.yaml
├── grafana-dashboardSources.yaml
├── grafana-deployment.yaml
├── grafana-serviceAccount.yaml
├── grafana-serviceMonitor.yaml
├── grafana-service.yaml
├── kube-prometheus-prometheusRule.yaml
├── kubernetes-prometheusRule.yaml
├── kubernetes-serviceMonitorApiserver.yaml
├── kubernetes-serviceMonitorCoreDNS.yaml
├── kubernetes-serviceMonitorKubeControllerManager.yaml
├── kubernetes-serviceMonitorKubelet.yaml
├── kubernetes-serviceMonitorKubeScheduler.yaml
├── kube-state-metrics-clusterRoleBinding.yaml
├── kube-state-metrics-clusterRole.yaml
├── kube-state-metrics-deployment.yaml
├── kube-state-metrics-prometheusRule.yaml
├── kube-state-metrics-serviceAccount.yaml
├── kube-state-metrics-serviceMonitor.yaml
├── kube-state-metrics-service.yaml
├── node-exporter-clusterRoleBinding.yaml
├── node-exporter-clusterRole.yaml
├── node-exporter-daemonset.yaml
├── node-exporter-prometheusRule.yaml
├── node-exporter-serviceAccount.yaml
├── node-exporter-serviceMonitor.yaml
├── node-exporter-service.yaml
├── prometheus-adapter-apiService.yaml
├── prometheus-adapter-clusterRoleAggregatedMetricsReader.yaml
├── prometheus-adapter-clusterRoleBindingDelegator.yaml
├── prometheus-adapter-clusterRoleBinding.yaml
├── prometheus-adapter-clusterRoleServerResources.yaml
├── prometheus-adapter-clusterRole.yaml
├── prometheus-adapter-configMap.yaml
├── prometheus-adapter-deployment.yaml
├── prometheus-adapter-podDisruptionBudget.yaml
├── prometheus-adapter-roleBindingAuthReader.yaml
├── prometheus-adapter-serviceAccount.yaml
├── prometheus-adapter-serviceMonitor.yaml
├── prometheus-adapter-service.yaml
├── prometheus-clusterRoleBinding.yaml
├── prometheus-clusterRole.yaml
├── prometheus-operator-prometheusRule.yaml
├── prometheus-operator-serviceMonitor.yaml
├── prometheus-podDisruptionBudget.yaml
├── prometheus-prometheusRule.yaml
├── prometheus-prometheus.yaml
├── prometheus-roleBindingConfig.yaml
├── prometheus-roleBindingSpecificNamespaces.yaml
├── prometheus-roleConfig.yaml
├── prometheus-roleSpecificNamespaces.yaml
├── prometheus-serviceAccount.yaml
├── prometheus-serviceMonitor.yaml
├── prometheus-service.yaml
└── setup
├── 0namespace-namespace.yaml
├── prometheus-operator-0alertmanagerConfigCustomResourceDefinition.yaml
├── prometheus-operator-0alertmanagerCustomResourceDefinition.yaml
├── prometheus-operator-0podmonitorCustomResourceDefinition.yaml
├── prometheus-operator-0probeCustomResourceDefinition.yaml
├── prometheus-operator-0prometheusCustomResourceDefinition.yaml
├── prometheus-operator-0prometheusruleCustomResourceDefinition.yaml
├── prometheus-operator-0servicemonitorCustomResourceDefinition.yaml
├── prometheus-operator-0thanosrulerCustomResourceDefinition.yaml
├── prometheus-operator-clusterRoleBinding.yaml
├── prometheus-operator-clusterRole.yaml
├── prometheus-operator-deployment.yaml
├── prometheus-operator-serviceAccount.yaml
└── prometheus-operator-service.yaml
1 directory, 83 files
[root@VM-16-14-centos manifests]# pwd
/data/kube-prometheus-0.9.0/manifests
[root@VM-16-14-centos manifests]# kubectl apply -f setup/
namespace/monitoring created
customresourcedefinition.apiextensions.k8s.io/alertmanagerconfigs.monitoring.coreos.com created
customresourcedefinition.apiextensions.k8s.io/alertmanagers.monitoring.coreos.com created
customresourcedefinition.apiextensions.k8s.io/podmonitors.monitoring.coreos.com created
customresourcedefinition.apiextensions.k8s.io/probes.monitoring.coreos.com created
customresourcedefinition.apiextensions.k8s.io/prometheuses.monitoring.coreos.com created
customresourcedefinition.apiextensions.k8s.io/prometheusrules.monitoring.coreos.com created
customresourcedefinition.apiextensions.k8s.io/servicemonitors.monitoring.coreos.com created
customresourcedefinition.apiextensions.k8s.io/thanosrulers.monitoring.coreos.com created
clusterrole.rbac.authorization.k8s.io/prometheus-operator created
clusterrolebinding.rbac.authorization.k8s.io/prometheus-operator created
deployment.apps/prometheus-operator created
service/prometheus-operator created
serviceaccount/prometheus-operator created
[root@VM-16-14-centos manifests]# kubectl apply -f .
alertmanager.monitoring.coreos.com/main created
Warning: policy/v1beta1 PodDisruptionBudget is deprecated in v1.21+, unavailable in v1.25+; use policy/v1 PodDisruptionBudget
poddisruptionbudget.policy/alertmanager-main created
prometheusrule.monitoring.coreos.com/alertmanager-main-rules created
secret/alertmanager-main created
service/alertmanager-main created
serviceaccount/alertmanager-main created
servicemonitor.monitoring.coreos.com/alertmanager created
clusterrole.rbac.authorization.k8s.io/blackbox-exporter created
clusterrolebinding.rbac.authorization.k8s.io/blackbox-exporter created
configmap/blackbox-exporter-configuration created
deployment.apps/blackbox-exporter created
service/blackbox-exporter created
serviceaccount/blackbox-exporter created
servicemonitor.monitoring.coreos.com/blackbox-exporter created
secret/grafana-datasources created
configmap/grafana-dashboard-alertmanager-overview created
configmap/grafana-dashboard-apiserver created
configmap/grafana-dashboard-cluster-total created
configmap/grafana-dashboard-controller-manager created
configmap/grafana-dashboard-k8s-resources-cluster created
configmap/grafana-dashboard-k8s-resources-namespace created
configmap/grafana-dashboard-k8s-resources-node created
configmap/grafana-dashboard-k8s-resources-pod created
configmap/grafana-dashboard-k8s-resources-workload created
configmap/grafana-dashboard-k8s-resources-workloads-namespace created
configmap/grafana-dashboard-kubelet created
configmap/grafana-dashboard-namespace-by-pod created
configmap/grafana-dashboard-namespace-by-workload created
configmap/grafana-dashboard-node-cluster-rsrc-use created
configmap/grafana-dashboard-node-rsrc-use created
configmap/grafana-dashboard-nodes created
configmap/grafana-dashboard-persistentvolumesusage created
configmap/grafana-dashboard-pod-total created
configmap/grafana-dashboard-prometheus-remote-write created
configmap/grafana-dashboard-prometheus created
configmap/grafana-dashboard-proxy created
configmap/grafana-dashboard-scheduler created
configmap/grafana-dashboard-workload-total created
configmap/grafana-dashboards created
deployment.apps/grafana created
service/grafana created
serviceaccount/grafana created
servicemonitor.monitoring.coreos.com/grafana created
prometheusrule.monitoring.coreos.com/kube-prometheus-rules created
clusterrole.rbac.authorization.k8s.io/kube-state-metrics created
clusterrolebinding.rbac.authorization.k8s.io/kube-state-metrics created
deployment.apps/kube-state-metrics created
prometheusrule.monitoring.coreos.com/kube-state-metrics-rules created
service/kube-state-metrics created
serviceaccount/kube-state-metrics created
servicemonitor.monitoring.coreos.com/kube-state-metrics created
prometheusrule.monitoring.coreos.com/kubernetes-monitoring-rules created
servicemonitor.monitoring.coreos.com/kube-apiserver created
servicemonitor.monitoring.coreos.com/coredns created
servicemonitor.monitoring.coreos.com/kube-controller-manager created
servicemonitor.monitoring.coreos.com/kube-scheduler created
servicemonitor.monitoring.coreos.com/kubelet created
clusterrole.rbac.authorization.k8s.io/node-exporter created
clusterrolebinding.rbac.authorization.k8s.io/node-exporter created
daemonset.apps/node-exporter created
prometheusrule.monitoring.coreos.com/node-exporter-rules created
service/node-exporter created
serviceaccount/node-exporter created
servicemonitor.monitoring.coreos.com/node-exporter created
apiservice.apiregistration.k8s.io/v1beta1.metrics.k8s.io configured
clusterrole.rbac.authorization.k8s.io/prometheus-adapter created
clusterrole.rbac.authorization.k8s.io/system:aggregated-metrics-reader configured
clusterrolebinding.rbac.authorization.k8s.io/prometheus-adapter created
clusterrolebinding.rbac.authorization.k8s.io/resource-metrics:system:auth-delegator created
clusterrole.rbac.authorization.k8s.io/resource-metrics-server-resources created
configmap/adapter-config created
deployment.apps/prometheus-adapter created
poddisruptionbudget.policy/prometheus-adapter created
rolebinding.rbac.authorization.k8s.io/resource-metrics-auth-reader created
service/prometheus-adapter created
serviceaccount/prometheus-adapter created
servicemonitor.monitoring.coreos.com/prometheus-adapter created
clusterrole.rbac.authorization.k8s.io/prometheus-k8s created
clusterrolebinding.rbac.authorization.k8s.io/prometheus-k8s created
prometheusrule.monitoring.coreos.com/prometheus-operator-rules created
servicemonitor.monitoring.coreos.com/prometheus-operator created
poddisruptionbudget.policy/prometheus-k8s created
prometheus.monitoring.coreos.com/k8s created
prometheusrule.monitoring.coreos.com/prometheus-k8s-prometheus-rules created
rolebinding.rbac.authorization.k8s.io/prometheus-k8s-config created
rolebinding.rbac.authorization.k8s.io/prometheus-k8s created
rolebinding.rbac.authorization.k8s.io/prometheus-k8s created
rolebinding.rbac.authorization.k8s.io/prometheus-k8s created
role.rbac.authorization.k8s.io/prometheus-k8s-config created
role.rbac.authorization.k8s.io/prometheus-k8s created
role.rbac.authorization.k8s.io/prometheus-k8s created
role.rbac.authorization.k8s.io/prometheus-k8s created
service/prometheus-k8s created
serviceaccount/prometheus-k8s created
servicemonitor.monitoring.coreos.com/prometheus-k8s created
[root@VM-16-14-centos manifests]#
[root@VM-16-14-centos ~]# kubectl get po,svc -n monitoring -o wide
NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES
pod/alertmanager-main-0 2/2 Running 0 15m 172.30.1.166 vm-16-6-centos <none> <none>
pod/alertmanager-main-1 2/2 Running 0 15m 172.30.2.94 vm-16-4-centos <none> <none>
pod/alertmanager-main-2 2/2 Running 0 15m 172.30.1.167 vm-16-6-centos <none> <none>
pod/blackbox-exporter-6798fb5bb4-d6mvc 3/3 Running 0 15m 172.30.2.95 vm-16-4-centos <none> <none>
pod/grafana-7476b4c65b-r7q9m 1/1 Running 0 15m 172.30.2.96 vm-16-4-centos <none> <none>
pod/kube-state-metrics-74964b6cd4-k5plk 3/3 Running 0 15m 172.30.1.168 vm-16-6-centos <none> <none>
pod/node-exporter-8lsdz 2/2 Running 0 15m 10.206.16.4 vm-16-4-centos <none> <none>
pod/node-exporter-kmjnl 2/2 Running 0 15m 10.206.16.6 vm-16-6-centos <none> <none>
pod/node-exporter-wxcln 2/2 Running 0 15m 10.206.16.14 vm-16-14-centos <none> <none>
pod/prometheus-adapter-5b8db7955f-888rx 1/1 Running 0 15m 172.30.1.169 vm-16-6-centos <none> <none>
pod/prometheus-adapter-5b8db7955f-j4tq9 1/1 Running 0 15m 172.30.2.97 vm-16-4-centos <none> <none>
pod/prometheus-k8s-0 2/2 Running 0 15m 172.30.1.170 vm-16-6-centos <none> <none>
pod/prometheus-k8s-1 2/2 Running 0 15m 172.30.2.98 vm-16-4-centos <none> <none>
pod/prometheus-operator-75d9b475d9-dghn9 2/2 Running 0 15m 172.30.2.93 vm-16-4-centos <none> <none>
NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE SELECTOR
service/alertmanager-main ClusterIP 10.254.131.128 <none> 9093/TCP 15m alertmanager=main,app.kubernetes.io/component=alert-router,app.kubernetes.io/name=alertmanager,app.kubernetes.io/part-of=kube-prometheus,app=alertmanager
service/alertmanager-operated ClusterIP None <none> 9093/TCP,9094/TCP,9094/UDP 15m app.kubernetes.io/name=alertmanager
service/blackbox-exporter ClusterIP 10.254.250.76 <none> 9115/TCP,19115/TCP 15m app.kubernetes.io/component=exporter,app.kubernetes.io/name=blackbox-exporter,app.kubernetes.io/part-of=kube-prometheus
service/grafana ClusterIP 10.254.196.172 <none> 3000/TCP 15m app.kubernetes.io/component=grafana,app.kubernetes.io/name=grafana,app.kubernetes.io/part-of=kube-prometheus
service/kube-state-metrics ClusterIP None <none> 8443/TCP,9443/TCP 15m app.kubernetes.io/component=exporter,app.kubernetes.io/name=kube-state-metrics,app.kubernetes.io/part-of=kube-prometheus
service/node-exporter ClusterIP None <none> 9100/TCP 15m app.kubernetes.io/component=exporter,app.kubernetes.io/name=node-exporter,app.kubernetes.io/part-of=kube-prometheus
service/prometheus-adapter ClusterIP 10.254.253.205 <none> 443/TCP 15m app.kubernetes.io/component=metrics-adapter,app.kubernetes.io/name=prometheus-adapter,app.kubernetes.io/part-of=kube-prometheus
service/prometheus-k8s ClusterIP 10.254.236.103 <none> 9090/TCP 15m app.kubernetes.io/component=prometheus,app.kubernetes.io/name=prometheus,app.kubernetes.io/part-of=kube-prometheus,app=prometheus,prometheus=k8s
service/prometheus-operated ClusterIP None <none> 9090/TCP 15m app.kubernetes.io/name=prometheus
service/prometheus-operator ClusterIP None <none> 8443/TCP 15m app.kubernetes.io/component=controller,app.kubernetes.io/name=prometheus-operator,app.kubernetes.io/part-of=kube-prometheus
[root@VM-16-14-centos ~]#
1.3、创建ingress
[root@VM-16-14-centos data]# cat my-ingress-grafana.yaml
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: grafana
annotations:
kubernetes.io/ingress.class: "nginx"
namespace: monitoring
spec:
rules:
- host: kkkk.com
http:
paths:
- path: "/"
pathType: Prefix
backend:
service:
name: grafana
port:
number: 3000
- 至此,grafana即可访问。默认账号密码 admin/admin。
- grafana
- Prometheus Time Series Collection and Processing Server
- Prometheus metrics
- Prometheus target
- Prometheus alerts
二、dashboard管理
2.1、dashboard导入
- 导入自带dashborad
- 导入 315,dashborad grafana.com/grafana/das…
- 导入 6417,kubernetes-cluster-prometheus
- 导入 其他
2.2、官网dashborad
- grafana.com/grafana/das…
- 12006,Kubernetes apiserver
- 3070,Etcd by Prometheus
- 5926,CoreDNS
- 3131,Kubernetes All Nodes
- 893,Docker and system monitoring
- 7279,CoreDNS 2
- 9614,NGINX Ingress controller
三、alert告警管理
3.1、KubeControllerManagerDown (1 active)
[root@VM-16-14-centos ~]# vim /data/kube-controller-namager-svc-ep.yaml
apiVersion: v1
kind: Service
metadata:
name: kube-controller-manager
namespace: kube-system
labels:
k8s-app: kube-controller-manager
spec:
type: ClusterIP
clusterIP: None
ports:
- name: https-metrics
port: 10257
targetPort: 10257
protocol: TCP
---
apiVersion: v1
kind: Endpoints
metadata:
name: kube-controller-manager
namespace: kube-system
labels:
k8s-app: kube-controller-manager
subsets:
- addresses:
- ip: 172.16.1.71
ports:
- name: https-metrics
port: 10257
protocol: TCP
[root@VM-16-14-centos ~]# kubectl apply -f /data/kube-controller-namager-svc-ep.yaml
[root@VM-16-14-centos ~]# curl -X POST http://10.254.236.103:9090/-/reload