helm install
helm2
wget helm-v2.16.1-linux-amd64.tar.gz
tar -zxvf helm-v2.16.1-linux-amd64.tar.gz
./helm init --history-max 200 # addin local repo, and install tiller pod
#set sa tiller cluster-admin role
kubectl create serviceaccount -n kube-system tiller
kubectl create clusterrolebinding tiller-cluster-admin --clusterrole=cluster-admin --serviceaccount=kube-system:tiller
kubectl --namespace kube-system patch deploy tiller-deploy -p '{"spec":{"template":{"spec":{"serviceAccount":"tiller"}}}}'
prometheus install
helm2 install stable/prometheus-operator --name prometheus-operator --namespace monitoring
nohup kubectl port-forward svc/prometheus-operator-prometheus -n monitoring 9090:9090 --address 0.0.0.0 </dev/null >/dev/null 2>&1 &
nohup kubectl port-forward svc/prometheus-operator-grafana -n monitoring 8082:80 --address 0.0.0.0 </dev/null >/dev/null 2>&1 &
helm3安装方式
kubectl apply -f crds/
helm install prometheus --namespace=monitoring ./ --set prometheusOperator.createCustomResource=false
自定义监控项
自定义告警规则及告警方式
修改prometheusrules.monitoring.coreos.com,添加
- name: kuai.rules
rules:
- expr: kube_pod_status_phase{phase="Unknown", namespace="model-deployment"} == 1 or kube_pod_status_phase{phase="Failed", namespace="model-deployment"} == 1
alert: PodFail
for: 1m
labels:
service: kuai_pod
修改secrets, alertmanager.yaml
echo '"global":
"resolve_timeout": "5m"
"receivers":
- "name": "null"
- "name": "podEventWebHook"
"webhook_configs":
- "url": "http://10.244.4.30:5000/hpa"
"send_resolved": true
"route":
"group_by":
- "job"
"group_interval": "5m"
"group_wait": "30s"
"receiver": "null"
"repeat_interval": "12h"
"routes":
- "match":
"alertname": "DeadMansSwitch"
"receiver": "null"
- "match":
"service": "kuai_pod"
"receiver": "podEventWebHook"' |base64
[k8s@TX-220-54-4 ns]$ cat alertmanager.yaml
apiVersion: v1
data:
alertmanager.yaml: Imdsb2JhbCI6CiAgInJlc29sdmVfdGltZW91dCI6ICI1bSIKInJlY2VpdmVycyI6Ci0gIm5hbWUiOiAibnVsbCIKLSAibmFtZSI6ICJwb2RFdmVudFdlYkhvb2siCiAgIndlYmhvb2tfY29uZmlncyI6CiAgLSAidXJsIjogImh0dHA6Ly8xMC4yNDQuNC4zMDo1MDAwL2hwYSIKICAgICJzZW5kX3Jlc29sdmVkIjogdHJ1ZQoicm91dGUiOgogICJncm91cF9ieSI6CiAgLSAiam9iIgogICJncm91cF9pbnRlcnZhbCI6ICI1bSIKICAiZ3JvdXBfd2FpdCI6ICIzMHMiCiAgInJlY2VpdmVyIjogIm51bGwiCiAgInJlcGVhdF9pbnRlcnZhbCI6ICIxMmgiCiAgInJvdXRlcyI6CiAgLSAibWF0Y2giOgogICAgICAiYWxlcnRuYW1lIjogIkRlYWRNYW5zU3dpdGNoIgogICAgInJlY2VpdmVyIjogIm51bGwiCiAgLSAibWF0Y2giOgogICAgICAic2VydmljZSI6ICJrdWFpX3BvZCIKICAgICJyZWNlaXZlciI6ICJwb2RFdmVudFdlYkhvb2siCg==
kind: Secret
metadata:
name: alertmanager-main
namespace: monitoring
type: Opaque
添加webhook
from flask import Flask,request
import json
app = Flask(__name__)
@app.route("/hpa",methods=["POST"])
def hpa():
content = request.get_json()
#分析content字段,提取相关数据
#.......
#.......
print content
if __name__ == "__main__":
app.run("0.0.0.0")