VictoriaMetrics 自监控以及ruler告警语句

1,232 阅读2分钟

添加Prometheus采集配置文件


  - job_name: "vmstorage"
    static_configs:
      - targets: ["192.168.26.100:8482","192.168.26.102:8482"]
  - job_name: "vminsert"
    static_configs:
      - targets: ["192.168.26.101:8480"]
  - job_name: "vmselect"
    static_configs:
      - targets: ["192.168.26.102:8481"]

image.png

开启远程写

remote_write:    # 远程写入到远程 VM 存储
 - url: http://192.168.26.101:8480/insert/0/prometheus/

==重启prometheus==

systemctl restart prometheus.service

集群监控

groups:
  - name: vmcluster
    interval: 30s
    rules:
      - alert: DiskRunsOutOfSpaceIn3Days
        expr: |
          vm_free_disk_space_bytes / ignoring(path)
          (
             (
              rate(vm_rows_added_to_storage_total[1d]) -
              ignoring(type) rate(vm_deduplicated_samples_total{type="merge"}[1d])
             )
            * scalar(
              sum(vm_data_size_bytes{type!~"indexdb.*"}) /
              sum(vm_rows{type!~"indexdb.*"})
             )
          ) < 3 * 24 * 3600 > 0
        for: 30m
        labels:
          severity: critical
        annotations:
          dashboard: "http://localhost:3000/d/oS7Bi_0Wz?viewPanel=113&var-instance={{ $labels.instance }}"
          summary: "虚拟机实例 {{ $labels.instance }} 的磁盘空间将在3天内用尽"
          description: "当前的写入速率,虚拟机实例 {{ $labels.instance }} 的剩余磁盘空间仅足够 {{ $value | humanizeDuration }},请考虑限制写入速率、减少数据保留时间或增加磁盘空间。"

      - alert: DiskRunsOutOfSpace
        expr: |
          sum(vm_data_size_bytes) by(instance) /
          (
           sum(vm_free_disk_space_bytes) by(instance) +
           sum(vm_data_size_bytes) by(instance)
          ) > 0.8
        for: 30m
        labels:
          severity: critical
        annotations:
          dashboard: "http://192.168.26.100:3000/d/oS7Bi_0Wz?viewPanel=200&var-instance={{ $labels.instance }}"
          summary: "虚拟机实例 {{ $labels.instance }} 的磁盘空间即将用尽"
          description: "虚拟机实例 {{ $labels.instance }} 的磁盘使用率超过80%,剩余不足20%的磁盘空间可能导致合并过程和整体性能降低,如果可能,请考虑限制写入速率、减少数据保留时间或增加磁盘空间。"

      - alert: RequestErrorsToAPI
        expr: increase(vm_http_request_errors_total[5m]) > 0
        for: 15m
        labels:
          severity: warning
          show_at: dashboard
        annotations:
          dashboard: "http://192.168.26.100:3000/d/oS7Bi_0Wz?viewPanel=52&var-instance={{ $labels.instance }}"
          summary: "{{ $labels.job }}路径{{ $labels.path }}(虚拟机实例 {{ $labels.instance }})有过多错误"
          description: "路径 {{ $labels.path }} 的请求出现错误,请验证客户端是否发送正确的请求。"

      - alert: RPCErrors
        expr: |
          (
           sum(increase(vm_rpc_connection_errors_total[5m])) by(job, instance)
           +
           sum(increase(vm_rpc_dial_errors_total[5m])) by(job, instance)
           +
           sum(increase(vm_rpc_handshake_errors_total[5m])) by(job, instance)
          ) > 0
        for: 15m
        labels:
          severity: warning
          show_at: dashboard
        annotations:
          dashboard: "http://192.168.26.100:3000/d/oS7Bi_0Wz?viewPanel=44&var-instance={{ $labels.instance }}"
          summary: "{{ $labels.job }}(虚拟机实例 {{ $labels.instance }})存在过多的RPC错误"
          description: "RPC错误是集群组件之间的连接错误,错误可能的原因包括配置错误、负载过重、网络中断或无法访问的组件。"

      - alert: RowsRejectedOnIngestion
        expr: sum(rate(vm_rows_ignored_total[5m])) by (instance, reason) > 0
        for: 15m
        labels:
          severity: warning
        annotations:
          dashboard: "http://192.168.26.100:3000/d/oS7Bi_0Wz?viewPanel=135&var-instance={{ $labels.instance }}"
          summary: "尝试写入虚拟机实例 \"{{ $labels.instance }}\" 时存在被拒绝的数据行"
          description: "由于以下原因,VM拒绝写入虚拟机实例 \"{{ $labels.instance }}\" 的数据行:\"{{ $labels.reason }}\""

      - alert: TooHighChurnRate
        expr: |
          (
             sum(rate(vm_new_timeseries_created_total[5m]))
             /
             sum(rate(vm_rows_inserted_total[5m]))
           ) > 0.1
        for: 15m
        labels:
          severity: warning
        annotations:
          dashboard: "http://192.168.26.100:3000/d/oS7Bi_0Wz?viewPanel=102"
          summary: "最近15分钟内的数据变动率超过10%"
          description: "虚拟机持续创建新的时间序列,这种现象称为数据变动率(Churn Rate),高数据变动率与数据库性能密切相关,可能导致意外的OOM(Out of Memory)错误或查询缓慢。"

      - alert: TooHighChurnRate24h
        expr: |
          sum(increase(vm_new_timeseries_created_total[24h]))
          >
          (sum(vm_cache_entries{type="storage/hour_metric_ids"})* 3)
        for: 15m
        labels:
          severity: warning
        annotations:
          dashboard: "http://192.168.26.100:3000/d/oS7Bi_0Wz?viewPanel=102"
          summary: "过去24小时内创建的新时间序列数量过高"
          description: "过去24小时内创建的新时间序列数量是当前活跃时间序列数量的3倍以上这种现象称为数据变动率(Churn Rate)高数据变动率与数据库性能密切相关,可能导致意外的OOM(Out of Memory)错误或查询缓慢。"

      - alert: TooHighSlowInsertsRate
        expr: |
          (
             sum(rate(vm_slow_row_inserts_total[5m]))
             /
             sum(rate(vm_rows_inserted_total[5m]))
           ) > 0.05
        for: 15m
        labels:
          severity: warning
        annotations:
          dashboard: "http://192.168.26.100:3000/d/oS7Bi_0Wz?viewPanel=108"
          summary: "最近15分钟内缓慢插入的比例超过5%"
          description: "高比例的缓慢插入可能表明当前负载资源不足可能需要更多的RAM来优化处理当前活跃时间序列的数量请参考 https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3976#issuecomment-1476883183"

      - alert: ProcessNearFDLimits
        expr: (process_max_fds - process_open_fds) < 100
        for: 5m
        labels:
          severity: critical
        annotations:
          dashboard: "http://192.168.26.100:3000/d/oS7Bi_0Wz?viewPanel=117&var-instance={{ $labels.instance }}"
          summary: "最近5分钟内 \"{{ $labels.job }}\"(\"{{ $labels.instance }}\")的空闲文件描述符数不足100"
          description: "用尽操作系统的文件描述符限制可能严重降低进程性能请尽快考虑增加文件描述符限制。"

      - alert: LabelsLimitExceededOnIngestion
        expr: sum(increase(vm_metrics_with_dropped_labels_total[5m])) by (instance) > 0
        for: 15m
        labels:
          severity: warning
        annotations:
          dashboard: "http://192.168.26.100:3000/d/oS7Bi_0Wz?viewPanel=116&var-instance={{ $labels.instance }}"
          summary: "虚拟机实例 {{ $labels.instance }} 上的指标超过了标签限制"
          description: "VictoriaMetrics对每个指标的标签数量有限制,可通过 `-maxLabelsPerTimeseries` 命令行标志进行配置这可以防止导入具有过多标签的指标。请验证 `-maxLabelsPerTimeseries` 的配置是否正确,或者验证是否发送这些指标的客户端行为正常"

      - alert: VminsertVmstorageConnectionIsSaturated
        expr: rate(vm_rpc_send_duration_seconds_total[5m]) > 0.9
        for: 15m
        labels:
          severity: warning
          show_at: dashboard
        annotations:
          dashboard: "http://192.168.26.100:3000/d/oS7Bi_0Wz?viewPanel=139&var-instance={{ $labels.instance }}"
          summary: "vminsert(虚拟机实例 {{ $labels.instance }})与vmstorage(虚拟机实例 {{ $labels.addr }})之间的连接已饱和"
          description: "vminsert(虚拟机实例 {{ $labels.instance }})与vmstorage(虚拟机实例 {{ $labels.addr }})之间的连接饱和超过90%,vminsert无法跟上写入速率,这通常意味着必须向集群中添加更多的vminsert或vmstorage节点以增加总的vminsert -> vmstorage连接数"

vm健康监控

groups:
  - name: vm-health
    rules:
      - alert: TooManyRestarts
        expr: changes(process_start_time_seconds{job=~"victoriametrics.*|vmselect.*|vminsert.*|vmstorage.*|vmagent.*|vmalert.*|vmsingle.*|vmalertmanager.*"}[15m]) > 2
        labels:
          severity: critical
        annotations:
          summary: "{{ $labels.job }} 过多重启(实例 {{ $labels.instance }})"
          description: "Job {{ $labels.job }} (实例 {{ $labels.instance }}) 在过去的 15 分钟内重启超过两次,可能出现了 crashlooping 问题。"

      - alert: ServiceDown
        expr: up{job=~"victoriametrics.*|vmselect.*|vminsert.*|vmstorage.*|vmagent.*|vmalert.*|vmsingle.*|vmalertmanager.*"} == 0
        for: 2m
        labels:
          severity: critical
        annotations:
          summary: "{{ $labels.job }} 服务已停止在 {{ $labels.instance }}"
          description: "{{ $labels.instance }} 上的 {{ $labels.job }} 服务已经停止超过 2 分钟。"

      - alert: ProcessNearFDLimits
        expr: (process_max_fds - process_open_fds) < 100
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "过去 5 分钟内,\"{{ $labels.job }}\"(\"{{ $labels.instance }}\") 的可用文件描述符数少于 100"
          description: "耗尽操作系统文件描述符限制可能导致进程严重降级。
          考虑尽快增加限制。"

      - alert: TooHighMemoryUsage
        expr: (process_resident_memory_anon_bytes / vm_available_memory_bytes) > 0.8
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "过去 5 分钟内,\"{{ $labels.job }}\"(\"{{ $labels.instance }}\") 内存使用量超过 80%"
          description: "内存使用量过高可能导致多种问题,如OOM或性能下降,考虑增加可用内存或降低进程负载。"

      - alert: TooHighCPUUsage
        expr: rate(process_cpu_seconds_total[5m]) / process_cpu_cores_available > 0.9
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "过去 5 分钟内,\"{{ $labels.job }}\"(\"{{ $labels.instance }}\") CPU 使用率超过 90%"
          description: "CPU 使用率过高可能表明资源不足,并导致进程不稳定,考虑增加可用的CPU资源或降低进程负载。"

      - alert: TooManyLogs
        expr: sum(increase(vm_log_messages_total{level="error"}[5m])) by (job, instance) > 0
        for: 15m
        labels:
          severity: warning
        annotations:
          summary: "过去 15 分钟内,\"{{ $labels.job }}\" ({{ $labels.instance }}) 打印过多日志"
          description: "过去 15 分钟内,\"{{ $labels.job }}\" ({{ $labels.instance }}) 的日志打印率为 {{ $value }},建议检查特定错误消息的日志。"

      - alert: TooManyTSIDMisses
        expr: sum(rate(vm_missing_tsids_for_metric_id_total[5m])) by (job, instance) > 0
        for: 10m
        labels:
          severity: critical
        annotations:
          summary: "过去 10 分钟内,\"{{ $labels.job }}\" ({{ $labels.instance }}) TSID 错误数过多"
          description: "查询中查找 TSID 错误率对于\"{{ $labels.job }}\" ({{ $labels.instance }}) 过高,确保你正在运行 VictoriaMetrics 的 v1.85.3 或更高版本,相关问题:https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3502"

      - alert: ConcurrentInsertsHitTheLimit
        expr: avg_over_time(vm_concurrent_insert_current[1m]) >= vm_concurrent_insert_capacity
        for: 15m
        labels:
          severity: warning
        annotations:
          summary: "{{ $labels.job }} 在实例 {{ $labels.instance }} 上不断达到并发插入限制"
          description: "实例 {{ $labels.instance }} 上的并发插入限制取决于 CPU 的数量,当组件不断达到限制时,可能是组件负载过高,需要更多的 CPU 资源,对于 vmagent 或 vminsert 这样的组件,如果有太多的客户端尝试写入,可能会触发警报,如果 vmagent 或 vminsert 的 CPU 使用率和网络饱和度处于正常水平,那么可能值得调整 `-maxConcurrentInserts` 命令行标志。"

以上都是用gpt翻译的具体使用还得修改一下

image.png