node_export及prometheus查询

1,668 阅读8分钟

添加监控项及export查询数据

node_export介绍

本机curl访问数据

[root@prome_master_01 tgzs]# curl  -s  localhost:9100/metrics |grep node_  |head -20
# HELP node_arp_entries ARP entries by device
# TYPE node_arp_entries gauge
node_arp_entries{device="eth0"} 3
# HELP node_boot_time_seconds Node boot time, in unixtime.
# TYPE node_boot_time_seconds gauge
node_boot_time_seconds 1.616987084e+09
# HELP node_context_switches_total Total number of context switches.
# TYPE node_context_switches_total counter
node_context_switches_total 2.105979e+06
# HELP node_cooling_device_cur_state Current throttle state of the cooling device
# TYPE node_cooling_device_cur_state gauge
node_cooling_device_cur_state{name="0",type="Processor"} 0
node_cooling_device_cur_state{name="1",type="Processor"} 0
node_cooling_device_cur_state{name="2",type="Processor"} 0
node_cooling_device_cur_state{name="3",type="Processor"} 0
# HELP node_cooling_device_max_state Maximum throttle state of the cooling device
# TYPE node_cooling_device_max_state gauge
node_cooling_device_max_state{name="0",type="Processor"} 0
node_cooling_device_max_state{name="1",type="Processor"} 0
node_cooling_device_max_state{name="2",type="Processor"} 0

项目地址

查看启动日志

Mar 29 15:38:51 prome_master_01 node_exporter: level=info ts=2021-03-29T07:38:51.315Z caller=node_exporter.go:178 msg="Starting node_exporter" version="(version=1.1.2, branch=HEAD, revision=b597c1244d7bef49e6f3359c87a56dd7707f6719)"
Mar 29 15:38:51 prome_master_01 node_exporter: level=info ts=2021-03-29T07:38:51.315Z caller=node_exporter.go:179 msg="Build context" build_context="(go=go1.15.8, user=root@f07de8ca602a, date=20210305-09:29:10)"
Mar 29 15:38:51 prome_master_01 node_exporter: level=warn ts=2021-03-29T07:38:51.315Z caller=node_exporter.go:181 msg="Node Exporter is running as root user. This exporter is designed to run as unpriviledged user, root is not required."
Mar 29 15:38:51 prome_master_01 node_exporter: level=info ts=2021-03-29T07:38:51.316Z caller=filesystem_common.go:74 collector=filesystem msg="Parsed flag --collector.filesystem.ignored-mount-points" flag=^/(dev|proc|sys|var/lib/docker/.+)($|/)
Mar 29 15:38:51 prome_master_01 node_exporter: level=info ts=2021-03-29T07:38:51.316Z caller=filesystem_common.go:76 collector=filesystem msg="Parsed flag --collector.filesystem.ignored-fs-types" flag=^(autofs|binfmt_misc|bpf|cgroup2?|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|iso9660|mqueue|nsfs|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|selinuxfs|squashfs|sysfs|tracefs)$
Mar 29 15:38:51 prome_master_01 node_exporter: level=info ts=2021-03-29T07:38:51.316Z caller=node_exporter.go:106 msg="Enabled collectors"
Mar 29 15:38:51 prome_master_01 node_exporter: level=info ts=2021-03-29T07:38:51.316Z caller=node_exporter.go:113 collector=arp
Mar 29 15:38:51 prome_master_01 node_exporter: level=info ts=2021-03-29T07:38:51.316Z caller=node_exporter.go:113 collector=bcache
Mar 29 15:38:51 prome_master_01 node_exporter: level=info ts=2021-03-29T07:38:51.316Z caller=node_exporter.go:113 collector=bonding
Mar 29 15:38:51 prome_master_01 node_exporter: level=info ts=2021-03-29T07:38:51.316Z caller=node_exporter.go:113 collector=btrfs
Mar 29 15:38:51 prome_master_01 node_exporter: level=info ts=2021-03-29T07:38:51.316Z caller=node_exporter.go:113 collector=conntrack
Mar 29 15:38:51 prome_master_01 node_exporter: level=info ts=2021-03-29T07:38:51.316Z caller=node_exporter.go:113 collector=cpu
Mar 29 15:38:51 prome_master_01 node_exporter: level=info ts=2021-03-29T07:38:51.316Z caller=node_exporter.go:113 collector=cpufreq
Mar 29 15:38:51 prome_master_01 node_exporter: level=info ts=2021-03-29T07:38:51.316Z caller=node_exporter.go:113 collector=diskstats
Mar 29 15:38:51 prome_master_01 node_exporter: level=info ts=2021-03-29T07:38:51.316Z caller=node_exporter.go:113 collector=edac
Mar 29 15:38:51 prome_master_01 node_exporter: level=info ts=2021-03-29T07:38:51.316Z caller=node_exporter.go:113 collector=entropy
Mar 29 15:38:51 prome_master_01 node_exporter: level=info ts=2021-03-29T07:38:51.316Z caller=node_exporter.go:113 collector=fibrechannel
Mar 29 15:38:51 prome_master_01 node_exporter: level=info ts=2021-03-29T07:38:51.316Z caller=node_exporter.go:113 collector=filefd
Mar 29 15:38:51 prome_master_01 node_exporter: level=info ts=2021-03-29T07:38:51.316Z caller=node_exporter.go:113 collector=filesystem
Mar 29 15:38:51 prome_master_01 node_exporter: level=info ts=2021-03-29T07:38:51.316Z caller=node_exporter.go:113 collector=hwmon
Mar 29 15:38:51 prome_master_01 node_exporter: level=info ts=2021-03-29T07:38:51.316Z caller=node_exporter.go:113 collector=infiniband
Mar 29 15:38:51 prome_master_01 node_exporter: level=info ts=2021-03-29T07:38:51.316Z caller=node_exporter.go:113 collector=ipvs

默认开启的采集项目介绍

node_exporter默认关闭的采集项.png

  • 黑名单: 关闭某一项默认开启的采集项
--no-collector.<name> flag
​
# 未开启前
[root@prome_master_01 node_exporter]# curl  -s  localhost:9100/metrics |grep node_cpu
# HELP node_cpu_guest_seconds_total Seconds the CPUs spent in guests (VMs) for each mode.
# TYPE node_cpu_guest_seconds_total counter
node_cpu_guest_seconds_total{cpu="0",mode="nice"} 0
node_cpu_guest_seconds_total{cpu="0",mode="user"} 0
node_cpu_guest_seconds_total{cpu="1",mode="nice"} 0
node_cpu_guest_seconds_total{cpu="1",mode="user"} 0
node_cpu_guest_seconds_total{cpu="2",mode="nice"} 0
node_cpu_guest_seconds_total{cpu="2",mode="user"} 0
node_cpu_guest_seconds_total{cpu="3",mode="nice"} 0
node_cpu_guest_seconds_total{cpu="3",mode="user"} 0
# HELP node_cpu_seconds_total Seconds the CPUs spent in each mode.
# TYPE node_cpu_seconds_total counter
node_cpu_seconds_total{cpu="0",mode="idle"} 17691.27
node_cpu_seconds_total{cpu="0",mode="iowait"} 8.9
node_cpu_seconds_total{cpu="0",mode="irq"} 0
node_cpu_seconds_total{cpu="0",mode="nice"} 0.32
node_cpu_seconds_total{cpu="0",mode="softirq"} 0.28
node_cpu_seconds_total{cpu="0",mode="steal"} 2.7# 关闭cpu采集
 ./node_exporter --no-collector.cpu
curl  -s  localhost:9100/metrics |grep node_cpu
​
​
  • 白名单:关闭默认采集项而只开启某些采集
 --collector.disable-defaults --collector.<name> .
​
# 只开启mem采集
 ./node_exporter --collector.disable-defaults --collector.meminfo
​
# 只开启mem 和cpu 采集
./node_exporter --collector.disable-defaults --collector.meminfo --collector.cpu

默认关闭的 关闭原因

  • 太重:High cardinality
  • 太慢:Prolonged runtime that exceeds the Prometheus scrape_interval or scrape_timeout
  • 太多资源开销: Significant resource demands on the host

node_exporter默认开启的采集项.png

禁用golang sdk 指标

  • 使用 --web.disable-exporter-metrics
  • promhttp_ 代表访问/metrics 的http情况
[root@prome_master_01 tgzs]# curl  -s  localhost:9100/metrics |grep promhttp_
# HELP promhttp_metric_handler_errors_total Total number of internal errors encountered by the promhttp metric handler.
# TYPE promhttp_metric_handler_errors_total counter
promhttp_metric_handler_errors_total{cause="encoding"} 0
promhttp_metric_handler_errors_total{cause="gathering"} 0
# HELP promhttp_metric_handler_requests_in_flight Current number of scrapes being served.
# TYPE promhttp_metric_handler_requests_in_flight gauge
promhttp_metric_handler_requests_in_flight 1
# HELP promhttp_metric_handler_requests_total Total number of scrapes by HTTP status code.
# TYPE promhttp_metric_handler_requests_total counter
promhttp_metric_handler_requests_total{code="200"} 8
promhttp_metric_handler_requests_total{code="500"} 0
promhttp_metric_handler_requests_total{code="503"} 0
  • go_代表 goruntime 信息等
# HELP go_goroutines Number of goroutines that currently exist.
# TYPE go_goroutines gauge
go_goroutines 7
# HELP go_info Information about the Go environment.
# TYPE go_info gauge
go_info{version="go1.15.8"} 1
# HELP go_memstats_alloc_bytes Number of bytes allocated and still in use.
# TYPE go_memstats_alloc_bytes gauge
go_memstats_alloc_bytes 2.781752e+06
​
  • process_代表 进程信息等
# HELP process_cpu_seconds_total Total user and system CPU time spent in seconds.
# TYPE process_cpu_seconds_total counter
process_cpu_seconds_total 0.54
# HELP process_max_fds Maximum number of open file descriptors.
# TYPE process_max_fds gauge
process_max_fds 1024
# HELP process_open_fds Number of open file descriptors.
# TYPE process_open_fds gauge
process_open_fds 9
# HELP process_resident_memory_bytes Resident memory size in bytes.
# TYPE process_resident_memory_bytes gauge
process_resident_memory_bytes 1.5720448e+07

节点上自打点数据上报

  • --collector.textfile.directory="" 配置本地采集目录
  • 在采集目录里创建.prom文件,格式说明
# 创建目录
mkdir ./text_file_dir
# 准备 prom文件
cat <<EOF > ./text_file_dir/test.prom
# HELP nyy_test_metric just test
# TYPE nyy_test_metric gauge
nyy_test_metric{method="post",code="200"} 1027
EOF
​
# 启动服务
./node_exporter --collector.textfile.directory=./text_file_dir
​
# curl查看数据
[root@prome_master_01 tgzs]# curl  -s  localhost:9100/metrics |grep nyy
# HELP nyy_test_metric just test
# TYPE nyy_test_metric gauge
nyy_test_metric{code="200",method="post"} 1027
​
​

http传入参数,按采集器过滤指标

  • 原理: 通过http请求参数过滤采集器
func (h *handler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
    filters := r.URL.Query()["collect[]"]
    level.Debug(h.logger).Log("msg", "collect query:", "filters", filters)
​
    if len(filters) == 0 {
        // No filters, use the prepared unfiltered handler.
        h.unfilteredHandler.ServeHTTP(w, r)
        return
    }
    // To serve filtered metrics, we create a filtering handler on the fly.
    filteredHandler, err := h.innerHandler(filters...)
    if err != nil {
        level.Warn(h.logger).Log("msg", "Couldn't create filtered metrics handler:", "err", err)
        w.WriteHeader(http.StatusBadRequest)
        w.Write([]byte(fmt.Sprintf("Couldn't create filtered metrics handler: %s", err)))
        return
    }
    filteredHandler.ServeHTTP(w, r)
}
  • http访问
# 只看cpu采集器的指标
http://192.168.0.112:9100/metrics?collect[]=cpu
​
​
# 只看cpu和mem采集器的指标
http://192.168.0.112:9100/metrics?collect[]=cpu&collect[]=meminfo
  • prometheus配置
  params:
    collect[]:
      - cpu
      - meminfo
  • 和prometheus relabel_config的区别 : 按采集器过滤 VS 按metric_name 或label过滤

导入dashboard商城中的node_exporter模板

grafana_dashboard商城.png

grafana_node_exporter大盘.png


在prometheus graph页面上查询数据

node_cpu_seconds_total{mode="user"}
​
node_cpu_seconds_total{cpu="0", instance="172.20.70.205:9100", job="prometheus", mode="user"}
53.43
node_cpu_seconds_total{cpu="0", instance="172.20.70.215:9100", job="prometheus", mode="user"}
8.17
node_cpu_seconds_total{cpu="1", instance="172.20.70.205:9100", job="prometheus", mode="user"}
28.96
node_cpu_seconds_total{cpu="1", instance="172.20.70.215:9100", job="prometheus", mode="user"}
12.32
node_cpu_seconds_total{cpu="2", instance="172.20.70.205:9100", job="prometheus", mode="user"}
31.54
node_cpu_seconds_total{cpu="2", instance="172.20.70.215:9100", job="prometheus", mode="user"}
8.32
node_cpu_seconds_total{cpu="3", instance="172.20.70.205:9100", job="prometheus", mode="user"}
53.88
node_cpu_seconds_total{cpu="3", instance="172.20.70.215:9100", job="prometheus", mode="user"}
6.38

prometheus 查询数据及数据概念

prometheus 基本概念

sample 数据点

type sample struct {
    t int64
    v float64
}
  • sample代表一个数据点
  • size:16byte: 包含 1个8byte int64时间戳和1个8byte float64 value

sample.png

Label 标签

type Label struct {
    Name, Value string
}
  • 一对label 比如 cpu="0" mode: "user"

标签.png

Labels 标签组

type Labels []Label
  • 就是metric 一个指标的所有tag values

prometheus四种查询类型

  1. 即时向量 Instant vector : 一组时间序列,每个时间序列包含一个样本,所有样本共享相同的时间戳

    在prometheus页面上就是table查询 ,对应查询接口 /api/v1/query

即时向量.png

  1. vector 向量

    type Vector []Sample
    
    • vector 向量,是samples的别名,但是所有sample具有相同timestamp ,常用作instance_query的结果
  2. 范围向量 Range vector : 一组时间序列,每个时间序列包含一个样本,所有样本共享相同的时间戳

    在prometheus页面上就是graph查询 ,对应查询接口 /api/v1/query

    Matrix 矩阵

    type Matrix []Series
    
    • Matrix是series的切片,一般的range_query返回的结果

范围向量.png

  1. 标量 Scalar 一个简单的数字浮点值

  2. String 一个简单的字符串值;目前未使用

四种标签匹配模式

  1. = 等于

    • 查询: cpu第一个核并且是用户态的数据 node_cpu_seconds_total{mode="user",cpu="0"}
  2. != 不等于

    • 查询: 非lo网卡的接收字节数 node_network_receive_bytes_total{device!="lo"}
  3. =~ 正则匹配

    • 查询: 挂载点以/run开头的文件系统剩余字节数 node_filesystem_avail_bytes{mountpoint=~"^/run.*"}
  4. !~ 正则非匹配

    • 查询: 块设备名字不包含vda的读字节数 node_disk_read_bytes_total{device!~".vda. "}

四种数据类型

  • gauge 当前值
node_memory_MemFree_bytes
  • counter 计数器是代表一个累积指标单调递增计数器,其价值只能在重新启动增加或归零。例如,您可以使用计数器来表示已服务请求,已完成任务或错误的数量。
http_request_total
  • histogram 直方图样本观测(通常之类的东西请求持续时间或响应大小)和计数它们配置的桶中。它还提供所有观察值的总和。
# http所有接口 总的95分位值
# sum/count 可以算平均值
prometheus_http_request_duration_seconds_sum/ prometheus_http_request_duration_seconds_count
​
# histogram_quantile(0.95, sum(rate(prometheus_http_request_duration_seconds_bucket[5m])) by (le,handler))
​
histogram_quantile(0.95, sum(rate(prometheus_http_request_duration_seconds_bucket[1m])) by (le))
​
# range_query接口的95分位值
histogram_quantile(0.95, sum(rate(prometheus_http_request_duration_seconds_bucket{handler="/api/v1/query_range"}[5m])) by (le))
​
​
​
  • summary 摘要会采样观察值(通常是请求持续时间和响应大小之类的东西)。尽管它还提供了观测值的总数和所有观测值的总和,但它可以计算滑动时间窗口内的可配置分位数。
# gc耗时# HELP go_gc_duration_seconds A summary of the pause duration of garbage collection cycles.
# TYPE go_gc_duration_seconds summary
go_gc_duration_seconds{quantile="0"} 0.000135743
go_gc_duration_seconds{quantile="0.25"} 0.000872805
go_gc_duration_seconds{quantile="0.5"} 0.000965516
go_gc_duration_seconds{quantile="0.75"} 0.001055636
go_gc_duration_seconds{quantile="1"} 0.006464756# summary 平均值
go_gc_duration_seconds_sum /go_gc_duration_seconds_count

范围向量选择器 Range Vector Selectors

  • 范围矢量的工作方式与即时矢量一样,不同之处在于它们从当前即时中选择了一定范围的样本。语法上,将持续时间附加在[]向量选择器末尾的方括号()中,以指定应为每个结果范围向量元素提取多远的时间值。
  • 只能作用在counter

时间范围

ms -毫秒
s -秒
m - 分钟
h - 小时
d -天-假设一天总是24小时
w -周-假设一周始终为7天
y -年-假设一年始终为365天

直接查询报错 node_network_receive_bytes_total{device!="lo"}[1m]

Error executing query: invalid expression type "range vector" for range query, must be Scalar or instant Vector
​

需要叠加一个非聚合函数 如 rate irate delta idelta sum 等

  • 计算网卡入流量 rate(node_network_receive_bytes_total{device!="lo"}[1m])

时间范围 ,不能低于采集间隔

  • 采集30秒 ,查询10秒则无数据

  • rate(node_network_receive_bytes_total{device!="lo"}[10s])