Prometheus 部署

1,417 阅读8分钟

准备环境

部署软件及环境

设置主机名

hostnamectl set-hostname prome-master01

设置时区

timedatectl

[root@prometheus_master01 ~]# timedatectl 
      Local time:  2021-03-27 22:39:41 CST
  Universal time:  2021-03-27 14:39:41 UTC
        RTC time:  2021-03-27 14:39:41
       Time zone: Asia/Shanghai (CST, +0800)
     NTP enabled: yes
NTP synchronized: yes
 RTC in local TZ: no
      DST active: n/a

timedatectl set-timezone Asia/Shanghai

关闭防火墙 selinux

systemctl stop firewalld
systemctl disable firewalld
systemctl status firewalld
​
setenforce 0
sed -i '/^SELINUX/s/enforcing/disabled/' /etc/selinux/config
getenforce

关闭sshd dns反解

sed -i 's/^#UseDNS yes/UseDNS no/'  /etc/ssh/sshd_config
systemctl restart sshd 

搭建阶段

设置国内yum源

mkdir /tmp/yum_repo_bk
/bin/mv -f /etc/yum.repos.d/* /tmp/yum_repo_bk
​
# 阿里云源
wget -O /etc/yum.repos.d/CentOS-Base.repo http://mirrors.aliyun.com/repo/Centos-7.repo
​
# epel源
wget  -O /etc/yum.repos.d/epel-7.repo https://mirrors.aliyun.com/repo/epel-7.repo
​
yum makecache

安装必备工具

# rzsz 
yum -y install lrzsz  yum-utils

准备数据目录等

# 安装包目录
mkdir -pv /opt/tgzs
# 程序目录
mkdir -pv /opt/app
​

设置history文件 ulimit

cat <<EOF >> /etc/profile
export HISTFILESIZE=
export HISTSIZE=
EOF
source /etc/profile

下载prometheus 组件最新版本包

# 地址
# https://github.com/prometheus/prometheus/releases/tag/v2.25.2
# prometheus 
wget -O /opt/tgzs/prometheus-2.25.2.linux-amd64.tar.gz  https://github.com/prometheus/prometheus/releases/download/v2.25.2/prometheus-2.25.2.linux-amd64.tar.gz
​
# node_exporter
wget -O /opt/tgzs/node_exporter-1.1.2.linux-amd64.tar.gz https://github.com/prometheus/node_exporter/releases/download/v1.1.2/node_exporter-1.1.2.linux-amd64.tar.gz
​
# alertmanager
wget -O /opt/tgzs/alertmanager-0.21.0.linux-amd64.tar.gz https://github.com/prometheus/alertmanager/releases/download/v0.21.0/alertmanager-0.21.0.linux-amd64.tar.gz
​
# pushgateway
wget -O  /opt/tgzs/pushgateway-1.4.0.linux-amd64.tar.gz https://github.com/prometheus/pushgateway/releases/download/v1.4.0/pushgateway-1.4.0.linux-amd64.tar.gz
​
# process-exporter
wget -O  /opt/tgzs/process-exporter-0.7.5.linux-amd64.tar.gz https://github.com/ncabatoff/process-exporter/releases/download/v0.7.5/process-exporter-0.7.5.linux-amd64.tar.gz
​
# blackbox_exporter
wget -O  /opt/tgzs/blackbox_exporter-0.18.0.linux-amd64.tar.gz https://github.com/prometheus/blackbox_exporter/releases/download/v0.18.0/blackbox_exporter-0.18.0.linux-amd64.tar.gz
​
# redis_exporter
wget -O  /opt/tgzs/redis_exporter-v1.20.0.linux-amd64.tar.gz https://github.com/oliver006/redis_exporter/releases/download/v1.20.0/redis_exporter-v1.20.0.linux-amd64.tar.gz
​
# mysql_exporter
wget -O  /opt/tgzs/mysqld_exporter-0.12.1.linux-amd64.tar.gz https://github.com/prometheus/mysqld_exporter/releases/download/v0.12.1/mysqld_exporter-0.12.1.linux-amd64.tar.gz

安装mysql及配置

# 下载mysql源安装包
wget http://dev.mysql.com/get/mysql57-community-release-el7-8.noarch.rpm
​
# 安装mysql源
yum localinstall mysql57-community-release-el7-8.noarch.rpm -y 
​
# 检查mysql源是否安装成功
yum repolist enabled | grep "mysql.*-community.*"# 安装MySQL
yum install mysql-community-server -y 
​
​
# 3、启动MySQL服务
systemctl start mysqld
​
# 查看MySQL的启动状态
systemctl status mysqld
​
#4、开机启动
systemctl enable mysqld
systemctl daemon-reload
​
# 5、修改root本地登录密码
# mysql安装完成之后,在/var/log/mysqld.log文件中给root生成了一个默认密码。通过下面的方式找到root默认密码,然后登录mysql进行修改:
grep 'temporary password' /var/log/mysqld.log
mysql -uroot -p
​
# mysql5.7默认安装了密码安全检查插件(validate_password),默认密码检查策略要求密码必须包含:大小写字母、数字和特殊符号,
# 并且长度不能少于8位。否则会提示ERROR 1819 (HY000): Your password does not satisfy the current policy requirements错误# 如果不需要密码策略,添加my.cnf文件中添加如下配置禁用即可:
# 配置默认编码为utf8
# 关闭客户端dns反解echo -e "validate_password = off\ncharacter_set_server=utf8\ninit_connect='SET NAMES utf8'\nskip-name-resolve\n" >> /etc/my.cnf 
systemctl restart mysqld 
​
mysql -uroot -p 
​
​
## 授权
alter user 'root'@'localhost' identified by '123123';
​
grant all privileges on *.* to root@'%' identified by '123123' with grant option;
flush privileges;
​

安装redis及配置

yum -y install redis 

编译安装redis-6.2.1

yum install -y gcc gcc-c++ tcl
​
wget  -O /opt/tgzs/redis-6.2.1.tar.gz https://download.redis.io/releases/redis-6.2.1.tar.gz
cd /opt/tgzs/
​
​
#解压redis
tar xf redis-6.2.1.tar.gz
#进入解压后的目录
cd redis-6.2.1
#分配器allocator,如果有MALLOC  这个 环境变量, 会有用这个环境变量的 去建立Redis。#而且libc 并不是默认的分配器, 默认的是 jemalloc, 因为 jemalloc 被证明 有更少的 fragmentation problems 比libc。#但是如果你又没有jemalloc而只有 libc 当然 make 出错。 所以加这么一个参数,运行如下命令:
​
make MALLOC=libc -j 20
​
#编译# make -j 20
#创建redis安装目录,主要用于存放redis所需bin文件
mkdir -p /usr/local/redis
#安装redis并指定安装目录
make PREFIX=/usr/local/redis/ install
​
# 设置path
vim /etc/profile
export PATH=$PATH:/usr/local/redis/bin
source /etc/profile
​
#复制默认配置文件到/etc
egrep -v  "^$|#" redis.conf   > redis_sample.conf
#修改配置文件监听IP为0.0.0.0,否则只能本地登录
sed -i s/bind\ 127.0.0.1/bind\ 0.0.0.0/g redis_sample.conf
#修改运行方式为后台运行
sed -i s/daemonize\ no/daemonize\ yes/g redis_sample.conf
​
​
/bin/cp -f redis_sample.conf /etc/redis_6379.conf 
/bin/cp -f redis_sample.conf /etc/redis_6479.conf 
​
#设置日志文件路径
sed -i s@logfile\ ""@logfile\ "/opt/logs/redis_6379.log"@g /etc/redis_6379.conf 
sed -i s@logfile\ ""@logfile\ "/opt/logs/redis_6479.log"@g /etc/redis_6479.conf 
​
#设置数据目录
sed -i s@dir\ ./@dir\ /var/lib/redis_6379@g /etc/redis_6379.conf 
sed -i s@dir\ ./@dir\ /var/lib/redis_6479@g /etc/redis_6479.conf 
​
# 修改port
sed -i 's/port 6379/port 6379/g' /etc/redis_6379.conf 
sed -i 's/port 6379/port 6479/g' /etc/redis_6479.conf 
​
mkdir  /var/lib/redis_6379
mkdir  /var/lib/redis_6479
mkdir  /opt/logs
​
cat <<EOF > /etc/systemd/system/redis_6379.service 
[Unit]
Description=The redis-server Process Manager
After=syslog.target network.target
​
[Service]
Type=forking
ExecStart=/usr/local/redis/bin/redis-server /etc/redis_6379.conf
#ExecStop=/usr/local/redis/bin/redis-shutdown
​
[Install]
WantedBy=multi-user.target
EOFcat <<EOF > /etc/systemd/system/redis_6479.service 
[Unit]
Description=The redis-server Process Manager
After=syslog.target network.target
​
[Service]
Type=forking
ExecStart=/usr/local/redis/bin/redis-server /etc/redis_6479.conf
#ExecStop=/usr/local/redis/bin/redis-shutdown
​
[Install]
WantedBy=multi-user.target
EOF
​
systemctl daemon-reload
#设置redis开机自启
systemctl enable redis_6379
systemctl enable redis_6479
#启动redis
systemctl restart redis_6379
systemctl restart redis_6479
#查看redis状态
systemctl status redis_6379
systemctl status redis_6479

安装grafana及配置

rpm 安装grafana 7

# 地址 https://grafana.com/grafana/download
wget -O /opt/tgzs/grafana-7.5.1-1.x86_64.rpm https://dl.grafana.com/oss/release/grafana-7.5.1-1.x86_64.rpm
sudo yum install grafana-7.5.1-1.x86_64.rpm
​

mysql中创建数据库

CREATE DATABASE IF NOT EXISTS grafana DEFAULT CHARSET utf8 COLLATE utf8_general_ci;

修改配置文件 填写mysql路径等

vim /etc/grafana/grafana.ini
​
##
type :改成mysql  不用sqllite3
name: grafana
user: root
密码:123123

启动服务

systemctl start grafana-server
systemctl enable grafana-server
systemctl status grafana-server

查看日志 有无报错

tail -f /var/log/grafana/grafana.log

笔记本设置硬解

# windows 
C:\Windows\System32\drivers\etc\hosts
192.168.0.112 grafana.prome.me
​

笔记本浏览器访问

http://grafana.prome.me:3000/?orgId=1
默认 用户密码 :admin/admin

谷歌浏览器不能编辑问题

安装Ansible并批量按住安装node-export

节点主机名写入hosts

echo "192.168.0.112   prome-master01" >> /etc/hosts
echo "192.168.0.113   prome-node01" >> /etc/hosts

master上生成ssh key 并拷贝到node上

ssh-keygen
ssh-copy-id prome-node01
ssh-copy-id prome-master01
# 测试ssh联通
​
ssh prome-node01
​

master 上安装ansible

yum install -y ansible
​
# 关闭hostcheck 
vim /etc/ansible/ansible.cfg
​
ssh_args = -C -o ControlMaster=auto -o ControlPersist=60s -o StrictHostKeyChecking=no

playbook执行时需要设置机器文件

cat <<EOF > /opt/tgzs/host_file
prome-master01
prome-node01
EOF
# ansible -i host_file all -m ping  测试是否连通

设置syslog 和logrotate服务

ansible-playbook -i host_file init_syslog_logrotate.yaml

编写ansible 发布服务脚本

ansible-playbook -i host_file  service_deploy.yaml  -e "tgz=node_exporter-1.1.2.linux-amd64.tar.gz" -e "app=node_exporter"

检查node_exporter服务状态

ansible -i host_file all -m shell -a " ps -ef |grep node_exporter|grep -v grep "
​

浏览器访问 9100/metrics

node01.prome.me:9100/metrics
master01.prome.me:9100/metrics

安装prometheus并配置

使用ansible部署prometheus

ansible-playbook -i host_file  service_deploy.yaml  -e "tgz=prometheus-2.25.2.linux-amd64.tar.gz" -e "app=prometheus"

查看页面

http://master01.prome.me:9090/
​

prometheus配置文件 解析

# 全局配置段
global:
  # 采集间隔 
  scrape_interval:     15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
  # 计算报警和预聚合间隔
  evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
  # 采集超时时间
  scrape_timeout: 10s 
  # 查询日志,包含各阶段耗时统计
  query_log_file: /opt/logs/prometheus_query_log
  # 全局标签组
  # 通过本实例采集的数据都会叠加下面的标签
  external_labels:
    account: 'huawei-main'
    region: 'beijng-01'


# Alertmanager信息段
alerting:
  alertmanagers:
  - scheme: http
    static_configs:
    - targets:
      - "localhost:9093"

# 告警、预聚合配置文件段
rule_files:
    - /etc/prometheus/rules/record.yml
    - /etc/prometheus/rules/alert.yml

# 采集配置段
scrape_configs:
  # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
  - job_name: 'prometheus'

    # metrics_path defaults to '/metrics'
    # scheme defaults to 'http'.

    static_configs:
    - targets: ['localhost:9090']

# 远程查询段
remote_read:
  # prometheus 
  - url: http://prometheus/v1/read
    read_recent: true

  # m3db 
  - url: "http://m3coordinator-read:7201/api/v1/prom/remote/read"
    read_recent: true

# 远程写入段
remote_write:
  - url: "http://m3coordinator-write:7201/api/v1/prom/remote/write"
    queue_config:
      capacity: 10000
      max_samples_per_send: 60000
    write_relabel_configs:
      - source_labels: [__name__]
        separator: ;
        # 标签key前缀匹配到的drop
        regex: '(kubelet_|apiserver_|container_fs_).*'
        replacement: $1
        action: drop
  • 所以prometheus实例可以用来做下列用途
对应的配置段用途
采集配置段做采集器,数据保存在本地
采集配置段 + 远程写入段做采集器+传输器,数据保存在本地+远端存储
远程查询段做查询器,查询远端存储数据
采集配置段 + 远程查询段做采集器+查询器,查询本地数据+远端存储数据
采集配置段 + Alertmanager信息段 + 告警配置文件段做采集器+告警触发器,查询本地数据生成报警发往Alertmanager
远程查询段 + Alertmanager信息段 + 告警配置文件段做远程告警触发器,查询远端数据生成报警发往Alertmanager
远程查询段+远程写入段 + 预聚合配置文件段做预聚合指标,生成的结果集指标写入远端存储

准备prometheus配置文件,配置采集两个node_exporter

global:
  scrape_interval: 15s
  scrape_timeout: 10s
  evaluation_interval: 15s
alerting:
  alertmanagers:
  - scheme: http
    timeout: 10s
    api_version: v1
    static_configs:
    - targets: []
scrape_configs:
- job_name: prometheus
  honor_timestamps: true
  scrape_interval: 15s
  scrape_timeout: 10s
  metrics_path: /metrics
  scheme: http
  static_configs:
  - targets:
    - 192.168.26.112:9100
    - 192.168.26.113:9100

热更新配置文件

# 命令行开启  --web.enable-lifecycle
curl -X POST http://localhost:9090/-/reload 
​

页面查看targets up情况

解说targets页面

  • job 分组情况
  • endpoint 实例地址
  • state 采集是否成功
  • label 标签组
  • Last Scrape 上次采集到现在的间隔时间
  • Scrape Duration 上次采集耗时
  • Error 采集错误

通过api获取targets 详情

  • 运行008_get_targets_from_prome.py
状态:正常 num:1/2 endpoint:http://172.20.70.205:9100/metrics state:up labels:{'instance': '192.168.26.112:9100', 'job': 'prometheus'} lastScrape:2021-03-29T18:20:04.304025213+08:00 lastScrapeDuration:0.011969003 lastError:
状态:正常 num:2/2 endpoint:http://172.20.70.215:9100/metrics state:up labels:{'instance': '192.168.26.113:9100', 'job': 'prometheus'} lastScrape:2021-03-29T18:20:06.845862504+08:00 lastScrapeDuration:0.012705335 lastError:
​
  • 随便填几个错误的target测试一下 比如 abc:9100
状态:异常 num:1/3 endpoint:http://abc:9100/metrics state:down labels:{'instance': 'abc:9100', 'job': 'prometheus'} lastScrape:2021-03-29T18:24:08.365229831+08:00 lastScrapeDuration:0.487732313 lastError:Get "http://abc:9100/metrics": dial tcp: lookup abc on 114.114.114.114:53: no such host
状态:正常 num:2/3 endpoint:http://192.168.26.112:9100/metrics state:up labels:{'instance': '192.168.26.112:9100', 'job': 'prometheus'} lastScrape:2021-03-29T18:24:19.304044469+08:00 lastScrapeDuration:0.012483866 lastError:
状态:正常 num:3/3 endpoint:http://192.168.26.113:9100/metrics state:up labels:{'instance': '192.168.26.113:9100', 'job': 'prometheus'} lastScrape:2021-03-29T18:24:21.845860017+08:00 lastScrapeDuration:0.010381262 lastError:
​
​
  • 可以用来算target采集成功率
  • up metrics
采集prometheus自身的指标
    - 192.168.26.112:9090
    - 192.168.26.113:9090