准备环境
部署软件及环境
- 下载地址:prometheus.io/download/
- redis、mysql、grafana、ansible、exporter、prometheus
设置主机名
hostnamectl set-hostname prome-master01
设置时区
timedatectl
[root@prometheus_master01 ~]# timedatectl
Local time: 六 2021-03-27 22:39:41 CST
Universal time: 六 2021-03-27 14:39:41 UTC
RTC time: 六 2021-03-27 14:39:41
Time zone: Asia/Shanghai (CST, +0800)
NTP enabled: yes
NTP synchronized: yes
RTC in local TZ: no
DST active: n/a
timedatectl set-timezone Asia/Shanghai
关闭防火墙 selinux
systemctl stop firewalld
systemctl disable firewalld
systemctl status firewalld
setenforce 0
sed -i '/^SELINUX/s/enforcing/disabled/' /etc/selinux/config
getenforce
关闭sshd dns反解
sed -i 's/^#UseDNS yes/UseDNS no/' /etc/ssh/sshd_config
systemctl restart sshd
搭建阶段
设置国内yum源
mkdir /tmp/yum_repo_bk
/bin/mv -f /etc/yum.repos.d/* /tmp/yum_repo_bk
# 阿里云源
wget -O /etc/yum.repos.d/CentOS-Base.repo http://mirrors.aliyun.com/repo/Centos-7.repo
# epel源
wget -O /etc/yum.repos.d/epel-7.repo https://mirrors.aliyun.com/repo/epel-7.repo
yum makecache
安装必备工具
# rzsz
yum -y install lrzsz yum-utils
准备数据目录等
# 安装包目录
mkdir -pv /opt/tgzs
# 程序目录
mkdir -pv /opt/app
设置history文件 ulimit
cat <<EOF >> /etc/profile
export HISTFILESIZE=
export HISTSIZE=
EOF
source /etc/profile
下载prometheus 组件最新版本包
# 地址
# https://github.com/prometheus/prometheus/releases/tag/v2.25.2
# prometheus
wget -O /opt/tgzs/prometheus-2.25.2.linux-amd64.tar.gz https://github.com/prometheus/prometheus/releases/download/v2.25.2/prometheus-2.25.2.linux-amd64.tar.gz
# node_exporter
wget -O /opt/tgzs/node_exporter-1.1.2.linux-amd64.tar.gz https://github.com/prometheus/node_exporter/releases/download/v1.1.2/node_exporter-1.1.2.linux-amd64.tar.gz
# alertmanager
wget -O /opt/tgzs/alertmanager-0.21.0.linux-amd64.tar.gz https://github.com/prometheus/alertmanager/releases/download/v0.21.0/alertmanager-0.21.0.linux-amd64.tar.gz
# pushgateway
wget -O /opt/tgzs/pushgateway-1.4.0.linux-amd64.tar.gz https://github.com/prometheus/pushgateway/releases/download/v1.4.0/pushgateway-1.4.0.linux-amd64.tar.gz
# process-exporter
wget -O /opt/tgzs/process-exporter-0.7.5.linux-amd64.tar.gz https://github.com/ncabatoff/process-exporter/releases/download/v0.7.5/process-exporter-0.7.5.linux-amd64.tar.gz
# blackbox_exporter
wget -O /opt/tgzs/blackbox_exporter-0.18.0.linux-amd64.tar.gz https://github.com/prometheus/blackbox_exporter/releases/download/v0.18.0/blackbox_exporter-0.18.0.linux-amd64.tar.gz
# redis_exporter
wget -O /opt/tgzs/redis_exporter-v1.20.0.linux-amd64.tar.gz https://github.com/oliver006/redis_exporter/releases/download/v1.20.0/redis_exporter-v1.20.0.linux-amd64.tar.gz
# mysql_exporter
wget -O /opt/tgzs/mysqld_exporter-0.12.1.linux-amd64.tar.gz https://github.com/prometheus/mysqld_exporter/releases/download/v0.12.1/mysqld_exporter-0.12.1.linux-amd64.tar.gz
安装mysql及配置
# 下载mysql源安装包
wget http://dev.mysql.com/get/mysql57-community-release-el7-8.noarch.rpm
# 安装mysql源
yum localinstall mysql57-community-release-el7-8.noarch.rpm -y
# 检查mysql源是否安装成功
yum repolist enabled | grep "mysql.*-community.*"
# 安装MySQL
yum install mysql-community-server -y
# 3、启动MySQL服务
systemctl start mysqld
# 查看MySQL的启动状态
systemctl status mysqld
#4、开机启动
systemctl enable mysqld
systemctl daemon-reload
# 5、修改root本地登录密码
# mysql安装完成之后,在/var/log/mysqld.log文件中给root生成了一个默认密码。通过下面的方式找到root默认密码,然后登录mysql进行修改:
grep 'temporary password' /var/log/mysqld.log
mysql -uroot -p
# mysql5.7默认安装了密码安全检查插件(validate_password),默认密码检查策略要求密码必须包含:大小写字母、数字和特殊符号,
# 并且长度不能少于8位。否则会提示ERROR 1819 (HY000): Your password does not satisfy the current policy requirements错误
# 如果不需要密码策略,添加my.cnf文件中添加如下配置禁用即可:
# 配置默认编码为utf8
# 关闭客户端dns反解
echo -e "validate_password = off\ncharacter_set_server=utf8\ninit_connect='SET NAMES utf8'\nskip-name-resolve\n" >> /etc/my.cnf
systemctl restart mysqld
mysql -uroot -p
## 授权
alter user 'root'@'localhost' identified by '123123';
grant all privileges on *.* to root@'%' identified by '123123' with grant option;
flush privileges;
安装redis及配置
yum -y install redis
编译安装redis-6.2.1
yum install -y gcc gcc-c++ tcl
wget -O /opt/tgzs/redis-6.2.1.tar.gz https://download.redis.io/releases/redis-6.2.1.tar.gz
cd /opt/tgzs/
#解压redis
tar xf redis-6.2.1.tar.gz
#进入解压后的目录
cd redis-6.2.1
#分配器allocator,如果有MALLOC 这个 环境变量, 会有用这个环境变量的 去建立Redis。
#而且libc 并不是默认的分配器, 默认的是 jemalloc, 因为 jemalloc 被证明 有更少的 fragmentation problems 比libc。
#但是如果你又没有jemalloc而只有 libc 当然 make 出错。 所以加这么一个参数,运行如下命令:
make MALLOC=libc -j 20
#编译
# make -j 20
#创建redis安装目录,主要用于存放redis所需bin文件
mkdir -p /usr/local/redis
#安装redis并指定安装目录
make PREFIX=/usr/local/redis/ install
# 设置path
vim /etc/profile
export PATH=$PATH:/usr/local/redis/bin
source /etc/profile
#复制默认配置文件到/etc
egrep -v "^$|#" redis.conf > redis_sample.conf
#修改配置文件监听IP为0.0.0.0,否则只能本地登录
sed -i s/bind\ 127.0.0.1/bind\ 0.0.0.0/g redis_sample.conf
#修改运行方式为后台运行
sed -i s/daemonize\ no/daemonize\ yes/g redis_sample.conf
/bin/cp -f redis_sample.conf /etc/redis_6379.conf
/bin/cp -f redis_sample.conf /etc/redis_6479.conf
#设置日志文件路径
sed -i s@logfile\ ""@logfile\ "/opt/logs/redis_6379.log"@g /etc/redis_6379.conf
sed -i s@logfile\ ""@logfile\ "/opt/logs/redis_6479.log"@g /etc/redis_6479.conf
#设置数据目录
sed -i s@dir\ ./@dir\ /var/lib/redis_6379@g /etc/redis_6379.conf
sed -i s@dir\ ./@dir\ /var/lib/redis_6479@g /etc/redis_6479.conf
# 修改port
sed -i 's/port 6379/port 6379/g' /etc/redis_6379.conf
sed -i 's/port 6379/port 6479/g' /etc/redis_6479.conf
mkdir /var/lib/redis_6379
mkdir /var/lib/redis_6479
mkdir /opt/logs
cat <<EOF > /etc/systemd/system/redis_6379.service
[Unit]
Description=The redis-server Process Manager
After=syslog.target network.target
[Service]
Type=forking
ExecStart=/usr/local/redis/bin/redis-server /etc/redis_6379.conf
#ExecStop=/usr/local/redis/bin/redis-shutdown
[Install]
WantedBy=multi-user.target
EOF
cat <<EOF > /etc/systemd/system/redis_6479.service
[Unit]
Description=The redis-server Process Manager
After=syslog.target network.target
[Service]
Type=forking
ExecStart=/usr/local/redis/bin/redis-server /etc/redis_6479.conf
#ExecStop=/usr/local/redis/bin/redis-shutdown
[Install]
WantedBy=multi-user.target
EOF
systemctl daemon-reload
#设置redis开机自启
systemctl enable redis_6379
systemctl enable redis_6479
#启动redis
systemctl restart redis_6379
systemctl restart redis_6479
#查看redis状态
systemctl status redis_6379
systemctl status redis_6479
安装grafana及配置
rpm 安装grafana 7
# 地址 https://grafana.com/grafana/download
wget -O /opt/tgzs/grafana-7.5.1-1.x86_64.rpm https://dl.grafana.com/oss/release/grafana-7.5.1-1.x86_64.rpm
sudo yum install grafana-7.5.1-1.x86_64.rpm
mysql中创建数据库
CREATE DATABASE IF NOT EXISTS grafana DEFAULT CHARSET utf8 COLLATE utf8_general_ci;
修改配置文件 填写mysql路径等
vim /etc/grafana/grafana.ini
##
type :改成mysql 不用sqllite3
name: grafana
user: root
密码:123123
启动服务
systemctl start grafana-server
systemctl enable grafana-server
systemctl status grafana-server
查看日志 有无报错
tail -f /var/log/grafana/grafana.log
笔记本设置硬解
# windows
C:\Windows\System32\drivers\etc\hosts
192.168.0.112 grafana.prome.me
笔记本浏览器访问
http://grafana.prome.me:3000/?orgId=1
默认 用户密码 :admin/admin
谷歌浏览器不能编辑问题
- issue github.com/grafana/gra…
安装Ansible并批量按住安装node-export
节点主机名写入hosts
echo "192.168.0.112 prome-master01" >> /etc/hosts
echo "192.168.0.113 prome-node01" >> /etc/hosts
master上生成ssh key 并拷贝到node上
ssh-keygen
ssh-copy-id prome-node01
ssh-copy-id prome-master01
# 测试ssh联通
ssh prome-node01
master 上安装ansible
yum install -y ansible
# 关闭hostcheck
vim /etc/ansible/ansible.cfg
ssh_args = -C -o ControlMaster=auto -o ControlPersist=60s -o StrictHostKeyChecking=no
playbook执行时需要设置机器文件
cat <<EOF > /opt/tgzs/host_file
prome-master01
prome-node01
EOF
# ansible -i host_file all -m ping 测试是否连通
设置syslog 和logrotate服务
ansible-playbook -i host_file init_syslog_logrotate.yaml
编写ansible 发布服务脚本
ansible-playbook -i host_file service_deploy.yaml -e "tgz=node_exporter-1.1.2.linux-amd64.tar.gz" -e "app=node_exporter"
检查node_exporter服务状态
ansible -i host_file all -m shell -a " ps -ef |grep node_exporter|grep -v grep "
浏览器访问 9100/metrics
node01.prome.me:9100/metrics
master01.prome.me:9100/metrics
安装prometheus并配置
使用ansible部署prometheus
ansible-playbook -i host_file service_deploy.yaml -e "tgz=prometheus-2.25.2.linux-amd64.tar.gz" -e "app=prometheus"
查看页面
http://master01.prome.me:9090/
prometheus配置文件 解析
# 全局配置段
global:
# 采集间隔
scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
# 计算报警和预聚合间隔
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
# 采集超时时间
scrape_timeout: 10s
# 查询日志,包含各阶段耗时统计
query_log_file: /opt/logs/prometheus_query_log
# 全局标签组
# 通过本实例采集的数据都会叠加下面的标签
external_labels:
account: 'huawei-main'
region: 'beijng-01'
# Alertmanager信息段
alerting:
alertmanagers:
- scheme: http
static_configs:
- targets:
- "localhost:9093"
# 告警、预聚合配置文件段
rule_files:
- /etc/prometheus/rules/record.yml
- /etc/prometheus/rules/alert.yml
# 采集配置段
scrape_configs:
# The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
- job_name: 'prometheus'
# metrics_path defaults to '/metrics'
# scheme defaults to 'http'.
static_configs:
- targets: ['localhost:9090']
# 远程查询段
remote_read:
# prometheus
- url: http://prometheus/v1/read
read_recent: true
# m3db
- url: "http://m3coordinator-read:7201/api/v1/prom/remote/read"
read_recent: true
# 远程写入段
remote_write:
- url: "http://m3coordinator-write:7201/api/v1/prom/remote/write"
queue_config:
capacity: 10000
max_samples_per_send: 60000
write_relabel_configs:
- source_labels: [__name__]
separator: ;
# 标签key前缀匹配到的drop
regex: '(kubelet_|apiserver_|container_fs_).*'
replacement: $1
action: drop
- 所以prometheus实例可以用来做下列用途
对应的配置段 | 用途 |
---|---|
采集配置段 | 做采集器,数据保存在本地 |
采集配置段 + 远程写入段 | 做采集器+传输器,数据保存在本地+远端存储 |
远程查询段 | 做查询器,查询远端存储数据 |
采集配置段 + 远程查询段 | 做采集器+查询器,查询本地数据+远端存储数据 |
采集配置段 + Alertmanager信息段 + 告警配置文件段 | 做采集器+告警触发器,查询本地数据生成报警发往Alertmanager |
远程查询段 + Alertmanager信息段 + 告警配置文件段 | 做远程告警触发器,查询远端数据生成报警发往Alertmanager |
远程查询段+远程写入段 + 预聚合配置文件段 | 做预聚合指标,生成的结果集指标写入远端存储 |
准备prometheus配置文件,配置采集两个node_exporter
global:
scrape_interval: 15s
scrape_timeout: 10s
evaluation_interval: 15s
alerting:
alertmanagers:
- scheme: http
timeout: 10s
api_version: v1
static_configs:
- targets: []
scrape_configs:
- job_name: prometheus
honor_timestamps: true
scrape_interval: 15s
scrape_timeout: 10s
metrics_path: /metrics
scheme: http
static_configs:
- targets:
- 192.168.26.112:9100
- 192.168.26.113:9100
热更新配置文件
# 命令行开启 --web.enable-lifecycle
curl -X POST http://localhost:9090/-/reload
页面查看targets up情况
解说targets页面
- job 分组情况
- endpoint 实例地址
- state 采集是否成功
- label 标签组
- Last Scrape 上次采集到现在的间隔时间
- Scrape Duration 上次采集耗时
- Error 采集错误
通过api获取targets 详情
- 运行
008_get_targets_from_prome.py
状态:正常 num:1/2 endpoint:http://172.20.70.205:9100/metrics state:up labels:{'instance': '192.168.26.112:9100', 'job': 'prometheus'} lastScrape:2021-03-29T18:20:04.304025213+08:00 lastScrapeDuration:0.011969003 lastError:
状态:正常 num:2/2 endpoint:http://172.20.70.215:9100/metrics state:up labels:{'instance': '192.168.26.113:9100', 'job': 'prometheus'} lastScrape:2021-03-29T18:20:06.845862504+08:00 lastScrapeDuration:0.012705335 lastError:
- 随便填几个错误的target测试一下 比如
abc:9100
状态:异常 num:1/3 endpoint:http://abc:9100/metrics state:down labels:{'instance': 'abc:9100', 'job': 'prometheus'} lastScrape:2021-03-29T18:24:08.365229831+08:00 lastScrapeDuration:0.487732313 lastError:Get "http://abc:9100/metrics": dial tcp: lookup abc on 114.114.114.114:53: no such host
状态:正常 num:2/3 endpoint:http://192.168.26.112:9100/metrics state:up labels:{'instance': '192.168.26.112:9100', 'job': 'prometheus'} lastScrape:2021-03-29T18:24:19.304044469+08:00 lastScrapeDuration:0.012483866 lastError:
状态:正常 num:3/3 endpoint:http://192.168.26.113:9100/metrics state:up labels:{'instance': '192.168.26.113:9100', 'job': 'prometheus'} lastScrape:2021-03-29T18:24:21.845860017+08:00 lastScrapeDuration:0.010381262 lastError:
- 可以用来算target采集成功率
- up metrics
采集prometheus自身的指标
- 192.168.26.112:9090
- 192.168.26.113:9090