一、背景
大数据cdh的元数据库使用的是mysql,版本为5.7.27,目前是单点状态,目标实现主从并且支持HA能力。
由于DBA的mysql PaaS与CDH的不兼容,所以需要手工部署mysql和配置HA。在DBA属于非标mysql,不能接入DMP管理平台
目的: 实现CDH 元数据库mysql的HA能力。
由于CDH只是集群的元数据,数据量不超过10G,该方案只为实现HA能力,不带读写分离。
功能:实现HA,不带读写分离,从库的修复需要手工。
注意:
· 主从mysql启动或者重启后默认只有 read only。
· mysql启动后需要手动: set global read_only =0; 之后才可以写,目的防止从库出现脑裂。
二、架构与方案
复制架构:部署异步复制的1主1从的
HA能力:主从监测脚本+keepalived
说明:
-
主从监测脚本:实现主从状态的监测以及故障是的Failover。
-
keepalived使用VIP的漂移。
-
从库必须设置read only状态,避免发生脑裂。
暂时无法在飞书文档外展示此内容
三、部署mysql
主库
- 部署mysql服务
- 配置my.cnf
cat > /etc/my.cnf << EOF
[mysqld]
basedir = /data/mysql
datadir = /data/mysql/data
socket = /tmp/mysql.sock
pid-file = /data/mysql/data/mysqld.pid
super_read_only = ON
user = mysql
port = 3306
# 二进制日志配置
server-id = 1
log-bin = mysql-bin
binlog-format = row
expire_logs_days = 5
max_binlog_size = 500M
gtid-mode = ON
enforce-gtid-consistency=ON
# 主从复制配置
log-slave-updates = 1
read-only = 0
EOF
- 配置主库mysql的systemd
cat > /etc/systemd/system/mysqld.service << EOF
[Unit]
Description=MySQL 5.7 Database Server
After=network.target
[Service]
User=mysql
Group=mysql
ExecStart=/data/mysql/mysql/bin/mysqld --defaults-file=/etc/my.cnf
Restart=on-failure
[Install]
WantedBy=multi-user.target
EOF
systemctl daemon-reload && systemctl start mysqld && systemctl enable mysqld
# 配置环境变量
echo 'export PATH="/data/mysql/mysql/bin:$PATH"' >> /etc/profile.d/mysql.sh
source /etc/profile
- 修改root账号
ALTER USER 'root'@'localhost' IDENTIFIED BY '123';
FLUSH PRIVILEGES;
EXIT;
create user 'root'@'%' identified by '123';
grant all on *.* to root with grant option;
flush privileges;
show grants for 'root'@'%';
- 创建业务账号
# 主库上创建复制用的账号
# 创建复制账号
CREATE USER 'replica'@'%' identified WITH mysql_native_password by '1234';
# 复制账号权限配置
GRANT REPLICATION SLAVE ON *.* TO 'replica'@'%';
GRANT REPLICATION SLAVE, REPLICATION CLIENT ON *.* TO 'replica'@'%';
# 刷新
flush privileges;
grant replication slave on *.* to replica@"%" identified by "1234";
-
创建复制账号
-
查看master信息
从库
- 部署mysql
- 配置my.cnf
cat > /etc/my.cnf << EOF
[mysqld]
basedir = /data/mysql/mysql
datadir = /data/mysql/data
socket = /tmp/mysql.sock
pid-file = /data/mysql/data/mysqld.pid
read_only = ON
user = mysql
port = 3306
# 二进制日志配置
server-id = 2
log-bin = mysql-bin
binlog-format = row
expire_logs_days=5
max_binlog_size = 500M
# 主从复制配置
relay-log=mysql-relay-bin
log-slave-updates=ON
gtid-mode=ON
enforce-gtid-consistency=ON
slave-parallel-type=LOGICAL_CLOCK
slave-parallel-workers=4
slave_preserve_commit_order=on
read-only = 0
EOF
- 配置mysql的systemd
cat > /etc/systemd/system/mysqld.service << EOF
[Unit]
Description=MySQL 5.7 Database Server
After=network.target
[Service]
User=mysql
Group=mysql
ExecStart=/data/mysql/mysql/bin/mysqld --defaults-file=/etc/my.cnf
Restart=on-failure
[Install]
WantedBy=multi-user.target
EOF
systemctl daemon-reload && systemctl start mysqld && systemctl enable mysqld
# 环境变量
echo 'export PATH="/data/mysql/mysql/bin:$PATH"' >> /etc/profile.d/mysql.sh
source /etc/profile
- 修改root账号
> 使用临时密码登录MySQL,修改 root 密码
ALTER USER 'root'@'localhost' IDENTIFIED BY '123';
FLUSH PRIVILEGES;
EXIT;
- 配置slave信息
# 从库上配置复制信息
# 备库停止slave进程
stop slave;
CHANGE MASTER TO MASTER_HOST='172.21.27.30' ,MASTER_USER='replica' ,MASTER_PASSWORD='1234',master_auto_position=1;
- 启动复制
#
start slave;
# 查看slave状态
show slave status;
- 观察主从复制状态,并设置从库只读
# 将slave 设置为只读
SHOW GLOBAL VARIABLES LIKE '%read_only';
set global super_read_only =1;
-
完成
四、keepalived部署与配置
- 2个db主机安装keepalived
- 配置2个主机的 /etc/keepalived/keepalived.conf
主节点配置
! Configuration File for keepalived
vrrp_script chk_haproxy {
script "/etc/keepalived/check_haproxy.sh"
interval 2
}
#router_id需要根据具体业务命名,以便于不同套作区分,同套keepalived的2台此参数名相同
global_defs {
router_id ks_pqsql
}
vrrp_instance VI_1 {
state BACKUP
#网口名称需要确认是否为ens33,如果不是需要修改为对应网口信息
interface ens33
#virtual_router_id 取值范围为0~255,同vip网段需要确认不能和其他套keepalived共用一个值,同套keepalived的2台此参数名相同
virtual_router_id 245
#priority取值自定义,区别2台keepalived的权重,一台值偏小
priority 100
advert_int 1
nopreempt
#auth_pass需要一套环境配置为相同信息
authentication {
auth_type PASS
auth_pass Yumc1234
}
track_script {
#chk_haproxy
}
#172.xx.xx.xx/xx需要修改为真实vip的网段ip信息,网口名称需要确认是否为ens33,如果不是需要修改为对应网口信息
virtual_ipaddress {
192.168.3.250/24 dev ens33
}
#172.25.89.xx改为实际vip地址,172.25.88.1改为实际网关地址
notify_master "/usr/sbin/arping -I ens33 -c 2 -s 192.168.3.250 192.168.3.1"
}
从节点配置
! Configuration File for keepalived
vrrp_script chk_haproxy {
script "/etc/keepalived/check_haproxy.sh"
interval 2
}
#router_id需要根据具体业务命名,以便于不同套作区分,同套keepalived的2台此参数名相同
global_defs {
router_id ks_pqsql
}
vrrp_instance VI_1 {
state BACKUP
#网口名称需要确认是否为ens33,如果不是需要修改为对应网口信息
interface ens33
#virtual_router_id 取值范围为0~255,同vip网段需要确认不能和其他套keepalived共用一个值,同套keepalived的2台此参数名相同
virtual_router_id 245
#priority取值自定义,区别2台keepalived的权重,一台值偏小
priority 50
advert_int 1
nopreempt
#auth_pass需要一套环境配置为相同信息
authentication {
auth_type PASS
auth_pass Yumc1234
}
track_script {
#chk_haproxy
}
#172.xx.xx.xx/xx需要修改为真实vip的网段ip信息,网口名称需要确认是否为ens33,如果不是需要修改为对应网口信息
virtual_ipaddress {
192.168.3.250/24 dev ens33
}
#172.25.89.xx改为实际vip地址,172.25.88.1改为实际网关地址
notify_master "/usr/sbin/arping -I ens33 -c 2 -s 192.168.3.250/24 192.168.3.1"
}
五、自动主从状态监测与HA能力实现
直接执行 sh check_mysql_master.sh ,可以放Jenkins中执行,执行频率 30s 一次(可以适当缩短间隔时间)。
vim check_mysql_master.sh
#!/bin/bash
# 定义主从库的地址和端口
MASTER_HOST="172.21.27.30"
MASTER_PORT="3306"
SLAVE_HOST="172.21.27.29"
SLAVE_PORT="3306"
time=$(date)
# 定义MySQL用户和密码
MYSQL_USER="root"
MYSQL_PASSWORD="123"
# 定义日志文件
LOG_FILE="/var/log/mysql_failover.log"
# 检查MySQL服务状态
check_mysql_status() {
local host=$1
local port=$2
local status
status=$(ssh $host systemctl is-active mysqld)
if [ "$status" == "active" ]; then
echo "MySQL service on $host is running."
return 0
else
echo "MySQL service on $host is not running."
return 1
fi
}
# 检查从库复制状态
check_slave_status() {
local host=$1
local port=$2
local status
status=$(mysql -h$host -P$port -u$MYSQL_USER -p$MYSQL_PASSWORD -e "SHOW SLAVE STATUS\G" | grep "Slave_IO_Running" | awk '{print $2}')
if [ "$status" == "Yes" ]; then
echo "Slave IO on $host is running."
return 0
else
echo "Slave IO on $host is not running."
return 1
fi
}
# 切换主从
promote_slave_to_master() {
local slave_host=$1
local slave_port=$2
local master_host=$3
local master_port=$4
echo "Promoting $slave_host to master..."
# 停止从库复制
mysql -h$slave_host -P$slave_port -u$MYSQL_USER -p$MYSQL_PASSWORD -e "STOP SLAVE;"
# 清除从库的复制信息
mysql -h$slave_host -P$slave_port -u$MYSQL_USER -p$MYSQL_PASSWORD -e "RESET SLAVE ALL;"
# 设置从库为只读模式
mysql -h$slave_host -P$slave_port -u$MYSQL_USER -p$MYSQL_PASSWORD -e "SET GLOBAL read_only = OFF;"
# 设置从库为新的主库
# mysql -h$slave_host -P$slave_port -u$MYSQL_USER -p$MYSQL_PASSWORD -e "CHANGE MASTER TO MASTER_HOST='$master_host', MASTER_PORT=$master_port, MASTER_USER='$MYSQL_USER', MASTER_PASSWORD='$MYSQL_PASSWORD', MASTER_AUTO_POSITION=1;"
echo "$slave_host has been promoted to master."
}
# 主函数
main() {
while true; do
# 检查主库状态
if ! check_mysql_status $MASTER_HOST; then
echo "$time Master MySQL service is down. Checking slave status..."
# 检查从库状态
if check_mysql_status $SLAVE_HOST && check_slave_status $SLAVE_HOST $SLAVE_PORT; then
echo "$time Slave MySQL service is running. Promoting slave to master..."
# 切换主从
promote_slave_to_master $SLAVE_HOST $SLAVE_PORT $MASTER_HOST $MASTER_PORT
# 更新主从地址
MASTER_HOST=$SLAVE_HOST
SLAVE_HOST="172.21.27.30"
echo "$time Failover completed. New master is $MASTER_HOST."
else
echo "$time Slave MySQL service is down. Cannot perform failover."
fi
else
echo "$time Master MySQL service is running."
fi
sleep 30
time=$(date)
done
}
# 执行主函数
main >> $LOG_FILE 2>&1
六、手工切换主从工具
#!/bin/bash
ip a | grep 172
echo ${master_ip}
echo ${slave_ip}
echo ${db_port}
echo ${MYSQL_USER}
echo ${MYSQL_PASSWORD}
# 定义主从库的地址和端口
MASTER_HOST=${master_ip}
MASTER_PORT=${db_port}
SLAVE_HOST=${slave_ip}
SLAVE_PORT=${db_port}
time=$(date)
# 检查MySQL服务状态
check_mysql_status() {
local host=$1
local port=$2
local status
status=$(ssh $host systemctl is-active mysqld)
if [ "$status" == "active" ]; then
echo "MySQL service on $host is running."
return 0
else
echo "MySQL service on $host is not running."
return 1
fi
}
# 检查从库复制状态
check_slave_status() {
local host=$1
local port=$2
local status
status=$(mysql -h$host -P$port -u$MYSQL_USER -p$MYSQL_PASSWORD -e "SHOW SLAVE STATUS\G" | grep "Slave_IO_Running" | awk '{print $2}')
if [ "$status" == "Yes" ]; then
echo "Slave IO on $host is running."
return 0
else
echo "Slave IO on $host is not running."
return 1
fi
}
# 切换主从
promote_slave_to_master() {
local slave_host=$1
local slave_port=$2
local master_host=$3
local master_port=$4
echo "Promoting $slave_host to master..."
# 旧主设置为只读
mysql -h$master_host -P$master_port -u$MYSQL_USER -p$MYSQL_PASSWORD -e "SET GLOBAL super_read_only = 1;"
# 停止从库复制
mysql -h$slave_host -P$slave_port -u$MYSQL_USER -p$MYSQL_PASSWORD -e "STOP SLAVE;"
# 清除从库的复制信息
mysql -h$slave_host -P$slave_port -u$MYSQL_USER -p$MYSQL_PASSWORD -e "RESET SLAVE ALL;"
# 关闭新主的只读
mysql -h$slave_host -P$slave_port -u$MYSQL_USER -p$MYSQL_PASSWORD -e "SET GLOBAL read_only = OFF;"
# 设置旧主库为新从库
echo "CHANGE MASTER TO MASTER_HOST='$slave_host', MASTER_USER='replica', MASTER_PASSWORD='1234', MASTER_AUTO_POSITION=1;start slave;"
mysql -h$master_host -P$master_port -u$MYSQL_USER -p$MYSQL_PASSWORD -e "CHANGE MASTER TO MASTER_HOST='$slave_host',MASTER_USER='$MYSQL_USER', MASTER_PASSWORD='$MYSQL_PASSWORD', MASTER_AUTO_POSITION=1;start slave;"
echo "$slave_host has been promoted to master."
}
main() {
check_mysql_status $SLAVE_HOST $SLAVE_PORT
mysql_status=$?
echo mysql_status
check_slave_status $SLAVE_HOST $SLAVE_PORT
slave_status=$?
echo slave_status
if [ $mysql_status -eq 0 ] && [ $slave_status -eq 0 ] ;then
echo "$time slave MySQL service is right... and promote slave to master..."
promote_slave_to_master $SLAVE_HOST $SLAVE_PORT $MASTER_HOST $MASTER_PORT
echo "$time Failover completed. New master is $SLAVE_HOST."
else
echo "$time slave MySQL service is down... do not promote slave to master..."
fi
}
main > ./check_info.log 2>&1
cat ./check_info.log
jenkins工具: http://172.21.141.48:8080/job/MongoDB/job/switch.EC_CDH_mysql.master/
七、监控与告警
监听工具
逻辑:
-
主mysql服务异常,则自动切主从 (频率30S一次)
-
VIP自动切换,有个监测工具,如果VIP所在mysql不是master,则重启VIP所在主机的keepalived服务,由于只是1主1从,所以VIP会从slave漂到master上。 (频率60s一次)
监控屏暂无
八、补充
无