TiDB 生产部署与监控完整指南
一、生产部署详细步骤
第1步:环境准备与检查
1.1 硬件规格要求
# 最低生产配置(6节点)
├── PD节点 ×3 (4核8G, 200G SSD)
├── TiDB节点 ×2 (8核16G, 200G SSD)
├── TiKV节点 ×3 (16核32G, 1TB NVMe SSD) *必须NVMe*
└── 监控节点 ×1 (8核16G, 500G SSD)
# 推荐配置(高可用)
├── PD节点 ×3 (8核16G, 500G SSD)
├── TiDB节点 ×3 (16核32G, 500G SSD)
├── TiKV节点 ×5 (32核64G, 2TB NVMe SSD)
└── 监控节点 ×2 (8核16G, 1TB SSD) *主备*
1.2 系统配置(所有节点执行)
# 1. 关闭防火墙和SELinux
systemctl stop firewalld && systemctl disable firewalld
setenforce 0
sed -i 's/SELINUX=enforcing/SELINUX=disabled/g' /etc/selinux/config
# 2. 优化内核参数
cat > /etc/sysctl.d/tidb.conf << EOF
# 网络优化
net.core.somaxconn = 32768
net.ipv4.tcp_syncookies = 0
net.ipv4.tcp_max_syn_backlog = 65536
net.ipv4.tcp_syn_retries = 2
net.ipv4.tcp_synack_retries = 2
# 内存优化
vm.swappiness = 0
vm.dirty_ratio = 20
vm.dirty_background_ratio = 10
vm.overcommit_memory = 1
# 文件系统优化
fs.file-max = 1000000
fs.nr_open = 1000000
EOF
sysctl -p /etc/sysctl.d/tidb.conf
# 3. 调整文件句柄和进程数
cat > /etc/security/limits.d/tidb.conf << EOF
tidb soft nofile 1000000
tidb hard nofile 1000000
tidb soft nproc 65535
tidb hard nproc 65535
tidb soft stack 10240
EOF
# 4. 禁用透明大页和NUMA
echo never > /sys/kernel/mm/transparent_hugepage/enabled
echo never > /sys/kernel/mm/transparent_hugepage/defrag
grubby --update-kernel=ALL --args="transparent_hugepage=never numa=off"
# 5. 创建tidb用户
groupadd -g 1001 tidb
useradd -m -u 1001 -g tidb -s /bin/bash tidb
echo "tidb ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers.d/tidb
# 6. 时钟同步(必须!)
yum install -y chrony
systemctl enable chronyd && systemctl start chronyd
chronyc sources -v # 验证同步状态
1.3 磁盘规划
# TiKV节点磁盘布局(示例)
/dev/nvme0n1 # 数据盘1 → /data1/tidb-data
/dev/nvme1n1 # 数据盘2 → /data2/tidb-data
/dev/sdb # 日志盘 → /data1/tidb-log
/dev/sdc # 部署目录 → /tidb-deploy
# 格式化并挂载
mkfs.ext4 /dev/nvme0n1
mkdir -p /data1/tidb-data
echo "/dev/nvme0n1 /data1/tidb-data ext4 defaults,noatime,nodiratime 0 0" >> /etc/fstab
mount -a
chown -R tidb:tidb /data1/tidb-data
第2步:安装部署工具
2.1 安装TiUP(中控机)
# 1. 安装TiUP
curl --proto '=https' --tlsv1.2 -sSf https://tiup-mirrors.pingcap.com/install.sh | sh
# 2. 设置环境变量
echo 'export PATH=$HOME/.tiup/bin:$PATH' >> ~/.bashrc
source ~/.bashrc
# 3. 安装cluster组件
tiup cluster
# 4. 更新镜像(国内加速)
tiup mirror set https://tiup-mirrors.pingcap.com
2.2 配置SSH互信
# 1. 生成密钥(中控机)
ssh-keygen -t rsa -b 4096 -C "tidb-deploy"
# 2. 复制公钥到所有节点
for host in 10.0.1.{11..13} 10.0.2.{11..12} 10.0.3.{11..13} 10.0.4.11; do
ssh-copy-id -i ~/.ssh/id_rsa.pub tidb@$host
done
# 3. 测试免密登录
tiup cluster check --apply --user tidb topology.yaml
第3步:编写拓扑配置
3.1 完整拓扑文件(topology.yaml)
global:
user: "tidb"
ssh_port: 22
deploy_dir: "/tidb-deploy"
data_dir: "/tidb-data"
arch: "amd64"
os: "linux"
pd_servers:
- host: 10.0.1.11
name: "pd-1"
client_port: 2379
peer_port: 2380
deploy_dir: "/tidb-deploy/pd-2379"
data_dir: "/tidb-data/pd-2379"
log_dir: "/tidb-deploy/pd-2379/log"
- host: 10.0.1.12
name: "pd-2"
client_port: 2379
peer_port: 2380
- host: 10.0.1.13
name: "pd-3"
client_port: 2379
peer_port: 2380
tidb_servers:
- host: 10.0.2.11
port: 4000
status_port: 10080
deploy_dir: "/tidb-deploy/tidb-4000"
log_dir: "/tidb-deploy/tidb-4000/log"
- host: 10.0.2.12
port: 4000
status_port: 10080
tikv_servers:
- host: 10.0.3.11
port: 20160
status_port: 20180
deploy_dir: "/tidb-deploy/tikv-20160"
data_dir:
- "/data1/tidb-data/tikv-20160"
- "/data2/tidb-data/tikv-20160"
log_dir: "/tidb-deploy/tikv-20160/log"
config:
server.labels:
zone: "zone1"
host: "tikv-1"
- host: 10.0.3.12
port: 20160
status_port: 20180
config:
server.labels:
zone: "zone1"
host: "tikv-2"
- host: 10.0.3.13
port: 20160
status_port: 20180
config:
server.labels:
zone: "zone2"
host: "tikv-3"
monitoring_servers:
- host: 10.0.4.11
port: 9090
deploy_dir: "/tidb-deploy/prometheus-9090"
data_dir: "/tidb-data/prometheus-9090"
log_dir: "/tidb-deploy/prometheus-9090/log"
grafana_servers:
- host: 10.0.4.11
port: 3000
deploy_dir: "/tidb-deploy/grafana-3000"
alertmanager_servers:
- host: 10.0.4.11
web_port: 9093
cluster_port: 9094
deploy_dir: "/tidb-deploy/alertmanager-9093"
data_dir: "/tidb-data/alertmanager-9093"
log_dir: "/tidb-deploy/alertmanager-9093/log"
3.2 高级配置(server_configs)
server_configs:
pd:
schedule.max-merge-region-size: 20
schedule.max-merge-region-keys: 200000
schedule.split-merge-interval: 1h
replication.location-labels: ["zone", "host"]
replication.max-replicas: 3
log.level: "info"
tikv:
readpool.storage.use-unified-pool: true
readpool.coprocessor.use-unified-pool: true
storage.block-cache.capacity: "10GB"
raftstore.apply-pool-size: 4
raftstore.store-pool-size: 4
rocksdb.max-background-jobs: 8
rocksdb.max-sub-compactions: 4
rocksdb.defaultcf.block-cache-size: "6GB"
rocksdb.writecf.block-cache-size: "2GB"
server.grpc-concurrency: 8
tidb:
performance.max-procs: 0
performance.txn-total-size-limit: 10737418240
log.slow-threshold: 300
prepared-plan-cache.enabled: true
prepared-plan-cache.capacity: 100
tikv-client.max-batch-size: 128
第4步:部署集群
4.1 环境检查
# 1. 检查操作系统和硬件
tiup cluster check topology.yaml --user tidb
# 2. 自动修复检查出的问题
tiup cluster check topology.yaml --apply --user tidb
# 3. 手动检查项
# - 时钟同步:chronyc sources
# - 磁盘挂载:df -h
# - 内存检查:free -h
# - CPU检查:lscpu
# - 网络检查:ping/ping6
4.2 执行部署
# 1. 部署集群(指定版本)
tiup cluster deploy tidb-prod v7.5.0 topology.yaml \
--user tidb \
--ssh-timeout 300 \
--wait-timeout 600
# 2. 启动集群
tiup cluster start tidb-prod
# 3. 查看状态
tiup cluster display tidb-prod
# 4. 验证集群
tiup cluster audit tidb-prod
4.3 初始化配置
-- 1. 连接TiDB
mysql -h 10.0.2.11 -P 4000 -u root
-- 2. 修改root密码
ALTER USER 'root'@'%' IDENTIFIED BY 'StrongPass123!';
FLUSH PRIVILEGES;
-- 3. 创建业务数据库和用户
CREATE DATABASE IF NOT EXISTS app_db;
CREATE USER 'app_user'@'10.0.%' IDENTIFIED BY 'AppPass123!';
GRANT ALL PRIVILEGES ON app_db.* TO 'app_user'@'10.0.%';
-- 4. 设置全局参数
SET GLOBAL tidb_mem_quota_query = 34359738368; -- 32GB
SET GLOBAL tidb_enable_1pc = ON; -- 一阶段提交
SET GLOBAL tidb_enable_fast_analyze = ON;
第5步:监控系统部署与配置
5.1 监控架构
Prometheus (采集) → Grafana (展示) → Alertmanager (告警)
↓
Pushgateway (自定义指标)
↓
Blackbox Exporter (网络探测)
5.2 Prometheus配置优化
# 编辑Prometheus配置
tiup cluster edit-config tidb-prod
# 添加以下配置
monitoring_servers:
prometheus:
config:
global:
scrape_interval: 15s
evaluation_interval: 15s
scrape_timeout: 10s
# 存储配置
storage:
tsdb:
retention: 30d # 保留30天
path: /tidb-data/prometheus-9090/data
# 告警规则
rule_files:
- /tidb-deploy/prometheus-9090/conf/*.rules.yml
# 抓取目标
scrape_configs:
- job_name: 'tidb'
static_configs:
- targets: ['10.0.2.11:10080', '10.0.2.12:10080']
- job_name: 'tikv'
static_configs:
- targets: ['10.0.3.11:20180', '10.0.3.12:20180', '10.0.3.13:20180']
- job_name: 'pd'
static_configs:
- targets: ['10.0.1.11:2379', '10.0.1.12:2379', '10.0.1.13:2379']
- job_name: 'node'
static_configs:
- targets: ['10.0.1.11:9100', '10.0.1.12:9100', '10.0.1.13:9100',
'10.0.2.11:9100', '10.0.2.12:9100',
'10.0.3.11:9100', '10.0.3.12:9100', '10.0.3.13:9100',
'10.0.4.11:9100']
5.3 Grafana配置
# 1. 导入官方仪表板
# 访问 http://10.0.4.11:3000
# 用户名/密码:admin/admin
# 2. 导入的仪表板ID:
# - TiDB集群概览:12599
# - TiDB详细指标:12600
# - TiKV详细指标:12603
# - PD详细指标:12605
# - Node Exporter:11074
# 3. 配置数据源
# 数据源类型:Prometheus
# URL:http://10.0.4.11:9090
# Access:Server (Default)
# 4. 创建自定义仪表板
# 关键指标面板:
# - QPS/TPS实时监控
# - 查询延迟P95/P99
# - 连接数监控
# - 内存使用率
# - 磁盘IOPS/吞吐量
# - Region分布
# - 慢查询统计
5.4 Alertmanager告警配置
# alertmanager.yml
global:
smtp_smarthost: 'smtp.example.com:587'
smtp_from: 'tidb-alert@example.com'
smtp_auth_username: 'alert@example.com'
smtp_auth_password: 'your-password'
smtp_require_tls: true
route:
group_by: ['alertname', 'cluster', 'service']
group_wait: 30s
group_interval: 5m
repeat_interval: 12h
receiver: 'email-alerts'
routes:
- match:
severity: critical
receiver: 'critical-alerts'
continue: true
- match:
severity: warning
receiver: 'warning-alerts'
continue: true
receivers:
- name: 'email-alerts'
email_configs:
- to: 'dba-team@example.com'
send_resolved: true
- name: 'critical-alerts'
email_configs:
- to: 'dba-oncall@example.com'
webhook_configs:
- url: 'http://alert-webhook.example.com/critical'
- name: 'warning-alerts'
email_configs:
- to: 'dba-team@example.com'
slack_configs:
- api_url: 'https://hooks.slack.com/services/xxx'
channel: '#tidb-alerts'
title: 'TiDB告警'
text: '{{ range .Alerts }}{{ .Annotations.summary }}\n{{ end }}'
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'cluster', 'instance']
5.5 关键告警规则
# critical.rules.yml
groups:
- name: tidb-critical
rules:
# TiDB服务宕机
- alert: TiDBInstanceDown
expr: up{job="tidb"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: "TiDB实例 {{ $labels.instance }} 宕机"
description: "TiDB实例 {{ $labels.instance }} 已宕机超过1分钟"
# TiKV存储空间不足
- alert: TiKVDiskFull
expr: (node_filesystem_avail_bytes{mountpoint="/data1"} / node_filesystem_size_bytes{mountpoint="/data1"} * 100) < 10
for: 5m
labels:
severity: critical
annotations:
summary: "TiKV磁盘空间不足 {{ $labels.instance }}"
description: "TiKV实例 {{ $labels.instance }} 磁盘使用率超过90%"
# 高延迟查询
- alert: HighQueryLatency
expr: histogram_quantile(0.99, rate(tidb_server_handle_query_duration_seconds_bucket[5m])) > 1
for: 5m
labels:
severity: warning
annotations:
summary: "查询延迟过高"
description: "P99查询延迟超过1秒"
# Region不健康
- alert: RegionUnhealthy
expr: sum(pd_regions_state{type="miss-peer"}) by (instance) > 0
for: 5m
labels:
severity: critical
annotations:
summary: "Region副本缺失"
description: "有Region副本缺失,需要检查"
# 内存不足
- alert: TiDBOutOfMemory
expr: process_resident_memory_bytes{job="tidb"} / 1024 / 1024 / 1024 > 28
for: 2m
labels:
severity: warning
annotations:
summary: "TiDB内存使用过高"
description: "TiDB实例 {{ $labels.instance }} 内存使用超过28GB"
第6步:监控指标详解
6.1 核心监控指标
-- 1. 集群健康状态
SELECT * FROM information_schema.cluster_info;
-- 2. 慢查询监控
SELECT * FROM information_schema.slow_query
WHERE time > NOW() - INTERVAL 1 HOUR
ORDER BY time DESC
LIMIT 10;
-- 3. 锁冲突监控
SELECT * FROM information_schema.deadlocks;
-- 4. 连接数监控
SHOW PROCESSLIST;
SELECT COUNT(*) FROM information_schema.processlist;
-- 5. 存储使用
SELECT
STORE_ID,
CAPACITY,
AVAILABLE,
USED_SIZE
FROM information_schema.tikv_store_status;
6.2 Prometheus关键查询
# 1. QPS监控
sum(rate(tidb_executor_statement_total[1m])) by (type)
# 2. 查询延迟P95/P99
histogram_quantile(0.95, rate(tidb_server_handle_query_duration_seconds_bucket[5m]))
histogram_quantile(0.99, rate(tidb_server_handle_query_duration_seconds_bucket[5m]))
# 3. 连接数
tidb_server_connections
# 4. TiKV存储使用
sum(tikv_store_size_bytes) by (instance)
sum(tikv_region_size_bytes) by (instance)
# 5. PD调度
rate(pd_schedule_operator_total[5m])
# 6. 节点资源
node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100
rate(node_cpu_seconds_total{mode="idle"}[5m]) * 100
node_filesystem_avail_bytes / node_filesystem_size_bytes * 100
6.3 自定义监控脚本
#!/bin/bash
# monitor_tidb.sh
# 检查集群状态
check_cluster_status() {
echo "=== 集群状态检查 ==="
tiup cluster display tidb-prod
echo -e "\n=== 节点状态 ==="
mysql -h 10.0.2.11 -P 4000 -u root -p'密码' -e "
SELECT INSTANCE, TYPE, STATUS, VERSION, UPTIME
FROM information_schema.cluster_info;
"
}
# 检查慢查询
check_slow_queries() {
echo -e "\n=== 慢查询检查(最近1小时)==="
mysql -h 10.0.2.11 -P 4000 -u root -p'密码' -e "
SELECT
query,
digest,
avg_process_time,
max_process_time,
exec_count
FROM information_schema.slow_query
WHERE time > NOW() - INTERVAL 1 HOUR
ORDER BY max_process_time DESC
LIMIT 5;
"
}
# 检查Region分布
check_region_distribution() {
echo -e "\n=== Region分布检查 ==="
mysql -h 10.0.2.11 -P 4000 -u root -p'密码' -e "
SELECT
STORE_ID,
COUNT(*) as region_count,
SUM(APPROXIMATE_SIZE) as total_size_mb,
SUM(APPROXIMATE_KEYS) as total_keys
FROM information_schema.tikv_region_status
GROUP BY STORE_ID
ORDER BY region_count DESC;
"
}
# 检查备份状态
check_backup_status() {
echo -e "\n=== 备份状态检查 ==="
ls -lh /backup/tidb/ 2>/dev/null || echo "备份目录不存在"
# 检查最近备份时间
find /backup/tidb/ -name "*.log" -mtime -1 2>/dev/null | head -5
}
# 主函数
main() {
check_cluster_status
check_slow_queries
check_region_distribution
check_backup_status
}
main
第7步:告警处理流程
7.1 告警分级与响应
# 告警级别定义
告警级别:
P0-紧急:
- TiDB/TiKV/PD实例宕机
- 磁盘空间不足(<5%)
- Region副本大量缺失
- 响应: 立即处理,15分钟内响应
P1-严重:
- 查询延迟>5秒
- 内存使用>90%
- CPU使用>90%
- 响应: 1小时内处理
P2-警告:
- 慢查询增多
- 连接数接近上限
- 磁盘使用>80%
- 响应: 4小时内处理
P3-提示:
- 备份失败
- 监控数据异常
- 日志错误增多
- 响应: 24小时内处理
7.2 常见告警处理
# 1. TiDB实例宕机
tiup cluster start tidb-prod -N 10.0.2.11:4000
# 检查日志
tail -f /tidb-deploy/tidb-4000/log/tidb.log
# 2. 磁盘空间不足
# 清理日志
find /tidb-deploy -name "*.log" -mtime +7 -delete
# 清理慢查询日志
mysql -h 10.0.2.11 -P 4000 -u root -p -e "ADMIN CLEANUP SLOW QUERY BY '2024-01-01 00:00:00';"
# 3. 高延迟查询
# 查看当前慢查询
SELECT * FROM information_schema.processlist WHERE TIME > 60;
# 终止问题查询
KILL QUERY [query_id];
# 4. Region不均衡
# 手动调度
tiup ctl:v7.5.0 pd -u http://10.0.1.11:2379 operator add scatter-region [region_id]
第8步:备份与恢复
8.1 备份策略
#!/bin/bash
# backup_tidb.sh
# 全量备份(每周日)
full_backup() {
local backup_dir="/backup/tidb/full_$(date +%Y%m%d)"
mkdir -p $backup_dir
tiup br backup full \
--pd "10.0.1.11:2379" \
--storage "local://$backup_dir" \
--ratelimit 128 \
--log-file "$backup_dir/backup.log"
# 保留最近4次全备
ls -dt /backup/tidb/full_* | tail -n +5 | xargs rm -rf
}
# 增量备份(每天)
incremental_backup() {
local last_backup=$(ls -dt /backup/tidb/full_* | head -1)
local backup_dir="/backup/tidb/inc_$(date +%Y%m%d_%H%M%S)"
mkdir -p $backup_dir
tiup br backup incremental \
--pd "10.0.1.11:2379" \
--storage "local://$backup_dir" \
--lastbackupts "$(cat $last_backup/backupmeta | grep -o '"end-ts":[0-9]*' | cut -d: -f2)" \
--ratelimit 256 \
--log-file "$backup_dir/backup.log"
# 保留最近7天增量
find /backup/tidb/inc_* -type d -mtime +7 -exec rm -rf {} ;
}
# 日志备份(每小时)
log_backup() {
tiup br log backup \
--pd "10.0.1.11:2379" \
--storage "local:///backup/tidb/log" \
--start-ts "$(date -d '1 hour ago' +%s)000"
}
# 根据参数执行
case $1 in
"full") full_backup ;;
"inc") incremental_backup ;;
"log") log_backup ;;
*) echo "Usage: $0 {full|inc|log}" ;;
esac
8.2 恢复测试
# 1. 创建测试环境
tiup cluster deploy tidb-test v7.5.0 topology-test.yaml
# 2. 恢复全量备份
tiup br restore full \
--pd "10.0.1.21:2379" \
--storage "local:///backup/tidb/full_20240101" \
--ratelimit 128
# 3. 恢复增量备份
tiup br restore incremental \
--pd "10.0.1.21:2379" \
--storage "local:///backup/tidb/inc_20240102" \
--full-backup-storage "local:///backup/tidb/full_20240101"
# 4. 验证数据
mysql -h 10.0.2.21 -P 4000 -u root -p -e "
SELECT COUNT(*) FROM app_db.important_table;
CHECKSUM TABLE app_db.important_table;
"
第9步:性能调优
9.1 参数调优
-- TiDB参数调优
SET GLOBAL tidb_mem_quota_query = 8589934592; -- 8GB
SET GLOBAL tidb_enable_1pc = ON;
SET GLOBAL tidb_enable_fast_analyze = ON;
SET GLOBAL tidb_txn_mode = 'optimistic';
-- TiKV参数调优(通过tiup修改)
tiup cluster edit-config tidb-prod
# 添加
server_configs:
tikv:
readpool.unified.max-thread-count: 12
storage.block-cache.capacity: "12GB"
raftstore.apply-pool-size: 6
raftstore.store-pool-size: 6
coprocessor.region-max-size: "256MB"
coprocessor.region-max-keys: 2880000
9.2 索引优化
-- 1. 查看缺失索引
SELECT * FROM information_schema.tidb_index_usage
WHERE LAST_USED_TIME < DATE_SUB(NOW(), INTERVAL 7 DAY);
-- 2. 查看冗余索引
SELECT
table_schema,
table_name,
index_name,
column_name,
seq_in_index
FROM information_schema.statistics
WHERE table_schema NOT IN ('mysql', 'information_schema', 'performance_schema')
GROUP BY table_schema, table_name, index_name, column_name, seq_in_index
HAVING COUNT(*) > 1;
-- 3. 自动分析
SET GLOBAL tidb_enable_auto_analyze = ON;
SET GLOBAL tidb_auto_analyze_ratio = 0.5;
第10步:日常运维
10.1 健康检查脚本
#!/bin/bash
# health_check.sh
check_items=(
"集群状态:tiup cluster display tidb-prod | grep -E '(Up|Healthy)' | wc -l"
"PD Leader:curl -s http://10.0.1.11:2379/pd/api/v1/leader | jq .name"
"TiDB连接:mysql -h 10.0.2.11 -P 4000 -u root -p密码 -e 'SELECT 1' 2>/dev/null || echo 'FAIL'"
"Region状态:curl -s http://10.0.1.11:2379/pd/api/v1/regions | jq '.count'"
"存储空间:df -h /data1 | awk 'NR==2{print $5}'"
"内存使用:free -h | awk '/Mem:/{print $3/$2 * 100}'"
"CPU负载:uptime | awk -F'load average:' '{print $2}'"
)
echo "=== TiDB集群健康检查 ==="
echo "检查时间: $(date)"
echo ""
for item in "${check_items[@]}"; do
name="${item%%:*}"
cmd="${item#*:}"
result=$(eval $cmd 2>/dev/null)
status="✓"
if [[ $result =~ "FAIL" ]] || [[ $result -eq 0 ]] 2>/dev/null; then
status="✗"
fi
printf "%-20s %-5s %s\n" "$name" "$status" "$result"
done
10.2 扩容操作
# 1. 编辑拓扑文件添加新节点
# 2. 扩容TiKV节点
tiup cluster scale-out tidb-prod scale-tikv.yaml
# 3. 扩容TiDB节点
tiup cluster scale-out tidb-prod scale-tidb.yaml
# 4. 验证扩容
tiup cluster display tidb-prod
10.3 升级操作
# 1. 检查升级兼容性
tiup cluster check tidb-prod --cluster-version v7.5.0 --upgrade
# 2. 备份集群
tiup br backup full --pd "10.0.1.11:2379" --storage "local:///backup/pre-upgrade"
# 3. 滚动升级
tiup cluster upgrade tidb-prod v7.6.0
# 4. 验证升级
tiup cluster display tidb-prod
mysql -h 10.0.2.11 -P 4000 -u root -p -e "SELECT VERSION();"
监控大屏关键指标
必须监控的10个核心指标:
- QPS/TPS:每秒查询/事务数
- 查询延迟P95/P99:响应时间分布
- 连接数:当前活跃连接
- Region健康度:副本状态
- 节点资源:CPU/内存/磁盘
- TiKV存储:空间使用率
- GC耗时:垃圾回收时间
- 慢查询数:超过阈值的查询
- 错误率:SQL错误比例
- 备份状态:最后一次备份时间
告警阈值建议:
- CPU使用率 > 80% (警告), > 90% (严重)
- 内存使用率 > 85% (警告), > 95% (严重)
- 磁盘使用率 > 80% (警告), > 90% (严重)
- 查询延迟P99 > 1秒 (警告), > 3秒 (严重)
- Region副本缺失 > 0 (立即告警)
总结要点
- 部署前:硬件检查、系统优化、网络规划
- 部署中:拓扑设计、参数调优、安全配置
- 部署后:监控配置、备份策略、性能测试
- 运维中:日常巡检、告警处理、定期优化
- 故障时:快速定位、按预案处理、事后复盘
记住:监控是眼睛,备份是生命线,文档是传承。做好这三点,TiDB生产环境就能稳定运行。