TiDB 生产部署与监控完整指南

2 阅读8分钟

TiDB 生产部署与监控完整指南

一、生产部署详细步骤

第1步:环境准备与检查

1.1 硬件规格要求
# 最低生产配置(6节点)
├── PD节点 ×3 (48G, 200G SSD)
├── TiDB节点 ×2 (816G, 200G SSD) 
├── TiKV节点 ×3 (1632G, 1TB NVMe SSD) *必须NVMe*
└── 监控节点 ×1 (816G, 500G SSD)

# 推荐配置(高可用)
├── PD节点 ×3 (816G, 500G SSD)
├── TiDB节点 ×3 (1632G, 500G SSD)
├── TiKV节点 ×5 (3264G, 2TB NVMe SSD)
└── 监控节点 ×2 (816G, 1TB SSD) *主备*
1.2 系统配置(所有节点执行)
# 1. 关闭防火墙和SELinux
systemctl stop firewalld && systemctl disable firewalld
setenforce 0
sed -i 's/SELINUX=enforcing/SELINUX=disabled/g' /etc/selinux/config

# 2. 优化内核参数
cat > /etc/sysctl.d/tidb.conf << EOF
# 网络优化
net.core.somaxconn = 32768
net.ipv4.tcp_syncookies = 0
net.ipv4.tcp_max_syn_backlog = 65536
net.ipv4.tcp_syn_retries = 2
net.ipv4.tcp_synack_retries = 2

# 内存优化
vm.swappiness = 0
vm.dirty_ratio = 20
vm.dirty_background_ratio = 10
vm.overcommit_memory = 1

# 文件系统优化
fs.file-max = 1000000
fs.nr_open = 1000000
EOF
sysctl -p /etc/sysctl.d/tidb.conf

# 3. 调整文件句柄和进程数
cat > /etc/security/limits.d/tidb.conf << EOF
tidb soft nofile 1000000
tidb hard nofile 1000000
tidb soft nproc 65535
tidb hard nproc 65535
tidb soft stack 10240
EOF

# 4. 禁用透明大页和NUMA
echo never > /sys/kernel/mm/transparent_hugepage/enabled
echo never > /sys/kernel/mm/transparent_hugepage/defrag
grubby --update-kernel=ALL --args="transparent_hugepage=never numa=off"

# 5. 创建tidb用户
groupadd -g 1001 tidb
useradd -m -u 1001 -g tidb -s /bin/bash tidb
echo "tidb ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers.d/tidb

# 6. 时钟同步(必须!)
yum install -y chrony
systemctl enable chronyd && systemctl start chronyd
chronyc sources -v  # 验证同步状态
1.3 磁盘规划
# TiKV节点磁盘布局(示例)
/dev/nvme0n1  # 数据盘1 → /data1/tidb-data
/dev/nvme1n1  # 数据盘2 → /data2/tidb-data
/dev/sdb      # 日志盘 → /data1/tidb-log
/dev/sdc      # 部署目录 → /tidb-deploy

# 格式化并挂载
mkfs.ext4 /dev/nvme0n1
mkdir -p /data1/tidb-data
echo "/dev/nvme0n1 /data1/tidb-data ext4 defaults,noatime,nodiratime 0 0" >> /etc/fstab
mount -a
chown -R tidb:tidb /data1/tidb-data

第2步:安装部署工具

2.1 安装TiUP(中控机)
# 1. 安装TiUP
curl --proto '=https' --tlsv1.2 -sSf https://tiup-mirrors.pingcap.com/install.sh | sh

# 2. 设置环境变量
echo 'export PATH=$HOME/.tiup/bin:$PATH' >> ~/.bashrc
source ~/.bashrc

# 3. 安装cluster组件
tiup cluster

# 4. 更新镜像(国内加速)
tiup mirror set https://tiup-mirrors.pingcap.com
2.2 配置SSH互信
# 1. 生成密钥(中控机)
ssh-keygen -t rsa -b 4096 -C "tidb-deploy"

# 2. 复制公钥到所有节点
for host in 10.0.1.{11..13} 10.0.2.{11..12} 10.0.3.{11..13} 10.0.4.11; do
    ssh-copy-id -i ~/.ssh/id_rsa.pub tidb@$host
done

# 3. 测试免密登录
tiup cluster check --apply --user tidb topology.yaml

第3步:编写拓扑配置

3.1 完整拓扑文件(topology.yaml)
global:
  user: "tidb"
  ssh_port: 22
  deploy_dir: "/tidb-deploy"
  data_dir: "/tidb-data"
  arch: "amd64"
  os: "linux"

pd_servers:
  - host: 10.0.1.11
    name: "pd-1"
    client_port: 2379
    peer_port: 2380
    deploy_dir: "/tidb-deploy/pd-2379"
    data_dir: "/tidb-data/pd-2379"
    log_dir: "/tidb-deploy/pd-2379/log"
  - host: 10.0.1.12
    name: "pd-2"
    client_port: 2379
    peer_port: 2380
  - host: 10.0.1.13
    name: "pd-3"
    client_port: 2379
    peer_port: 2380

tidb_servers:
  - host: 10.0.2.11
    port: 4000
    status_port: 10080
    deploy_dir: "/tidb-deploy/tidb-4000"
    log_dir: "/tidb-deploy/tidb-4000/log"
  - host: 10.0.2.12
    port: 4000
    status_port: 10080

tikv_servers:
  - host: 10.0.3.11
    port: 20160
    status_port: 20180
    deploy_dir: "/tidb-deploy/tikv-20160"
    data_dir: 
      - "/data1/tidb-data/tikv-20160"
      - "/data2/tidb-data/tikv-20160"
    log_dir: "/tidb-deploy/tikv-20160/log"
    config:
      server.labels: 
        zone: "zone1"
        host: "tikv-1"
  - host: 10.0.3.12
    port: 20160
    status_port: 20180
    config:
      server.labels: 
        zone: "zone1"
        host: "tikv-2"
  - host: 10.0.3.13
    port: 20160
    status_port: 20180
    config:
      server.labels: 
        zone: "zone2"
        host: "tikv-3"

monitoring_servers:
  - host: 10.0.4.11
    port: 9090
    deploy_dir: "/tidb-deploy/prometheus-9090"
    data_dir: "/tidb-data/prometheus-9090"
    log_dir: "/tidb-deploy/prometheus-9090/log"

grafana_servers:
  - host: 10.0.4.11
    port: 3000
    deploy_dir: "/tidb-deploy/grafana-3000"

alertmanager_servers:
  - host: 10.0.4.11
    web_port: 9093
    cluster_port: 9094
    deploy_dir: "/tidb-deploy/alertmanager-9093"
    data_dir: "/tidb-data/alertmanager-9093"
    log_dir: "/tidb-deploy/alertmanager-9093/log"
3.2 高级配置(server_configs)
server_configs:
  pd:
    schedule.max-merge-region-size: 20
    schedule.max-merge-region-keys: 200000
    schedule.split-merge-interval: 1h
    replication.location-labels: ["zone", "host"]
    replication.max-replicas: 3
    log.level: "info"
    
  tikv:
    readpool.storage.use-unified-pool: true
    readpool.coprocessor.use-unified-pool: true
    storage.block-cache.capacity: "10GB"
    raftstore.apply-pool-size: 4
    raftstore.store-pool-size: 4
    rocksdb.max-background-jobs: 8
    rocksdb.max-sub-compactions: 4
    rocksdb.defaultcf.block-cache-size: "6GB"
    rocksdb.writecf.block-cache-size: "2GB"
    server.grpc-concurrency: 8
    
  tidb:
    performance.max-procs: 0
    performance.txn-total-size-limit: 10737418240
    log.slow-threshold: 300
    prepared-plan-cache.enabled: true
    prepared-plan-cache.capacity: 100
    tikv-client.max-batch-size: 128

第4步:部署集群

4.1 环境检查
# 1. 检查操作系统和硬件
tiup cluster check topology.yaml --user tidb

# 2. 自动修复检查出的问题
tiup cluster check topology.yaml --apply --user tidb

# 3. 手动检查项
# - 时钟同步:chronyc sources
# - 磁盘挂载:df -h
# - 内存检查:free -h
# - CPU检查:lscpu
# - 网络检查:ping/ping6
4.2 执行部署
# 1. 部署集群(指定版本)
tiup cluster deploy tidb-prod v7.5.0 topology.yaml \
  --user tidb \
  --ssh-timeout 300 \
  --wait-timeout 600

# 2. 启动集群
tiup cluster start tidb-prod

# 3. 查看状态
tiup cluster display tidb-prod

# 4. 验证集群
tiup cluster audit tidb-prod
4.3 初始化配置
-- 1. 连接TiDB
mysql -h 10.0.2.11 -P 4000 -u root

-- 2. 修改root密码
ALTER USER 'root'@'%' IDENTIFIED BY 'StrongPass123!';
FLUSH PRIVILEGES;

-- 3. 创建业务数据库和用户
CREATE DATABASE IF NOT EXISTS app_db;
CREATE USER 'app_user'@'10.0.%' IDENTIFIED BY 'AppPass123!';
GRANT ALL PRIVILEGES ON app_db.* TO 'app_user'@'10.0.%';

-- 4. 设置全局参数
SET GLOBAL tidb_mem_quota_query = 34359738368;  -- 32GB
SET GLOBAL tidb_enable_1pc = ON;  -- 一阶段提交
SET GLOBAL tidb_enable_fast_analyze = ON;

第5步:监控系统部署与配置

5.1 监控架构
Prometheus (采集) → Grafana (展示) → Alertmanager (告警)
      ↓
   Pushgateway (自定义指标)
      ↓
   Blackbox Exporter (网络探测)
5.2 Prometheus配置优化
# 编辑Prometheus配置
tiup cluster edit-config tidb-prod

# 添加以下配置
monitoring_servers:
  prometheus:
    config:
      global:
        scrape_interval: 15s
        evaluation_interval: 15s
        scrape_timeout: 10s
      
      # 存储配置
      storage:
        tsdb:
          retention: 30d  # 保留30天
          path: /tidb-data/prometheus-9090/data
      
      # 告警规则
      rule_files:
        - /tidb-deploy/prometheus-9090/conf/*.rules.yml
      
      # 抓取目标
      scrape_configs:
        - job_name: 'tidb'
          static_configs:
            - targets: ['10.0.2.11:10080', '10.0.2.12:10080']
          
        - job_name: 'tikv'
          static_configs:
            - targets: ['10.0.3.11:20180', '10.0.3.12:20180', '10.0.3.13:20180']
          
        - job_name: 'pd'
          static_configs:
            - targets: ['10.0.1.11:2379', '10.0.1.12:2379', '10.0.1.13:2379']
          
        - job_name: 'node'
          static_configs:
            - targets: ['10.0.1.11:9100', '10.0.1.12:9100', '10.0.1.13:9100',
                       '10.0.2.11:9100', '10.0.2.12:9100',
                       '10.0.3.11:9100', '10.0.3.12:9100', '10.0.3.13:9100',
                       '10.0.4.11:9100']
5.3 Grafana配置
# 1. 导入官方仪表板
# 访问 http://10.0.4.11:3000
# 用户名/密码:admin/admin

# 2. 导入的仪表板ID:
# - TiDB集群概览:12599
# - TiDB详细指标:12600
# - TiKV详细指标:12603
# - PD详细指标:12605
# - Node Exporter:11074

# 3. 配置数据源
# 数据源类型:Prometheus
# URL:http://10.0.4.11:9090
# Access:Server (Default)

# 4. 创建自定义仪表板
# 关键指标面板:
# - QPS/TPS实时监控
# - 查询延迟P95/P99
# - 连接数监控
# - 内存使用率
# - 磁盘IOPS/吞吐量
# - Region分布
# - 慢查询统计
5.4 Alertmanager告警配置
# alertmanager.yml
global:
  smtp_smarthost: 'smtp.example.com:587'
  smtp_from: 'tidb-alert@example.com'
  smtp_auth_username: 'alert@example.com'
  smtp_auth_password: 'your-password'
  smtp_require_tls: true

route:
  group_by: ['alertname', 'cluster', 'service']
  group_wait: 30s
  group_interval: 5m
  repeat_interval: 12h
  receiver: 'email-alerts'
  
  routes:
    - match:
        severity: critical
      receiver: 'critical-alerts'
      continue: true
    - match:
        severity: warning
      receiver: 'warning-alerts'
      continue: true

receivers:
  - name: 'email-alerts'
    email_configs:
      - to: 'dba-team@example.com'
        send_resolved: true
        
  - name: 'critical-alerts'
    email_configs:
      - to: 'dba-oncall@example.com'
    webhook_configs:
      - url: 'http://alert-webhook.example.com/critical'
        
  - name: 'warning-alerts'
    email_configs:
      - to: 'dba-team@example.com'
    slack_configs:
      - api_url: 'https://hooks.slack.com/services/xxx'
        channel: '#tidb-alerts'
        title: 'TiDB告警'
        text: '{{ range .Alerts }}{{ .Annotations.summary }}\n{{ end }}'

inhibit_rules:
  - source_match:
      severity: 'critical'
    target_match:
      severity: 'warning'
    equal: ['alertname', 'cluster', 'instance']
5.5 关键告警规则
# critical.rules.yml
groups:
  - name: tidb-critical
    rules:
      # TiDB服务宕机
      - alert: TiDBInstanceDown
        expr: up{job="tidb"} == 0
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "TiDB实例 {{ $labels.instance }} 宕机"
          description: "TiDB实例 {{ $labels.instance }} 已宕机超过1分钟"
          
      # TiKV存储空间不足
      - alert: TiKVDiskFull
        expr: (node_filesystem_avail_bytes{mountpoint="/data1"} / node_filesystem_size_bytes{mountpoint="/data1"} * 100) < 10
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "TiKV磁盘空间不足 {{ $labels.instance }}"
          description: "TiKV实例 {{ $labels.instance }} 磁盘使用率超过90%"
          
      # 高延迟查询
      - alert: HighQueryLatency
        expr: histogram_quantile(0.99, rate(tidb_server_handle_query_duration_seconds_bucket[5m])) > 1
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "查询延迟过高"
          description: "P99查询延迟超过1秒"
          
      # Region不健康
      - alert: RegionUnhealthy
        expr: sum(pd_regions_state{type="miss-peer"}) by (instance) > 0
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Region副本缺失"
          description: "有Region副本缺失,需要检查"
          
      # 内存不足
      - alert: TiDBOutOfMemory
        expr: process_resident_memory_bytes{job="tidb"} / 1024 / 1024 / 1024 > 28
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: "TiDB内存使用过高"
          description: "TiDB实例 {{ $labels.instance }} 内存使用超过28GB"

第6步:监控指标详解

6.1 核心监控指标
-- 1. 集群健康状态
SELECT * FROM information_schema.cluster_info;

-- 2. 慢查询监控
SELECT * FROM information_schema.slow_query 
WHERE time > NOW() - INTERVAL 1 HOUR 
ORDER BY time DESC 
LIMIT 10;

-- 3. 锁冲突监控
SELECT * FROM information_schema.deadlocks;

-- 4. 连接数监控
SHOW PROCESSLIST;
SELECT COUNT(*) FROM information_schema.processlist;

-- 5. 存储使用
SELECT 
    STORE_ID,
    CAPACITY,
    AVAILABLE,
    USED_SIZE
FROM information_schema.tikv_store_status;
6.2 Prometheus关键查询
# 1. QPS监控
sum(rate(tidb_executor_statement_total[1m])) by (type)

# 2. 查询延迟P95/P99
histogram_quantile(0.95, rate(tidb_server_handle_query_duration_seconds_bucket[5m]))
histogram_quantile(0.99, rate(tidb_server_handle_query_duration_seconds_bucket[5m]))

# 3. 连接数
tidb_server_connections

# 4. TiKV存储使用
sum(tikv_store_size_bytes) by (instance)
sum(tikv_region_size_bytes) by (instance)

# 5. PD调度
rate(pd_schedule_operator_total[5m])

# 6. 节点资源
node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100
rate(node_cpu_seconds_total{mode="idle"}[5m]) * 100
node_filesystem_avail_bytes / node_filesystem_size_bytes * 100
6.3 自定义监控脚本
#!/bin/bash
# monitor_tidb.sh

# 检查集群状态
check_cluster_status() {
    echo "=== 集群状态检查 ==="
    tiup cluster display tidb-prod
    
    echo -e "\n=== 节点状态 ==="
    mysql -h 10.0.2.11 -P 4000 -u root -p'密码' -e "
        SELECT INSTANCE, TYPE, STATUS, VERSION, UPTIME 
        FROM information_schema.cluster_info;
    "
}

# 检查慢查询
check_slow_queries() {
    echo -e "\n=== 慢查询检查(最近1小时)==="
    mysql -h 10.0.2.11 -P 4000 -u root -p'密码' -e "
        SELECT 
            query,
            digest,
            avg_process_time,
            max_process_time,
            exec_count
        FROM information_schema.slow_query
        WHERE time > NOW() - INTERVAL 1 HOUR
        ORDER BY max_process_time DESC
        LIMIT 5;
    "
}

# 检查Region分布
check_region_distribution() {
    echo -e "\n=== Region分布检查 ==="
    mysql -h 10.0.2.11 -P 4000 -u root -p'密码' -e "
        SELECT 
            STORE_ID,
            COUNT(*) as region_count,
            SUM(APPROXIMATE_SIZE) as total_size_mb,
            SUM(APPROXIMATE_KEYS) as total_keys
        FROM information_schema.tikv_region_status
        GROUP BY STORE_ID
        ORDER BY region_count DESC;
    "
}

# 检查备份状态
check_backup_status() {
    echo -e "\n=== 备份状态检查 ==="
    ls -lh /backup/tidb/ 2>/dev/null || echo "备份目录不存在"
    
    # 检查最近备份时间
    find /backup/tidb/ -name "*.log" -mtime -1 2>/dev/null | head -5
}

# 主函数
main() {
    check_cluster_status
    check_slow_queries
    check_region_distribution
    check_backup_status
}

main

第7步:告警处理流程

7.1 告警分级与响应
# 告警级别定义
告警级别:
  P0-紧急:
    - TiDB/TiKV/PD实例宕机
    - 磁盘空间不足(<5%)
    - Region副本大量缺失
    - 响应: 立即处理,15分钟内响应
    
  P1-严重:
    - 查询延迟>5秒
    - 内存使用>90%
    - CPU使用>90%
    - 响应: 1小时内处理
    
  P2-警告:
    - 慢查询增多
    - 连接数接近上限
    - 磁盘使用>80%
    - 响应: 4小时内处理
    
  P3-提示:
    - 备份失败
    - 监控数据异常
    - 日志错误增多
    - 响应: 24小时内处理
7.2 常见告警处理
# 1. TiDB实例宕机
tiup cluster start tidb-prod -N 10.0.2.11:4000
# 检查日志
tail -f /tidb-deploy/tidb-4000/log/tidb.log

# 2. 磁盘空间不足
# 清理日志
find /tidb-deploy -name "*.log" -mtime +7 -delete
# 清理慢查询日志
mysql -h 10.0.2.11 -P 4000 -u root -p -e "ADMIN CLEANUP SLOW QUERY BY '2024-01-01 00:00:00';"

# 3. 高延迟查询
# 查看当前慢查询
SELECT * FROM information_schema.processlist WHERE TIME > 60;
# 终止问题查询
KILL QUERY [query_id];

# 4. Region不均衡
# 手动调度
tiup ctl:v7.5.0 pd -u http://10.0.1.11:2379 operator add scatter-region [region_id]

第8步:备份与恢复

8.1 备份策略
#!/bin/bash
# backup_tidb.sh

# 全量备份(每周日)
full_backup() {
    local backup_dir="/backup/tidb/full_$(date +%Y%m%d)"
    mkdir -p $backup_dir
    
    tiup br backup full \
        --pd "10.0.1.11:2379" \
        --storage "local://$backup_dir" \
        --ratelimit 128 \
        --log-file "$backup_dir/backup.log"
    
    # 保留最近4次全备
    ls -dt /backup/tidb/full_* | tail -n +5 | xargs rm -rf
}

# 增量备份(每天)
incremental_backup() {
    local last_backup=$(ls -dt /backup/tidb/full_* | head -1)
    local backup_dir="/backup/tidb/inc_$(date +%Y%m%d_%H%M%S)"
    
    mkdir -p $backup_dir
    
    tiup br backup incremental \
        --pd "10.0.1.11:2379" \
        --storage "local://$backup_dir" \
        --lastbackupts "$(cat $last_backup/backupmeta | grep -o '"end-ts":[0-9]*' | cut -d: -f2)" \
        --ratelimit 256 \
        --log-file "$backup_dir/backup.log"
    
    # 保留最近7天增量
    find /backup/tidb/inc_* -type d -mtime +7 -exec rm -rf {} ;
}

# 日志备份(每小时)
log_backup() {
    tiup br log backup \
        --pd "10.0.1.11:2379" \
        --storage "local:///backup/tidb/log" \
        --start-ts "$(date -d '1 hour ago' +%s)000"
}

# 根据参数执行
case $1 in
    "full") full_backup ;;
    "inc") incremental_backup ;;
    "log") log_backup ;;
    *) echo "Usage: $0 {full|inc|log}" ;;
esac
8.2 恢复测试
# 1. 创建测试环境
tiup cluster deploy tidb-test v7.5.0 topology-test.yaml

# 2. 恢复全量备份
tiup br restore full \
    --pd "10.0.1.21:2379" \
    --storage "local:///backup/tidb/full_20240101" \
    --ratelimit 128

# 3. 恢复增量备份
tiup br restore incremental \
    --pd "10.0.1.21:2379" \
    --storage "local:///backup/tidb/inc_20240102" \
    --full-backup-storage "local:///backup/tidb/full_20240101"

# 4. 验证数据
mysql -h 10.0.2.21 -P 4000 -u root -p -e "
    SELECT COUNT(*) FROM app_db.important_table;
    CHECKSUM TABLE app_db.important_table;
"

第9步:性能调优

9.1 参数调优
-- TiDB参数调优
SET GLOBAL tidb_mem_quota_query = 8589934592;  -- 8GB
SET GLOBAL tidb_enable_1pc = ON;
SET GLOBAL tidb_enable_fast_analyze = ON;
SET GLOBAL tidb_txn_mode = 'optimistic';

-- TiKV参数调优(通过tiup修改)
tiup cluster edit-config tidb-prod
# 添加
server_configs:
  tikv:
    readpool.unified.max-thread-count: 12
    storage.block-cache.capacity: "12GB"
    raftstore.apply-pool-size: 6
    raftstore.store-pool-size: 6
    coprocessor.region-max-size: "256MB"
    coprocessor.region-max-keys: 2880000
9.2 索引优化
-- 1. 查看缺失索引
SELECT * FROM information_schema.tidb_index_usage 
WHERE LAST_USED_TIME < DATE_SUB(NOW(), INTERVAL 7 DAY);

-- 2. 查看冗余索引
SELECT 
    table_schema,
    table_name,
    index_name,
    column_name,
    seq_in_index
FROM information_schema.statistics
WHERE table_schema NOT IN ('mysql', 'information_schema', 'performance_schema')
GROUP BY table_schema, table_name, index_name, column_name, seq_in_index
HAVING COUNT(*) > 1;

-- 3. 自动分析
SET GLOBAL tidb_enable_auto_analyze = ON;
SET GLOBAL tidb_auto_analyze_ratio = 0.5;

第10步:日常运维

10.1 健康检查脚本
#!/bin/bash
# health_check.sh

check_items=(
    "集群状态:tiup cluster display tidb-prod | grep -E '(Up|Healthy)' | wc -l"
    "PD Leader:curl -s http://10.0.1.11:2379/pd/api/v1/leader | jq .name"
    "TiDB连接:mysql -h 10.0.2.11 -P 4000 -u root -p密码 -e 'SELECT 1' 2>/dev/null || echo 'FAIL'"
    "Region状态:curl -s http://10.0.1.11:2379/pd/api/v1/regions | jq '.count'"
    "存储空间:df -h /data1 | awk 'NR==2{print $5}'"
    "内存使用:free -h | awk '/Mem:/{print $3/$2 * 100}'"
    "CPU负载:uptime | awk -F'load average:' '{print $2}'"
)

echo "=== TiDB集群健康检查 ==="
echo "检查时间: $(date)"
echo ""

for item in "${check_items[@]}"; do
    name="${item%%:*}"
    cmd="${item#*:}"
    result=$(eval $cmd 2>/dev/null)
    status="✓"
    
    if [[ $result =~ "FAIL" ]] || [[ $result -eq 0 ]] 2>/dev/null; then
        status="✗"
    fi
    
    printf "%-20s %-5s %s\n" "$name" "$status" "$result"
done
10.2 扩容操作
# 1. 编辑拓扑文件添加新节点
# 2. 扩容TiKV节点
tiup cluster scale-out tidb-prod scale-tikv.yaml

# 3. 扩容TiDB节点
tiup cluster scale-out tidb-prod scale-tidb.yaml

# 4. 验证扩容
tiup cluster display tidb-prod
10.3 升级操作
# 1. 检查升级兼容性
tiup cluster check tidb-prod --cluster-version v7.5.0 --upgrade

# 2. 备份集群
tiup br backup full --pd "10.0.1.11:2379" --storage "local:///backup/pre-upgrade"

# 3. 滚动升级
tiup cluster upgrade tidb-prod v7.6.0

# 4. 验证升级
tiup cluster display tidb-prod
mysql -h 10.0.2.11 -P 4000 -u root -p -e "SELECT VERSION();"

监控大屏关键指标

必须监控的10个核心指标:

  1. QPS/TPS:每秒查询/事务数
  2. 查询延迟P95/P99:响应时间分布
  3. 连接数:当前活跃连接
  4. Region健康度:副本状态
  5. 节点资源:CPU/内存/磁盘
  6. TiKV存储:空间使用率
  7. GC耗时:垃圾回收时间
  8. 慢查询数:超过阈值的查询
  9. 错误率:SQL错误比例
  10. 备份状态:最后一次备份时间

告警阈值建议:

  • CPU使用率 > 80% (警告), > 90% (严重)
  • 内存使用率 > 85% (警告), > 95% (严重)
  • 磁盘使用率 > 80% (警告), > 90% (严重)
  • 查询延迟P99 > 1秒 (警告), > 3秒 (严重)
  • Region副本缺失 > 0 (立即告警)

总结要点

  1. 部署前:硬件检查、系统优化、网络规划
  2. 部署中:拓扑设计、参数调优、安全配置
  3. 部署后:监控配置、备份策略、性能测试
  4. 运维中:日常巡检、告警处理、定期优化
  5. 故障时:快速定位、按预案处理、事后复盘

记住:监控是眼睛,备份是生命线,文档是传承。做好这三点,TiDB生产环境就能稳定运行。