服务器操作卡顿问题解决

9 阅读5分钟

1、目前使用时发现操作明显有卡顿感 常用的方法,进程角度

# 综合视图,按 CPU 排序
ps aux --sort=-%cpu | head -20

# 按内存排序
ps aux --sort=-%mem | head -20

# 线程级查看(找多线程程序的瓶颈线程)
top -H -p <PID>  # 然后按 P 排序看 CPU,按 M 排序看内存

# 进程树,看父子关系
pstree -p <PID>
hanwang@k8s-master-node:~/work/ai-coding$ ps aux --sort=-%mem | head -20
USER         PID %CPU %MEM    VSZ   RSS TTY      STAT START   TIME COMMAND
libvirt+    4186 11.4 28.3 15033916 8392752 ?    Sl   3月26 406:23 /usr/bin/qemu-system-x86_64 -name guest=devstack-vm,debug-threads=on -S -object {"qom-type":"secret","id":"masterKey0","format":"raw","file":"/var/lib/libvirt/qemu/domain-1-devstack-vm/master-key.aes"} -machine pc-q35-8.2,usb=off,dump-guest-core=off,memory-backend=pc.ram,hpet=off,acpi=on -accel kvm -cpu host,migratable=on -m size=8388608k -object {"qom-type":"memory-backend-ram","id":"pc.ram","size":8589934592} -overcommit mem-lock=off -smp 4,sockets=4,cores=1,threads=1 -uuid 5e6f16c3-0a69-412e-a303-ac90d506e8f4 -display none -no-user-config -nodefaults -chardev socket,id=charmonitor,fd=32,server=on,wait=off -mon chardev=charmonitor,id=monitor,mode=control -rtc base=utc,driftfix=slew -global kvm-pit.lost_tick_policy=delay -no-shutdown -global ICH9-LPC.disable_s3=1 -global ICH9-LPC.disable_s4=1 -boot strict=on -device {"driver":"pcie-root-port","port":8,"chassis":1,"id":"pci.1","bus":"pcie.0","multifunction":true,"addr":"0x1"} -device {"driver":"pcie-root-port","port":9,"chassis":2,"id":"pci.2","bus":"pcie.0","addr":"0x1.0x1"} -device {"driver":"pcie-root-port","port":10,"chassis":3,"id":"pci.3","bus":"pcie.0","addr":"0x1.0x2"} -device {"driver":"pcie-root-port","port":11,"chassis":4,"id":"pci.4","bus":"pcie.0","addr":"0x1.0x3"} -device {"driver":"pcie-root-port","port":12,"chassis":5,"id":"pci.5","bus":"pcie.0","addr":"0x1.0x4"} -device {"driver":"pcie-root-port","port":13,"chassis":6,"id":"pci.6","bus":"pcie.0","addr":"0x1.0x5"} -device {"driver":"pcie-root-port","port":14,"chassis":7,"id":"pci.7","bus":"pcie.0","addr":"0x1.0x6"} -device {"driver":"pcie-root-port","port":15,"chassis":8,"id":"pci.8","bus":"pcie.0","addr":"0x1.0x7"} -device {"driver":"pcie-root-port","port":16,"chassis":9,"id":"pci.9","bus":"pcie.0","multifunction":true,"addr":"0x2"} -device {"driver":"pcie-root-port","port":17,"chassis":10,"id":"pci.10","bus":"pcie.0","addr":"0x2.0x1"} -device {"driver":"pcie-root-port","port":18,"chassis":11,"id":"pci.11","bus":"pcie.0","addr":"0x2.0x2"} -device {"driver":"pcie-root-port","port":19,"chassis":12,"id":"pci.12","bus":"pcie.0","addr":"0x2.0x3"} -device {"driver":"pcie-root-port","port":20,"chassis":13,"id":"pci.13","bus":"pcie.0","addr":"0x2.0x4"} -device {"driver":"pcie-root-port","port":21,"chassis":14,"id":"pci.14","bus":"pcie.0","addr":"0x2.0x5"} -device {"driver":"qemu-xhci","p2":15,"p3":15,"id":"usb","bus":"pci.2","addr":"0x0"} -device {"driver":"virtio-serial-pci","id":"virtio-serial0","bus":"pci.3","addr":"0x0"} -blockdev {"driver":"file","filename":"/var/lib/libvirt/images/devstack/base.img","node-name":"libvirt-3-storage","auto-read-only":true,"discard":"unmap"} -blockdev {"node-name":"libvirt-3-format","read-only":true,"discard":"unmap","driver":"qcow2","file":"libvirt-3-storage","backing":null} -blockdev {"driver":"file","filename":"/var/lib/libvirt/images/devstack-vm-1.qcow2","node-name":"libvirt-2-storage","auto-read-only":true,"discard":"unmap"} -blockdev {"node-name":"libvirt-2-format","read-only":false,"discard":"unmap","driver":"qcow2","file":"libvirt-2-storage","backing":"libvirt-3-format"} -device {"driver":"virtio-blk-pci","bus":"pci.4","addr":"0x0","drive":"libvirt-2-format","id":"virtio-disk0","bootindex":1} -blockdev {"driver":"file","filename":"/var/lib/libvirt/images/devstack/seed.iso","node-name":"libvirt-1-storage","auto-read-only":true,"discard":"unmap"} -blockdev {"node-name":"libvirt-1-format","read-only":true,"driver":"raw","file":"libvirt-1-storage"} -device {"driver":"ide-cd","bus":"ide.0","drive":"libvirt-1-format","id":"sata0-0-0"} -netdev {"type":"tap","fd":"30","vhost":true,"vhostfd":"36","id":"hostnet0"} -device {"driver":"virtio-net-pci","netdev":"hostnet0","id":"net0","mac":"52:54:00:f6:f2:55","bus":"pci.1","addr":"0x0"} -chardev pty,id=charserial0 -device {"driver":"isa-serial","chardev":"charserial0","id":"serial0","index":0} -chardev socket,id=charchannel0,fd=31,server=on,wait=off -device {"driver":"virtserialport","bus":"virtio-serial0.0","nr":1,"chardev":"charchannel0","id":"channel0","name":"org.qemu.guest_agent.0"} -audiodev {"id":"audio1","driver":"none"} -global ICH9-LPC.noreboot=off -watchdog-action reset -device {"driver":"virtio-balloon-pci","id":"balloon0","bus":"pci.5","addr":"0x0"} -object {"qom-type":"rng-random","id":"objrng0","filename":"/dev/urandom"} -device {"driver":"virtio-rng-pci","rng":"objrng0","id":"rng0","bus":"pci.6","addr":"0x0"} -sandbox on,obsolete=deny,elevateprivileges=deny,spawn=deny,resourcecontrol=deny -msg timestamp=on
hanwang     8557  0.2 15.3 14077228 4546868 ?    Sl   3月26   7:47 /usr/share/elasticsearch/jdk/bin/java -Des.networkaddress.cache.ttl=60 -Des.networkaddress.cache.negative.ttl=10 -Djava.security.manager=allow -XX:+AlwaysPreTouch -Xss1m -Djava.awt.headless=true -Dfile.encoding=UTF-8 -Djna.nosys=true -XX:-OmitStackTraceInFastThrow -Dio.netty.noUnsafe=true -Dio.netty.noKeySetOptimization=true -Dio.netty.recycler.maxCapacityPerThread=0 -Dlog4j.shutdownHookEnabled=false -Dlog4j2.disable.jmx=true -Dlog4j2.formatMsgNoLookups=true -Djava.locale.providers=SPI,COMPAT --add-opens=java.base/java.io=org.elasticsearch.preallocate -Des.cgroups.hierarchy.override=/ -XX:+UseG1GC -Djava.io.tmpdir=/tmp/elasticsearch-14664739578300384182 --add-modules=jdk.incubator.vector -XX:+HeapDumpOnOutOfMemoryError -XX:+ExitOnOutOfMemoryError -XX:HeapDumpPath=data -XX:ErrorFile=logs/hs_err_pid%p.log -Xlog:gc*,gc+age=trace,safepoint:file=logs/gc.log:utctime,level,pid,tags:filecount=32,filesize=64m -Xms3849m -Xmx3849m -XX:MaxDirectMemorySize=2018508800 -XX:G1HeapRegionSize=4m -XX:InitiatingHeapOccupancyPercent=30 -XX:G1ReservePercent=15 -Des.distribution.type=docker --module-path /usr/share/elasticsearch/lib --add-modules=jdk.net --add-modules=ALL-MODULE-PATH -m org.elasticsearch.server/org.elasticsearch.bootstrap.Elasticsearch
hanwang   625102 13.3  2.0 74796540 616464 pts/5 Tl   09:50   0:11 /usr/lib/node_modules/opencode-ai/bin/.opencode
hanwang     7571  0.9  1.9 726924 585612 ?       Ss   3月26  32:17 /usr/bin/Xvfb :1 -screen 0 15360x8640x24 -dpi 96 +extension COMPOSITE +extension DAMAGE +extension GLX +extension RANDR +extension RENDER +extension MIT-SHM +extension XFIXES +extension XTEST +iglx +render -nolisten tcp -ac -noreset -shmem
root        8339  4.0  1.8 1320308 555536 ?      Ssl  3月26 143:50 kube-apiserver --advertise-address=192.168.18.133 --allow-privileged=true --audit-log-format=json --audit-log-maxage=7 --audit-log-maxbackup=10 --audit-log-maxsize=100 --audit-log-path=/var/log/kubernetes/audit.log --audit-policy-file=/etc/kubernetes/audit-policy.yml --authorization-mode=Node,RBAC --client-ca-file=/etc/kubernetes/pki/ca.crt --enable-admission-plugins=NodeRestriction --enable-aggregator-routing=true --enable-bootstrap-token-auth=true --etcd-cafile=/etc/kubernetes/pki/etcd/ca.crt --etcd-certfile=/etc/kubernetes/pki/apiserver-etcd-client.crt --etcd-keyfile=/etc/kubernetes/pki/apiserver-etcd-client.key --etcd-servers=https://127.0.0.1:2379 --feature-gates= --kubelet-client-certificate=/etc/kubernetes/pki/apiserver-kubelet-client.crt --kubelet-client-key=/etc/kubernetes/pki/apiserver-kubelet-client.key --kubelet-preferred-address-types=InternalIP,ExternalIP,Hostname --proxy-client-cert-file=/etc/kubernetes/pki/front-proxy-client.crt --proxy-client-key-file=/etc/kubernetes/pki/front-proxy-client.key --requestheader-allowed-names=front-proxy-client --requestheader-client-ca-file=/etc/kubernetes/pki/front-proxy-ca.crt --requestheader-extra-headers-prefix=X-Remote-Extra- --requestheader-group-headers=X-Remote-Group --requestheader-username-headers=X-Remote-User --secure-port=6443 --service-account-issuer=https://kubernetes.default.svc.cluster.local --service-account-key-file=/etc/kubernetes/pki/sa.pub --service-account-signing-key-file=/etc/kubernetes/pki/sa.key --service-cluster-ip-range=10.96.0.0/22 --tls-cert-file=/etc/kubernetes/pki/apiserver.crt --tls-private-key-file=/etc/kubernetes/pki/apiserver.key
hanwang  3823945  0.4  1.6 7008400 486596 pts/4  Sl+  3月28   2:45 /home/hanwang/work/cowpaw/venv/bin/python3 /home/hanwang/work/cowpaw/venv/bin/copaw app --host 0.0.0.0
pcp         5294  0.4  1.5 2221828 469296 ?      Ssl  3月26  15:49 mysqld
pcp        13524  0.4  1.5 2353940 467044 ?      Ssl  3月26  16:50 mysqld
hanwang     7963 38.0  1.4 3689060 427180 pts/0  SLsl+ 3月26 1346:21 /usr/bin/retroarch -f
pcp         5574  0.5  1.3 2331232 388364 ?      Ssl  3月26  19:10 mysqld --max_connections=1000 --character-set-server=utf8mb4 --collation-server=utf8mb4_unicode_ci --default-authentication-plugin=mysql_native_password --tls_version=TLSv1.2,TLSv1.3 --init-file /data/application/init.sql --binlog_expire_logs_seconds=604800
hanwang  2764530  3.4  1.1 34785652 333112 ?     Sl   3月28  49:18 /home/hanwang/.qoder-server/bin/16c1dec9a3994f378353c6947b77c2dcf96e7c49/node --dns-result-order=ipv4first /home/hanwang/.qoder-server/bin/16c1dec9a3994f378353c6947b77c2dcf96e7c49/out/bootstrap-fork --type=extensionHost --transformURIs --useHostProxy=false
hanwang     6566  0.0  0.9 6587772 292244 ?      Sl   3月26   0:14 /snap/snap-store/1216/usr/bin/snap-store --gapplication-service
hanwang     4359  0.0  0.9 5169488 275436 ?      Ssl  3月26   2:57 /usr/bin/gnome-shell
root        5253  0.0  0.8 273463132 264468 ?    Ssl  3月26   0:47 /jellyfin/jellyfin
hanwang    17060  0.0  0.8 879348 253756 ?       SNl  3月26   2:42 /usr/bin/python3 /usr/bin/update-manager --no-update --no-focus-on-map
472         5575  0.3  0.8 1804476 244240 ?      Ssl  3月26  13:32 grafana server --homepath=/usr/share/grafana --config=/etc/grafana/grafana.ini --packaging=docker cfg:default.log.mode=console cfg:default.paths.data=/var/lib/grafana cfg:default.paths.logs=/var/log/grafana cfg:default.paths.plugins=/var/lib/grafana/plugins cfg:default.paths.provisioning=/etc/grafana/provisioning
hanwang  2764730  0.3  0.7 5805520 216720 ?      Sl   3月28   5:22 /home/hanwang/.qoder-server/bin/16c1dec9a3994f378353c6947b77c2dcf96e7c49/extensions/aicoding-agent/bin/x86_64_linux/Qoder start --workDir /home/hanwang/.config/Qoder/4fd368a50cab7edf1f493950ad5427e90b9ac12fca180ac55f52481aaf7b6541/SharedClientCache
root        1078  0.2  0.6 1638684 199424 ?      S<Lsl 3月26   7:45 ovs-vswitchd unix:/var/run/openvswitch/db.sock -vconsole:emer -vsyslog:err -vfile:info --mlockall --no-chdir --log-file=/var/log/openvswitch/ovs-vswitchd.log --pidfile=/var/run/openvswitch/ovs-vswitchd.pid --detach
root         424  0.0  0.6 252252 178004 ?       S<s  3月26   0:30 /usr/lib/systemd/systemd-journald

2 问题发现

进程PID%MEMRSS (实际内存)说明
qemu-system-x86_64 (devstack-vm)418628.3%8.0 GBKVM 虚拟机
Elasticsearch855715.3%4.3 GB搜索引擎
opencode-ai6251022.0%616 MBAI 编码助手
Xvfb75711.9%585 MB虚拟显示服务器
kube-apiserver83391.8%555 MBK8s API 服务器
cowpaw (Python)38239451.6%486 MBPython 应用
MySQL (3个实例)5294/13524/55744.3%共 ~1.3 GB数据库
retroarch79631.4%427 MB游戏模拟器
qoder-server (Node)27645301.1%333 MB代码服务器

3、目前前两位服务暂时不用,考虑关掉

sudo virsh shutdown devstack-vm  # 直接关机释放 8GB,需要等待一段事件

4、总结了一套诊断脚本

#!/bin/bash
#===============================================================================
#
#          FILE: system_diagnosis.sh
#
#         USAGE: ./system_diagnosis.sh [options]
#
#   DESCRIPTION: 服务器卡顿分层排查脚本
#                1. 快速概览层 (1分钟)
#                2. 系统资源层 (CPU/内存/磁盘/网络)
#                3. 进程服务层 (TOP进程/僵尸进程/服务状态)
#                4. 深度分析层 (IO延迟/系统调用/硬件错误)
#
#       OPTIONS: -q (快速模式), -d (深度模式), -o <file> (输出到文件)
#  REQUIREMENTS: root权限(部分功能), sysstat/iperf3等工具(可选)
#          BUGS: 报告至 hanwang@example.com
#         NOTES: 建议在卡顿期间运行,多次采样更准确
#        AUTHOR: Han Wang, Cloud Platform Architect
#       VERSION: 2.0
#       CREATED: 2026-03-29
#===============================================================================

set -o pipefail

# 颜色定义
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
CYAN='\033[0;36m'
NC='\033[0m' # No Color

# 全局变量
REPORT_FILE=""
QUICK_MODE=false
DEEP_MODE=false
SAMPLE_INTERVAL=1
SAMPLE_COUNT=3
START_TIME=$(date +%s)

# 日志函数
log_info() {
    echo -e "${GREEN}[INFO]${NC} $1"
}

log_warn() {
    echo -e "${YELLOW}[WARN]${NC} $1"
}

log_error() {
    echo -e "${RED}[ERROR]${NC} $1"
}

log_section() {
    echo -e "\n${CYAN}========================================${NC}"
    echo -e "${CYAN}  $1${NC}"
    echo -e "${CYAN}========================================${NC}\n"
}

# 检查root权限
check_root() {
    if [[ $EUID -ne 0 ]]; then
        log_warn "当前非root用户,部分诊断信息可能无法获取"
        log_warn "建议: sudo ./system_diagnosis.sh"
        echo ""
    fi
}

# 检查依赖工具
check_dependencies() {
    local missing_tools=()
    
    for tool in vmstat iostat mpstat pidstat ss; do
        if ! command -v $tool &> /dev/null; then
            missing_tools+=($tool)
        fi
    done
    
    if [[ ${#missing_tools[@]} -gt 0 ]]; then
        log_warn "缺少工具: ${missing_tools[*]}"
        log_info "安装: sudo apt-get install -y sysstat iproute2"
        echo ""
    fi
}

# 打印报告头部
print_header() {
    cat << 'EOF'
    ____  _   _ _____    _     _       _             
   / ___|| | | | ____|  | |   (_)_ __ | |_ ___  _ __ 
   \___ \| |_| |  _|    | |   | | '_ \| __/ _ \| '__|
    ___) |  _  | |___   | |___| | | | | || (_) | |   
   |____/|_| |_|_____|  |_____|_|_| |_|\__\___/|_|   
                                                     
   服务器卡顿分层排查脚本 v2.0
   Author: Han Wang | Cloud Platform Architect
EOF
    echo ""
    log_info "开始时间: $(date '+%Y-%m-%d %H:%M:%S')"
    log_info "主机名: $(hostname)"
    log_info "系统: $(lsb_release -ds 2>/dev/null || cat /etc/os-release | grep PRETTY_NAME | cut -d'"' -f2)"
    log_info "内核: $(uname -r)"
    log_info "架构: $(uname -m)"
    echo ""
}

#===============================================================================
# 第一层:快速概览 (1分钟健康检查)
#===============================================================================
layer1_quick_overview() {
    log_section "第一层:快速概览 (1分钟健康检查)"
    
    # 系统负载
    echo -e "${BLUE}[系统负载]${NC}"
    uptime_info=$(uptime)
    echo "  $uptime_info"
    
    # 解析负载值
    load_1min=$(echo "$uptime_info" | awk -F'load average:' '{print $2}' | awk -F',' '{print $1}' | tr -d ' ')
    cpu_cores=$(nproc)
    load_threshold=$(echo "$cpu_cores * 2" | bc -l 2>/dev/null || echo "$((cpu_cores * 2))")
    
    if [[ $(echo "$load_1min > $load_threshold" | bc -l 2>/dev/null || echo "0") -eq 1 ]]; then
        log_error "1分钟负载 $load_1min 超过阈值 $load_threshold (CPU核数×2),系统过载!"
    elif [[ $(echo "$load_1min > $cpu_cores" | bc -l 2>/dev/null || echo "0") -eq 1 ]]; then
        log_warn "1分钟负载 $load_1min 超过CPU核数 $cpu_cores,负载较高"
    else
        log_info "负载正常: $load_1min / $cpu_cores"
    fi
    echo ""
    
    # 内存概览
    echo -e "${BLUE}[内存概览]${NC}"
    free -h | grep -E "(Mem|Swap)" | while read line; do
        echo "  $line"
    done
    
    mem_info=$(free | grep Mem)
    total_mem=$(echo $mem_info | awk '{print $2}')
    used_mem=$(echo $mem_info | awk '{print $3}')
    avail_mem=$(echo $mem_info | awk '{print $7}')
    mem_usage=$(echo "scale=1; $used_mem * 100 / $total_mem" | bc -l 2>/dev/null || echo "0")
    
    if [[ $(echo "$mem_usage > 90" | bc -l 2>/dev/null || echo "0") -eq 1 ]]; then
        log_error "内存使用率 ${mem_usage}%,严重不足!"
    elif [[ $(echo "$mem_usage > 70" | bc -l 2>/dev/null || echo "0") -eq 1 ]]; then
        log_warn "内存使用率 ${mem_usage}%,压力较大"
    else
        log_info "内存使用率 ${mem_usage}%,状态良好"
    fi
    
    # Swap检查
    swap_info=$(free | grep Swap)
    swap_total=$(echo $swap_info | awk '{print $2}')
    swap_used=$(echo $swap_info | awk '{print $3}')
    if [[ $swap_total -gt 0 && $swap_used -gt 0 ]]; then
        swap_usage=$(echo "scale=1; $swap_used * 100 / $swap_total" | bc -l 2>/dev/null || echo "0")
        if [[ $(echo "$swap_usage > 50" | bc -l 2>/dev/null || echo "0") -eq 1 ]]; then
            log_error "Swap使用率 ${swap_usage}%,频繁换页导致卡顿!"
        else
            log_warn "Swap使用率 ${swap_usage}%,注意内存压力"
        fi
    fi
    echo ""
    
    # 磁盘概览
    echo -e "${BLUE}[磁盘概览]${NC}"
    df -h | grep -E "^/dev" | while read line; do
        usage=$(echo $line | awk '{print $5}' | tr -d '%')
        mount=$(echo $line | awk '{print $6}')
        if [[ $usage -gt 90 ]]; then
            echo -e "  ${RED}[警告]${NC} $line"
        elif [[ $usage -gt 80 ]]; then
            echo -e "  ${YELLOW}[注意]${NC} $line"
        else
            echo "  $line"
        fi
    done
    echo ""
}

#===============================================================================
# 第二层:系统资源层详细检查
#===============================================================================
layer2_system_resources() {
    log_section "第二层:系统资源层详细检查"
    
    # CPU 详细分析
    echo -e "${BLUE}[CPU 详细分析]${NC}"
    echo "  物理核数: $(nproc)"
    echo "  型号: $(grep 'model name' /proc/cpuinfo | head -1 | cut -d':' -f2 | sed 's/^[ \t]*//')"
    
    if command -v mpstat &> /dev/null; then
        echo ""
        echo "  各核负载分布 (mpstat -P ALL):"
        mpstat -P ALL $SAMPLE_INTERVAL 1 | tail -n +4 | head -n 20 | while read line; do
            echo "    $line"
        done
        
        # 检查是否单核瓶颈
        max_cpu=$(mpstat -P ALL $SAMPLE_INTERVAL 1 | tail -n +4 | awk '{print $4}' | sort -rn | head -1)
        if [[ $(echo "$max_cpu > 80" | bc -l 2>/dev/null || echo "0") -eq 1 ]]; then
            log_warn "检测到单核使用率超过80%,可能存在单线程瓶颈"
        fi
    fi
    
    # 检查 steal time (虚拟机场景)
    if grep -q "st" /proc/stat 2>/dev/null; then
        steal_time=$(top -bn1 | grep "Cpu(s)" | grep -oP '\d+\.\d+(?=\s*st)' | head -1)
        if [[ -n "$steal_time" && $(echo "$steal_time > 5" | bc -l 2>/dev/null || echo "0") -eq 1 ]]; then
            log_error "CPU Steal Time: ${steal_time}%,宿主机超售严重!"
        fi
    fi
    echo ""
    
    # 内存压力检测
    echo -e "${BLUE}[内存压力检测]${NC}"
    
    # /proc/meminfo 关键指标
    echo "  关键指标:"
    for key in MemTotal MemFree MemAvailable Buffers Cached Active Inactive Dirty Writeback AnonPages; do
        value=$(grep "^$key:" /proc/meminfo | awk '{print $2, $3}')
        [[ -n "$value" ]] && echo "    $key: $value"
    done
    
    # Dirty 内存检查
    dirty=$(grep "^Dirty:" /proc/meminfo | awk '{print $2}')
    if [[ $dirty -gt 102400 ]]; then  # > 100MB
        log_warn "Dirty内存 ${dirty}KB,大量数据待写入磁盘,即将触发IO风暴"
    fi
    
    # 检查 OOM 历史
    if dmesg 2>/dev/null | grep -i "out of memory" | tail -5 | grep -q "oom"; then
        log_error "检测到历史 OOM 事件!"
        echo "  最近5条:"
        dmesg 2>/dev/null | grep -i "out of memory" | tail -5 | sed 's/^/    /'
    fi
    echo ""
    
    # 磁盘 IO 分析
    echo -e "${BLUE}[磁盘 IO 分析]${NC}"
    if command -v iostat &> /dev/null; then
        echo "  IO统计 (iostat -xz):"
        iostat -xz $SAMPLE_INTERVAL 1 | tail -n +4 | head -20 | while read line; do
            echo "    $line"
        done
        
        # 提取关键指标
        disk_util=$(iostat -xz $SAMPLE_INTERVAL 1 2>/dev/null | awk '/^nvme|^sd|^vd/{getline; print $NF}' | sort -rn | head -1)
        if [[ -n "$disk_util" && $(echo "$disk_util > 90" | bc -l 2>/dev/null || echo "0") -eq 1 ]]; then
            log_error "磁盘利用率 ${disk_util}%,IO 饱和!"
        elif [[ -n "$disk_util" && $(echo "$disk_util > 70" | bc -l 2>/dev/null || echo "0") -eq 1 ]]; then
            log_warn "磁盘利用率 ${disk_util}%,IO 压力较大"
        fi
    else
        log_warn "未安装 sysstat,无法获取 IO 统计"
    fi
    
    # 检查磁盘延迟 (通过 /proc/diskstats)
    echo ""
    echo "  磁盘延迟估算 (读取/写入 ms):"
    cat /proc/diskstats | grep -E "nvme|sd[a-z] " | while read line; do
        device=$(echo $line | awk '{print $3}')
        reads=$(echo $line | awk '{print $6}')
        read_ms=$(echo $line | awk '{print $7}')
        writes=$(echo $line | awk '{print $10}')
        write_ms=$(echo $line | awk '{print $11}')
        
        if [[ $reads -gt 0 ]]; then
            avg_read_latency=$(echo "scale=2; $read_ms / $reads" | bc -l 2>/dev/null || echo "N/A")
        else
            avg_read_latency="N/A"
        fi
        
        if [[ $writes -gt 0 ]]; then
            avg_write_latency=$(echo "scale=2; $write_ms / $writes" | bc -l 2>/dev/null || echo "N/A")
        else
            avg_write_latency="N/A"
        fi
        
        echo "    $device: 读延迟=${avg_read_latency}ms, 写延迟=${avg_write_latency}ms"
    done
    echo ""
    
    # 网络分析
    echo -e "${BLUE}[网络连接分析]${NC}"
    echo "  连接状态统计:"
    ss -s 2>/dev/null | sed 's/^/    /'
    
    echo ""
    echo "  TCP 状态分布:"
    ss -ant | awk 'NR>1 {++S[$1]} END {for(a in S) print "    " a, S[a]}' | sort -rn -k2
    
    # 检查 TIME_WAIT 和 CLOSE_WAIT
    time_wait=$(ss -ant | grep -c TIME_WAIT 2>/dev/null || echo "0")
    close_wait=$(ss -ant | grep -c CLOSE_WAIT 2>/dev/null || echo "0")
    
    if [[ $close_wait -gt 100 ]]; then
        log_error "CLOSE_WAIT 连接数 $close_wait,存在连接泄漏!"
    fi
    
    if [[ $time_wait -gt 10000 ]]; then
        log_warn "TIME_WAIT 连接数 $time_wait,端口可能耗尽"
    fi
    echo ""
}

#===============================================================================
# 第三层:进程服务层检查
#===============================================================================
layer3_process_services() {
    log_section "第三层:进程服务层检查"
    
    # TOP 资源消耗进程
    echo -e "${BLUE}[TOP 资源消耗进程]${NC}"
    
    echo "  CPU 占用 TOP 10:"
    ps aux --sort=-%cpu | head -11 | tail -10 | nl -w 2 -s '. ' | while read line; do
        pid=$(echo $line | awk '{print $3}')
        cpu=$(echo $line | awk '{print $4}')
        mem=$(echo $line | awk '{print $5}')
        cmd=$(echo $line | cut -d' ' -f12-)
        
        if [[ $(echo "$cpu > 50" | bc -l 2>/dev/null || echo "0") -eq 1 ]]; then
            echo -e "    ${RED}$line${NC}"
        elif [[ $(echo "$cpu > 20" | bc -l 2>/dev/null || echo "0") -eq 1 ]]; then
            echo -e "    ${YELLOW}$line${NC}"
        else
            echo "    $line"
        fi
    done
    
    echo ""
    echo "  内存占用 TOP 10:"
    ps aux --sort=-%mem | head -11 | tail -10 | nl -w 2 -s '. ' | while read line; do
        mem=$(echo $line | awk '{print $5}')
        if [[ $(echo "$mem > 10" | bc -l 2>/dev/null || echo "0") -eq 1 ]]; then
            echo -e "    ${RED}$line${NC}"
        elif [[ $(echo "$mem > 5" | bc -l 2>/dev/null || echo "0") -eq 1 ]]; then
            echo -e "    ${YELLOW}$line${NC}"
        else
            echo "    $line"
        fi
    done
    echo ""
    
    # 僵尸进程检查
    echo -e "${BLUE}[僵尸进程检查]${NC}"
    zombie_count=$(ps aux | awk '$8=="Z" {count++} END {print count+0}')
    if [[ $zombie_count -gt 0 ]]; then
        log_error "发现 $zombie_count 个僵尸进程!"
        echo "  僵尸进程列表:"
        ps aux | awk '$8=="Z" {print "    PID: " $2, "PPID: " $3, "CMD: " $11}' | head -10
        echo "  父进程:"
        ps aux | awk '$8=="Z" {print $3}' | sort -u | xargs -I {} ps -p {} -o pid,comm 2>/dev/null
    else
        log_info "未发现僵尸进程"
    fi
    echo ""
    
    # D 状态进程(不可中断睡眠,通常是IO等待)
    echo -e "${BLUE}[D 状态进程检查 (不可中断睡眠)]${NC}"
    d_state_procs=$(ps aux | awk '$8=="D" {print $0}')
    if [[ -n "$d_state_procs" ]]; then
        log_error "发现 D 状态进程(IO 等待/死锁):"
        echo "$d_state_procs" | head -5 | sed 's/^/    /'
    else
        log_info "未发现 D 状态进程"
    fi
    echo ""
    
    # 系统服务状态
    echo -e "${BLUE}[关键服务状态]${NC}"
    critical_services=("sshd" "systemd-journald" "cron" "networking" "docker" "containerd" "kubelet")
    for service in "${critical_services[@]}"; do
        if systemctl is-active --quiet $service 2>/dev/null; then
            echo "  [运行中] $service"
        elif systemctl is-failed --quiet $service 2>/dev/null; then
            echo -e "  ${RED}[失败] $service${NC}"
        else
            echo "  [未运行/未安装] $service"
        fi
    done
    echo ""
    
    # 定时任务检查
    echo -e "${BLUE}[定时任务检查]${NC}"
    echo "  当前时间: $(date '+%H:%M')"
    echo "  最近5分钟内可能执行的 cron 任务:"
    crontab -l 2>/dev/null | grep -v "^#" | while read line; do
        echo "    $line"
    done
    ls -la /etc/cron.d/ 2>/dev/null | tail -n +3 | while read line; do
        echo "    /etc/cron.d/$(echo $line | awk '{print $9}')"
    done
    echo ""
}

#===============================================================================
# 第四层:深度分析(可选)
#===============================================================================
layer4_deep_analysis() {
    if [[ "$DEEP_MODE" != true ]]; then
        return
    fi
    
    log_section "第四层:深度分析 (Deep Mode)"
    
    # IO 压力详细分析
    echo -e "${BLUE}[IO 压力详细分析]${NC}"
    if [[ -f /proc/pressure/io ]]; then
        echo "  PSI IO 压力统计:"
        cat /proc/pressure/io | sed 's/^/    /'
    fi
    
    if [[ -f /proc/pressure/memory ]]; then
        echo ""
        echo "  PSI 内存压力统计:"
        cat /proc/pressure/memory | sed 's/^/    /'
    fi
    echo ""
    
    # 系统调用跟踪(采样)
    echo -e "${BLUE}[系统调用采样 (pidstat)]${NC}"
    if command -v pidstat &> /dev/null; then
        echo "  采样 5 秒,查看系统调用频率..."
        pidstat -S 1 5 2>/dev/null | tail -20 | sed 's/^/    /'
    fi
    echo ""
    
    # 上下文切换和中断
    echo -e "${BLUE}[上下文切换统计]${NC}"
    if command -v vmstat &> /dev/null; then
        echo "  (cs: 上下文切换, in: 中断, us/sy/id/wa: CPU时间分布)"
        vmstat -s | grep -E "(context switch|interrupt|CPU)" | sed 's/^/    /'
    fi
    echo ""
    
    # 硬件错误检查
    echo -e "${BLUE}[硬件错误检查]${NC}"
    if command -v mcelog &> /dev/null; then
        mcelog --client 2>/dev/null | head -10 | sed 's/^/    /'
    else
        log_info "未安装 mcelog,跳过硬件错误检查"
    fi
    
    # 内核日志错误
    echo ""
    echo "  最近内核错误 (dmesg):"
    dmesg -T 2>/dev/null | grep -iE "(error|fail|warn|oom|killed)" | tail -10 | sed 's/^/    /'
    echo ""
}

#===============================================================================
# 诊断报告生成
#===============================================================================
generate_report() {
    log_section "诊断报告总结"
    
    END_TIME=$(date +%s)
    DURATION=$((END_TIME - START_TIME))
    
    echo "排查耗时: ${DURATION} 秒"
    echo ""
    
    # 生成建议
    echo -e "${BLUE}[优化建议]${NC}"
    
    # 根据收集的信息给出建议
    load=$(uptime | awk -F'load average:' '{print $2}' | awk -F',' '{print $1}' | tr -d ' ')
    mem_usage=$(free | grep Mem | awk '{printf("%.0f", $3/$2 * 100)}')
    
    if [[ $(echo "$load > $(nproc)" | bc -l 2>/dev/null || echo "0") -eq 1 ]]; then
        echo "  1. [高负载] 系统负载过高,建议:"
        echo "     - 检查 CPU 占用最高的进程,考虑优化或迁移"
        echo "     - 如果是多核系统,检查是否存在单线程瓶颈"
    fi
    
    if [[ $mem_usage -gt 80 ]]; then
        echo "  2. [内存不足] 内存使用率 ${mem_usage}%,建议:"
        echo "     - 找出内存占用 TOP 进程,关闭不必要的服务"
        echo "     - 检查是否有内存泄漏(RSS 持续增长的进程)"
        echo "     - 考虑增加物理内存或启用 swap(临时方案)"
    fi
    
    # 检查是否有僵尸进程
    zombie_count=$(ps aux | awk '$8=="Z" {count++} END {print count+0}')
    if [[ $zombie_count -gt 0 ]]; then
        echo "  3. [僵尸进程] 存在 $zombie_count 个僵尸进程,建议重启父进程或系统"
    fi
    
    # 检查 D 状态进程
    d_count=$(ps aux | awk '$8=="D" {count++} END {print count+0}')
    if [[ $d_count -gt 0 ]]; then
        echo "  4. [IO 阻塞] 存在 $d_count 个 D 状态进程,建议:"
        echo "     - 检查磁盘健康状态 (smartctl)"
        echo "     - 检查是否有 NFS/iSCSI 等网络存储挂死"
    fi
    
    echo ""
    echo -e "${GREEN}诊断完成!如需进一步分析,请查看上述详细输出。${NC}"
}

#===============================================================================
# 主函数
#===============================================================================
main() {
    # 参数解析
    while getopts "qdo:h" opt; do
        case $opt in
            q) QUICK_MODE=true ;;
            d) DEEP_MODE=true ;;
            o) REPORT_FILE="$OPTARG" ;;
            h|*) 
                echo "Usage: $0 [-q] [-d] [-o <output_file>]"
                echo "  -q    快速模式 (仅第一层)"
                echo "  -d    深度模式 (包含第四层)"
                echo "  -o    输出到文件"
                exit 0
                ;;
        esac
    done
    
    # 重定向输出到文件
    if [[ -n "$REPORT_FILE" ]]; then
        exec > >(tee -a "$REPORT_FILE")
        exec 2>&1
    fi
    
    # 执行检查
    check_root
    check_dependencies
    print_header
    
    # 分层执行
    layer1_quick_overview
    
    if [[ "$QUICK_MODE" != true ]]; then
        layer2_system_resources
        layer3_process_services
        layer4_deep_analysis
    fi
    
    generate_report
}

# 运行主函数
main "$@"

执行

bash diagnose.sh