监控java进程内存占用,自动dump

20 阅读3分钟

监控脚本

#!/bin/bash

# ==============================================
# 配置区(监控目标Java进程)
# ==============================================
PROCESS_NAME="zhi-ai-server"  # 目标进程名
JAVA_HOME="/usr/java/openjdk21"  # JDK绝对路径
# ==============================================

# ==============================================
# 自动获取路径
# ==============================================
WATCH_SCRIPT_DIR=$(cd "$(dirname "$0")" && pwd)
DUMP_DIR="${WATCH_SCRIPT_DIR}/../jvm_dumps"
LOG_FILE="${DUMP_DIR}/jvm_monitor.log"

# ==============================================
# 初始化与校验
# ==============================================
mkdir -p "$DUMP_DIR" || {
    echo "[$(date)] 错误:无法创建目录 $DUMP_DIR(权限不足)" | tee -a "$LOG_FILE"
    exit 1
}

if [ ! -d "$JAVA_HOME" ]; then
    echo "[$(date)] 错误:JAVA_HOME不存在 - $JAVA_HOME" | tee -a "$LOG_FILE"
    exit 1
fi
JSTAT="${JAVA_HOME}/bin/jstat"
JMEM="${JAVA_HOME}/bin/jmap"
if [ ! -x "$JSTAT" ] || [ ! -x "$JMEM" ]; then
    echo "[$(date)] 错误:JDK工具缺失(jstat/jmap)" | tee -a "$LOG_FILE"
    exit 1
fi

# ==============================================
# 获取目标Java进程PID(新增Java过滤)
# ==============================================
get_target_pid() {
    # 仅匹配同时包含PROCESS_NAME和java的进程(两种顺序都支持)
    local pids=$(ps aux | grep -E "$PROCESS_NAME.*java|java.*$PROCESS_NAME" | grep -v grep | awk '{print $2}')
    local pid_count=$(echo "$pids" | wc -l | awk '{print $1}')

    if [ $pid_count -eq 0 ]; then
        echo "[$(date)] 错误:未找到【$PROCESS_NAME】相关的Java进程" | tee -a "$LOG_FILE"
        exit 1
    elif [ $pid_count -gt 1 ]; then
        echo "[$(date)] 错误:找到多个【$PROCESS_NAME】相关的Java进程(PID: $pids)" | tee -a "$LOG_FILE"
        exit 1
    else
        echo "$pids"
    fi
}

# ==============================================
# 系统可用内存检查(GB)
# ==============================================
get_system_available_memory() {
    local available_bytes=$(free -b | awk '/Mem:/ {print $7}')
    if [ -z "$available_bytes" ] || [ "$available_bytes" -eq 0 ]; then
        available_bytes=$(free -b | awk '/Mem:/ {print $4}')
    fi
    if [ -z "$available_bytes" ] || [ "$available_bytes" -eq 0 ]; then
        echo "0.0"
        return
    fi
    echo "scale=2; $available_bytes / (1024*1024*1024)" | bc
}

# ==============================================
# JVM堆内存使用率(%)
# ==============================================
get_jvm_heap_usage() {
    local gc_stats=$("$JSTAT" -gc "$PID" 2>/dev/null | tail -n 1)
    if [ -z "$gc_stats" ]; then
        echo "[$(date)] 错误:无法获取JVM统计信息(进程【$PROCESS_NAME】PID: $PID)" | tee -a "$LOG_FILE"
        return 1
    fi

    local S0C=$(echo "$gc_stats" | awk '{print int($1)}')
    local S1C=$(echo "$gc_stats" | awk '{print int($2)}')
    local S0U=$(echo "$gc_stats" | awk '{print int($3)}')
    local S1U=$(echo "$gc_stats" | awk '{print int($4)}')
    local EC=$(echo "$gc_stats" | awk '{print int($5)}')
    local EU=$(echo "$gc_stats" | awk '{print int($6)}')
    local OC=$(echo "$gc_stats" | awk '{print int($7)}')
    local OU=$(echo "$gc_stats" | awk '{print int($8)}')

    local total_heap=$(( S0C + S1C + EC + OC ))
    local used_heap=$(( S0U + S1U + EU + OU ))

    if [ $total_heap -le 0 ] || [ $used_heap -lt 0 ]; then
        echo "0.0"
        return 0
    fi

    echo "scale=2; ($used_heap * 100) / $total_heap" | bc
    return 0
}

# ==============================================
# 触发条件时执行dump
# ==============================================
dump_and_log() {
    local timestamp=$(date +%Y%m%d_%H%M%S)
    local before_dump="${DUMP_DIR}/before_dump_${PROCESS_NAME}_${PID}_${timestamp}.txt"
    local heap_dump="${DUMP_DIR}/heapdump_${PROCESS_NAME}_${PID}_${timestamp}.hprof"
    local after_dump="${DUMP_DIR}/after_dump_${PROCESS_NAME}_${PID}_${timestamp}.txt"

    echo "[$(date)] 满足触发条件,处理进程【$PROCESS_NAME】(PID: $PID)..." | tee -a "$LOG_FILE"
    "$JMEM" -histo "$PID" > "$before_dump" 2>&1
    "$JMEM" -dump:format=b,file="$heap_dump" "$PID" 2>&1
    "$JMEM" -histo:live "$PID" > "$after_dump" 2>&1

    echo "[$(date)] 处理完成,文件保存到:$DUMP_DIR" | tee -a "$LOG_FILE"
    exit 0
}

# ==============================================
# 主监控循环
# ==============================================
PID=$(get_target_pid)
INTERVAL=${1:-30}

echo "[$(date)] 监控启动:进程【$PROCESS_NAME】PID=$PID,间隔=$INTERVAL秒" | tee -a "$LOG_FILE"
echo "[$(date)] 日志目录:$DUMP_DIR" | tee -a "$LOG_FILE"

while true; do
    # 检查进程是否存活
    if ! ps -p "$PID" > /dev/null; then
        echo "[$(date)] 错误:进程【$PROCESS_NAME】已退出,监控停止" | tee -a "$LOG_FILE"
        exit 0
    fi

    # 检查系统可用内存(<1GB触发)
    sys_available=$(get_system_available_memory)
    sys_available=${sys_available:-0.0}
    sys_available_int=$(echo "$sys_available" | cut -d '.' -f 1)
    if [ "$sys_available_int" -lt 1 ]; then
        echo "[$(date)] 系统可用内存过低:$sys_available GB,触发处理" | tee -a "$LOG_FILE"
        dump_and_log
    fi

    # 检查JVM堆使用率(≥90%触发)
    jvm_usage=$(get_jvm_heap_usage || echo "0.0")
    jvm_usage=${jvm_usage:-0.0}
    if [ $(echo "$jvm_usage >= 90" | bc) -eq 1 ]; then
        echo "[$(date)] JVM堆使用率过高:$jvm_usage%,触发处理" | tee -a "$LOG_FILE"
        dump_and_log
    fi

    # 正常监控日志
    echo "[$(date)] 监控中:JVM使用率=$jvm_usage%,系统可用内存=$sys_available GB" | tee -a "$LOG_FILE"
    sleep "$INTERVAL"
done

启动监控脚本

#!/bin/bash

# 当前控制脚本(jvm_start.sh)的绝对目录
CONTROL_SCRIPT_DIR=$(cd "$(dirname "$0")" && pwd)
# 监控脚本(jvm_watch.sh)的绝对全路径
WATCH_SCRIPT_FULL_PATH="${CONTROL_SCRIPT_DIR}/jvm_watch.sh"

# ==============================================
# 检查依赖
# ==============================================
# 检查监控脚本是否存在
if [ ! -f "$WATCH_SCRIPT_FULL_PATH" ]; then
    echo "错误:未找到监控脚本,路径:$WATCH_SCRIPT_FULL_PATH"
    echo "请确保jvm_start.sh与jvm_watch.sh在同一目录"
    exit 1
fi

# 检查监控脚本是否有执行权限
if [ ! -x "$WATCH_SCRIPT_FULL_PATH" ]; then
    echo "警告:监控脚本缺少执行权限,正在自动添加..."
    chmod +x "$WATCH_SCRIPT_FULL_PATH" || {
        echo "错误:无法添加执行权限,请手动执行 chmod +x $WATCH_SCRIPT_FULL_PATH"
        exit 1
    }
fi

# ==============================================
# 等待ai-server启动(核心:确保监控启动时进程已存在)
# ==============================================
wait_for_ai_server() {
    local MAX_WAIT=120  # 最大等待时间(秒),可根据实际调整
    local INTERVAL=5    # 检查间隔(秒)
    local ELAPSED=0

    echo "等待ai-server进程启动(最多等待$MAX_WAIT秒)..."
    # 输出手动检查ai-server进程的命令
    echo "手动检查ai-server进程命令:ps aux | grep 'ai-server' | grep -v grep"
    
    while true; do
        # 检查ai-server进程是否存在(与监控脚本逻辑一致)
        local pids=$(ps aux | grep 'ai-server' | grep -v grep | awk '{print $2}')
        if [ -n "$pids" ] && [ $(echo "$pids" | wc -l | awk '{print $1}') -eq 1 ]; then
            echo "ai-server进程已启动(PID: $pids)"
            return 0  # 进程就绪,退出等待
        fi

        # 超时退出
        ELAPSED=$((ELAPSED + INTERVAL))
        if [ $ELAPSED -ge $MAX_WAIT ]; then
            echo "错误:等待ai-server启动超时(超过$MAX_WAIT秒),监控启动失败"
            return 1  # 超时失败
        fi

        sleep $INTERVAL
    done
}

# ==============================================
# 启动监控
# ==============================================
start() {
    # 先检查是否已在运行(输出手动检查命令)
    local check_cmd="ps -ef | grep \"$WATCH_SCRIPT_FULL_PATH\" | grep -v grep > /dev/null"
    echo "手动检查监控进程是否运行:$check_cmd"
    if eval "$check_cmd"; then
        echo "监控脚本已在运行(全路径:$WATCH_SCRIPT_FULL_PATH)"
        return 0
    fi

    # 等待ai-server启动
    if ! wait_for_ai_server; then
        return 1  # 等待失败,不启动监控
    fi

    # 启动监控(输出手动启动命令)
    echo "正在启动监控脚本(全路径:$WATCH_SCRIPT_FULL_PATH)..."
    local start_cmd="nohup \"$WATCH_SCRIPT_FULL_PATH\" 60 > /dev/null 2>&1 &"
    echo "手动启动监控命令(可直接复制执行):$start_cmd"
    eval "$start_cmd"
    sleep 3  # 等待启动完成

    # 验证启动结果(输出手动验证命令)
    local verify_cmd="ps -ef | grep \"$WATCH_SCRIPT_FULL_PATH\" | grep -v grep"
    echo "手动验证监控进程(查看PID):$verify_cmd"
    if eval "$check_cmd"; then
        echo "监控启动成功!日志和dump文件路径:$(dirname "$WATCH_SCRIPT_FULL_PATH")/../jvm_dumps"
    else
        echo "监控启动失败!请查看jvm_dumps目录下的jvm_monitor.log排查错误"
    fi
}

# ==============================================
# 停止监控
# ==============================================
stop() {
    echo "正在停止监控脚本(全路径:$WATCH_SCRIPT_FULL_PATH)..."
    # 输出手动kill命令(可直接复制)
    local kill_cmd="ps -ef | grep \"$WATCH_SCRIPT_FULL_PATH\" | grep -v grep | awk '{print \$2}' | xargs -n 1 kill -9"
    echo "手动停止监控命令(可直接复制执行):$kill_cmd"
    eval "$kill_cmd" > /dev/null 2>&1
    sleep 2

    # 验证停止结果
    local check_cmd="ps -ef | grep \"$WATCH_SCRIPT_FULL_PATH\" | grep -v grep > /dev/null"
    if eval "$check_cmd"; then
        echo "停止失败!请手动执行上面的kill命令"
    else
        echo "停止成功!"
    fi
}

# ==============================================
# 主逻辑(接收参数:start/stop/restart)
# ==============================================
case "$1" in
    start)
        start
        ;;
    stop)
        stop
        ;;
    restart)
        echo "===== 重启监控脚本 ====="
        stop
        start
        ;;
    *)
        echo "用法:$0 {start|stop|restart}"
        exit 1
        ;;
esac