监控脚本
#!/bin/bash
# ==============================================
# 配置区(监控目标Java进程)
# ==============================================
PROCESS_NAME="zhi-ai-server" # 目标进程名
JAVA_HOME="/usr/java/openjdk21" # JDK绝对路径
# ==============================================
# ==============================================
# 自动获取路径
# ==============================================
WATCH_SCRIPT_DIR=$(cd "$(dirname "$0")" && pwd)
DUMP_DIR="${WATCH_SCRIPT_DIR}/../jvm_dumps"
LOG_FILE="${DUMP_DIR}/jvm_monitor.log"
# ==============================================
# 初始化与校验
# ==============================================
mkdir -p "$DUMP_DIR" || {
echo "[$(date)] 错误:无法创建目录 $DUMP_DIR(权限不足)" | tee -a "$LOG_FILE"
exit 1
}
if [ ! -d "$JAVA_HOME" ]; then
echo "[$(date)] 错误:JAVA_HOME不存在 - $JAVA_HOME" | tee -a "$LOG_FILE"
exit 1
fi
JSTAT="${JAVA_HOME}/bin/jstat"
JMEM="${JAVA_HOME}/bin/jmap"
if [ ! -x "$JSTAT" ] || [ ! -x "$JMEM" ]; then
echo "[$(date)] 错误:JDK工具缺失(jstat/jmap)" | tee -a "$LOG_FILE"
exit 1
fi
# ==============================================
# 获取目标Java进程PID(新增Java过滤)
# ==============================================
get_target_pid() {
# 仅匹配同时包含PROCESS_NAME和java的进程(两种顺序都支持)
local pids=$(ps aux | grep -E "$PROCESS_NAME.*java|java.*$PROCESS_NAME" | grep -v grep | awk '{print $2}')
local pid_count=$(echo "$pids" | wc -l | awk '{print $1}')
if [ $pid_count -eq 0 ]; then
echo "[$(date)] 错误:未找到【$PROCESS_NAME】相关的Java进程" | tee -a "$LOG_FILE"
exit 1
elif [ $pid_count -gt 1 ]; then
echo "[$(date)] 错误:找到多个【$PROCESS_NAME】相关的Java进程(PID: $pids)" | tee -a "$LOG_FILE"
exit 1
else
echo "$pids"
fi
}
# ==============================================
# 系统可用内存检查(GB)
# ==============================================
get_system_available_memory() {
local available_bytes=$(free -b | awk '/Mem:/ {print $7}')
if [ -z "$available_bytes" ] || [ "$available_bytes" -eq 0 ]; then
available_bytes=$(free -b | awk '/Mem:/ {print $4}')
fi
if [ -z "$available_bytes" ] || [ "$available_bytes" -eq 0 ]; then
echo "0.0"
return
fi
echo "scale=2; $available_bytes / (1024*1024*1024)" | bc
}
# ==============================================
# JVM堆内存使用率(%)
# ==============================================
get_jvm_heap_usage() {
local gc_stats=$("$JSTAT" -gc "$PID" 2>/dev/null | tail -n 1)
if [ -z "$gc_stats" ]; then
echo "[$(date)] 错误:无法获取JVM统计信息(进程【$PROCESS_NAME】PID: $PID)" | tee -a "$LOG_FILE"
return 1
fi
local S0C=$(echo "$gc_stats" | awk '{print int($1)}')
local S1C=$(echo "$gc_stats" | awk '{print int($2)}')
local S0U=$(echo "$gc_stats" | awk '{print int($3)}')
local S1U=$(echo "$gc_stats" | awk '{print int($4)}')
local EC=$(echo "$gc_stats" | awk '{print int($5)}')
local EU=$(echo "$gc_stats" | awk '{print int($6)}')
local OC=$(echo "$gc_stats" | awk '{print int($7)}')
local OU=$(echo "$gc_stats" | awk '{print int($8)}')
local total_heap=$(( S0C + S1C + EC + OC ))
local used_heap=$(( S0U + S1U + EU + OU ))
if [ $total_heap -le 0 ] || [ $used_heap -lt 0 ]; then
echo "0.0"
return 0
fi
echo "scale=2; ($used_heap * 100) / $total_heap" | bc
return 0
}
# ==============================================
# 触发条件时执行dump
# ==============================================
dump_and_log() {
local timestamp=$(date +%Y%m%d_%H%M%S)
local before_dump="${DUMP_DIR}/before_dump_${PROCESS_NAME}_${PID}_${timestamp}.txt"
local heap_dump="${DUMP_DIR}/heapdump_${PROCESS_NAME}_${PID}_${timestamp}.hprof"
local after_dump="${DUMP_DIR}/after_dump_${PROCESS_NAME}_${PID}_${timestamp}.txt"
echo "[$(date)] 满足触发条件,处理进程【$PROCESS_NAME】(PID: $PID)..." | tee -a "$LOG_FILE"
"$JMEM" -histo "$PID" > "$before_dump" 2>&1
"$JMEM" -dump:format=b,file="$heap_dump" "$PID" 2>&1
"$JMEM" -histo:live "$PID" > "$after_dump" 2>&1
echo "[$(date)] 处理完成,文件保存到:$DUMP_DIR" | tee -a "$LOG_FILE"
exit 0
}
# ==============================================
# 主监控循环
# ==============================================
PID=$(get_target_pid)
INTERVAL=${1:-30}
echo "[$(date)] 监控启动:进程【$PROCESS_NAME】PID=$PID,间隔=$INTERVAL秒" | tee -a "$LOG_FILE"
echo "[$(date)] 日志目录:$DUMP_DIR" | tee -a "$LOG_FILE"
while true; do
# 检查进程是否存活
if ! ps -p "$PID" > /dev/null; then
echo "[$(date)] 错误:进程【$PROCESS_NAME】已退出,监控停止" | tee -a "$LOG_FILE"
exit 0
fi
# 检查系统可用内存(<1GB触发)
sys_available=$(get_system_available_memory)
sys_available=${sys_available:-0.0}
sys_available_int=$(echo "$sys_available" | cut -d '.' -f 1)
if [ "$sys_available_int" -lt 1 ]; then
echo "[$(date)] 系统可用内存过低:$sys_available GB,触发处理" | tee -a "$LOG_FILE"
dump_and_log
fi
# 检查JVM堆使用率(≥90%触发)
jvm_usage=$(get_jvm_heap_usage || echo "0.0")
jvm_usage=${jvm_usage:-0.0}
if [ $(echo "$jvm_usage >= 90" | bc) -eq 1 ]; then
echo "[$(date)] JVM堆使用率过高:$jvm_usage%,触发处理" | tee -a "$LOG_FILE"
dump_and_log
fi
# 正常监控日志
echo "[$(date)] 监控中:JVM使用率=$jvm_usage%,系统可用内存=$sys_available GB" | tee -a "$LOG_FILE"
sleep "$INTERVAL"
done
启动监控脚本
#!/bin/bash
# 当前控制脚本(jvm_start.sh)的绝对目录
CONTROL_SCRIPT_DIR=$(cd "$(dirname "$0")" && pwd)
# 监控脚本(jvm_watch.sh)的绝对全路径
WATCH_SCRIPT_FULL_PATH="${CONTROL_SCRIPT_DIR}/jvm_watch.sh"
# ==============================================
# 检查依赖
# ==============================================
# 检查监控脚本是否存在
if [ ! -f "$WATCH_SCRIPT_FULL_PATH" ]; then
echo "错误:未找到监控脚本,路径:$WATCH_SCRIPT_FULL_PATH"
echo "请确保jvm_start.sh与jvm_watch.sh在同一目录"
exit 1
fi
# 检查监控脚本是否有执行权限
if [ ! -x "$WATCH_SCRIPT_FULL_PATH" ]; then
echo "警告:监控脚本缺少执行权限,正在自动添加..."
chmod +x "$WATCH_SCRIPT_FULL_PATH" || {
echo "错误:无法添加执行权限,请手动执行 chmod +x $WATCH_SCRIPT_FULL_PATH"
exit 1
}
fi
# ==============================================
# 等待ai-server启动(核心:确保监控启动时进程已存在)
# ==============================================
wait_for_ai_server() {
local MAX_WAIT=120 # 最大等待时间(秒),可根据实际调整
local INTERVAL=5 # 检查间隔(秒)
local ELAPSED=0
echo "等待ai-server进程启动(最多等待$MAX_WAIT秒)..."
# 输出手动检查ai-server进程的命令
echo "手动检查ai-server进程命令:ps aux | grep 'ai-server' | grep -v grep"
while true; do
# 检查ai-server进程是否存在(与监控脚本逻辑一致)
local pids=$(ps aux | grep 'ai-server' | grep -v grep | awk '{print $2}')
if [ -n "$pids" ] && [ $(echo "$pids" | wc -l | awk '{print $1}') -eq 1 ]; then
echo "ai-server进程已启动(PID: $pids)"
return 0 # 进程就绪,退出等待
fi
# 超时退出
ELAPSED=$((ELAPSED + INTERVAL))
if [ $ELAPSED -ge $MAX_WAIT ]; then
echo "错误:等待ai-server启动超时(超过$MAX_WAIT秒),监控启动失败"
return 1 # 超时失败
fi
sleep $INTERVAL
done
}
# ==============================================
# 启动监控
# ==============================================
start() {
# 先检查是否已在运行(输出手动检查命令)
local check_cmd="ps -ef | grep \"$WATCH_SCRIPT_FULL_PATH\" | grep -v grep > /dev/null"
echo "手动检查监控进程是否运行:$check_cmd"
if eval "$check_cmd"; then
echo "监控脚本已在运行(全路径:$WATCH_SCRIPT_FULL_PATH)"
return 0
fi
# 等待ai-server启动
if ! wait_for_ai_server; then
return 1 # 等待失败,不启动监控
fi
# 启动监控(输出手动启动命令)
echo "正在启动监控脚本(全路径:$WATCH_SCRIPT_FULL_PATH)..."
local start_cmd="nohup \"$WATCH_SCRIPT_FULL_PATH\" 60 > /dev/null 2>&1 &"
echo "手动启动监控命令(可直接复制执行):$start_cmd"
eval "$start_cmd"
sleep 3 # 等待启动完成
# 验证启动结果(输出手动验证命令)
local verify_cmd="ps -ef | grep \"$WATCH_SCRIPT_FULL_PATH\" | grep -v grep"
echo "手动验证监控进程(查看PID):$verify_cmd"
if eval "$check_cmd"; then
echo "监控启动成功!日志和dump文件路径:$(dirname "$WATCH_SCRIPT_FULL_PATH")/../jvm_dumps"
else
echo "监控启动失败!请查看jvm_dumps目录下的jvm_monitor.log排查错误"
fi
}
# ==============================================
# 停止监控
# ==============================================
stop() {
echo "正在停止监控脚本(全路径:$WATCH_SCRIPT_FULL_PATH)..."
# 输出手动kill命令(可直接复制)
local kill_cmd="ps -ef | grep \"$WATCH_SCRIPT_FULL_PATH\" | grep -v grep | awk '{print \$2}' | xargs -n 1 kill -9"
echo "手动停止监控命令(可直接复制执行):$kill_cmd"
eval "$kill_cmd" > /dev/null 2>&1
sleep 2
# 验证停止结果
local check_cmd="ps -ef | grep \"$WATCH_SCRIPT_FULL_PATH\" | grep -v grep > /dev/null"
if eval "$check_cmd"; then
echo "停止失败!请手动执行上面的kill命令"
else
echo "停止成功!"
fi
}
# ==============================================
# 主逻辑(接收参数:start/stop/restart)
# ==============================================
case "$1" in
start)
start
;;
stop)
stop
;;
restart)
echo "===== 重启监控脚本 ====="
stop
start
;;
*)
echo "用法:$0 {start|stop|restart}"
exit 1
;;
esac