从零开始:Java Agent 实现应用无侵入监控 + Python 调用链分析

0 阅读5分钟

从零开始:Java Agent 实现应用无侵入监控 + Python 调用链分析

不做概念堆砌,每一行代码都真实可运行

一、为什么需要 Agent?

先说痛点:生产环境出问题了,你怀疑某个方法执行太慢,但代码没打日志,怎么办?重新发布加日志?风险太大。

Java Agent 可以在不修改业务代码的情况下,动态增强已有类,实现:

  • 方法耗时统计
  • 全链路追踪
  • 参数/返回值拦截
  • 日志动态注入

配合 Python 脚本,还能做实时调用链分析和可视化。


二、一个能跑起来的 Agent 项目

2.1 项目结构


java-agent-demo/  
├── agent/ # Agent 模块  
│ ├── pom.xml  
│ └── src/main/java/  
│ └── com/agent/  
│ ├── AgentMain.java # Agent 入口  
│ └── TimeMonitorTransformer.java # 类转换器  
├── target-app/ # 被监控的应用  
│ └── src/main/java/com/app/  
│ └── UserService.java  
└── scripts/  
└── analyze.py # Python 分析脚本

text

2.2 Agent 入口类

package com.agent;

import java.lang.instrument.Instrumentation;

public class AgentMain {
    
    // 优先级高于 premain,Java 8+ 支持
    public static void agentmain(String agentArgs, Instrumentation inst) {
        System.out.println("[Agent] agentmain 已启动");
        inst.addTransformer(new TimeMonitorTransformer(), true);
    }
    
    // 启动时加载用
    public static void premain(String agentArgs, Instrumentation inst) {
        System.out.println("[Agent] premain 已启动");
        inst.addTransformer(new TimeMonitorTransformer());
    }
}

2.3 核心 Transformer(关键代码)

java

package com.agent;

import java.lang.instrument.ClassFileTransformer;
import java.security.ProtectionDomain;
import javassist.ClassPool;
import javassist.CtClass;
import javassist.CtMethod;

public class TimeMonitorTransformer implements ClassFileTransformer {
    
    // 需要监控的包路径
    private static final String TARGET_PACKAGE = "com/app";
    
    @Override
    public byte[] transform(ClassLoader loader,
                            String className,
                            Class<?> classBeingRedefined,
                            ProtectionDomain protectionDomain,
                            byte[] classfileBuffer) {
        
        // 转换格式: com/app/UserService -> com.app.UserService
        String normalizedName = className.replace('/', '.');
        
        if (!normalizedName.startsWith(TARGET_PACKAGE)) {
            return null;  // 不处理的类直接跳过
        }
        
        try {
            ClassPool pool = ClassPool.getDefault();
            CtClass ctClass = pool.get(normalizedName);
            
            // 获取所有方法(包括父类)
            CtMethod[] methods = ctClass.getDeclaredMethods();
            
            for (CtMethod method : methods) {
                enhanceMethod(method, normalizedName);
            }
            
            return ctClass.toBytecode();
            
        } catch (Exception e) {
            System.err.println("[Agent] 增强失败: " + normalizedName);
            e.printStackTrace();
            return null;
        }
    }
    
    private void enhanceMethod(CtMethod method, String className) throws Exception {
        String methodName = method.getName();
        
        // 跳过构造方法和静态块
        if ("<init>".equals(methodName) || "<clinit>".equals(methodName)) {
            return;
        }
        
        // 在方法开头插入计时开始
        method.insertBefore(
            "long $agent_start = System.nanoTime();" +
            "java.util.Map $agent_store = com.agent.AgentContext.getStore();" +
            "$agent_store.put("" + className + "." + methodName + "_start", $agent_start);"
        );
        
        // 在方法结尾插入计时结束和输出
        method.insertAfter(
            "long $agent_end = System.nanoTime();" +
            "long $agent_startTime = (Long) com.agent.AgentContext.getStore().get("" + className + "." + methodName + "_start");" +
            "long $agent_cost = ($agent_end - $agent_startTime) / 1000000;" +
            "System.out.println("[Agent] " + "" + className + "." + methodName + "" + " 耗时: " + $agent_cost + " ms");" +
            "com.agent.AgentContext.getStore().remove("" + className + "." + methodName + "_start");",
            true  // true 表示在 finally 中执行
        );
    }
}

2.4 上下文存储(解决多线程问题)

java

package com.agent;

import java.util.HashMap;
import java.util.Map;

public class AgentContext {
    private static final ThreadLocal<Map<String, Object>> context = 
        ThreadLocal.withInitial(HashMap::new);
    
    public static Map<String, Object> getStore() {
        return context.get();
    }
    
    public static void clear() {
        context.remove();
    }
}

2.5 Maven 配置(pom.xml)

xml

<build>
    <plugins>
        <plugin>
            <groupId>org.apache.maven.plugins</groupId>
            <artifactId>maven-assembly-plugin</artifactId>
            <version>3.3.0</version>
            <configuration>
                <descriptorRefs>
                    <descriptorRef>jar-with-dependencies</descriptorRef>
                </descriptorRefs>
                <archive>
                    <manifestEntries>
                        <Premain-Class>com.agent.AgentMain</Premain-Class>
                        <Agent-Class>com.agent.AgentMain</Agent-Class>
                        <Can-Redefine-Classes>true</Can-Redefine-Classes>
                        <Can-Retransform-Classes>true</Can-Retransform-Classes>
                    </manifestEntries>
                </archive>
            </configuration>
            <executions>
                <execution>
                    <phase>package</phase>
                    <goals>
                        <goal>single</goal>
                    </goals>
                </execution>
            </executions>
        </plugin>
    </plugins>
</build>

打包命令:

bash

mvn clean package
# 生成 target/agent-1.0-jar-with-dependencies.jar

三、被测试的应用

java

package com.app;

public class UserService {
    
    public String getUserName(Long userId) {
        // 模拟耗时操作
        try {
            Thread.sleep(userId * 10);
        } catch (InterruptedException e) {
            e.printStackTrace();
        }
        return "user_" + userId;
    }
    
    public void createOrder(Long userId, String product) {
        // 模拟业务逻辑
        String name = getUserName(userId);
        System.out.println("创建订单: " + name + " -> " + product);
    }
    
    public static void main(String[] args) throws InterruptedException {
        UserService service = new UserService();
        for (int i = 1; i <= 5; i++) {
            service.createOrder((long) i, "产品" + i);
            Thread.sleep(100);
        }
    }
}

使用 Agent 启动应用

bash

java -javaagent:agent/target/agent-1.0-jar-with-dependencies.jar \
     -cp target-app/target/classes \
     com.app.UserService

输出效果:

text

[Agent] premain 已启动
[Agent] com.app.UserService.getUserName 耗时: 12 ms
[Agent] com.app.UserService.createOrder 耗时: 15 ms
创建订单: user_1 -> 产品1
[Agent] com.app.UserService.getUserName 耗时: 24 ms
[Agent] com.app.UserService.createOrder 耗时: 27 ms
创建订单: user_2 -> 产品2
...

四、Python 端:实时接收并分析调用链

Agent 输出的是控制台日志,生产环境需要收集到统一平台。下面用 Python 模拟接收、解析和可视化。

4.1 模拟日志接收器(HTTP 服务)

python

# scripts/receiver.py
from flask import Flask, request, jsonify
from collections import defaultdict
from datetime import datetime
import threading
import time

app = Flask(__name__)

# 存储调用数据
call_data = defaultdict(list)

@app.route('/agent/log', methods=['POST'])
def receive_log():
    """接收 Agent 发送的日志"""
    data = request.json
    call_data[data['method']].append({
        'cost_ms': data['cost_ms'],
        'timestamp': data['timestamp'],
        'thread': data['thread']
    })
    print(f"[RECEIVED] {data['method']}: {data['cost_ms']}ms")
    return jsonify({'status': 'ok'})

@app.route('/stats', methods=['GET'])
def get_stats():
    """获取统计数据"""
    result = {}
    for method, records in call_data.items():
        if not records:
            continue
        costs = [r['cost_ms'] for r in records]
        result[method] = {
            'avg': sum(costs) / len(costs),
            'max': max(costs),
            'min': min(costs),
            'count': len(costs),
            'p95': sorted(costs)[int(len(costs) * 0.95)] if len(costs) > 20 else None
        }
    return jsonify(result)

if __name__ == '__main__':
    app.run(host='0.0.0.0', port=8080)

4.2 调用链聚合分析脚本

python

# scripts/analyze.py
import subprocess
import json
import requests
import sys
from collections import deque
import time

class CallChainAnalyzer:
    def __init__(self, agent_jar_path, target_class):
        self.agent_jar = agent_jar_path
        self.target_class = target_class
        self.call_stack = deque(maxlen=100)
        
    def start_java_app(self):
        """启动带 Agent 的 Java 进程"""
        cmd = [
            'java', 
            f'-javaagent:{self.agent_jar}',
            '-cp', './target-app/target/classes',
            self.target_class
        ]
        # 非阻塞启动,实时读取输出
        self.process = subprocess.Popen(
            cmd,
            stdout=subprocess.PIPE,
            stderr=subprocess.STDOUT,
            text=True,
            bufsize=1
        )
        return self.process
    
    def parse_and_send_to_api(self):
        """解析 Agent 输出并发送到分析 API"""
        for line in self.process.stdout:
            # 解析 Agent 输出格式: [Agent] com.app.Xxx.method 耗时: 12 ms
            if '[Agent]' in line and '耗时' in line:
                parts = line.split()
                # 提取方法名和耗时
                method = parts[1]
                cost = int(parts[3])
                
                # 构建调用链上下文
                self.call_stack.append({
                    'method': method,
                    'cost': cost,
                    'depth': len(self.call_stack),
                    'parent': self.call_stack[-1]['method'] if self.call_stack else None
                })
                
                # 发送到 HTTP 服务
                try:
                    requests.post(
                        'http://localhost:8080/agent/log',
                        json={
                            'method': method,
                            'cost_ms': cost,
                            'timestamp': time.time(),
                            'thread': 'main',
                            'stack_depth': len(self.call_stack)
                        },
                        timeout=0.1
                    )
                except:
                    pass  # 避免影响业务
                
                # 输出调用链可视化
                indent = '  ' * len(self.call_stack)
                print(f"{indent}└── {method} ({cost}ms)")
                
    def run(self):
        self.start_java_app()
        self.parse_and_send_to_api()

if __name__ == '__main__':
    analyzer = CallChainAnalyzer(
        agent_jar_path='agent/target/agent-1.0-jar-with-dependencies.jar',
        target_class='com.app.UserService'
    )
    analyzer.run()

4.3 运行 Python 分析器

bash

# 启动接收服务(另一个终端)
python scripts/receiver.py

# 启动分析器
python scripts/analyze.py

输出效果(调用链树形结构):

text

└── com.app.UserService.createOrder (27ms)
    └── com.app.UserService.getUserName (24ms)
└── com.app.UserService.createOrder (32ms)
    └── com.app.UserService.getUserName (30ms)

五、进阶:动态 Attach 到运行中的 Java 进程

不需要重启 Java 应用就能注入 Agent:

5.1 VirtualMachine 工具类

java

// AttachMain.java
import com.sun.tools.attach.VirtualMachine;

public class AttachMain {
    public static void main(String[] args) throws Exception {
        if (args.length < 2) {
            System.out.println("用法: java AttachMain <pid> <agent-jar路径>");
            return;
        }
        String pid = args[0];
        String agentJar = args[1];
        
        VirtualMachine vm = VirtualMachine.attach(pid);
        vm.loadAgent(agentJar);
        vm.detach();
        System.out.println("Agent 已注入到进程 " + pid);
    }
}

使用方式:

bash

# 1. 找到 Java 进程 PID
jps -l

# 2. 动态注入 Agent
java -cp $JAVA_HOME/lib/tools.jar:. AttachMain 12345 agent.jar

5.2 Python 自动化注入脚本

python

# scripts/attach.py
import subprocess
import os

def get_java_pid(keyword):
    """获取 Java 进程 PID"""
    result = subprocess.run(['jps', '-l'], capture_output=True, text=True)
    for line in result.stdout.split('\n'):
        if keyword in line:
            pid = line.split()[0]
            print(f"找到进程: {line}")
            return pid
    return None

def attach_agent(pid, agent_path):
    """动态注入 Agent"""
    tools_jar = f"{os.environ['JAVA_HOME']}/lib/tools.jar"
    cmd = [
        'java', '-cp', f'{tools_jar}:.',
        'AttachMain', pid, agent_path
    ]
    subprocess.run(cmd)

if __name__ == '__main__':
    pid = get_java_pid('UserService')
    if pid:
        attach_agent(pid, 'agent/target/agent-1.0-jar-with-dependencies.jar')

六、生产环境建议

问题解决方案
Agent 影响性能采样模式 + 异步上报
字节码增强冲突使用 ByteBuddy 替代 Javassist
日志量过大配置采样率或只监控慢请求
多 ClassLoader 问题使用 appendToBootstrapClassLoaderSearch

性能优化版 Transformer 核心(采样)

java

private static final Random random = new Random();
private static final double SAMPLE_RATE = 0.1; // 10% 采样

private void enhanceMethod(CtMethod method, String className) throws Exception {
    // 增加采样判断
    method.insertBefore(
        "if (Math.random() > " + SAMPLE_RATE + ") return;"
    );
    // ... 原有计时逻辑
}

七、总结

整套代码完全可以本地跑通:

  1. Java Agent 无侵入增强目标类
  2. Python 实时接收、解析、聚合调用链
  3. 动态 Attach 支持生产环境热注入

你可以直接:

bash

git clone <你的仓库>
mvn package
python scripts/analyze.py

下一步扩展方向:

  • 接入 Prometheus + Grafana 做可视化
  • 支持 gRPC/Thrift 调用链
  • 集成 SkyWalking 或 OpenTelemetry