容易误解的NPE

302 阅读5分钟

问题描述

骑手交易风控金额审批通过后,结算失败,风控的同学查看堆栈后发现有两类异常,一类是正常的空指针,一类是看不懂的空指针。正常的空指针异常就不再列举了,直接来看下面这个不好理解的NPE

content:  9672833181742924 [DubboServerHandler-localhost:port-thread-199] com.....riskcontrol.trade.unit.dubbo.client.RiderTradeClient:addBalanceLogForAbnormalBack:46 --- riderTradeCostSettleOutmodedProvider#addBalanceLogForAbnormalBack异常
java.lang.NullPointerException: null
	at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method) ~[?:1.8.0_201]
	at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62) ~[?:1.8.0_201]
	at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45) ~[?:1.8.0_201]
	at java.lang.reflect.Constructor.newInstance(Constructor.java:423) ~[?:1.8.0_201]
	at com.alibaba.com.caucho.hessian.io.JavaDeserializer.instantiate(JavaDeserializer.java:305) ~[dubbo-2.5.6.2-D-RELEASE.jar!/:2.5.6.2-D-RELEASE]
	at com.alibaba.com.caucho.hessian.io.JavaDeserializer.readObject(JavaDeserializer.java:194) ~[dubbo-2.5.6.2-D-RELEASE.jar!/:2.5.6.2-D-RELEASE]
	at com.alibaba.com.caucho.hessian.io.SerializerFactory.readObject(SerializerFactory.java:524) ~[dubbo-2.5.6.2-D-RELEASE.jar!/:2.5.6.2-D-RELEASE]
	at com.alibaba.com.caucho.hessian.io.Hessian2Input.readObjectInstance(Hessian2Input.java:2743) ~[dubbo-2.5.6.2-D-RELEASE.jar!/:2.5.6.2-D-RELEASE]
	at com.alibaba.com.caucho.hessian.io.Hessian2Input.readObject(Hessian2Input.java:2683) ~[dubbo-2.5.6.2-D-RELEASE.jar!/:2.5.6.2-D-RELEASE]
	at com.alibaba.com.caucho.hessian.io.Hessian2Input.readObject(Hessian2Input.java:2657) ~[dubbo-2.5.6.2-D-RELEASE.jar!/:2.5.6.2-D-RELEASE]
	at com.alibaba.dubbo.common.serialize.support.hessian.Hessian2ObjectInput.readObject(Hessian2ObjectInput.java:76) ~[dubbo-2.5.6.2-D-RELEASE.jar!/:2.5.6.2-D-RELEASE]
	at com.alibaba.dubbo.rpc.protocol.dubbo.DecodeableRpcResult.decode(DecodeableRpcResult.java:92) ~[dubbo-2.5.6.2-D-RELEASE.jar!/:2.5.6.2-D-RELEASE]
	at com.alibaba.dubbo.rpc.protocol.dubbo.DecodeableRpcResult.decode(DecodeableRpcResult.java:109) ~[dubbo-2.5.6.2-D-RELEASE.jar!/:2.5.6.2-D-RELEASE]
	at com.alibaba.dubbo.rpc.protocol.dubbo.DubboCodec.decodeBody(DubboCodec.java:90) ~[dubbo-2.5.6.2-D-RELEASE.jar!/:2.5.6.2-D-RELEASE]
	at com.alibaba.dubbo.remoting.exchange.codec.ExchangeCodec.decode(ExchangeCodec.java:119) ~[dubbo-2.5.6.2-D-RELEASE.jar!/:2.5.6.2-D-RELEASE]
	at com.alibaba.dubbo.remoting.exchange.codec.ExchangeCodec.decode(ExchangeCodec.java:80) ~[dubbo-2.5.6.2-D-RELEASE.jar!/:2.5.6.2-D-RELEASE]
	at com.alibaba.dubbo.rpc.protocol.dubbo.DubboCountCodec.decode(DubboCountCodec.java:46) ~[dubbo-2.5.6.2-D-RELEASE.jar!/:2.5.6.2-D-RELEASE]
	at com.alibaba.dubbo.remoting.transport.netty.NettyCodecAdapter$InternalDecoder.messageReceived(NettyCodecAdapter.java:134) ~[dubbo-2.5.6.2-D-RELEASE.jar!/:2.5.6.2-D-RELEASE]
	at org.jboss.netty.channel.SimpleChannelUpstreamHandler.handleUpstream(SimpleChannelUpstreamHandler.java:80) ~[netty-3.2.5.Final.jar!/:?]
	at org.jboss.netty.channel.DefaultChannelPipeline.sendUpstream(DefaultChannelPipeline.java:564) ~[netty-3.2.5.Final.jar!/:?]
	at org.jboss.netty.channel.DefaultChannelPipeline.sendUpstream(DefaultChannelPipeline.java:559) ~[netty-3.2.5.Final.jar!/:?]
	at org.jboss.netty.channel.Channels.fireMessageReceived(Channels.java:274) ~[netty-3.2.5.Final.jar!/:?]
	at org.jboss.netty.channel.Channels.fireMessageReceived(Channels.java:261) ~[netty-3.2.5.Final.jar!/:?]
	at org.jboss.netty.channel.socket.nio.NioWorker.read(NioWorker.java:349) ~[netty-3.2.5.Final.jar!/:?]
	at org.jboss.netty.channel.socket.nio.NioWorker.processSelectedKeys(NioWorker.java:280) ~[netty-3.2.5.Final.jar!/:?]
	at org.jboss.netty.channel.socket.nio.NioWorker.run(NioWorker.java:200) ~[netty-3.2.5.Final.jar!/:?]
	at org.jboss.netty.util.ThreadRenamingRunnable.run(ThreadRenamingRunnable.java:108) ~[netty-3.2.5.Final.jar!/:?]
	at org.jboss.netty.util.internal.DeadLockProofWorker$1.run(DeadLockProofWorker.java:44) ~[netty-3.2.5.Final.jar!/:?]
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) [?:1.8.0_201]
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) [?:1.8.0_201]
	at java.lang.Thread.run(Thread.java:748) [?:1.8.0_201]

本地方法NPE?不能够啊!!!

问题排查

根据调用方提供的信息,查看服务提供者的日志,报错日志如下:

content:  [DubboServerHandler-localhost:port-thread-494] 
com.....common.exception.filter.dubbo.BusinessExceptionFilter:invoke:90 
Got unchecked and undeclared exception which called by remoteHost. 
service: com.....billing.settle.provider.outmoded.RiderTradeCostSettleOutmodedProvider, 
method: addBalanceLogForAbnormalBack, args:[23,"607623443302600704",6286.77], 
exception: java.lang.NullPointerException: null ===> java.lang.NullPointerException

可以看到该NPE异常是没有堆栈的,也就是说由于大量的NPE触发了JVM的堆栈优化,不再输出堆栈,该优化参数:OmitStackTraceInFastThrow。相关源码如下:

// src/share/vm/opto/graphKit.cpp:515
if (treat_throw_as_hot
  && (!StackTraceInThrowable || OmitStackTraceInFastThrow)) {
// If the throw is local, we use a pre-existing instance and
// punt on the backtrace.  This would lead to a missing backtrace
// (a repeat of 4292742) if the backtrace object is ever asked
// for its backtrace.
// Fixing this remaining case of 4292742 requires some flavor of
// escape analysis.  Leave that for the future.
ciInstance* ex_obj = NULL;
switch (reason) {
case Deoptimization::Reason_null_check:
  ex_obj = env()->NullPointerException_instance();
  break;
case Deoptimization::Reason_div0_check:
  ex_obj = env()->ArithmeticException_instance();
  break;
case Deoptimization::Reason_range_check:
  ex_obj = env()->ArrayIndexOutOfBoundsException_instance();
  break;
case Deoptimization::Reason_class_check:
  if (java_bc() == Bytecodes::_aastore) {
    ex_obj = env()->ArrayStoreException_instance();
  } else {
    ex_obj = env()->ClassCastException_instance();
  }
  break;
}
if (failing()) { stop(); return; }  // exception allocation might fail
if (ex_obj != NULL) {
  ...
  // Clear the detail message of the preallocated exception object.
  // Weblogic sometimes mutates the detail message of exceptions
  // using reflection.
  ...
}

那么客户端看到的异常堆栈从何而来?

问题推断1:dubbo客户端将客户端当前执行的堆栈填充至NPE异常

服务端故意抛出无堆栈的NPE,使用dubbo客户端调用服务端,异常不会被填充,说明不是dubbo内部自动填充的异常。那么问题在哪?因为公司封装了很多业务异常,对于dubbo服务抛出的业务异常要进行定制化处理,于是加了业务异常过滤器,那么是不是该过滤器导致的呢?

dubbo调用链简单回顾下: RegistryProtocol引用服务提供者-》ProtocolFilterWrapper将调用者Invoker使用过滤链包装-》调用者DubboInvoker

服务端调用目标方法出现异常,将异常包装为RpcResult响应客户端:com.alibaba.dubbo.rpc.protocol.AbstractInvoker#invoke

try {
    return doInvoke(invocation);
} catch (InvocationTargetException e) { // biz exception
    Throwable te = e.getTargetException();
    if (te == null) {
        return new RpcResult(e);
    } else {
        if (te instanceof RpcException) {
            ((RpcException) te).setCode(RpcException.BIZ_EXCEPTION);
        }
        return new RpcResult(te);
    }
} catch (RpcException e) {
    if (e.isBiz()) {
        return new RpcResult(e);
    } else {
        throw e;
    }
} catch (Throwable e) {
    return new RpcResult(e);
}

过滤器:服务端调用同样存在业务异常过滤器,源码与客户端类似,过滤器输出的日志结合源码可以确认绑定的异常为NPE异常 过滤器获取调用结果RpcResult进行业务处理,此时我们服务端抛出了NPE异常,也就是RpcResult绑定的异常类型为NPE,查看我们业务异常处理过滤器代码如下:

public Result invoke(Invoker<?> invoker, Invocation invocation)
        throws RpcException {
    try {
        ...
        Result result = invoker.invoke(invocation);
		...
        if (result.hasException()
                && GenericService.class != invoker.getInterface()) {
            try {
                Throwable exception = result.getException();
                // 如果是checked异常,直接抛出
                if (!(exception instanceof RuntimeException)
                        && (exception instanceof Exception)) {
                    return result;
                }
                // 在方法签名上有声明,直接抛出
                ...
				// 业务异常直接抛出
                if(exception instanceof BusinessException){
                    return result;
                }
                ...
                // 异常类和接口类在同一jar包里,直接抛出
                ...
                // 是JDK自带的异常,直接抛出
                String className = exception.getClass().getName();
                if (className.startsWith("java.")
                        || className.startsWith("javax.")) {
                    return result;
                }
                // 是Dubbo本身的异常,直接抛出
                if (exception instanceof RpcException) {
                    return result;
                }
                // 否则,包装成RuntimeException抛给客户端
                return new RpcResult(new RuntimeException(
                        StringUtils.toString(exception)));
            } catch (Throwable e) {
                ...
                return result;
            }
        }
        return result;
    } catch (RuntimeException e) {
        ...
        throw e;
    }
}

并没有填充异常堆栈的代码。那么是否日志打印输出时自动填充,同样适用自己写的案例尝试log.error输出也依然是没有堆栈,故不是该原因

问题推断2:hessian序列化时填充的当前堆栈

代码成功复现,为什么前面dubbo抛出NPE没能复现呢?因为我当时本地执行代码时使用单线程循环调用,下面案例代码如果不使用线程池调用也是无法触发JVM的异常堆栈优化,由于没有触发JVM的异常堆栈优化,所以就没能复现咯

import com.alibaba.com.caucho.hessian.io.HessianInput;
import com.alibaba.com.caucho.hessian.io.HessianOutput;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.util.HashSet;
import java.util.Set;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;

/**
 * @author 会灰翔的灰机
 * @date 2020/10/11
 */
public class NpeThief extends Thread {

    private Set<Integer> lens = new HashSet<>();
    private static int count = 0;
    private Throwable throwable;
    @Override
    public void run() {
        try{
            System.out.println(this.getClass().getSimpleName()+"--"+(++count));
            String str = null;
            // 制造空指针NPE
            System.out.println(str.length());
        }catch (Throwable e){
            lens.add(e.getStackTrace().length);
            if (throwable == null && e.getStackTrace().length==0) {
                throwable=e;
            }
        }
    }
    public static void main(String[] args) throws InterruptedException, IOException {
        NpeThief npeThief = new NpeThief();
        ExecutorService executorService = Executors.newFixedThreadPool(20);
        for (int i=0; i<8000; i++) {
            executorService.execute(npeThief);
            // 稍微sleep一下, 是为了不要让异常抛出太快, 导致控制台输出太快, 把有异常栈信息冲掉, 只留下fast throw方式抛出的异常
            Thread.sleep(2);
        }
        System.out.println(npeThief.lens);
        System.out.println(npeThief.throwable.getStackTrace().length);
        ByteArrayOutputStream os = new ByteArrayOutputStream();
        //Hessian的序列化输出
        HessianOutput ho = new HessianOutput(os);
        Throwable old = npeThief.throwable;
        ho.writeObject(old);
        old.printStackTrace();

        byte[] data = os.toByteArray();

        ByteArrayInputStream is = new ByteArrayInputStream(data);
        //Hessian的反序列化读取对象
        HessianInput hi = new HessianInput(is);
        //前面的ho.writeObject(old);必须有,不然报错HessianProtocolException: unknown code for readObject at end of file
        Throwable throwable = (Throwable) hi.readObject();
        throwable.printStackTrace();
    }
}

总结

  1. 异常的堆栈并非服务端的堆栈,因为服务端日志可以看到堆栈已经被JVM优化掉了,堆栈实际是客户端侧当时的堆栈
  2. 问题是hessian反序列化+JVM优化异常堆栈导致,简单的处理方式就是关闭JVM的异常堆栈优化功能即可:-XX:-OmitStackTraceInFastThrow