链路追踪
概念
-
Trace: 表示一整条链路,(跨线程、跨进程的所有 Segment 的集合)- 在skywalking中, Trace 不是一个具体的数据模型,而是多个 Segment 串起来表示的逻辑对象
-
segment: 表示一个JVM进程内的一个线程中的所有操作的集合。(可以将 一个JVM进程 理解为 一个 微服务) -
Span: 表示一个具体操作。
链路ID的生成
-
DistributedTraceId顶级抽象父类。 -
PropagatedTraceId: 跨线程时用的id生成器 -
NewDistributedTraceId: 跨进程时用的id生成器 -
GlobalIdGenerator:NewDistributedTraceId构造函数调用的方法,以此创建id.
@RequiredArgsConstructor
@ToString
@EqualsAndHashCode
public abstract class DistributedTraceId {
@Getter
private final String id;
}
public class PropagatedTraceId extends DistributedTraceId {
public PropagatedTraceId(String id) {
super(id);
}
}
public class NewDistributedTraceId extends DistributedTraceId {
public NewDistributedTraceId() {
super(GlobalIdGenerator.generate());
}
}
GlobalIdGenerator
id 生成器
/**
* id 生成
*/
public final class GlobalIdGenerator {
private static final String PROCESS_ID = UUID.randomUUID().toString().replaceAll("-", "");
// 这里初始化了 IDContext
private static final ThreadLocal<IDContext> THREAD_ID_SEQUENCE = ThreadLocal.withInitial(
() -> new IDContext(System.currentTimeMillis(), (short) 0));
private GlobalIdGenerator() {
}
public static String generate() {
// 1. 应用实例id
// 2. 线程id
// 3. 有两部分,1)一个时间戳,以毫秒为单位 2)一个序列,在当前线程中,在 0(包括)和 9999(包括)之间
return StringUtil.join(
'.',
PROCESS_ID,
String.valueOf(Thread.currentThread().getId()),
String.valueOf(THREAD_ID_SEQUENCE.get().nextSeq())
);
}
private static class IDContext {
/**
* 上次生成 sequence 的时间戳
*/
private long lastTimestamp;
/**
* 线程的序列号
*/
private short threadSeq;
// Just for considering time-shift-back only.
// 时钟回拨
private long lastShiftTimestamp;
private int lastShiftValue;
private IDContext(long lastTimestamp, short threadSeq) {
this.lastTimestamp = lastTimestamp;
this.threadSeq = threadSeq;
}
/**
* 生成序号
* 有两部分,1)一个时间戳,以毫秒为单位 2)一个序列,在当前线程中,在 0(包括)和 9999(包括)之间
* @return
*/
private long nextSeq() {
return timestamp() * 10000 + nextThreadSeq();
}
private long timestamp() {
long currentTimeMillis = System.currentTimeMillis();
// 发生了时钟回拨
if (currentTimeMillis < lastTimestamp) {
// Just for considering time-shift-back by Ops or OS. @hanahmily 's suggestion.
// 只是为了考虑 Ops 或 OS 的时间倒退。
if (lastShiftTimestamp != currentTimeMillis) {
// 时钟回拨次数+1
lastShiftValue++;
lastShiftTimestamp = currentTimeMillis;
}
return lastShiftValue;
} else {
// 正常逻辑
lastTimestamp = currentTimeMillis;
return lastTimestamp;
}
}
private short nextThreadSeq() {
if (threadSeq == 10000) {
threadSeq = 0;
}
return threadSeq++;
}
}
}
TraceSegment
链路追踪的重要概念模型。前面说了,skywalking没有将Trace设计为数据模型。Trace只是概念,由多个segment串联而成的,relatedGlobalTraceId就是所属的trace的id,代码中可以看到,在构造函数就会生成一个id,但也可以调用relatedGlobalTrace()将该Segment关联到其他Trace上。
/**
* segment
* <p>
* Trace的组合部分。 多个segment组成一个trace
* Trace 不是一个具体的数据模型,而是多个 Segment 串起来表示的逻辑对象
*/
public class TraceSegment {
/**
* The id of this trace segment. Every segment has its unique-global-id.
* 全局唯一的 segmentId
*/
private String traceSegmentId;
/**
* 指针,指向当前segment的parent segment 的指针
*/
private TraceSegmentRef ref;
/**
* <p>
* span
*/
private List<AbstractTracingSpan> spans;
/**
*当前segment 所在 Trace 的 ID
*/
private DistributedTraceId relatedGlobalTraceId;
private boolean ignore = false;
private boolean isSizeLimited = false;
private final long createTime;
/**
* Create a default/empty trace segment, with current time as start time, and generate a new segment id.
*/
public TraceSegment() {
this.traceSegmentId = GlobalIdGenerator.generate();
this.spans = new LinkedList<>();
// 在 skywalking 中,Trace 不是一个具体的数据模型,而是多个 Segment 串起来表示的逻辑对象
// 这里在生成 Segment时,就创建了 traceId
this.relatedGlobalTraceId = new NewDistributedTraceId();
this.createTime = System.currentTimeMillis();
}
/**
* Establish the link between this segment and its parents.
*
* @param refSegment {@link TraceSegmentRef}
*/
public void ref(TraceSegmentRef refSegment) {
if (null == ref) {
this.ref = refSegment;
}
}
/**
* Establish the line between this segment and the relative global trace id.
* 将当前segment 关联到 一个Trace上
* 就是把持有的traceId给换了。(relatedGlobalTraceId)
* 但是 跨进程id才行
*/
public void relatedGlobalTrace(DistributedTraceId distributedTraceId) {
if (relatedGlobalTraceId instanceof NewDistributedTraceId) {
this.relatedGlobalTraceId = distributedTraceId;
}
}
/**
*
* 加入一个span
*/
public void archive(AbstractTracingSpan finishedSpan) {
spans.add(finishedSpan);
}
/**
* Finish this {@link TraceSegment}. <p> return this, for chaining
* 结束方法
*关闭 segment时,要调用这个方法。
*
* span 是否到达了上限,配置中的默认值 300
*
*/
public TraceSegment finish(boolean isSizeLimited) {
this.isSizeLimited = isSizeLimited;
return this;
}
}
TraceSegmentRef
是指向Parent Segment的指针。这是一个对象,里面存储了父Segment的基本信息
@Getter
public class TraceSegmentRef {
// 类型 跨进程、跨线程
private SegmentRefType type;
// traceId
private String traceId;
// parent 的 traceSegmentId
private String traceSegmentId;
private int spanId;
// Mall -> Order 对于Order 服务来讲,parentService 就是Mail
private String parentService;
// parentService 的具体一个实例
private String parentServiceInstance;
// 进入parentService 的那个请求
private String parentEndpoint;
// 记录的地址信息
private String addressUsedAtClient;
}
span
span表示一个基本的操作,它的概念也是最多的。
AsyncSpan
最顶层的span,定义了基础的 prepareForAsync() // 准备阶段 和 asyncFinish() // 结束阶段
/**
* Span could use these APIs to active and extend its lift cycle across thread.
* <p>
* This is typical used in async plugin, especially RPC plugins.
*
* // 异步span
* 最顶层的span
*/
public interface AsyncSpan {
/**
* The span finish at current tracing context, but the current span is still alive, until {@link #asyncFinish}
* called.
* <p>
* This method must be called
* <p>
* 1. In original thread(tracing context). 2. Current span is active span.
* <p>
* During alive, tags, logs and attributes of the span could be changed, in any thread.
* <p>
* The execution times of {@link #prepareForAsync} and {@link #asyncFinish()} must match.
*
* @return the current span
*
* // 准备阶段
*/
AbstractSpan prepareForAsync();
/**
* Notify the span, it could be finished.
* <p>
* The execution times of {@link #prepareForAsync} and {@link #asyncFinish()} must match.
*
* @return the current span
*
* // 结束阶段
*/
AbstractSpan asyncFinish();
}
AbstractSpan
继承 AsyncSpan,并定义了一些通用方法。
layer
/**
* 指定当前Span 表示的操作所在的插件属于哪一种 skywalking 划分的类型
* - 在skywalking中,将各种插件划分为5类,=> DB(1), RPC_FRAMEWORK(2), HTTP(3), MQ(4), CACHE(5);。这个就可以理解为层
*
* @param layer 枚举
* @return
*/
AbstractSpan setLayer(SpanLayer layer);
/**
* The <code>AbstractSpan</code> represents the span's skeleton, which contains all open methods.
* <p>
* AbstractSpan表示跨度的骨架,定义了公用的方法
*/
public interface AbstractSpan extends AsyncSpan {
/**
* - ComponentsDefine 将插件定义为一个对象
* - 指定当前 Span 表示的操作发生在那个插件上
* Set the component id, which defines in {@link ComponentsDefine}
*
* @return the span for chaining.
*/
AbstractSpan setComponent(Component component);
/**
* 指定当前Span 表示的操作所在的插件属于哪一种 skywalking 划分的类型
* - 在skywalking中,将各种插件划分为5类,=> DB(1), RPC_FRAMEWORK(2), HTTP(3), MQ(4), CACHE(5);。这个就可以理解为层
*
* @param layer 枚举
* @return
*/
AbstractSpan setLayer(SpanLayer layer);
/**
* Set a key:value tag on the Span.
*
* @return this Span instance, for chaining
* @deprecated use {@link #tag(AbstractTag, String)} in companion with {@link Tags#ofKey(String)} instead
*/
@Deprecated
AbstractSpan tag(String key, String value);
/**
* 打标签
* AbstractTag 增加了一个id
*/
AbstractSpan tag(AbstractTag<?> tag, String value);
/**
* 记录当前 挂钟时间 时间戳的异常事件。
* - 挂钟时间: 本机当前时间
* Record an exception event of the current walltime timestamp.
*
* @param t any subclass of {@link Throwable}, which occurs in this span.
* @return the Span, for chaining
*/
AbstractSpan log(Throwable t);
/**
* 抽象方法,在错误发生时执行
*
* @return
*/
AbstractSpan errorOccurred();
/**
* @return true if the actual span is an entry span.
*/
boolean isEntry();
/**
* @return true if the actual span is an exit span.
*/
boolean isExit();
/**
* 在指定时间戳记录事件
* Record an event at a specific timestamp.
*
* @param timestamp The explicit timestamp for the log record.
* @param event the events
* @return the Span, for chaining
*/
AbstractSpan log(long timestamp, Map<String, ?> event);
/**
* Sets the string name for the logical operation this span represents.
* 如果当前Span的操作是
* 一个 HTTP 请求,operationName 就是 请求的URL;
* 一条 SQL 语句,operationName 就是 SQL 的类型
* 一个 Redis 操作, operationName 就是 Redis 命令
*
* @return this Span instance, for chaining
*/
AbstractSpan setOperationName(String operationName);
/**
* Start a span.
*动作开始的时候,调用这个方法
*
* @return this Span instance, for chaining
*/
AbstractSpan start();
/**
* Get the id of span
*
* @return id value.
*/
int getSpanId();
String getOperationName();
/**
* 跨 Segment 时,通过 ref 将Segment 关联起来
*
* Reference other trace segment.
*
* @param ref segment ref
*/
void ref(TraceSegmentRef ref);
AbstractSpan start(long startTime);
/**
* 什么叫 peer, 就是对端地址
* 一个请求可能跨多个进程,操作多种中间件,那么每一个RPC, 对面的服务的地址就是 remotePeer
* 每一次中间件的操作,中间件的地址就是 remotePeer
* @param remotePeer
* @return
*/
AbstractSpan setPeer(String remotePeer);
/**
* @return true if the span's owner(tracing context main thread) is been profiled.
*/
boolean isProfiling();
/**
* 设置 span 发生到OAP后,要不要进行性能分析
* Should skip analysis in the backend.
*/
void skipAnalysis();
}
AbstractTracingSpan
/**
* The <code>AbstractTracingSpan</code> represents a group of {@link AbstractSpan} implementations, which belongs a real
* distributed trace.
* <p>
* AbstractTracingSpan代表了一组AbstractSpan的实现,属于真正的分布式trace。
*/
public abstract class AbstractTracingSpan implements AbstractSpan {
/**
* Span id starts from 0.
*/
protected int spanId;
/**
* Parent span id starts from 0. -1 means no parent span.
* 从0 开始, -1 表示没有父级
*/
protected int parentSpanId;
/**
* 封装的tag
*/
protected List<TagValuePair> tags;
protected String operationName;
protected SpanLayer layer;
/**
* The span has been tagged in async mode, required async stop to finish.
* 表示当前异步操作,是否已经开始
*/
protected volatile boolean isInAsyncMode = false;
/**
* The flag represents whether the span has been async stopped
* 表示当前异步操作,是否已经结束
*/
private volatile boolean isAsyncStopped = false;
/**
* The context to which the span belongs
* span所属的上下文
* 用来管理一条链路上的 segment 和 span
*/
protected final TracingContext owner;
/**
* The start time of this Span.
*/
protected long startTime;
/**
* The end time of this Span.
*/
protected long endTime;
/**
* Error has occurred in the scope of span.
*/
protected boolean errorOccurred = false;
protected int componentId = 0;
/**
* Log is a concept from OpenTracing spec. https://github.com/opentracing/specification/blob/master/specification.md#log-structured-data
*/
protected List<LogDataEntity> logs;
/**
* The refs of parent trace segments, except the primary one. For most RPC call, {@link #refs} contains only one
* element, but if this segment is a start span of batch process, the segment faces multi parents, at this moment,
* we use this {@link #refs} to link them.
* <p>
* 用于当前 Span 指定自己的所在的 Segment 的前一个Segment, 除非这个 Span 所在的Segment 是整条链路上的第一个Segment
* - 为什么是list?
* 正常情况下,list中只有一个元素。如果 segment 是批处理的话,就会有多个
*/
protected List<TraceSegmentRef> refs;
/**
* Tracing Mode. If true means represents all spans generated in this context should skip analysis.
* 跟踪模式。如果为真,则表示在此上下文中生成的所有跨度应跳过分析。
*/
protected boolean skipAnalysis;
protected AbstractTracingSpan(int spanId, int parentSpanId, String operationName, TracingContext owner) {
this.operationName = operationName;
this.spanId = spanId;
this.parentSpanId = parentSpanId;
this.owner = owner;
}
/**
* Set a key:value tag on the Span.
* <p>
* {@inheritDoc}
*
* @return this Span instance, for chaining
*/
@Override
public AbstractTracingSpan tag(String key, String value) {
return tag(Tags.ofKey(key), value);
}
@Override
public AbstractTracingSpan tag(AbstractTag<?> tag, String value) {
if (tags == null) {
tags = new ArrayList<>(8);
}
if (tag.isCanOverwrite()) {
for (TagValuePair pair : tags) {
if (pair.sameWith(tag)) {
pair.setValue(value);
return this;
}
}
}
tags.add(new TagValuePair(tag, value));
return this;
}
/**
* Finish the active Span. When it is finished, it will be archived by the given {@link TraceSegment}, which owners
* it.
*
* span 结束时,要调用一下 finish
*
* @param owner of the Span.
*/
public boolean finish(TraceSegment owner) {
this.endTime = System.currentTimeMillis();
// 归档
owner.archive(this);
return true;
}
@Override
public AbstractTracingSpan start() {
this.startTime = System.currentTimeMillis();
return this;
}
/**
* Record an exception event of the current walltime timestamp.
*
* @param t any subclass of {@link Throwable}, which occurs in this span.
* @return the Span, for chaining
*/
@Override
public AbstractTracingSpan log(Throwable t) {
if (logs == null) {
logs = new LinkedList<>();
}
if (!errorOccurred && ServiceManager.INSTANCE.findService(StatusCheckService.class).isError(t)) {
errorOccurred();
}
logs.add(new LogDataEntity.Builder().add(new KeyValuePair("event", "error"))
.add(new KeyValuePair("error.kind", t.getClass().getName()))
.add(new KeyValuePair("message", t.getMessage()))
.add(new KeyValuePair(
"stack",
ThrowableTransformer.INSTANCE.convert2String(t, 4000)
))
.build(System.currentTimeMillis()));
return this;
}
/**
* Record a common log with multi fields, for supporting opentracing-java
*
* @return the Span, for chaining
*/
@Override
public AbstractTracingSpan log(long timestampMicroseconds, Map<String, ?> fields) {
if (logs == null) {
logs = new LinkedList<>();
}
LogDataEntity.Builder builder = new LogDataEntity.Builder();
for (Map.Entry<String, ?> entry : fields.entrySet()) {
builder.add(new KeyValuePair(entry.getKey(), entry.getValue().toString()));
}
logs.add(builder.build(timestampMicroseconds));
return this;
}
/**
* In the scope of this span tracing context, error occurred, in auto-instrumentation mechanism, almost means throw
* an exception.
*
* @return span instance, for chaining.
*/
@Override
public AbstractTracingSpan errorOccurred() {
this.errorOccurred = true;
return this;
}
/**
* Set the operation name, just because these is not compress dictionary value for this name. Use the entire string
* temporarily, the agent will compress this name in async mode.
*
* @return span instance, for chaining.
*/
@Override
public AbstractTracingSpan setOperationName(String operationName) {
this.operationName = operationName;
return this;
}
@Override
public int getSpanId() {
return spanId;
}
@Override
public String getOperationName() {
return operationName;
}
@Override
public AbstractTracingSpan setLayer(SpanLayer layer) {
this.layer = layer;
return this;
}
/**
* Set the component of this span, with internal supported. Highly recommend to use this way.
*
* @return span instance, for chaining.
*/
@Override
public AbstractTracingSpan setComponent(Component component) {
this.componentId = component.getId();
return this;
}
@Override
public AbstractSpan start(long startTime) {
this.startTime = startTime;
return this;
}
public SpanObject.Builder transform() {
SpanObject.Builder spanBuilder = SpanObject.newBuilder();
spanBuilder.setSpanId(this.spanId);
spanBuilder.setParentSpanId(parentSpanId);
spanBuilder.setStartTime(startTime);
spanBuilder.setEndTime(endTime);
spanBuilder.setOperationName(operationName);
spanBuilder.setSkipAnalysis(skipAnalysis);
if (isEntry()) {
spanBuilder.setSpanType(SpanType.Entry);
} else if (isExit()) {
spanBuilder.setSpanType(SpanType.Exit);
} else {
spanBuilder.setSpanType(SpanType.Local);
}
if (this.layer != null) {
spanBuilder.setSpanLayerValue(this.layer.getCode());
}
if (componentId != DictionaryUtil.nullValue()) {
spanBuilder.setComponentId(componentId);
}
spanBuilder.setIsError(errorOccurred);
if (this.tags != null) {
for (TagValuePair tag : this.tags) {
spanBuilder.addTags(tag.transform());
}
}
if (this.logs != null) {
for (LogDataEntity log : this.logs) {
spanBuilder.addLogs(log.transform());
}
}
if (this.refs != null) {
for (TraceSegmentRef ref : this.refs) {
spanBuilder.addRefs(ref.transform());
}
}
return spanBuilder;
}
@Override
public void ref(TraceSegmentRef ref) {
if (refs == null) {
refs = new LinkedList<>();
}
/*
* Provide the OOM protection if the entry span hosts too many references.
*/
if (refs.size() == Config.Agent.TRACE_SEGMENT_REF_LIMIT_PER_SPAN) {
return;
}
if (!refs.contains(ref)) {
refs.add(ref);
}
}
/**
* 异步开始前,要先调用这个方法
* @return
*/
@Override
public AbstractSpan prepareForAsync() {
if (isInAsyncMode) {
throw new RuntimeException("Prepare for async repeatedly. Span is already in async mode.");
}
// 等待异步完成
ContextManager.awaitFinishAsync(this);
isInAsyncMode = true;
return this;
}
/**
* 异步任务结束时,要调用这个方法
* @return
*/
@Override
public AbstractSpan asyncFinish() {
if (!isInAsyncMode) {
throw new RuntimeException("Span is not in async mode, please use '#prepareForAsync' to active.");
}
if (isAsyncStopped) {
throw new RuntimeException("Can not do async finish for the span repeatedly.");
}
this.endTime = System.currentTimeMillis();
owner.asyncStop(this);
isAsyncStopped = true;
return this;
}
@Override
public boolean isProfiling() {
return this.owner.profileStatus().isProfiling();
}
@Override
public void skipAnalysis() {
this.skipAnalysis = true;
}
}
StackBasedTracingSpan
抽象类,基于栈的Span。实际上没有栈结构。而是通过stackDepth// 当前栈深度 来模拟。
/**
* The <code>StackBasedTracingSpan</code> represents a span with an inside stack construction.
* <p>
* This kind of span can start and finish multi times in a stack-like invoke line.
*
* 基于栈的span
*
*/
public abstract class StackBasedTracingSpan extends AbstractTracingSpan {
/**
* 当前栈深
*/
protected int stackDepth;
protected String peer;
protected StackBasedTracingSpan(int spanId, int parentSpanId, String operationName, TracingContext owner) {
super(spanId, parentSpanId, operationName, owner);
this.stackDepth = 0;
this.peer = null;
}
protected StackBasedTracingSpan(int spanId, int parentSpanId, String operationName, String peer,
TracingContext owner) {
super(spanId, parentSpanId, operationName, owner);
this.peer = peer;
}
@Override
public SpanObject.Builder transform() {
SpanObject.Builder spanBuilder = super.transform();
if (StringUtil.isNotEmpty(peer)) {
spanBuilder.setPeer(peer);
}
return spanBuilder;
}
@Override
public boolean finish(TraceSegment owner) {
if (--stackDepth == 0) {
return super.finish(owner);
} else {
return false;
}
}
@Override
public AbstractSpan setPeer(final String remotePeer) {
this.peer = remotePeer;
return this;
}
}
EntrySpan 和 ExitSpan
这两个才是真正干活的Span,上面的都是抽象类和接口。
调用逻辑
一个简单的接口请求,会经过很多框架,比如 tomcat、spring mvc ,skywalking 也针对这些框架开发了对应的插件。那么第一个运行到的插件,就会创建EntrySpan,而后面的插件就会复用这个EntrySpan,只是会覆盖一些数据。而ExitSpan就不是复用了,在一个Segment中可能存在多个。
/**
* The <code>EntrySpan</code> represents a service provider point, such as Tomcat server entrance.
* <p>
* It is a start point of {@link TraceSegment}, even in a complex application, there maybe have multi-layer entry point,
* the <code>EntrySpan</code> only represents the first one.
* <p>
* But with the last <code>EntrySpan</code>'s tags and logs, which have more details about a service provider.
* <p>
* Such as: Tomcat Embed - Dubbox The <code>EntrySpan</code> represents the Dubbox span.
*/
public class EntrySpan extends StackBasedTracingSpan {
// 当前最大栈深
private int currentMaxDepth;
public EntrySpan(int spanId, int parentSpanId, String operationName, TracingContext owner) {
super(spanId, parentSpanId, operationName, owner);
this.currentMaxDepth = 0;
}
/**
* Set the {@link #startTime}, when the first start, which means the first service provided.
*
* EntrySpan 只会由第一个插件创建, 但是后面的插件复用 EntrySpan 时 都要来调用一次 start() 方法
* 因为每一个插件都以为自己是第一个创建这个 EntrySpan 的
*/
@Override
public EntrySpan start() {
if ((currentMaxDepth = ++stackDepth) == 1) {
super.start();
}
clearWhenRestart();
return this;
}
@Override
public EntrySpan tag(String key, String value) {
if (stackDepth == currentMaxDepth || isInAsyncMode) {
super.tag(key, value);
}
return this;
}
@Override
public AbstractTracingSpan setLayer(SpanLayer layer) {
if (stackDepth == currentMaxDepth || isInAsyncMode) {
return super.setLayer(layer);
} else {
return this;
}
}
@Override
public AbstractTracingSpan setComponent(Component component) {
if (stackDepth == currentMaxDepth || isInAsyncMode) {
return super.setComponent(component);
} else {
return this;
}
}
@Override
public AbstractTracingSpan setOperationName(String operationName) {
if (stackDepth == currentMaxDepth || isInAsyncMode) {
return super.setOperationName(operationName);
} else {
return this;
}
}
@Override
public EntrySpan log(Throwable t) {
super.log(t);
return this;
}
@Override
public boolean isEntry() {
return true;
}
@Override
public boolean isExit() {
return false;
}
private void clearWhenRestart() {
this.componentId = DictionaryUtil.nullValue();
this.layer = null;
this.logs = null;
this.tags = null;
}
}
/**
* The <code>ExitSpan</code> represents a service consumer point, such as Feign, Okhttp client for an Http service.
* <p>
* It is an exit point or a leaf span(our old name) of trace tree. In a single rpc call, because of a combination of
* discovery libs, there maybe contain multi-layer exit point:
* <p>
* The <code>ExitSpan</code> only presents the first one.
* <p>
* Such as: Dubbox - Apache Httpcomponent - ...(Remote) The <code>ExitSpan</code> represents the Dubbox span, and ignore
* the httpcomponent span's info.
*
* 退出span 代表消费侧
* 区别就是
* EntrySpan 代表的是更靠近服务这一侧的信息
* ExitSpan 代表的是更靠近消费这一侧的信息
*
* -
* ExitSpan代表一个服务消费点,比如Feign,Okhttp客户端为一个Http服务。
* 它是跟踪树的出口点或叶子跨度(我们的旧名称)。在单个 rpc 调用中,由于发现库的组合,可能包含多层出口点:
* ExitSpan仅显示第一个。
* 如:Dubbox - Apache Httpcomponent - ...(Remote) ExitSpan代表Dubbox span,忽略httpcomponent span的信息。退出跨度
*
*/
public class ExitSpan extends StackBasedTracingSpan implements ExitTypeSpan {
public ExitSpan(int spanId, int parentSpanId, String operationName, String peer, TracingContext owner) {
super(spanId, parentSpanId, operationName, peer, owner);
}
public ExitSpan(int spanId, int parentSpanId, String operationName, TracingContext owner) {
super(spanId, parentSpanId, operationName, owner);
}
/**
* Set the {@link #startTime}, when the first start, which means the first service provided.
*/
@Override
public ExitSpan start() {
// 当前栈深时 是1 的情况下,才允许
// exitSpan 刚创建时, 栈深才会是1
if (++stackDepth == 1) {
super.start();
}
return this;
}
@Override
public ExitSpan tag(String key, String value) {
if (stackDepth == 1 || isInAsyncMode) {
super.tag(key, value);
}
return this;
}
@Override
public AbstractTracingSpan tag(AbstractTag<?> tag, String value) {
if (stackDepth == 1 || tag.isCanOverwrite() || isInAsyncMode) {
super.tag(tag, value);
}
return this;
}
@Override
public AbstractTracingSpan setLayer(SpanLayer layer) {
if (stackDepth == 1 || isInAsyncMode) {
return super.setLayer(layer);
} else {
return this;
}
}
@Override
public AbstractTracingSpan setComponent(Component component) {
if (stackDepth == 1 || isInAsyncMode) {
return super.setComponent(component);
} else {
return this;
}
}
@Override
public ExitSpan log(Throwable t) {
super.log(t);
return this;
}
@Override
public AbstractTracingSpan setOperationName(String operationName) {
if (stackDepth == 1 || isInAsyncMode) {
return super.setOperationName(operationName);
} else {
return this;
}
}
@Override
public String getPeer() {
return peer;
}
@Override
public ExitSpan inject(final ContextCarrier carrier) {
this.owner.inject(this, carrier);
return this;
}
@Override
public boolean isEntry() {
return false;
}
@Override
public boolean isExit() {
return true;
}
}
链路追踪上下文
AbstractTracerContext: 接口,定义了基础方法TracingContext: 核心的链路追踪逻辑控制器,管理当前 Segment 和前后Segment
AbstractTracerContext
跨进程传输数据
// **** 在跨进程的情况下,传递数据。 inject 打包数据。extract 解压数据 ***
/**
* Prepare for the cross-process propagation. How to initialize the carrier, depends on the implementation.
*
* 注入,将一些数据放到 carrier 中
* @param carrier to carry the context for crossing process.
*/
void inject(ContextCarrier carrier);
/**
* Build the reference between this segment and a cross-process segment. How to build, depends on the
* implementation.
*
* 提取 从 carrier 中提取一些数据
*
* @param carrier carried the context from a cross-process segment.
*/
void extract(ContextCarrier carrier);
跨线程传输数据
// **** 在跨线程的情况下,传递数据。 capture 打包数据。continued 解压数据 ***
/**
* 生成快照
* Capture a snapshot for cross-thread propagation. It's a similar concept with ActiveSpan.Continuation in
* OpenTracing-java How to build, depends on the implementation.
*
* @return the {@link ContextSnapshot} , which includes the reference context.
*/
ContextSnapshot capture();
/**
* 延续这个快照, 继续
* Build the reference between this segment and a cross-thread segment. How to build, depends on the
* implementation.
*
* @param snapshot from {@link #capture()} in the parent thread.
*/
void continued(ContextSnapshot snapshot);
TracingContext
- 一个 TracingContext 对应一个 Segment (管理)
- 管理当前 Segment 和自己前后的 Segment 的引用 TraceSegmentRef
- 当前Segment 内的所有 span
activeSpanStack
activeSpanStack 是一个重要属性,作者使用 LinkedList 模仿栈,用于储存 span。每一个创建的 Span 都会放入 activeSpanStack(先进后出)。以此理解,栈顶的 Span 就是 currentSpan(activeSpan)
/**
activeSpanStack栈顶的span就是activeSpan
* @return the active span of current context, the top element of {@link #activeSpanStack}
*/
@Override
public AbstractSpan activeSpan() {
AbstractSpan span = peek();
if (span == null) {
throw new IllegalStateException("No active span.");
}
return span;
}
/**
* @return the top element of 'ActiveSpanStack' only.
*/
private AbstractSpan peek() {
if (activeSpanStack.isEmpty()) {
return null;
}
return activeSpanStack.getLast();
}
createEntrySpan()
- 限制检查: 如果需要限制,就创建
NoopSpan - 设置父级: 从
activeSpanStack中取出栈顶的Span(activeSpan),取其id作为parentSpanId。如果不存在就设置为-1。 - 数据复用:
EntrySpan和ExitSpan,创建时会判断parentSpan也是同类型的Span则复用,否则才会初始化并入栈。LocalSpan不会做检查,直接初始化并入栈
/**
* Create an entry span
*
* @param operationName most likely a service name
* @return span instance. Ref to {@link EntrySpan}
*/
@Override
public AbstractSpan createEntrySpan(final String operationName) {
// 限制机制
// spanLimit配置项
if (isLimitMechanismWorking()) {
NoopSpan span = new NoopSpan();
return push(span);
}
AbstractSpan entrySpan;
TracingContext owner = this;
// 弹出一个span作为父级。这里的peek 不会删除元素
final AbstractSpan parentSpan = peek();
// 拿到父级span的ID,如果不存在父级,赋值为-1
final int parentSpanId = parentSpan == null ? -1 : parentSpan.getSpanId();
// 不为null 复用span,覆写信息
if (parentSpan != null && parentSpan.isEntry()) {
/*
* Only add the profiling recheck on creating entry span,
* as the operation name could be overrided.
*/
profilingRecheck(parentSpan, operationName);
parentSpan.setOperationName(operationName);
entrySpan = parentSpan;
return entrySpan.start();
} else {
// 巧了,没有父级,创建 EntrySpan。并入栈
entrySpan = new EntrySpan(
spanIdGenerator++, parentSpanId,
operationName, owner
);
entrySpan.start();
return push(entrySpan);
}
}
stopSpan()
- 传入的Span必须是activeSpanStack栈顶的Span,否则抛出异常
- 栈顶的Span出栈,如果栈顶的Span是AbstractTracingSpan,调用Span自身的finish方法
- 如果栈已经空了且当前TracingContext还在运行状态
- 关闭当前TraceSegment
- 将当前TraceSegment交给TracingContextListener去处理,TracingContextListener会将TraceSegment发送到OAP
- 修改当前TracingContext运行状态为false
/**
*停止, 只能停止栈顶的span。
* >按照子父级的概念 要先把子级关闭,才能去关闭父级
* Stop the given span, if and only if this one is the top element of {@link #activeSpanStack}. Because the tracing
* core must make sure the span must match in a stack module, like any program did.
*
* @param span to finish
*/
@Override
public boolean stopSpan(AbstractSpan span) {
AbstractSpan lastSpan = peek();
if (lastSpan == span) {
if (lastSpan instanceof AbstractTracingSpan) {
AbstractTracingSpan toFinishSpan = (AbstractTracingSpan) lastSpan;
if (toFinishSpan.finish(segment)) {
pop();
}
} else {
pop();
}
} else {
throw new IllegalStateException("Stopping the unexpected span = " + span);
}
finish();
return activeSpanStack.isEmpty();
}
上下文适配器 ContextManager
/**
* {@link ContextManager} controls the whole context of {@link TraceSegment}. Any {@link TraceSegment} relates to
* single-thread, so this context use {@link ThreadLocal} to maintain the context, and make sure, since a {@link
* TraceSegment} starts, all ChildOf spans are in the same context. <p> What is 'ChildOf'?
* https://github.com/opentracing/specification/blob/master/specification.md#references-between-spans
*
* <p> Also, {@link ContextManager} delegates to all {@link AbstractTracerContext}'s major methods.
* <p>
* ContextManager代理了AbstractTracerContext主要的方法
* TraceSegment及其所包含的Span都在同一个线程内,ContextManager使用ThreadLocal来管理TraceSegment的上下文(也就是AbstractTracerContext)
*/
public class ContextManager implements BootService {
private static final String EMPTY_TRACE_CONTEXT_ID = "N/A";
private static final ILog LOGGER = LogManager.getLogger(ContextManager.class);
private static ThreadLocal<AbstractTracerContext> CONTEXT = new ThreadLocal<AbstractTracerContext>();
private static ThreadLocal<RuntimeContext> RUNTIME_CONTEXT = new ThreadLocal<RuntimeContext>();
private static ContextManagerExtendService EXTEND_SERVICE;
private static AbstractTracerContext getOrCreate(String operationName, boolean forceSampling) {
// 从 threadLocal 中获取 AbstractTracerContext, 存在就返回,不存在就创建。
AbstractTracerContext context = CONTEXT.get();
if (context == null) {
// operationName为空创建IgnoredTracerContext
if (StringUtil.isEmpty(operationName)) {
if (LOGGER.isDebugEnable()) {
LOGGER.debug("No operation name, ignore this trace.");
}
context = new IgnoredTracerContext();
} else {
// 初始化 ContextManagerExtendService
// 调用ContextManagerExtendService的createTraceContext方法创建AbstractTracerContext,并设置到ThreadLocal中
if (EXTEND_SERVICE == null) {
EXTEND_SERVICE = ServiceManager.INSTANCE.findService(ContextManagerExtendService.class);
}
context = EXTEND_SERVICE.createTraceContext(operationName, forceSampling);
}
CONTEXT.set(context);
}
return context;
}
private static AbstractTracerContext get() {
return CONTEXT.get();
}
/**
* @return the first global trace id when tracing. Otherwise, "N/A".
*/
public static String getGlobalTraceId() {
AbstractTracerContext context = CONTEXT.get();
return Objects.nonNull(context) ? context.getReadablePrimaryTraceId() : EMPTY_TRACE_CONTEXT_ID;
}
/**
* @return the current segment id when tracing. Otherwise, "N/A".
*/
public static String getSegmentId() {
AbstractTracerContext context = CONTEXT.get();
return Objects.nonNull(context) ? context.getSegmentId() : EMPTY_TRACE_CONTEXT_ID;
}
/**
* @return the current span id when tracing. Otherwise, the value is -1.
*/
public static int getSpanId() {
AbstractTracerContext context = CONTEXT.get();
return Objects.nonNull(context) ? context.getSpanId() : -1;
}
public static AbstractSpan createEntrySpan(String operationName, ContextCarrier carrier) {
AbstractSpan span;
AbstractTracerContext context;
operationName = StringUtil.cut(operationName, OPERATION_NAME_THRESHOLD);
if (carrier != null && carrier.isValid()) {
SamplingService samplingService = ServiceManager.INSTANCE.findService(SamplingService.class);
samplingService.forceSampled();
// 一定要强制采样,因为链路中的前置TraceSegment已经存在,否则链路就可能会断开
context = getOrCreate(operationName, true);
span = context.createEntrySpan(operationName);
context.extract(carrier);
} else {
// 不需要强制采样,根据采样率来决定当前链路是否要采样
context = getOrCreate(operationName, false);
span = context.createEntrySpan(operationName);
}
return span;
}
public static AbstractSpan createLocalSpan(String operationName) {
operationName = StringUtil.cut(operationName, OPERATION_NAME_THRESHOLD);
AbstractTracerContext context = getOrCreate(operationName, false);
return context.createLocalSpan(operationName);
}
public static AbstractSpan createExitSpan(String operationName, ContextCarrier carrier, String remotePeer) {
if (carrier == null) {
throw new IllegalArgumentException("ContextCarrier can't be null.");
}
operationName = StringUtil.cut(operationName, OPERATION_NAME_THRESHOLD);
AbstractTracerContext context = getOrCreate(operationName, false);
AbstractSpan span = context.createExitSpan(operationName, remotePeer);
context.inject(carrier);
return span;
}
public static AbstractSpan createExitSpan(String operationName, String remotePeer) {
operationName = StringUtil.cut(operationName, OPERATION_NAME_THRESHOLD);
AbstractTracerContext context = getOrCreate(operationName, false);
return context.createExitSpan(operationName, remotePeer);
}
public static void inject(ContextCarrier carrier) {
get().inject(carrier);
}
public static void extract(ContextCarrier carrier) {
if (carrier == null) {
throw new IllegalArgumentException("ContextCarrier can't be null.");
}
if (carrier.isValid()) {
get().extract(carrier);
}
}
public static ContextSnapshot capture() {
return get().capture();
}
public static void continued(ContextSnapshot snapshot) {
if (snapshot == null) {
throw new IllegalArgumentException("ContextSnapshot can't be null.");
}
if (!snapshot.isFromCurrent()) {
get().continued(snapshot);
}
}
public static AbstractTracerContext awaitFinishAsync(AbstractSpan span) {
final AbstractTracerContext context = get();
AbstractSpan activeSpan = context.activeSpan();
if (span != activeSpan) {
throw new RuntimeException("Span is not the active in current context.");
}
return context.awaitFinishAsync();
}
/**
* If not sure has the active span, use this method, will be cause NPE when has no active span, use
* ContextManager::isActive method to determine whether there has the active span.
*/
public static AbstractSpan activeSpan() {
return get().activeSpan();
}
/**
* Recommend use ContextManager::stopSpan(AbstractSpan span), because in that way, the TracingContext core could
* verify this span is the active one, in order to avoid stop unexpected span. If the current span is hard to get or
* only could get by low-performance way, this stop way is still acceptable.
*/
public static void stopSpan() {
final AbstractTracerContext context = get();
stopSpan(context.activeSpan(), context);
}
public static void stopSpan(AbstractSpan span) {
stopSpan(span, get());
}
private static void stopSpan(AbstractSpan span, final AbstractTracerContext context) {
if (context.stopSpan(span)) {
CONTEXT.remove();
RUNTIME_CONTEXT.remove();
}
}
@Override
public void prepare() {
}
@Override
public void boot() {
}
@Override
public void onComplete() {
}
@Override
public void shutdown() {
}
public static boolean isActive() {
return get() != null;
}
public static RuntimeContext getRuntimeContext() {
RuntimeContext runtimeContext = RUNTIME_CONTEXT.get();
if (runtimeContext == null) {
runtimeContext = new RuntimeContext(RUNTIME_CONTEXT);
RUNTIME_CONTEXT.set(runtimeContext);
}
return runtimeContext;
}
public static CorrelationContext getCorrelationContext() {
final AbstractTracerContext tracerContext = get();
if (tracerContext == null) {
return null;
}
return tracerContext.getCorrelationContext();
}
}
DataCarrier
Agent采集到的链路数据会先放到DataCarrier中,由消费者线程读取DataCarrier中的数据上报到OAP
相关数据的结构图示
基础Buffer
底层是一个数组
Buffer
/**
* Self implementation ring queue.
* 自行实现环形队列。
* // jdk 知识。 jdk 9 之后
* AtomicIntegerArray 中 VarHandle 替代以往的直接使用 Unsafe, 目的是为了更安全的去操作内存,提升性能
* 屏蔽了 Unsafe 的危险性
*/
public class Buffer<T> implements QueueBuffer<T> {
// 数据的数组
private final Object[] buffer;
// 策略
private BufferStrategy strategy;
// 数组 buffer 的索引
private AtomicRangeInteger index;
Buffer(int bufferSize, BufferStrategy strategy) {
buffer = new Object[bufferSize];
this.strategy = strategy;
index = new AtomicRangeInteger(0, bufferSize);
}
@Override
public void setStrategy(BufferStrategy strategy) {
this.strategy = strategy;
}
/**
* 环状队列。
* getAndIncrement(),会为data分配下标。如果数组已经满了,会从0开始。
* 在arr[index] 的 value 不为空的情况下,根据策略来决定是否覆盖。
*
* @param data to add.
* @return
*/
@Override
public boolean save(T data) {
int i = index.getAndIncrement();
if (buffer[i] != null) {
// 策略
switch (strategy) {
case IF_POSSIBLE:
return false;
default:
}
}
buffer[i] = data;
return true;
}
@Override
public int getBufferSize() {
return buffer.length;
}
@Override
public void obtain(List<T> consumeList) {
this.obtain(consumeList, 0, buffer.length);
}
void obtain(List<T> consumeList, int start, int end) {
for (int i = start; i < end; i++) {
if (buffer[i] != null) {
consumeList.add((T) buffer[i]);
buffer[i] = null;
}
}
}
}
public enum BufferStrategy {
/**
* 阻塞,等待队列有空位置
*/
BLOCKING,
/**
* 能放就放,不能放就算了
*/
IF_POSSIBLE
}
ArrayBlockingQueueBuffer
/**
* The buffer implementation based on JDK ArrayBlockingQueue.
* <p>
* This implementation has better performance in server side. We are still trying to research whether this is suitable
* for agent side, which is more sensitive about blocks.
*
* 阻塞队列实现的 Buffer
* 作者说 在 OAP 中 使用 ArrayBlockingQueue 拥有更高的性能。就想在agent 端试试
*
*/
public class ArrayBlockingQueueBuffer<T> implements QueueBuffer<T> {
private BufferStrategy strategy;
private ArrayBlockingQueue<T> queue;
private int bufferSize;
ArrayBlockingQueueBuffer(int bufferSize, BufferStrategy strategy) {
this.strategy = strategy;
this.queue = new ArrayBlockingQueue<T>(bufferSize);
this.bufferSize = bufferSize;
}
@Override
public boolean save(T data) {
//only BufferStrategy.BLOCKING
try {
queue.put(data);
} catch (InterruptedException e) {
// Ignore the error
return false;
}
return true;
}
@Override
public void setStrategy(BufferStrategy strategy) {
this.strategy = strategy;
}
@Override
public void obtain(List<T> consumeList) {
queue.drainTo(consumeList);
}
@Override
public int getBufferSize() {
return bufferSize;
}
}
Channels
对一组 Buffer 进行管理
/**
* Channels of Buffer It contains all buffer data which belongs to this channel. It supports several strategy when
* buffer is full. The Default is BLOCKING <p> Created by wusheng on 2016/10/25.
*
* Buffer Channels 包含属于该通道的所有缓冲区数据。当缓冲区已满时,它支持多种策略。默认为阻塞
*/
public class Channels<T> {
// 被管理的 buffer
private final QueueBuffer<T>[] bufferChannels;
// 分区器 1. 滚动分区。2.线程id取模
private IDataPartitioner<T> dataPartitioner;
// 策略
private final BufferStrategy strategy;
// 数量
private final long size;
public Channels(int channelSize, int bufferSize, IDataPartitioner<T> partitioner, BufferStrategy strategy) {
this.dataPartitioner = partitioner;
this.strategy = strategy;
bufferChannels = new QueueBuffer[channelSize];
for (int i = 0; i < channelSize; i++) {
if (BufferStrategy.BLOCKING.equals(strategy)) {
bufferChannels[i] = new ArrayBlockingQueueBuffer<>(bufferSize, strategy);
} else {
bufferChannels[i] = new Buffer<>(bufferSize, strategy);
}
}
// noinspection PointlessArithmeticExpression
size = 1L * channelSize * bufferSize; // it's not pointless, it prevents numeric overflow before assigning an integer to a long
}
public boolean save(T data) {
// Buffer 的索引。即选择那个 Buffer 来储存数据
int index = dataPartitioner.partition(bufferChannels.length, data);
int retryCountDown = 1;
if (BufferStrategy.IF_POSSIBLE.equals(strategy)) {
int maxRetryCount = dataPartitioner.maxRetryCount();
if (maxRetryCount > 1) {
retryCountDown = maxRetryCount;
}
}
for (; retryCountDown > 0; retryCountDown--) {
if (bufferChannels[index].save(data)) {
return true;
}
}
return false;
}
public void setPartitioner(IDataPartitioner<T> dataPartitioner) {
this.dataPartitioner = dataPartitioner;
}
/**
* override the strategy at runtime. Notice, this will override several channels one by one. So, when running
* setStrategy, each channel may use different BufferStrategy
*/
public void setStrategy(BufferStrategy strategy) {
for (QueueBuffer<T> buffer : bufferChannels) {
buffer.setStrategy(strategy);
}
}
/**
* get channelSize
*/
public int getChannelSize() {
return this.bufferChannels.length;
}
public long size() {
return size;
}
public QueueBuffer<T> getBuffer(int index) {
return this.bufferChannels[index];
}
}
消费 Consumer
消费者读取DataCarrier中的数据上报到OAP,IConsumer是消费者的顶层接口,定义了基本方案。
ConsumerThread
一个ConsumerThread中包含多个DataSource,DataSource里包装了Buffer。同时一个ConsumerThread绑定了一个Consumer,Consumer会消费ConsumerThread中的DataSource
/**
* 一个线程,绑定一个消费者
* 一个消费者,绑定多个 Buffer
* @param <T>
*/
public class ConsumerThread<T> extends Thread {
private volatile boolean running;
private IConsumer<T> consumer;
private List<DataSource> dataSources;
// 本次消费没有取到数据时,现成 sleep 的时间
private long consumeCycle;
ConsumerThread(String threadName, IConsumer<T> consumer, long consumeCycle) {
super(threadName);
this.consumer = consumer;
running = false;
dataSources = new ArrayList<DataSource>(1);
this.consumeCycle = consumeCycle;
}
/**
* add whole buffer to consume
*/
void addDataSource(QueueBuffer<T> sourceBuffer) {
this.dataSources.add(new DataSource(sourceBuffer));
}
@Override
public void run() {
running = true;
final List<T> consumeList = new ArrayList<T>(1500);
while (running) {
// 没取到数据? 睡一会
if (!consume(consumeList)) {
try {
Thread.sleep(consumeCycle);
} catch (InterruptedException e) {
}
}
}
// consumer thread is going to stop
// consume the last time
// 在结束时,再消费一次
consume(consumeList);
consumer.onExit();
}
/**
* 将数据 放到 consumeList。 并消费
* @param consumeList
* @return
*/
private boolean consume(List<T> consumeList) {
for (DataSource dataSource : dataSources) {
dataSource.obtain(consumeList);
}
if (!consumeList.isEmpty()) {
try {
consumer.consume(consumeList);
} catch (Throwable t) {
consumer.onError(consumeList, t);
} finally {
consumeList.clear();
}
return true;
}
consumer.nothingToConsume();
return false;
}
void shutdown() {
running = false;
}
/**
* 适配器
* DataSource is a refer to {@link Buffer}.
*/
class DataSource {
private QueueBuffer<T> sourceBuffer;
DataSource(QueueBuffer<T> sourceBuffer) {
this.sourceBuffer = sourceBuffer;
}
void obtain(List<T> consumeList) {
sourceBuffer.obtain(consumeList);
}
}
}
MultipleChannelsConsumer
一个单消费者线程,但支持多个Channels和它们的消费者。
一个Group中包含一个Consumer和一个Channels,一个Channels包含多个Buffer,Consumer会消费Channels中所有的Buffer
一个MultipleChannelsConsumer包含多个Group,实际上是管理多个Consumer以及它们对应的Buffer
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
package org.apache.skywalking.apm.commons.datacarrier.consumer;
import java.util.ArrayList;
import java.util.List;
import org.apache.skywalking.apm.commons.datacarrier.buffer.Channels;
import org.apache.skywalking.apm.commons.datacarrier.buffer.QueueBuffer;
/**
* MultipleChannelsConsumer represent a single consumer thread, but support multiple channels with their {@link
* IConsumer}s
* 一个单消费者线程,但支持多个channels和它们的消费者
*/
public class MultipleChannelsConsumer extends Thread {
private volatile boolean running;
private volatile ArrayList<Group> consumeTargets;
@SuppressWarnings("NonAtomicVolatileUpdate")
private volatile long size;
private final long consumeCycle;
public MultipleChannelsConsumer(String threadName, long consumeCycle) {
super(threadName);
this.consumeTargets = new ArrayList<Group>();
this.consumeCycle = consumeCycle;
}
@Override
public void run() {
running = true;
final List consumeList = new ArrayList(2000);
while (running) {
boolean hasData = false;
for (Group target : consumeTargets) {
boolean consume = consume(target, consumeList);
hasData = hasData || consume;
}
if (!hasData) {
try {
Thread.sleep(consumeCycle);
} catch (InterruptedException e) {
}
}
}
// consumer thread is going to stop
// consume the last time
for (Group target : consumeTargets) {
consume(target, consumeList);
target.consumer.onExit();
}
}
private boolean consume(Group target, List consumeList) {
// 遍历channels中的buffer,将buffer中的数据放到consumeList中,并清空buffer
for (int i = 0; i < target.channels.getChannelSize(); i++) {
QueueBuffer buffer = target.channels.getBuffer(i);
buffer.obtain(consumeList);
}
if (!consumeList.isEmpty()) {
try {
// 调用消费者的消费逻辑
target.consumer.consume(consumeList);
} catch (Throwable t) {
target.consumer.onError(consumeList, t);
} finally {
consumeList.clear();
}
return true;
}
target.consumer.nothingToConsume();
return false;
}
/**
* Add a new target channels.
*/
public void addNewTarget(Channels channels, IConsumer consumer) {
Group group = new Group(channels, consumer);
// Recreate the new list to avoid change list while the list is used in consuming.
ArrayList<Group> newList = new ArrayList<Group>();
for (Group target : consumeTargets) {
newList.add(target);
}
newList.add(group);
consumeTargets = newList;
size += channels.size();
}
public long size() {
return size;
}
void shutdown() {
running = false;
}
private static class Group {
// 一个channels对应多个buffer
private Channels channels;
// consumer会消费channels中所有的buffer
private IConsumer consumer;
public Group(Channels channels, IConsumer consumer) {
this.channels = channels;
this.consumer = consumer;
}
}
}
消费驱动 Drive
ConsumeDriver
一个ConsumeDriver包含多个ConsumerThread
/**
* Pool of consumers <p> Created by wusheng on 2016/10/25.
*
* 一堆消费者线程,拿着一堆 buffer , 按照 allocateBuffer2Thread() 的策略 进行分配消费。
*/
public class ConsumeDriver<T> implements IDriver {
private boolean running;
private ConsumerThread[]
;
private Channels<T> channels;
private ReentrantLock lock;
public ConsumeDriver(String name, Channels<T> channels, Class<? extends IConsumer<T>> consumerClass, int num,
long consumeCycle) {
this(channels, num);
for (int i = 0; i < num; i++) {
consumerThreads[i] = new ConsumerThread("DataCarrier." + name + ".Consumer." + i + ".Thread", getNewConsumerInstance(consumerClass), consumeCycle);
consumerThreads[i].setDaemon(true);
}
}
public ConsumeDriver(String name, Channels<T> channels, IConsumer<T> prototype, int num, long consumeCycle) {
this(channels, num);
prototype.init();
for (int i = 0; i < num; i++) {
consumerThreads[i] = new ConsumerThread("DataCarrier." + name + ".Consumer." + i + ".Thread", prototype, consumeCycle);
consumerThreads[i].setDaemon(true);
}
}
private ConsumeDriver(Channels<T> channels, int num) {
running = false;
this.channels = channels;
consumerThreads = new ConsumerThread[num];
lock = new ReentrantLock();
}
private IConsumer<T> getNewConsumerInstance(Class<? extends IConsumer<T>> consumerClass) {
try {
IConsumer<T> inst = consumerClass.getDeclaredConstructor().newInstance();
inst.init();
return inst;
} catch (InstantiationException e) {
throw new ConsumerCannotBeCreatedException(e);
} catch (IllegalAccessException e) {
throw new ConsumerCannotBeCreatedException(e);
} catch (NoSuchMethodException e) {
throw new ConsumerCannotBeCreatedException(e);
} catch (InvocationTargetException e) {
throw new ConsumerCannotBeCreatedException(e);
}
}
@Override
public void begin(Channels channels) {
// begin只能调用一次
if (running) {
return;
}
lock.lock();
try {
this.allocateBuffer2Thread();
for (ConsumerThread consumerThread : consumerThreads) {
consumerThread.start();
}
running = true;
} finally {
lock.unlock();
}
}
@Override
public boolean isRunning(Channels channels) {
return running;
}
private void allocateBuffer2Thread() {
int channelSize = this.channels.getChannelSize();
/**
*
* 因为channels里面有多个buffer,同时这里也有多个消费者线程
* 这一步的操作就是将这些buffer分配给不同的消费者线程去消费
*
* if consumerThreads.length < channelSize
* each consumer will process several channels.
*
* if consumerThreads.length == channelSize
* each consumer will process one channel.
*
* if consumerThreads.length > channelSize
* there will be some threads do nothing.
*/
for (int channelIndex = 0; channelIndex < channelSize; channelIndex++) {
// 消费者线程索引 = buffer的下标和消费者线程数取模
int consumerIndex = channelIndex % consumerThreads.length;
consumerThreads[consumerIndex].addDataSource(channels.getBuffer(channelIndex));
}
}
@Override
public void close(Channels channels) {
lock.lock();
try {
this.running = false;
for (ConsumerThread consumerThread : consumerThreads) {
consumerThread.shutdown();
}
} finally {
lock.unlock();
}
}
}
BulkConsumePool
一个BulkConsumePool包含多个MultipleChannelsConsumer
/**
* BulkConsumePool works for consuming data from multiple channels(DataCarrier instances), with multiple {@link
* MultipleChannelsConsumer}s.
* <p>
* In typical case, the number of {@link MultipleChannelsConsumer} should be less than the number of channels.
*
* BulkConsumePool 用于使用多个MultipleChannelsConsumer消耗来自多个通道(DataCarrier 实例)的数据。
* 在典型情况下, MultipleChannelsConsumer的数量应该小于通道的数量
*/
public class BulkConsumePool implements ConsumerPool {
private List<MultipleChannelsConsumer> allConsumers;
private volatile boolean isStarted = false;
public BulkConsumePool(String name, int size, long consumeCycle) {
size = EnvUtil.getInt(name + "_THREAD", size);
allConsumers = new ArrayList<MultipleChannelsConsumer>(size);
// 创建消费者线程
for (int i = 0; i < size; i++) {
MultipleChannelsConsumer multipleChannelsConsumer = new MultipleChannelsConsumer("DataCarrier." + name + ".BulkConsumePool." + i + ".Thread", consumeCycle);
multipleChannelsConsumer.setDaemon(true);
allConsumers.add(multipleChannelsConsumer);
}
}
@Override
synchronized public void add(String name, Channels channels, IConsumer consumer) {
// 拿到负载最低的线程
MultipleChannelsConsumer multipleChannelsConsumer = getLowestPayload();
multipleChannelsConsumer.addNewTarget(channels, consumer);
}
/**
* Get the lowest payload consumer thread based on current allocate status.
*
* @return the lowest consumer.
*/
private MultipleChannelsConsumer getLowestPayload() {
MultipleChannelsConsumer winner = allConsumers.get(0);
// 找出持有 buffer 数量最少的线程
for (int i = 1; i < allConsumers.size(); i++) {
MultipleChannelsConsumer option = allConsumers.get(i);
if (option.size() < winner.size()) {
winner = option;
}
}
return winner;
}
/**
*
*/
@Override
public boolean isRunning(Channels channels) {
return isStarted;
}
@Override
public void close(Channels channels) {
for (MultipleChannelsConsumer consumer : allConsumers) {
consumer.shutdown();
}
}
@Override
public void begin(Channels channels) {
if (isStarted) {
return;
}
for (MultipleChannelsConsumer consumer : allConsumers) {
consumer.start();
}
isStarted = true;
}
/**
* The creator for {@link BulkConsumePool}.
*/
public static class Creator implements Callable<ConsumerPool> {
private String name;
private int size;
private long consumeCycle;
public Creator(String name, int poolSize, long consumeCycle) {
this.name = name;
this.size = poolSize;
this.consumeCycle = consumeCycle;
}
@Override
public ConsumerPool call() {
return new BulkConsumePool(name, size, consumeCycle);
}
public static int recommendMaxSize() {
return Runtime.getRuntime().availableProcessors() * 2;
}
}
}
链路数据发送的 OAP
TracingContext的finish()方法关闭当前TraceSegment后,会调用ListenerManager的notifyFinish()方法传入当前关闭的TraceSegment。ListenerManager的notifyFinish()方法会迭代所有注册的TracingContextListener调用它们的afterFinished()方法
public class TracingContext implements AbstractTracerContext {
/**
* 结束TracingContext
* Finish this context, and notify all {@link TracingContextListener}s, managed by {@link
* TracingContext.ListenerManager} and {@link TracingContext.TracingThreadListenerManager}
*/
private void finish() {
if (isRunningInAsyncMode) {
asyncFinishLock.lock();
}
try {
// 栈已经空了 且 当前TracingContext还在运行状态
boolean isFinishedInMainThread = activeSpanStack.isEmpty() && running;
if (isFinishedInMainThread) {
/*
* Notify after tracing finished in the main thread.
*/
TracingThreadListenerManager.notifyFinish(this);
}
if (isFinishedInMainThread && (!isRunningInAsyncMode || asyncSpanCounter == 0)) {
// 关闭当前TraceSegment
TraceSegment finishedSegment = segment.finish(isLimitMechanismWorking());
// 将当前TraceSegment交给TracingContextListener去处理,TracingContextListener会将TraceSegment发送到OAP
TracingContext.ListenerManager.notifyFinish(finishedSegment);
// 修改当前TracingContext运行状态为false
running = false;
}
} finally {
if (isRunningInAsyncMode) {
asyncFinishLock.unlock();
}
}
}
/**
* The <code>ListenerManager</code> represents an event notify for every registered listener, which are notified
* when the <code>TracingContext</code> finished, and {@link #segment} is ready for further process.
*/
public static class ListenerManager {
private static List<TracingContextListener> LISTENERS = new LinkedList<>();
/**
* Add the given {@link TracingContextListener} to {@link #LISTENERS} list.
*
* @param listener the new listener.
*/
public static synchronized void add(TracingContextListener listener) {
LISTENERS.add(listener);
}
/**
* Notify the {@link TracingContext.ListenerManager} about the given {@link TraceSegment} have finished. And
* trigger {@link TracingContext.ListenerManager} to notify all {@link #LISTENERS} 's {@link
* TracingContextListener#afterFinished(TraceSegment)}
*
* @param finishedSegment the segment that has finished
*/
static void notifyFinish(TraceSegment finishedSegment) {
for (TracingContextListener listener : LISTENERS) {
listener.afterFinished(finishedSegment);
}
}
/**
* Clear the given {@link TracingContextListener}
*/
public static synchronized void remove(TracingContextListener listener) {
LISTENERS.remove(listener);
}
}
TraceSegmentServiceClient
TraceSegmentServiceClient注册了TracingContextListener的监听。在 TracingContext.finish() 方法 会通过监听器的逻辑,调用到这个方法。 即,一个Segment 要关闭的时候,会把自己传到这里,这里会将其放入carrier。最后消费
/**
* 向OAP 发送数据
*/
@DefaultImplementor
public class TraceSegmentServiceClient implements BootService, IConsumer<TraceSegment>, TracingContextListener, GRPCChannelListener {
private static final ILog LOGGER = LogManager.getLogger(TraceSegmentServiceClient.class);
// 上一次打印传输traceSegment情况的日志的时间
private long lastLogTime;
// 成功发送的traceSegment数量
private long segmentUplinkedCounter;
// 因网络原因丢弃的traceSegment数量
private long segmentAbandonedCounter;
private volatile DataCarrier<TraceSegment> carrier;
private volatile TraceSegmentReportServiceGrpc.TraceSegmentReportServiceStub serviceStub;
private volatile GRPCChannelStatus status = GRPCChannelStatus.DISCONNECT;
@Override
public void prepare() {
ServiceManager.INSTANCE.findService(GRPCChannelManager.class).addChannelListener(this);
}
@Override
public void boot() {
lastLogTime = System.currentTimeMillis();
segmentUplinkedCounter = 0;
segmentAbandonedCounter = 0;
carrier = new DataCarrier<>(CHANNEL_SIZE, BUFFER_SIZE, BufferStrategy.IF_POSSIBLE);
carrier.consume(this, 1);
}
@Override
public void onComplete() {
TracingContext.ListenerManager.add(this);
}
@Override
public void shutdown() {
TracingContext.ListenerManager.remove(this);
carrier.shutdownConsumers();
}
@Override
public void init() {
}
@Override
public void consume(List<TraceSegment> data) {
if (CONNECTED.equals(status)) {
final GRPCStreamServiceStatus status = new GRPCStreamServiceStatus(false);
StreamObserver<SegmentObject> upstreamSegmentStreamObserver = serviceStub.withDeadlineAfter(
Config.Collector.GRPC_UPSTREAM_TIMEOUT, TimeUnit.SECONDS
).collect(new StreamObserver<Commands>() {
@Override
public void onNext(Commands commands) {
ServiceManager.INSTANCE.findService(CommandService.class)
.receiveCommand(commands);
}
@Override
public void onError(
Throwable throwable) {
status.finished();
if (LOGGER.isErrorEnable()) {
LOGGER.error(
throwable,
"Send UpstreamSegment to collector fail with a grpc internal exception."
);
}
ServiceManager.INSTANCE
.findService(GRPCChannelManager.class)
.reportError(throwable);
}
@Override
public void onCompleted() {
status.finished();
}
});
try {
for (TraceSegment segment : data) {
SegmentObject upstreamSegment = segment.transform();
// 发送到OAP
upstreamSegmentStreamObserver.onNext(upstreamSegment);
}
} catch (Throwable t) {
LOGGER.error(t, "Transform and send UpstreamSegment to collector fail.");
}
upstreamSegmentStreamObserver.onCompleted();
// 强制等待所有的traceSegment都发送完成
status.wait4Finish();
segmentUplinkedCounter += data.size();
} else {
segmentAbandonedCounter += data.size();
}
printUplinkStatus();
}
private void printUplinkStatus() {
long currentTimeMillis = System.currentTimeMillis();
if (currentTimeMillis - lastLogTime > 30 * 1000) {
lastLogTime = currentTimeMillis;
if (segmentUplinkedCounter > 0) {
LOGGER.debug("{} trace segments have been sent to collector.", segmentUplinkedCounter);
segmentUplinkedCounter = 0;
}
if (segmentAbandonedCounter > 0) {
LOGGER.debug(
"{} trace segments have been abandoned, cause by no available channel.", segmentAbandonedCounter);
segmentAbandonedCounter = 0;
}
}
}
@Override
public void onError(List<TraceSegment> data, Throwable t) {
LOGGER.error(t, "Try to send {} trace segments to collector, with unexpected exception.", data.size());
}
@Override
public void onExit() {
}
/**
* 监听方法。 TracingContext.finish() 方法 会通过监听器的逻辑,调用到这个方法。
* 即,一个Segment 要关闭的时候,会把自己传到这里,这里会将其放入carrier。最后消费
* @param traceSegment
*/
@Override
public void afterFinished(TraceSegment traceSegment) {
if (traceSegment.isIgnore()) {
return;
}
// 将traceSegment放到dataCarrier中
if (!carrier.produce(traceSegment)) {
if (LOGGER.isDebugEnable()) {
LOGGER.debug("One trace segment has been abandoned, cause by buffer is full.");
}
}
}
@Override
public void statusChanged(GRPCChannelStatus status) {
if (CONNECTED.equals(status)) {
Channel channel = ServiceManager.INSTANCE.findService(GRPCChannelManager.class).getChannel();
serviceStub = TraceSegmentReportServiceGrpc.newStub(channel);
}
this.status = status;
}
}