前言
整个反压机制不是单单一个算子去实现的,而是上下游协同操作的,因此,解析源码的时候会拆出每个单独的部分,没办法全面去协调解析,很绕,分为以下几步
- 下游解析上游发送的数据消息并占用缓冲区,等待下游消费者处理
- 下游消费者处理完,回收缓冲区,更新信用值(缓冲区)
- 下游计算信用值,并发送给上游
- 上游拿到信用值,并根据信用值去发送数据
一.下游解析上游的数据
涉及的核心类如下
- CreditBasedPartitionRequestClientHandler
- RemoteInputChannel
1.CreditBasedPartitionRequestClientHandler
(1) channelRead()
解析上游发送的消息并分发处理,调decodeMsg(msg)
// 这是数据接收逻辑:解析上游发送的消息并分发处理,调decodeMsg(msg)
@Override
public void channelRead(ChannelHandlerContext ctx, Object msg) throws Exception {
try {
decodeMsg(msg);
} catch (Throwable t) {
notifyAllChannelsOfErrorAndClose(t);
}
}
(2) 调用的decodeMsg()
针对三种不同的消息,进行特殊处理
- BufferResponse数据缓冲区消息:获取对应的
RemoteInputChannel,然后调decodeBufferOrEvent()去处理msg - ErrorResponse错误消息:
- 致命错误:通知所有通道关闭
- 非致命错误:仅通知相关通道关闭
- BacklogAnnouncement积压消息:同样,获取对应的
RemoteInputChannel,然后调其.onSenderBacklog()->调notifyBufferAvailable()
/* 消息的解码和分析,分为以下三种消息
* 1.BufferResponse:数据缓冲区(包含实际数据)
* 2.ErrorResponse:错误通知(如分区不存在、远程任务失败)
* 3.BacklogAnnouncement:积压通知(告知下游上游当前积压的数据量)
* */
private void decodeMsg(Object msg) {
final Class<?> msgClazz = msg.getClass();
// ---- Buffer --------------------------------------------------------
// 情况1:BufferResponse数据缓冲区
if (msgClazz == NettyMessage.BufferResponse.class) {
NettyMessage.BufferResponse bufferOrEvent = (NettyMessage.BufferResponse) msg;
// 获取目标输入通道,这里有个概念,就是上游的RSP都有对应下游IC的id,因此,才能知道该把数据发给谁
RemoteInputChannel inputChannel = inputChannels.get(bufferOrEvent.receiverId);
// 通道无效,则释放缓冲区并取消请求
if (inputChannel == null || inputChannel.isReleased()) {
bufferOrEvent.releaseBuffer();
cancelRequestFor(bufferOrEvent.receiverId);
return;
}
// 通道有效,调decodeBufferOrEvent()去处理缓冲区
try {
decodeBufferOrEvent(inputChannel, bufferOrEvent);
} catch (Throwable t) {
inputChannel.onError(t);
}
}
// 情况2:ErrorResponse错误通知
else if (msgClazz == NettyMessage.ErrorResponse.class) {
// ---- Error ---------------------------------------------------------
NettyMessage.ErrorResponse error = (NettyMessage.ErrorResponse) msg;
SocketAddress remoteAddr = ctx.channel().remoteAddress();
// 致命错误,通知所有通道并关闭链接
if (error.isFatalError()) {
notifyAllChannelsOfErrorAndClose(
new RemoteTransportException(
"Fatal error at remote task manager '"
+ remoteAddr
+ " [ "
+ connectionID.getResourceID().getStringWithMetadata()
+ " ] "
+ "'.",
remoteAddr,
error.cause));
} else { // 非致命错误,仅通知特定通道
RemoteInputChannel inputChannel = inputChannels.get(error.receiverId);
if (inputChannel != null) {
if (error.cause.getClass() == PartitionNotFoundException.class) { // 分区不存在,特殊处理
inputChannel.onFailedPartitionRequest();
} else { // 其他错误处理
inputChannel.onError(
new RemoteTransportException(
"Error at remote task manager '"
+ remoteAddr
+ " [ "
+ connectionID
.getResourceID()
.getStringWithMetadata()
+ " ] "
+ "'.",
remoteAddr,
error.cause));
}
}
}
}
// 情况3:BacklogAnnouncement积压通知
else if (msgClazz == NettyMessage.BacklogAnnouncement.class) {
NettyMessage.BacklogAnnouncement announcement = (NettyMessage.BacklogAnnouncement) msg;
// 同样,获取目标输入通道
RemoteInputChannel inputChannel = inputChannels.get(announcement.receiverId);
// 通道无效,则取消请求
if (inputChannel == null || inputChannel.isReleased()) {
cancelRequestFor(announcement.receiverId);
return;
}
// 通道有效,调onSenderBacklog()处理积压
try {
inputChannel.onSenderBacklog(announcement.backlog);
} catch (Throwable throwable) {
inputChannel.onError(throwable);
}
} else { // 其他情况,则抛出异常
throw new IllegalStateException(
"Received unknown message from producer: " + msg.getClass());
}
}
(3) 调用的decodeBufferOrEvent()
针对BufferResponse数据缓冲区消息又分为以下几种情况
- 空消息(可能是barrier、心跳、边界标记):调
inputChannel.onEmptyBuffer() - 有消息(带数据的):对数据进行分片,调
sliceBuffer()零拷贝去处理,并且涉及自定义回收逻辑bufferOrEvent.getBuffer().recycleBuffer()
// 该方法负责将从上游接受的BufferResponse 消息解码并分发给对应的输入通道
private void decodeBufferOrEvent(
RemoteInputChannel inputChannel, NettyMessage.BufferResponse bufferOrEvent)
throws Throwable {
// 1.空缓冲区:可能是心跳、缓冲区边界标记
if (bufferOrEvent.isBuffer() && bufferOrEvent.bufferSize == 0) {
inputChannel.onEmptyBuffer(bufferOrEvent.sequenceNumber, bufferOrEvent.backlog);
}
// 2.有效缓冲区:有数据的
else if (bufferOrEvent.getBuffer() != null) {
// 采用的是分片处理的方式
if (bufferOrEvent.numOfPartialBuffers > 0) {
int offset = 0; // 记录当前分片在原始缓冲区中的起始位置
int seq = bufferOrEvent.sequenceNumber; // 起始序列号
AtomicInteger waitToBeReleased =
new AtomicInteger(bufferOrEvent.numOfPartialBuffers); // 待释放分片计数器,递减
AtomicInteger processedPartialBuffers = new AtomicInteger(0); // 已处理分片计数器,递增
try {
for (int i = 0; i < bufferOrEvent.numOfPartialBuffers; i++) {
int size = bufferOrEvent.getPartialBufferSizes().get(i); // 获取当前分片的大小
// 1.处理计数
processedPartialBuffers.incrementAndGet();
// 2.创建分片并传递给输入通道的本地,进行维护
inputChannel.onBuffer(
sliceBuffer(
bufferOrEvent,
memorySegment -> { // 自定义缓冲区回收逻辑,当所有分片处理完成,则调用缓冲区回收逻辑,回收的是整个buffer
if (waitToBeReleased.decrementAndGet() == 0) {
bufferOrEvent.getBuffer().recycleBuffer();
}
},
offset,
size), // 创建分片缓冲区,采用零拷贝
seq++, // 递增序列号
i == bufferOrEvent.numOfPartialBuffers - 1
? bufferOrEvent.backlog
: -1, // 仅最后分片携带积压信息
-1);
// 3. 更新偏移量
offset += size;
}
} catch (Throwable throwable) {
LOG.error("Failed to process partial buffers.", throwable);
if (processedPartialBuffers.get() != bufferOrEvent.numOfPartialBuffers) {
bufferOrEvent.getBuffer().recycleBuffer();
}
throw throwable;
}
} else {
inputChannel.onBuffer(
bufferOrEvent.getBuffer(),
bufferOrEvent.sequenceNumber,
bufferOrEvent.backlog,
bufferOrEvent.subpartitionId);
}
}
// 3.其他情况,直接抛出异常
else {
throw new IllegalStateException(
"The read buffer is null in credit-based input channel.");
}
}
(4) 调用的sliceBuffer()
零拷贝,只引用
private static NetworkBuffer sliceBuffer(
NettyMessage.BufferResponse bufferOrEvent,
BufferRecycler recycler,
int offset,
int size) {
// 1.从原始缓冲区获取指定位置的ByteBuffer
ByteBuffer nioBuffer = bufferOrEvent.getBuffer().getNioBuffer(offset, size);
// 2.对nioBuffer包装成MemorySegment(零拷贝,只是引用)
MemorySegment segment;
if (nioBuffer.isDirect()) {
segment = MemorySegmentFactory.wrapOffHeapMemory(nioBuffer);
} else {
byte[] bytes = nioBuffer.array();
segment = MemorySegmentFactory.wrap(bytes);
}
// 3.创建新的NetworkBuffer,使用自定义回收逻辑,上面传入的recycler
return new NetworkBuffer(
segment, recycler, bufferOrEvent.dataType, bufferOrEvent.isCompressed, size);
}
好了,到这里,我们发现CreditBasedPartitionRequestClientHandler只是对消息进行分类,封装,然后具体发送处理还是调的RemoteInputChannel的一系列方法
2.RemoteInputChannel
(1) onSenderBacklog()
/**
* 从生产者的缓冲区响应接收积压值。如果当前可用缓冲区的数量小于backlog+initialCredit,它将向缓冲区管理器请求浮动缓冲区,然后通知生产者未宣布的信用。
* @param backlog 生产者子分区RSP中未发送的缓冲区数量。
* @throws IOException
*/
public void onSenderBacklog(int backlog) throws IOException {
notifyBufferAvailable(bufferManager.requestFloatingBuffers(backlog + initialCredit));
}
其实就是告诉RemoteInputChannel去尽可能增加信用值,然后通知上游去把积压的数据发送
处理积压的代码notifyBufferAvailable,详情看Flink-反压-3.源码分析-流程
(2) onBuffer()
public void onBuffer(Buffer buffer, int sequenceNumber, int backlog, int subpartitionId)
throws IOException {
boolean recycleBuffer = true;
try {
// 缓冲区顺序校验
if (expectedSequenceNumber != sequenceNumber) {
onError(new BufferReorderingException(expectedSequenceNumber, sequenceNumber));
return;
}
// 针对特殊数据类型(如barrier),阻塞上游数据
if (buffer.getDataType().isBlockingUpstream()) {
onBlockingUpstream();
// 要求backlog必须为0
checkArgument(backlog == 0, "Illegal number of backlog: %s, should be 0.", backlog);
}
final boolean wasEmpty;
boolean firstPriorityEvent = false;
// 同步处理接收的缓冲区
synchronized (receivedBuffers) {
// 记录接收日志
NetworkActionsLogger.traceInput(
"RemoteInputChannel#onBuffer",
buffer,
inputGate.getOwningTaskName(),
channelInfo,
channelStatePersister,
sequenceNumber);
// Similar to notifyBufferAvailable(), make sure that we never add a buffer
// after releaseAllResources() released all buffers from receivedBuffers
// (see above for details).
// 若通道已经释放,直接return;否则,执行下发逻辑
if (isReleased.get()) {
return;
}
wasEmpty = receivedBuffers.isEmpty();
// 封装缓冲区为SequenceBuffer
SequenceBuffer sequenceBuffer =
new SequenceBuffer(buffer, sequenceNumber, subpartitionId);
DataType dataType = buffer.getDataType();
// 只要有数据,就将recycleBuffer置为false,以免回收,表示当前数据正在占用缓冲区
// 对于优先级事件如barrier,加入到receivedBuffers的优先级队列中
if (dataType.hasPriority()) {
firstPriorityEvent = addPriorityBuffer(sequenceBuffer);
recycleBuffer = false;
} else {// 对于普通事件,加入到receivedBuffers的普通队列中
receivedBuffers.add(sequenceBuffer);
recycleBuffer = false;
if (dataType.requiresAnnouncement()) {
firstPriorityEvent = addPriorityBuffer(announce(sequenceBuffer));
}
}
// 更新队列总大小
totalQueueSizeInBytes += buffer.getSize();
// 检测barrier
final OptionalLong barrierId =
channelStatePersister.checkForBarrier(sequenceBuffer.buffer);
if (barrierId.isPresent() && barrierId.getAsLong() > lastBarrierId) {
// checkpoint was not yet started by task thread,
// so remember the numbers of buffers to spill for the time when
// it will be started
lastBarrierId = barrierId.getAsLong();
lastBarrierSequenceNumber = sequenceBuffer.sequenceNumber;
}
// 持久化通道状态
channelStatePersister.maybePersist(buffer);
// 更新序列号
++expectedSequenceNumber;
}
// 调notifyPriorityEvent()优先处理barrier情况
if (firstPriorityEvent) {
notifyPriorityEvent(sequenceNumber);
}
// 调notifyChannelNonEmpty()处理普通数据
if (wasEmpty) {
notifyChannelNonEmpty();
}
// 背压反馈
if (backlog >= 0) {
onSenderBacklog(backlog);
}
} finally {
// 若recycleBuffer为true,表示缓冲区可回收,更新信用值
if (recycleBuffer) {
buffer.recycleBuffer();
}
}
}
到这,我们看得出来其实onBuffer()方法是将数据占用上缓冲区,只有特殊情况才会调buffer.recycleBuffer()回收缓冲区,那么,消费完缓冲区的数据后再回收缓冲区的一定另有其人
二.消费完缓冲区数据后,回收缓冲区,更新信用值
以StreamTask为例子
1.算子做了啥
(1) StreamTask.processInput()
其实还是调的inputProcessor.processInput()
protected void processInput(MailboxDefaultAction.Controller controller) throws Exception {
DataInputStatus status = inputProcessor.processInput();
// 其他代码不重要,这里就给省略了
。。。
}
(2) StreamInputProcessor实现类的processInput()
StreamInputProcessor是一个接口其实现类如下
以StreamOneInputProcessor为例,它实现的processInput()如下
其实也是调的input.emitNext(output),input是StreamTaskInput实现类
@Override
public DataInputStatus processInput() throws Exception {
DataInputStatus status = input.emitNext(output);
if (status == DataInputStatus.END_OF_DATA) {
endOfInputAware.endInput(input.getInputIndex() + 1);
output = new FinishedDataOutput<>();
} else if (status == DataInputStatus.END_OF_RECOVERY) {
if (input instanceof RecoverableStreamTaskInput) {
input = ((RecoverableStreamTaskInput<IN>) input).finishRecovery();
}
return DataInputStatus.MORE_AVAILABLE;
}
return status;
}
(3) StreamTaskInput实现类的emitNext()
<1> AbstractStreamTaskNetworkInput.emitNext()
StreamTaskInput是一个接口,其实现类如下图
以AbstractStreamTaskNetworkInput为例子,其实现的emitNext()如下
其实它又调了currentRecordDeserializer.getNextRecord(),currentRecordDeserializer是RecordDeserializer实现类
@Override
public DataInputStatus emitNext(DataOutput<T> output) throws Exception {
while (true) {
// get the stream element from the deserializer
// CASE-1: 有反序列化器
if (currentRecordDeserializer != null) {
RecordDeserializer.DeserializationResult result;
try {
// 获取数据,若当前缓冲区已被完全消费,则调recycleBuffer()回收缓冲区
result = currentRecordDeserializer.getNextRecord(deserializationDelegate);
} catch (IOException e) {
throw new IOException(
String.format("Can't get next record for channel %s", lastChannel), e);
}
// 若缓冲区已完全消费,释放反序列化器引用(避免内存泄漏)
if (result.isBufferConsumed()) {
currentRecordDeserializer = null;
}
// 若是一条full record
if (result.isFullRecord()) {
// 调processElement()去处理完整记录,就是把数据发送给下游算子,如map,然后进行业务处理等
final boolean breakBatchEmitting =
processElement(deserializationDelegate.getInstance(), output);
// 若允许批量输出,且未返回,则继续循环,读取下一条;否则,直接返回DataInputStatus.MORE_AVAILABLE;
if (canEmitBatchOfRecords.check() && !breakBatchEmitting) {
continue;
}
return DataInputStatus.MORE_AVAILABLE;
}
}
// CASE-2: 无序列化器(上一个缓冲区已经被读完 OR 刚启动没数据,才会走CASE-2),从InputGate拉取BufferOrEvent
Optional<BufferOrEvent> bufferOrEvent = checkpointedInputGate.pollNext(); // 这个方法,对event类型进行处理,对buffer类型,进行记录字节数(监控用),下面有详细解析
// SUB-CASE-1: 拉取的是数据
if (bufferOrEvent.isPresent()) {
// return to the mailbox after receiving a checkpoint barrier to avoid processing of
// data after the barrier before checkpoint is performed for unaligned checkpoint
// mode
// 收到 checkpoint 屏障后,(当前流程)返回至邮箱队列,以避免在未对齐 checkpoint 模式下,
// 在 checkpoint 执行完成之前就处理屏障后的数据流
// (1) 数据是buffer,则调processBuffer() 会将 BufferOrEvent加入到RecordDeserializer待命,后续还是由CASE-1的getNextRecord去获取。
if (bufferOrEvent.get().isBuffer()) {
processBuffer(bufferOrEvent.get());
}
// (2) 数据是event(如Barrier等),则调processEvent()返回标记,处理是在checkpointedInputGate.pollNext()处理了,避免在未对齐检查点模式下执行检查点之前处理屏障之后的数据
else {
DataInputStatus status = processEvent(bufferOrEvent.get(), output);
if (status == DataInputStatus.MORE_AVAILABLE && canEmitBatchOfRecords.check()) {
continue;
}
return status;
}
}
// SUB-CASE-2: 没拉取到任何东西,则返回标记即可
else {
if (checkpointedInputGate.isFinished()) {
checkState(
checkpointedInputGate.getAvailableFuture().isDone(),
"Finished BarrierHandler should be available");
return DataInputStatus.END_OF_INPUT;
}
return DataInputStatus.NOTHING_AVAILABLE;
}
}
}
<2> CASE-1RecordDeserializer实现类的相关方法
RecordDeserializer也是一个接口,其实现类如下
以
SpillingAdaptiveSpanningRecordDeserializer为例
《1》setNextBuffer()
作用:把buffer放入对应的wrapper中,待命,后续由getNextRecord去取数
// 把buffer放入对应的wrapper中,待命
@Override
public void setNextBuffer(Buffer buffer) throws IOException {
currentBuffer = buffer;
int offset = buffer.getMemorySegmentOffset();
MemorySegment segment = buffer.getMemorySegment();
int numBytes = buffer.getSize();
// check if some spanning record deserialization is pending
// 检查是否有 未完成的跨区记录拼接(即上一个Buffer没凑齐一条完整的记录)
if (spanningWrapper.getNumGatheredBytes() > 0) {
// (1) 有跨区的记录拼接 -> 继续拼接当前buffer的字节,放入spanningWrapper
spanningWrapper.addNextChunkFromMemorySegment(segment, offset, numBytes);
} else {
// (2) 无跨区的记录待拼接 -> 初始化单Buffer解析器,放入nonSpanningWrapper
nonSpanningWrapper.initializeFromMemorySegment(segment, offset, numBytes + offset);
}
}
《2》getNextRecord()
@Override
public DeserializationResult getNextRecord(T target) throws IOException {
// always check the non-spanning wrapper first.
// this should be the majority of the cases for small records
// for large records, this portion of the work is very small in comparison anyways
// 优先检查非跨度包装器。这应该是小记录大记录的大多数情况,无论如何,这部分工作相比起来非常小
// 1. 尝试从当前缓冲区读取下一条完整记录
final DeserializationResult result = readNextRecord(target);
// 2. 如果当前缓冲区已被完全消费(即所有数据都被解析)
if (result.isBufferConsumed()) {
// 2.1 回收当前缓冲区
currentBuffer.recycleBuffer();
// 2.2 释放对缓冲区的引用,以便 GC
currentBuffer = null;
}
// 3. 返回解析结果(包含是否成功解析完整记录、缓冲区是否被消费等信息)
return result;
}
《3》readNextRecord()
private DeserializationResult readNextRecord(T target) throws IOException {
/*
1.nonSpanningWrapper: 单Buffer解析器,直接操作内存段
2.spanningWrapper: 跨Buffer解析器,将多个Buffer拼接成完整的记录,超过阈值会溢出磁盘
3.target: 最终要填充的业务对象,要继承IOReadableWritable
4.DeserializationResult: 标记枚举类,有2个属性isFullRecord和isBufferConsumed,分别表示是否成功解析完整记录和缓冲区是否被消费
- PARTIAL_RECORD: isFullRecord为false,isBufferConsumed为true
- INTERMEDIATE_RECORD_FROM_BUFFER: isFullRecord为true,isBufferConsumed为false
- LAST_RECORD_FROM_BUFFER: isFullRecord为true,isBufferConsumed为true
*/
// (1) 单Buffer已读完完整长度 -- 小记录高频场景
if (nonSpanningWrapper.hasCompleteLength()) {
return readNonSpanningRecord(target); // 调readNonSpanningRecord()取读取
}
// (2) 单Buffer有剩余未读完 -- 跨Buffer场景
else if (nonSpanningWrapper.hasRemaining()) {
// 把剩下的字节转给spanningWrapper
nonSpanningWrapper.transferTo(spanningWrapper.lengthBuffer);
return PARTIAL_RECORD;
}
// (3) 跨Buffer已读完完整长度 -- 大记录场景
else if (spanningWrapper.hasFullRecord()) {
target.read(spanningWrapper.getInputView());
// 把spanningWrapper中拼接后没用完的字节转给nonSpanningWrapper
spanningWrapper.transferLeftOverTo(nonSpanningWrapper);
return nonSpanningWrapper.hasRemaining()
? INTERMEDIATE_RECORD_FROM_BUFFER
: LAST_RECORD_FROM_BUFFER;
}
// (4) 所有解析器都没有凑齐完整记录 -- 需要更多数据
else {
return PARTIAL_RECORD;
}
}
<3> CASE-1调用的processElement()
这是将数据传给算子去处理的核心方法,下面的CASE-2的processBuffer()也只是将数据待命,后续还是由这里的processElement去处理
/**
* 处理给定的流元素并返回是否停止处理并从emitNext方法返回,以便在处理该元素后立即再次调用emitNext,以允许emitNext方法中的行为更改。
* 例如,emitNext 的行为可能需要在处理 RecordAttributes 后立即更改。
* @param streamElement
* @param output
* @return
* @throws Exception
*/
private boolean processElement(StreamElement streamElement, DataOutput<T> output)
throws Exception {
if (streamElement.isRecord()) {
// 业务数据:输出给下游消费者算子,如map等
output.emitRecord(streamElement.asRecord());
return false;
} else if (streamElement.isWatermark()) {
// 水位线处理
statusWatermarkValve.inputWatermark(
streamElement.asWatermark(), flattenedChannelIndices.get(lastChannel), output);
return false;
} else if (streamElement.isLatencyMarker()) {
// 延迟标记处理
output.emitLatencyMarker(streamElement.asLatencyMarker());
return false;
} else if (streamElement.isWatermarkStatus()) {
// 水位线状态处理
statusWatermarkValve.inputWatermarkStatus(
streamElement.asWatermarkStatus(),
flattenedChannelIndices.get(lastChannel),
output);
return false;
} else if (streamElement.isRecordAttributes()) {
// 属性处理
recordAttributesCombiner.inputRecordAttributes(
streamElement.asRecordAttributes(),
flattenedChannelIndices.get(lastChannel),
output);
return true;
} else {
throw new UnsupportedOperationException("Unknown type of StreamElement");
}
}
<4> CASE-2的processBuffer()
// 这里就是调currentRecordDeserializer去将当前buffer的数据放入其中,进行待命,等待后续processElement()去处理
protected void processBuffer(BufferOrEvent bufferOrEvent) throws IOException {
// 1.记录当前Buffer所属的通道InputChannelInfo
lastChannel = bufferOrEvent.getChannelInfo();
checkState(lastChannel != null);
// 2.获取该通通道的专属反序列化器(从recordDeserializers缓存中取)
currentRecordDeserializer = getActiveSerializer(bufferOrEvent.getChannelInfo());
checkState(
currentRecordDeserializer != null,
"currentRecordDeserializer has already been released");
// 3.将Buffer设置到反序列化器中,后续由getNextRecord()去拉取数据进行处理
currentRecordDeserializer.setNextBuffer(bufferOrEvent.getBuffer());
}
<5> CASE-2的processEvent()
这里其实就是打标记,告诉外面while循环,是否还可以继续拉取下一个数据
/* 根据事件的类型,去做相应处理,并返回对应的标记,但是barrier的处理在checkpointedInputGate.pollNext()处理的
MORE_AVAILABLE:事件处理完成,后续仍有数据 / 事件可处理;
END_OF_DATA/STOPPED:数据已结束 / 作业停止,无需再处理;
END_OF_INPUT:所有输入分区已结束,输入完全终止;
END_OF_RECOVERY:所有通道恢复完成(故障恢复场景)。
*/
protected DataInputStatus processEvent(BufferOrEvent bufferOrEvent, DataOutput<T> output) {
// Event received
final AbstractEvent event = bufferOrEvent.getEvent();
// 数据结束事件处理
if (event.getClass() == EndOfData.class) {
switch (checkpointedInputGate.hasReceivedEndOfData()) {
case NOT_END_OF_DATA:
// skip
break;
case DRAINED:
return DataInputStatus.END_OF_DATA;
case STOPPED:
return DataInputStatus.STOPPED;
}
}
// 分区结束事件处理
else if (event.getClass() == EndOfPartitionEvent.class) {
// release the record deserializer immediately,
// which is very valuable in case of bounded stream
// 释放该通道的反序列化器
releaseDeserializer(bufferOrEvent.getChannelInfo());
if (checkpointedInputGate.isFinished()) {
return DataInputStatus.END_OF_INPUT;
}
}
// 通道状态恢复完成事件处理
else if (event.getClass() == EndOfChannelStateEvent.class) {
if (checkpointedInputGate.allChannelsRecovered()) {
return DataInputStatus.END_OF_RECOVERY;
}
}
// 水位线事件处理
else if (event.getClass() == WatermarkEvent.class) {
try {
processWatermarkEvent(
bufferOrEvent.getChannelInfo(), (WatermarkEvent) event, output);
} catch (Exception e) {
ExceptionUtils.rethrow(e);
}
}
// 其他事件,如barrier,
return DataInputStatus.MORE_AVAILABLE;
}
(4) CheckpointedInputGate类的相关方法
<1> pollNext()
@Override
public Optional<BufferOrEvent> pollNext() throws IOException, InterruptedException {
// 1.从底层InputGate拉取原始的BufferOrEvent
Optional<BufferOrEvent> next = inputGate.pollNext();
// 若没拉取到数据,则返回空处理
if (!next.isPresent()) {
return handleEmptyBuffer();
}
BufferOrEvent bufferOrEvent = next.get();
// 2.拉取的是event事件,调handleEvent()去处理并返回
if (bufferOrEvent.isEvent()) {
return handleEvent(bufferOrEvent);
}
// 3.拉取的是buffer数据,统计处理的字节数(监控涉及),这里不处理,后面AbstractStreamTaskNetworkInput.emitNext()会调processBuffer()去处理
else if (bufferOrEvent.isBuffer()) {
/**
* 注释说明:统计字节数存在轻微误差(忽略反序列化器中缓存的字节),
* 但平均精度足够,暂时满足监控需求(后续可优化到 StreamTaskNetworkInput 层)
*/
barrierHandler.addProcessedBytes(bufferOrEvent.getBuffer().getSize());
}
// 4.返回next结果
return next;
}
<2> 调用的handleEvent()
private Optional<BufferOrEvent> handleEvent(BufferOrEvent bufferOrEvent) throws IOException {
Class<? extends AbstractEvent> eventClass = bufferOrEvent.getEvent().getClass();
// barrier的处理,调CheckpointBarrierHandler子类.processBarrier()先去处理barrier,会涉及触发cp操作
if (eventClass == CheckpointBarrier.class) {
CheckpointBarrier checkpointBarrier = (CheckpointBarrier) bufferOrEvent.getEvent();
barrierHandler.processBarrier(checkpointBarrier, bufferOrEvent.getChannelInfo(), false);
}
// 取消barrier的处理
else if (eventClass == CancelCheckpointMarker.class) {
barrierHandler.processCancellationBarrier(
(CancelCheckpointMarker) bufferOrEvent.getEvent(),
bufferOrEvent.getChannelInfo());
}
// 剩下就是其他情况了
else if (eventClass == EndOfData.class) {
inputGate.acknowledgeAllRecordsProcessed(bufferOrEvent.getChannelInfo());
} else if (eventClass == EndOfPartitionEvent.class) {
barrierHandler.processEndOfPartition(bufferOrEvent.getChannelInfo());
} else if (eventClass == EventAnnouncement.class) {
EventAnnouncement eventAnnouncement = (EventAnnouncement) bufferOrEvent.getEvent();
AbstractEvent announcedEvent = eventAnnouncement.getAnnouncedEvent();
checkState(
announcedEvent instanceof CheckpointBarrier,
"Only CheckpointBarrier announcement are currently supported, but found [%s]",
announcedEvent);
CheckpointBarrier announcedBarrier = (CheckpointBarrier) announcedEvent;
barrierHandler.processBarrierAnnouncement(
announcedBarrier,
eventAnnouncement.getSequenceNumber(),
bufferOrEvent.getChannelInfo());
} else if (bufferOrEvent.getEvent().getClass() == EndOfChannelStateEvent.class) {
upstreamRecoveryTracker.handleEndOfRecovery(bufferOrEvent.getChannelInfo());
}
// 返回bufferOrEvent
return Optional.of(bufferOrEvent);
}
<3> barrier处理的核心 -- processBarrier()
该方法在类CheckpointBarrierHandler内,是其子类实现的,如图
以CheckpointBarrierTracker为例,其触发cp的逻辑如下
- 快速通道:单numOpenChannels且barrier最新,那么调
notifyCheckpoint()->triggerCheckpointOnBarrier()直接触发CheckPoint` - 多通道情况:进一步处理,遍历待处理 Checkpoint 队列,只为查找当前屏障对应的第一个 CheckpointBarrierCount
- 没找到和当前barrier匹配的cp,说明当前barrier是最新的且第一个到达的,则加入到
pendingCheckpoints缓存中,方便后续其他通道的Barrier到达去匹配 - 找到了当前barrier匹配的cp,那么,会移除旧的cp并进行判断是否全部对齐了
- 若全部对齐,则直接调
triggerCheckpointOnAligned() - 非全部对齐,则不处理
- 若全部对齐,则直接调
- 没找到和当前barrier匹配的cp,说明当前barrier是最新的且第一个到达的,则加入到
对齐的原理: 来一个barrier,则alignedChannels.add(),然后返回alignedChannels.size,当alignedChannels.size == targetChannelCount,则说明全部对齐了
案例:最新 ID 的第一个 barrier到来→缓存到队列pendingCheckpoints中;后续相同ID 的barrier→从队列找CheckpointBarrierCount判断是否所有通道的barrier对齐→清理过期状态、全部对齐才触发 Checkpoint;旧 ID 的 barrie直接过滤;
@Override
public void processBarrier(
CheckpointBarrier receivedBarrier, InputChannelInfo channelInfo, boolean isRpcTriggered)
throws IOException {
final long barrierId = receivedBarrier.getId();
// 快速路径:条件:当前cpid是新的,且只有一个numOpenChannels通道时,直接调notifyCheckpoint()->triggerCheckpointOnBarrier()去触发CheckPoint,不需要对齐
if (barrierId > latestPendingCheckpointID && numOpenChannels == 1) {
markAlignmentStartAndEnd(barrierId, receivedBarrier.getTimestamp());
notifyCheckpoint(receivedBarrier);
return;
}
// general path for multiple input channels
if (LOG.isDebugEnabled()) {
LOG.debug("Received barrier for checkpoint {} from channel {}", barrierId, channelInfo);
}
// find the checkpoint barrier in the queue of pending barriers
// 多通道情况,进一步处理
CheckpointBarrierCount barrierCount = null; // 这个barrierCount只有当收到的是最新的barrier,且是第一个来的barrier,就会为null
// 也就是说第一个到达的barrier会缓存到pendingCheckpoints中。然后后面到的barrier会从pendingCheckpoints追踪到该barrier,然后进一步判断是否全部对齐了,当且仅当全部对齐了,才会触发cp
int pos = 0;
// 遍历待处理 Checkpoint 队列,只为查找当前屏障对应的第一个 CheckpointBarrierCount
for (CheckpointBarrierCount next : pendingCheckpoints) {
if (next.checkpointId() == barrierId) {
barrierCount = next;
break;
}
pos++; // 记录当前 Checkpoint 在队列中的位置
}
// (1) 若找到了当前的 CheckpointBarrierCount
if (barrierCount != null) {
// add one to the count to that barrier and check for completion
// 标记该通道是否已经对齐barrier了,逻辑就是:来一个alignedChannels.add,然后返回alignedChannels.size,当alignedChannels.size == targetChannelCount,则说明对齐了
int numChannelsNew = barrierCount.markChannelAligned(channelInfo);
if (numChannelsNew == barrierCount.getTargetChannelCount()) {
// checkpoint can be triggered (or is aborted and all barriers have been seen)
// first, remove this checkpoint and all all prior pending
// checkpoints (which are now subsumed)
// 所有通道都对齐,可以触发CheckPoint了,因此直接移除pendingCheckpoints之前的CheckPoint(因为他们已经过期了)
for (int i = 0; i <= pos; i++) {
pendingCheckpoints.pollFirst();
}
// notify the listener
// 调triggerCheckpointOnAligned()去触发CheckPoint
if (!barrierCount.isAborted()) {
triggerCheckpointOnAligned(barrierCount);
}
}
}
// (2) 若没找到当前的 CheckpointBarrierCount
else {
// 仅处理比当前latestPendingCheckpointID大的第一个CheckPoint(过滤旧的barrier,避免重复处理)
if (barrierId > latestPendingCheckpointID) {
// 标记该 Checkpoint 对齐开始(用于监控对齐耗时)
markAlignmentStart(barrierId, receivedBarrier.getTimestamp());
// 更新最新待处理 ID,避免后续处理旧 ID
latestPendingCheckpointID = barrierId;
// 创建新的 CheckpointBarrierCount,添加到追踪队列,方便后续来的该barrier去追踪
pendingCheckpoints.addLast(
new CheckpointBarrierCount(receivedBarrier, channelInfo, numOpenChannels));
// make sure we do not track too many checkpoints
// 限制队列长度,避免过多待处理 Checkpoint 导致内存泄漏
if (pendingCheckpoints.size() > MAX_CHECKPOINTS_TO_TRACK) {
pendingCheckpoints.pollFirst();
}
}
}
}
到这,发现RecordDeserializer中调用了buffer的recycleBuffer()去回收缓冲区,那么下面看buffer是怎么做的
2.Buffer做了啥
Buffer是一个接口,其实现类如下图
以NetworkBuffer为例,其实现的recycleBuffer()如下
(1) NetworkBuffer
public class NetworkBuffer extends AbstractReferenceCountedByteBuf implements Buffer {
private BufferRecycler recycler;
。。。
// 调AbstractReferenceCountedByteBuf的release()方法
@Override
public void recycleBuffer() {
release();
}
。。。
// 由AbstractReferenceCountedByteBuf.handleRelease()调用
@Override
protected void deallocate() {
// 调用 BufferRecycler 回收内存
recycler.recycle(memorySegment);
}
}
而AbstractReferenceCountedByteBuf是一个抽象类,其release()又调handleRelease()再调实现类的deallocate()
public boolean release() {
return this.handleRelease(updater.release(this));
}
public boolean release(int decrement) {
return this.handleRelease(updater.release(this, decrement));
}
private boolean handleRelease(boolean result) {
if (result) {
this.deallocate();
}
return result;
}
(2) 调用的BufferRecycler.recycle() -- 重要
BufferRecycler是一个接口,其实现类如下
以BufferManger为例
<1> BufferManager.recycle() -- 信用值+1的核心
流程如下
- 若输入通道已释放:将内存段直接返回给全局缓冲区池(
globalPool),无需更新信用值 - 若输入通道正常未释放:**将内存段封装成
NetworkBuffer加入到专属缓冲区队列中,然后判断是否需要poll浮动缓冲区判断的依据是当前可用的缓冲区数量(专属+浮动)是否 >所需缓冲区数量numRequiredBuffers**
- 若poll过浮动缓冲区:调用
LocalBufferPool的recycleBuffer()回收该浮动缓冲区 - 若没有poll浮动缓冲区:则调
inputChannel.notifyBufferAvailable(1)通知上游,信用值+1,
@Override
public void recycle(MemorySegment segment) {
@Nullable Buffer releasedFloatingBuffer = null;
synchronized (bufferQueue) {
try {
// Similar to notifyBufferAvailable(), make sure that we never add a buffer
// after channel released all buffers via releaseAllResources().
// 情况1:输入通道已经释放
if (inputChannel.isReleased()) {
// 如果输入通道已关闭,则将内存段直接返回给全局缓冲区池(globalPool),无需更新信用值。
globalPool.recycleUnpooledMemorySegments(Collections.singletonList(segment));
return;
}
// 情况2:输入通道正常
else {
// 将回收的内存段包装为新的 NetworkBuffer(关联当前 BufferManager 作为回收器)。
// 调用 bufferQueue.addExclusiveBuffer() 将该NetworkBuffer加入专属缓冲队列,并检查是否需要释放浮动缓冲区,将释放的浮动缓冲区赋值给releasedFloatingBuffer。
releasedFloatingBuffer =
bufferQueue.addExclusiveBuffer(
new NetworkBuffer(segment, this), numRequiredBuffers);
}
} catch (Throwable t) {
ExceptionUtils.rethrow(t);
} finally {
bufferQueue.notifyAll();
}
}
// 若释放了浮动缓冲区,递归回收该浮动缓冲区
if (releasedFloatingBuffer != null) {
// 这里浮动缓冲区其实调的是LocalBufferPool的recycleBuffer(),具体原因看后面文章关于浮动缓冲区和专属缓冲区
releasedFloatingBuffer.recycleBuffer();
}
// 若没有释放浮动缓冲区,则通知输入通道有一个新缓冲区(专属缓冲区)可用 就是告诉上游,当前信用值+1
else {
try {
inputChannel.notifyBufferAvailable(1);
} catch (Throwable t) {
ExceptionUtils.rethrow(t);
}
}
}
<2> BufferManager#AvailableBufferQueue.addExclusiveBuffer()
这个AvailableBufferQueue是BufferManager的内部类,他的主要属性如下
static final class AvailableBufferQueue {
// 1.来自固定缓冲池的浮动缓冲区队列
final ArrayDeque<Buffer> floatingBuffers;
// 2.来自全局缓冲池的专属缓冲区队列
final ArrayDeque<Buffer> exclusiveBuffers;
addExclusiveBuffer()流程如下
- 将新缓冲区加入到专属缓冲区队列
- 如果当前可用的缓冲区数量大于所需缓冲区数量,则释放一个浮动缓冲区
@Nullable
Buffer addExclusiveBuffer(Buffer buffer, int numRequiredBuffers) {
// 1.将新缓冲区加入到专属缓冲区队列
exclusiveBuffers.add(buffer);
// 2.如果当前可用的缓冲区数量大于所需缓冲区数量,则释放一个浮动缓冲区
// 但是注意:专属缓冲区不可释放
if (getAvailableBufferSize() > numRequiredBuffers) {
return floatingBuffers.poll();
}
return null;
}
<3> BufferManager#AvailableBufferQueue.getAvailableBufferSize()
当前可用的缓冲区数量 = 浮动缓冲区数量 + 专属缓冲区数量
// 计算当前可用的缓冲区数量 = 浮动缓冲区数量 + 专属缓冲区数量
int getAvailableBufferSize() {
return floatingBuffers.size() + exclusiveBuffers.size();
}