Flume[2] - 数据采集流程及源码解读

878 阅读5分钟

* 本文中出现的 Channel 为 MemoryChannel

执行流程图

执行流程.png

源码解读

Source存数据流程

SpoolDirectorySource为例,忽略业务细节做一个流程解读

 @Override
  public synchronized void start() {
    logger.info("SpoolDirectorySource source starting with directory: {}",
        spoolDirectory);
​
    // 创建一个定时的单核心线程池,用于重复执行任务
    executor = Executors.newSingleThreadScheduledExecutor();
​
    // 指定要读取的目录
    File directory = new File(spoolDirectory);
    try {
      // 业务具体细节配置
      reader = new ReliableSpoolingFileEventReader.Builder()
          .spoolDirectory(directory)
          .completedSuffix(completedSuffix)
          .includePattern(includePattern)
          .ignorePattern(ignorePattern)
          .trackerDirPath(trackerDirPath)
          .annotateFileName(fileHeader)
          .fileNameHeader(fileHeaderKey)
          .annotateBaseName(basenameHeader)
          .baseNameHeader(basenameHeaderKey)
          .deserializerType(deserializerType)
          .deserializerContext(deserializerContext)
          .deletePolicy(deletePolicy)
          .inputCharset(inputCharset)
          .decodeErrorPolicy(decodeErrorPolicy)
          .consumeOrder(consumeOrder)
          .recursiveDirectorySearch(recursiveDirectorySearch)
          .trackingPolicy(trackingPolicy)
          .sourceCounter(sourceCounter)
          .build();
    } catch (IOException ioe) {
      throw new FlumeException("Error instantiating spooling event parser",
          ioe);
    }
    // 通过配置生成Runnable对象,其run方法中为Source的具体行为,传入reader用于执行具体行为,传入source计数器用于在metrics返回状态
    Runnable runner = new SpoolDirectoryRunnable(reader, sourceCounter);
    // 按照配置,重复执行定时业务
    executor.scheduleWithFixedDelay(
        runner, 0, pollDelay, TimeUnit.MILLISECONDS);
    // 标记lifecycleState字段为START
    super.start();
    logger.debug("SpoolDirectorySource source started");
    // 初始化Source计数器
    sourceCounter.start();
  }

在AbstractSource中定义了channelProcessor字段,用于在Source中向Channel发送Event 当Source获取到指定格式的字段,即调用getChannelProcessor().processEvent(event)getChannelProcessor().processEventBatch(events)将Event发送到Channel(后者为批处理),以processEvent(event)为例:

  public void processEvent(Event event) {
    // 把event过一遍拦截器链,过滤指定的event或者为event附加属性
    event = interceptorChain.intercept(event);
    if (event == null) {
      return;
    }
​
    // 获取配置文件中获取当前Source必须的Channel
    List<Channel> requiredChannels = selector.getRequiredChannels(event);
    for (Channel reqChannel : requiredChannels) {
      // 获取各个Channel的事务对象
      Transaction tx = reqChannel.getTransaction();
      Preconditions.checkNotNull(tx, "Transaction object must not be null");
      try {
        // 事务开始
        tx.begin();
        // 执行到对于Channel的doPut方法
        reqChannel.put(event);
        // 事务提交
        tx.commit();
      } catch (Throwable t) {
        // 抛出任何异常,事务回滚
        tx.rollback();
        if (t instanceof Error) {
          LOG.error("Error while writing to required channel: " + reqChannel, t);
          throw (Error) t;
        } else if (t instanceof ChannelException) {
          throw (ChannelException) t;
        } else {
          throw new ChannelException("Unable to put event on required " +
              "channel: " + reqChannel, t);
        }
      } finally {
        if (tx != null) {
          // 无论事务执行成功与否,关闭当前事务
          tx.close();
        }
      }
    }
​
    // 获取可选的Channel
    List<Channel> optionalChannels = selector.getOptionalChannels(event);
    for (Channel optChannel : optionalChannels) {
      Transaction tx = null;
      try {
        tx = optChannel.getTransaction();
        // 此处tx对象没有notNull约束
        tx.begin();
​
        optChannel.put(event);
​
        tx.commit();
      } catch (Throwable t) {
        tx.rollback();
        LOG.error("Unable to put event on optional channel: " + optChannel, t);
        if (t instanceof Error) {
          throw (Error) t;
        }
      } finally {
        if (tx != null) {
          tx.close();
        }
      }
    }
  }

Sink取数据流程

以相对简单的LoggerSink为例

// LoggerSink会重复执行此方法
public Status process() throws EventDeliveryException {
    Status result = Status.READY;
    Channel channel = getChannel();
    Transaction transaction = channel.getTransaction();
    Event event = null;
​
    try {
        // 事务开始
        transaction.begin();
        // 执行 channel 中的 take 方法取出 Event 并 log 出来
        event = channel.take();
        if (event != null) {
            if (logger.isInfoEnabled()) {
                logger.info("Event: " + EventHelper.dumpEvent(event, maxBytesToLog));
            }
        } else {
            // No event found, request back-off semantics from the sink runner
            result = Status.BACKOFF;
        }
        // log 出来后,事务提交
        transaction.commit();
    } catch (Exception ex) {
        // 异常则回滚
        transaction.rollback();
        throw new EventDeliveryException("Failed to log event: " + event, ex);
    } finally {
        // 最后关闭事务
        transaction.close();
    }
    return result;
}

Channel及其事务

以 MemoryChannel 为例,先看构造方法

    // transCapacity限制Channel中存储的最大Event数,默认100
    public MemoryTransaction(int transCapacity, ChannelCounter counter) {
      // MemoryChannel 基于 Java 中的阻塞双向队列实现
      // putList 用于暂存 Source 向 MemoryChannel 存放的 event
      putList = new LinkedBlockingDeque<Event>(transCapacity);
      // takeList 用于暂存 Sink 在MemoryChannel 取出的 event
      takeList = new LinkedBlockingDeque<Event>(transCapacity);
      channelCounter = counter;
    }

之所以说putList和takeList是为了暂存Event,是为了实现事务

再看看 MemoryChannel 的 configure 方法节选,了解 Channel 事务中必要的类成员变量

// configure 方法节选
// 若 queue 已存在,按照指定的容量进行 resize
if (queue != null) {
    try {
        resizeQueue(capacity);
    } catch (InterruptedException e) {
        Thread.currentThread().interrupt();
    }
} else {
    // 一开始由这里开始初始化
    synchronized (queueLock) {
        // 基于阻塞队列,默认 capacity 为 100
        queue = new LinkedBlockingDeque<Event>(capacity);
        // 初始化 queue 相关信号量,用于 commit 操作中判定 Channel 资源是否充足
        queueRemaining = new Semaphore(capacity);
        queueStored = new Semaphore(0);
    }
}
// 初始化字节余量信号量,用于 commit 操作中判定剩余字节空间是否充足
if (bytesRemaining == null) {
    bytesRemaining = new Semaphore(byteCapacity);
    lastByteCapacity = byteCapacity;
} else {
    if (byteCapacity > lastByteCapacity) {
        bytesRemaining.release(byteCapacity - lastByteCapacity);
        lastByteCapacity = byteCapacity;
    } else {
        try {
            if (!bytesRemaining.tryAcquire(lastByteCapacity - byteCapacity, keepAlive,
                                           TimeUnit.SECONDS)) {
                LOGGER.warn("Couldn't acquire permits to downsize the byte capacity, resizing has been aborted");
            } else {
                lastByteCapacity = byteCapacity;
            }
        } catch (InterruptedException e) {
            Thread.currentThread().interrupt();
        }
    }
}

这里涉及到的类成员变量有

  • queue:在后面的方法介绍中我们可以得知,在Channel中真正存放Event的是queue,而非putListtakeList
  • queueRemaining:初始化一个大小为capacity的信号量,表示queue的剩余空间
  • queueStored:初始化一个大小为0的信号量,表示queue目前存储的Event数量
  • bytesRemaining:初始化一个大小为byteCapacity的信号量,表示内存的剩余空间(单位:字节)

Source将Event放入Channel,执行put方法,在doPut方法中:

protected void doPut(Event event) throws InterruptedException {
    // 操作数递增
    channelCounter.incrementEventPutAttemptCount();
    // 计算event的字节大小
    int eventByteSize = (int) Math.ceil(estimateEventSize(event) / byteCapacitySlotSize);
	// 往putList入队event
    if (!putList.offer(event)) {
        throw new ChannelException(
            "Put queue for MemoryTransaction of capacity " +
            putList.size() + " full, consider committing more frequently, " +
            "increasing capacity or increasing thread count");
    }
    // 记录put进来的字节大小
    putByteCounter += eventByteSize;
}

Sink从Channel中取出Event,执行take方法,在doTake方法中:

protected Event doTake() throws InterruptedException {
    // 操作数递增
    channelCounter.incrementEventTakeAttemptCount();
    // 若 takeList 中剩余空间为 0 ,抛出异常
    if (takeList.remainingCapacity() == 0) {
        throw new ChannelException("Take list for MemoryTransaction, capacity " +
                                   takeList.size() + " full, consider committing more frequently, " +
                                   "increasing capacity, or increasing thread count");
    }
    // queue 中没有元素时返回null
    if (!queueStored.tryAcquire(keepAlive, TimeUnit.SECONDS)) {
        return null;
    }
    Event event;
    synchronized (queueLock) {
        // 从 queue 中取出 Event
        event = queue.poll();
    }
    // notNull 约束
    Preconditions.checkNotNull(event, "Queue.poll returned NULL despite semaphore " +
                               "signalling existence of entry");
    // 将刚才从 queue 中取出的 Event 放进 takeList
    takeList.put(event);
	// 字节大小记录
    int eventByteSize = (int) Math.ceil(estimateEventSize(event) / byteCapacitySlotSize);
    takeByteCounter += eventByteSize;
    // 返回 Event 对象
    return event;
}

若无异常抛出,接着执行doCommit方法:

protected void doCommit() throws InterruptedException {    
    // 一次事务中从 Channel 中 take 个数与 put 个数的差
    int remainingChange = takeList.size() - putList.size();
    if (remainingChange < 0) {
        // Semaphore (信号量)根据 doPut 方法中累加的 putByteCounter 值申请剩余内存
        if (!bytesRemaining.tryAcquire(putByteCounter, keepAlive, TimeUnit.SECONDS)) {
            throw new ChannelException("Cannot commit transaction. Byte capacity " +
                                       "allocated to store event body " + byteCapacity * byteCapacitySlotSize +
                                       "reached. Please increase heap space/byte capacity allocated to " +
                                       "the channel as the sinks may not be keeping up with the sources");
        }
        // Semaphore (信号量)根据 take 个数与 put 个数的差来申请剩余队列空间
        if (!queueRemaining.tryAcquire(-remainingChange, keepAlive, TimeUnit.SECONDS)) {
            bytesRemaining.release(putByteCounter);
            throw new ChannelFullException("Space for commit to queue couldn't be acquired." +
                                           " Sinks are likely not keeping up with sources, or the buffer size is too tight");
        }
    }
    // 分别获取 putList,takeList 的大小
    int puts = putList.size();
    int takes = takeList.size();
    // 获得 queue 锁,将 putList 的元素全量入到 MemoryChannel queue 中
    synchronized (queueLock) {
        if (puts > 0) {
            while (!putList.isEmpty()) {
                if (!queue.offer(putList.removeFirst())) {
                    throw new RuntimeException("Queue add failed, this shouldn't be able to happen");
                }
            }
        }
        // 清除两个List
        putList.clear();
        takeList.clear();
    }
    
    // Semaphore(信号量)根据 doTake 方法中累加的 takeByteCounter 值释放内存(增加 bytesRemaining 对象中 permits 的值)
    bytesRemaining.release(takeByteCounter);
    takeByteCounter = 0;
    putByteCounter = 0;
	// Semaphore(信号量)根据 putList 得到一次事务所要放入 queue 的总数增加 queueStored的量
    queueStored.release(puts);
    if (remainingChange > 0) {
        // 当 take 的数量多于 put 的数量时,按照 remainingChange 增加 queueRemaining 的值
        queueRemaining.release(remainingChange);
    }
    // 计数
    if (puts > 0) {
        channelCounter.addToEventPutSuccessCount(puts);
    }
    if (takes > 0) {
        channelCounter.addToEventTakeSuccessCount(takes);
    }
	// 记录当前Channel大小
    channelCounter.setChannelSize(queue.size());
}

若抛出异常,则执行 doRollback 方法

protected void doRollback() {
    // 获取 takeList 大小
    int takes = takeList.size();
    synchronized (queueLock) {
        // 执行回滚时,queue 中剩余的空间小于 takeList(也就是要把 takeList 的 Event 放回去却没空间放了)时,报错
        Preconditions.checkState(queue.remainingCapacity() >= takeList.size(),
                                 "Not enough space in memory channel " +
                                 "queue to rollback takes. This should never happen, please report");
        // 把 takeList 的 Event 放回去 queue
        while (!takeList.isEmpty()) {
            queue.addFirst(takeList.removeLast());
        }
        // 清空 putList ,抛弃没有存到 Channel 的数据
        putList.clear();
    }
    putByteCounter = 0;
    takeByteCounter = 0;
	// queueStored 计数
    queueStored.release(takes);
    // Channel queue 大小计数
    channelCounter.setChannelSize(queue.size());
}

总结

  • put 事务错误时的回滚逻辑是:抛弃将要 put 的数据,由数据来源重新发送
  • take 事务错误时的回滚逻辑:将没有 take 成功的数据放回 Channel ,等待重新 take