Flume 案例:自定义 Source 和 Sink

209 阅读3分钟

要求:

  • 实现一个自定义的 Source,该 source 产生随机的 UUID
  • source 支持从配置文件定义三个配置
    • prefix:在产生的 UUID 之前加上配置的 prefix 字符串返回,prefix 与 UUID 之间的连接符为:::
    • isUpper:如果设置为 true,则返回的 UUID 字符串转为大写
    • length:设置生成的 UUID 的长度,默认为32
  • 实现一个自定义的 Sink,该 sink 将 channel 中的 event 以日志的形式记录到控制台
  • sink 支持从配置文件定义两个配置
    • prefix:在数据之前添加的前缀,前缀与数据之间的连接符为:::
    • suffix:在数据之后添加的后缀,后缀与数据之间的连接符为:::

自定义 Source 代码:

package com.arc.flume.source;

import org.apache.flume.Context;
import org.apache.flume.Event;
import org.apache.flume.EventDeliveryException;
import org.apache.flume.PollableSource;
import org.apache.flume.conf.Configurable;
import org.apache.flume.event.SimpleEvent;
import org.apache.flume.source.AbstractSource;

import java.nio.charset.StandardCharsets;
import java.util.UUID;

/**
 * @Description: 自定义 Source:产生随机数
 */
public class MySource extends AbstractSource implements Configurable, PollableSource {

    // 数据的前缀
    private String prefix;
    // 数据是否是大写形式
    private boolean isUpper;
    // 生成的 UUID 字符串的长度
    private int length;

    private Event getSomeData() {
        Event event = new SimpleEvent();
        String uuid = UUID.randomUUID().toString().substring(0, length);
        if (isUpper) {
            event.setBody((prefix + ":::" + uuid.toUpperCase()).getBytes(StandardCharsets.UTF_8));
        } else {
            event.setBody((prefix + ":::"  + uuid.toLowerCase()).getBytes(StandardCharsets.UTF_8));
        }
        return event;
    }

    /**
     * Source 核心方法:采集数据,封装成 Event,此方法在 Flume Agent 启动后被频繁调用
     * @return
     * @throws EventDeliveryException
     */
    @Override
    public Status process() throws EventDeliveryException {
        Status status = null;
        try {
            // This try clause includes whatever Channel/Event operations you want to do

            // Receive new data
            // 产生一个随机数
            Event e = getSomeData();
            Thread.sleep(1000);

            // Store the Event into this Source's associated Channel(s)
            // 将 Event 对象交给 Channel Processor 进行处理
            getChannelProcessor().processEvent(e);

            status = Status.READY;
        } catch (Throwable t) {
            // Log exception, handle individual exceptions as needed

            status = Status.BACKOFF;

            // re-throw all Errors
            if (t instanceof Error) {
                throw (Error)t;
            }
        }
        return status;
    }

    /**
     * 如果从数据源中没有获取到数据,则线程需要休息一段时间再去尝试获取数据
     * getBackOffSleepIncrement() 和 getMaxBackOffSleepInterval() 这两个方法用于控制线程休息时间
     * 源码如下:
     *
     * if (source.process().equals(PollableSource.Status.BACKOFF)) {
     *   counterGroup.incrementAndGet("runner.backoffs");
     *   Thread.sleep(Math.min(
     *    counterGroup.incrementAndGet("runner.backoffs.consecutive") * source.getBackOffSleepIncrement(),
     *        source.getMaxBackOffSleepInterval()));
     * } else {
     *   counterGroup.set("runner.backoffs.consecutive", 0L);
     * }
     *
     * 如果获取到数据,状态为 READY,runner.backoffs.consecutive = 0
     * 否则状态为 BACKOFF:
     * 第一次休息:
     *   counterGroup.incrementAndGet("runner.backoffs.consecutive") * source.getBackOffSleepIncrement()
     *       = 1 * source.getBackOffSleepIncrement()
     * 第二次休息:
     *   counterGroup.incrementAndGet("runner.backoffs.consecutive") * source.getBackOffSleepIncrement()
     *       = 2 * source.getBackOffSleepIncrement()
     * ....
     * 但休息的最大时间不能超过 getMaxBackOffSleepInterval()
     *
     */
    @Override
    public long getBackOffSleepIncrement() {
        return 1000;
    }

    @Override
    public long getMaxBackOffSleepInterval() {
        return 10000;
    }

    /**
     * 读取 Flume 的配置信息
     * a2.sources.r1.<property> = value
     * String field = context.getString(property)
     */
    @Override
    public void configure(Context context) {
        prefix = context.getString("prefix", "mySource");
        isUpper = context.getBoolean("isUpper", false);
        length = context.getInteger("length", 36);
    }

}

自定义 Sink 代码:

package com.arc.flume.sink;

import org.apache.flume.*;
import org.apache.flume.conf.Configurable;
import org.apache.flume.sink.AbstractSink;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.nio.charset.StandardCharsets;
import java.util.Map;

/**
 * @Description: 自定义 Sink,将数据通过日志的方式记录到控制台
 */
public class MySink extends AbstractSink implements Configurable {

    private static final Logger LOGGER = LoggerFactory.getLogger(MySink.class);

    // 自定义的属性:前缀
    private String prefix;
    // 自定义的属性:后缀
    private String suffix;

    private void storeSomeData(Event event) {
        Map<String, String> headers = event.getHeaders();
        String body = new String(event.getBody(), StandardCharsets.UTF_8);
        String data = prefix + ":::" + headers + ":::" + body + ":::" + suffix;
        LOGGER.info(data);
    }

    /**
     * 处理 Event 的核心方法,在 Flume 处理流程中循环调用
     * @return
     * @throws EventDeliveryException
     */
    @Override
    public Status process() throws EventDeliveryException {
        Status status = null;

        // Start transaction
        // 获取 Channel,一个 Sink 对应一个 Channel
        Channel ch = getChannel();
        // 获取事务对象
        Transaction txn = ch.getTransaction();
        // 开启事务
        txn.begin();
        try {
            // This try clause includes whatever Channel operations you want to do
            // 从 Channel 中获取 Event
            Event event = ch.take();

            // Send the Event to the external repository.
            // 处理 Event
            storeSomeData(event);
            // 处理成功,提交事务
            txn.commit();
            status = Status.READY;
        } catch (Throwable t) {
            // 处理失败,事务回滚
            txn.rollback();

            // Log exception, handle individual exceptions as needed
            // 状态标记为 BACKOFF
            status = Status.BACKOFF;

            // re-throw all Errors
            if (t instanceof Error) {
                throw (Error)t;
            }
        } finally {
            // 关闭事务
            txn.close();
        }
        return status;
    }

    /**
     * 读取配置,给自定义的属性赋值
     * @param context
     */
    @Override
    public void configure(Context context) {
        String prefix = context.getString("prefix", "MySink-prefix");
        String suffix = context.getString("suffix", "MySink-suffix");

        // Process the myProp value (e.g. validation)

        // Store myProp for later retrieval by process() method
        this.prefix = prefix;
        this.suffix = suffix;
    }

    @Override
    public synchronized void start() {
        // Initialize the connection to the external repository (e.g. HDFS) that
        // this Sink will forward Events to ..
        super.start();
    }

    @Override
    public synchronized void stop() {
        // Disconnect from the external respository and do any
        // additional cleanup (e.g. releasing resources or nulling-out
        // field values) ..
        super.stop();
    }
}

将代码打成 jar 包,上传到 flume 的 lib 目录下。

配置文件:

a1.sources = r1
a1.channels = c1
a1.sinks = k1

a1.sources.r1.type = com.arc.flume.source.MySource
a1.sources.r1.prefix = MS
a1.sources.r1.isUpper = true
a1.sources.r1.length = 8

a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100

a1.sinks.k1.type = com.arc.flume.sink.MySink
a1.sinks.k1.prefix = KM
a1.sinks.k1.suffix = BH

a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1

启动 Flume:

[admin@adp-01 ~]$ flume-ng agent -n a1 -c conf -f a1.conf

2023-04-27 12:45:05,440 INFO sink.MySink: KM:::{}:::MS:::1C67ED04:::BH
2023-04-27 12:45:06,443 INFO sink.MySink: KM:::{}:::MS:::4A857A87:::BH
2023-04-27 12:45:07,445 INFO sink.MySink: KM:::{}:::MS:::2F02208E:::BH
2023-04-27 12:45:08,453 INFO sink.MySink: KM:::{}:::MS:::15348712:::BH