Kafka Sink
需求:
- 数据中以 A-I 开头的数据,输入到 topic t1 中
- 数据中以 J-R 开头的数据,输入到 topic t2 中
- 其他数据输入到 topic t3 中
数据流:NetCat Source -> Memory Channel -> Kafka Sink
自定义拦截器:
package com.arc.flume.interceptor;
import org.apache.flume.Context;
import org.apache.flume.Event;
import org.apache.flume.interceptor.Interceptor;
import java.nio.charset.StandardCharsets;
import java.util.List;
/**
* 需求:
* - 数据中以 A-I 开头的数据,输入到 topic t1 中
* - 数据中以 J-R 开头的数据,输入到 topic t2 中
* - 其他数据输入到 topic t3 中
*/
public class KafkaInterceptor implements Interceptor {
@Override
public void initialize() {}
@Override
public Event intercept(Event event) {
String body = new String(event.getBody(), StandardCharsets.UTF_8);
char c = body.charAt(0);
if ((c >= 65 && c <= 73) || (c >= 97 && c <= 105)) {
event.getHeaders().put("topic", "t1");
} else if ((c >= 74 && c <= 82) || (c >= 106 && c <= 114)) {
event.getHeaders().put("topic", "t2");
} else {
event.getHeaders().put("topic", "t3");
}
return event;
}
@Override
public List<Event> intercept(List<Event> list) {
for (Event event : list) {
intercept(event);
}
return list;
}
@Override
public void close() {}
public static class Builder implements Interceptor.Builder {
@Override
public Interceptor build() {
return new KafkaInterceptor();
}
@Override
public void configure(Context context) {}
}
}
代码打成 jar 包,上传到 Flume 的 lib 目录下。
配置文件 a1.conf:
a1.sources = r1
a1.channels = c1
a1.sinks = k1
a1.sources.r1.type = netcat
a1.sources.r1.bind = localhost
a1.sources.r1.port = 44444
# 设置为自定义拦截器的全限定类名 + $Builder
a1.sources.r1.interceptors = i1
a1.sources.r1.interceptors.i1.type = com.arc.flume.interceptor.KafkaInterceptor$Builder
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100
a1.sinks.k1.type = org.apache.flume.sink.kafka.KafkaSink
a1.sinks.k1.kafka.bootstrap.servers = adp-01:9092,adp-02:9092,adp-03:9092
# 如果此处配置了,那么数据直接发送到指定的 topic
# 如果没有配置,但是在 event 的 header 中存在 topic 这个 key,那么数据会发送到 topic 这个 key 的 value 指定的 topic 中
# 否则使用默认的 topic:default-flume-topic
# a1.sinks.k1.kafka.topic =
a1.sinks.k1.kafka.producer.acks = 1
a1.sinks.k1.kafka.producer.linger.ms = 1
a1.sinks.k1.kafka.producer.compression.type = snappy
# 多少条数据写一次 Kafka
a1.sinks.k1.kafka.flumeBatchSize = 20
# 发送给 Kafka 的数据中是否包含 Event 的 Header
a1.sinks.k1.useFlumeEventFormat = true
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1
创建 Topic:
说明:如果 topic 不存在,会使用 Kafka 默认的配置(1 分区 1 副本)自动创建
$ kafka-topics.sh \
--bootstrap-server adp-01:9092,adp-02:9092,adp-03:9092 \
--create \
--partitions 3 \
--replication-factor 3 \
--topic t1
$ kafka-topics.sh \
--bootstrap-server adp-01:9092,adp-02:9092,adp-03:9092 \
--create \
--partitions 3 \
--replication-factor 3 \
--topic t2
$ kafka-topics.sh \
--bootstrap-server adp-01:9092,adp-02:9092,adp-03:9092 \
--create \
--partitions 3 \
--replication-factor 3 \
--topic t3
开启 3 个消费者用于测试:
$ kafka-console-consumer.sh \
--bootstrap-server adp-01:9092,adp-02:9092,adp-03:9092 \
--topic t1 \
--group g1
$ kafka-console-consumer.sh \
--bootstrap-server adp-01:9092,adp-02:9092,adp-03:9092 \
--topic t2 \
--group g1
$ kafka-console-consumer.sh \
--bootstrap-server adp-01:9092,adp-02:9092,adp-03:9092 \
--topic t3 \
--group g1
启动 Flume Agent:
$ flume-ng agent -n a1 -c conf -f a1.conf
发送数据:
$ nc localhost 44444
Allen
Kate
Zed
消费情况:
- 从 topic t1 中消费到 Allen
- 从 topic t2 中消费到 Kate
- 从 topic t3 中消费到 Zed
如果 useFlumeEventFormat = true,则消费到的数据是:
topict1Allen
topict2Kate
topict3Zed
Kafka Source
需求:读取 Kafka topic t1 中的数据,打印到控制台
数据流:Kafka Source -> Memory Channel -> Logger Sink
配置文件 a2.conf
a2.sources = r1
a2.channels = c1
a2.sinks = k1
a2.sources.r1.type = org.apache.flume.source.kafka.KafkaSource
a2.sources.r1.kafka.bootstrap.servers = adp-01:9092,adp-02:9092,adp-03:9092
a2.sources.r1.kafka.topics = t1
a2.sources.r1.kafka.consumer.group.id = g1
a2.sources.r1.kafka.consumer.enable.auto.commit = true
a2.sources.r1.kafka.consumer.auto.offset.reset = earliest
a2.sources.r1.kafka.consumer.auto.commit.interval.ms = 1000
# 不能大于 a2.channels.c1.transactionCapacity 的值
# 否则报错:source's batch size is greater than the channels transaction capacity
a2.sources.r1.batchSize = 100
# 如果使用 Flume 写数据到 Kafka Sink 时配置了 useFlumeEventFormat = true
# 那么使用 Kafka Source 消费时也要设置
a2.sources.r1.useFlumeEventFormat = true
a2.channels.c1.type = memory
a2.channels.c1.capacity = 1000
a2.channels.c1.transactionCapacity = 100
a2.sinks.k1.type = logger
a2.sources.r1.channels = c1
a2.sinks.k1.channel = c1
启动 Flume Agent:
$ flume-ng agent -n a2 -c conf -f a2.conf
# useFlumeEventFormat = true
INFO sink.LoggerSink: Event: { headers:{topic=t1, partition=0, offset=40, timestamp=1684747045840} body: 41 6C 6C 65 6E Allen }
# useFlumeEventFormat = false
INFO sink.LoggerSink: Event: { headers:{topic=t1, partition=2, offset=41, timestamp=1684747118213} body: 02 0A 74 6F 70 69 63 04 74 31 00 0A 41 6C 6C 65 ..topic.t1..Alle }
Kafka Channel
需求1:端口数据写入 Kafka topic
数据流:NetCat Source -> Kafka Channel
配置文件: a3.conf
a3.sources = r1
a3.channels = c1
a3.sources.r1.type = netcat
a3.sources.r1.bind = localhost
a3.sources.r1.port = 44444
a3.channels.c1.type = org.apache.flume.channel.kafka.KafkaChannel
a3.channels.c1.kafka.bootstrap.servers = adp-01:9092,adp-02:9092,adp-03:9092
a3.channels.c1.kafka.topic = t1
a3.channels.c1.parseAsFlumeEvent = true
a3.sources.r1.channels = c1
启动:
$ flume-ng agent -n a3 -c conf -f a3.conf
发送数据:
$ nc localhost 44444
Allen
OK
消费数据:
$ kafka-console-consumer.sh --bootstrap-server adp-01:9092,adp-02:9092,adp-03:9092 --topic t1 --group g1
Allen
需求2:直接从 Kafka Channel 中消费数据
配置文件:a4.conf
a4.channels = c1
a4.sinks = k1
a4.channels.c1.type = org.apache.flume.channel.kafka.KafkaChannel
a4.channels.c1.kafka.bootstrap.servers = adp-01:9092,adp-02:9092,adp-03:9092
a4.channels.c1.kafka.topic = t1
a4.channels.c1.kafka.consumer.group.id = g1
a4.channels.c1.parseAsFlumeEvent = true
a4.channels.c1.kafka.consumer.auto.offset.reset = earliest
a4.sinks.k1.type = logger
a4.sinks.k1.channel = c1
启动:
$ flume-ng agent -n a4 -c conf -f a4.conf
# parseAsFlumeEvent = true
INFO sink.LoggerSink: Event: { headers:{topic=t1} body: 41 6C 6C 65 6E Allen }
# parseAsFlumeEvent = false
INFO sink.LoggerSink: Event: { headers:{} body: 02 0A 74 6F 70 69 63 04 74 31 00 0A 41 6C 6C 65 ..topic.t1..Alle }