Flume事务
- Put事务流程
- doPut:将批量数据写入临时缓冲区
- doCommit:检查channel内存队列是否足够合并
- doRollback:channel:内存队列空间不足,回滚数据
- Take事务
- doTake:将数据拉取到临时缓冲区takeList,并将数据发送到HDFS
- doCommit:如果数据全部发送成功,则清除临时缓冲区takeList
- doRollback:数据发送过程中如果出现异常,rollback将临时缓冲区takeList中的数据归还给channel内存队列
Flume Agent 内部原理
-
channelSelector:选出Event要发往哪个Channel。有两种类型:Replicating(复制,default)和Multiplexing(多路复用)。Replicating会将source来的数据发往所有channel,Multiplexing选择发往那些Channel。
-
SinkProcessor:DefaultSinkProcessor对应单个Sink,LoadBalancingSinkProcessor和FailoverSinkProcessor对应的是Sink Group,LoadBalancingSinkProcessor可以实现负载均衡,FailOverSinkProcessor可以实现故障转移功能。
-
MemoryChannel/FileChannel/KafkaChannel
Flume 拓扑结构
简单串联
复制和多路复用
# 将数据流发送给所有channel
a1.sources.r1.selector.type = replicating
负载均衡和故障转移
a1.sinkgroups.g1.processor.type = failover
a1.sinkgroups.g1.processor.priority.k1 = 5
a1.sinkgroups.g1.processor.priority.k2 = 10
a1.sinkgroups.g1.processor.maxpenalty = 10000
聚合
Flume 简单实现
a1.sources = r1
a1.sinks = k1
a1.channels = c1
a1.sources.r1.type = TAILDIR
a1.sources.r1.positionFile = /opt/module/flume/tail_dir.json
a1.sources.r1.filegroups = f1
a1.sources.r1.filegroups.f1 = /opt/module/flume/files/file.*
a1.sinks.k1.type = hdfs
a1.sinks.k1.hdfs.type = hdfs://hadoop102:9000/flume/upload/%Y%m%d/%H
# 上传文件前缀
a1.sinks.k1.hdfs.filePrefix = upload-
# 是否按照时间滚动文件
a1.sinks.k1.hdfs.roud = True
# 多少时间创建一个新的文件夹
a1.sinks.k1.hdfs.roundValue = 1
# 重新定义时间单位
a1.sinks.k1.hdfs.roundUnit = hour
# 是否使用本地时间戳
a1.sinks.k1.hdfs.userLocalTimeStamp = true
# 积攒多少个Event才Flush到HDFS一次
a1.sinks.k1.hdfs.batchSize = 100
# 设置文件类型,可支持压缩
a1.sinks.k1.hdfs.fileType = DataStream
# 多久生产一个文件
a1.sinks.k1.hdfs.rollInterval = 60
# 设置每个文件的滚动大小,128M
a1.sinks.k1.hdfs.rollSize = 134217700
# 文件的滚动与Event的数量无关
a1.sinks.k1.hdfs.rollCount = 0
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100
a1.sources.r1.channels = c1
a1.sources.k1.channel = c1
Exec Source适用于监控一个实时追加的文件,但不能保证数据不丢失;Spooldir Source能够保证数据不丢失,且能够实现断点续传,但延迟高,不能实时监控;Taildir Source既能实现断点续传,又可以保证数据不丢失,还能实时监控。
Taildir Source维护了一个Json格式的Position File,会定期往Position File中更新每个文件读取到的最新位置。
自定义Interceptor
import org.apache.flume.Context
import org.apache.flume.Event
import org.apache.flume.interceptor.Interceptor
import java.util.List
public class CustomInterceptor implements Interceptor {
@Override
public void initialize() {
}
@Override
public Event intercept(Event event) {
byte[] body = event.getBody();
if (body[]0)
}
public Event intercept(Event event) {
byte[] body = event.getBody();
if (body[0] < 'z' && body[0] > 'a') {
event.getHeaders().put("type", "letter");
} else if (body[0] > '0' && body[0] < '9') {
event.getHeaders().put("type", "number");
}
return event;
}
@Override
public List<Event> intercept(List<Event> events) {
for (Event event : events) {
intercept(event);
}
return events;
}
@Override
public void close() {
}
public static class Builder implements Interceptor.Builder {
@Override
public Interceptor build() {
return new CustomInterceptor();
}
@Override
public void configure(Context context) {
}
}
}
flume配置文件
a1.sources.r1.interceptors = i1
a1.sources.r1.interceptors.i1.type = com.arkmu.flume.interceptor.CustomInterceptor$Builder
a1.sources.r1.selector.type = multiplexing
a1.sources.r1.selector.header = type
a1.sources.r1.selector.mapping.letter = c1
a1.sources.r1.selector.mapping.number = c2
自定义Source
import org.apache.flume.Context;
import org.apache.flume.EventDeliveryException;
import org.apache.flume.PollableSource;
import org.apache.flume.conf.Configurable;
import org.apache.flume.event.SimpleEvent;
import org.apache.flume.source.AbstractSource;
import java.util.HasMap
public class MySource extends AbstractSource implements Configurable, PollableSource {
private Long delay;
private String Field;
@Override
public void configure(Context context) {
delay = context.getLong("delay");
field = context.getString("field", "Hello!"):
}
@Override
public Status process() throws EventDeliveryException {
try {
HashMap<String, String> headerMap = new HashMap<>();
SimpleEvent event = new SimpleEvent();
for (int i = 0; i < 5; i++) {
event.setHeaders(headerMap);
event.setBody((field + i).getBytes());
getChannelProcessor().processEvent(event);
Thread.sleep(delay);
}
} catch (Exception e) {
e.printStackTrace();
return Status.BackOFF;
}
return Status.READY;
}
@Override
public long getBackOffSleepIcrement() {
return 0;
}
@Override
public long getMaxBackOffSleepInterval() {
return 0;
}
}
自定义Sink
import org.apache.flume.*;
import org.apache.flume.conf.Configurable;
import org.apache.flume.sink.AbstractSink;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class MySink extends AbstractSink implements Configurable {
private static final Logger LOG = LoggerFactory.getLogger(AbstractSink.class);
private String prefix;
private String suffix;
@Override
public Status process() throws EventDeliveryException {
// 声明返回值状态
Status status;
// 获取当前Sink绑定的channel
Channel ch = getChannel();
// 获取事务
Transaction txn = ch.getTransaction();
// 声明事件
Event event;
// 开启事务
txn.begin();
// 读取channel中的事件,知道读取事件循环结束
while(true) {
event = ch.take();
if (event != null) {
break;
}
}
try {
LOG.info(prefix + new String(event.getBody()) + suffix);
txn.commit();
status = Status.READY;
} catch (Execption e) {
// 遇到异常,回滚事务
txn.rollback();
status = STATUS.BACKOFF;
} finally {
txn.close()
}
return status;
}
@Override
public void configure(Context context) {
// 读取配置文件内容,有默认值
prefix = context.getString("prefix", "hello:");
// 读取配置文件内容,无默认值
suffix = context.getString("suffix");
}
}
Flume监控
sudo yum -y install httpd php
sudo yum -y install rrdtool perl-rrdtool rrdtool-devel
sudo yun -y install apr-devel
// 安装ganglia
sudo rpm -Uvh http://dl.fedoraproject.org/pub/epel/6/x86_64/epel-release-6-8.noarch.rpm
sudo yum -y install ganglia-gmetad
sudo yum -y install ganglia-web
sudo yum -y install ganglia-gmond
sudo vim /etc/httpd/conf.d/ganglia.conf
Allow from all
sudo vim /etc/ganglia/gmetad.conf
data_source "hadoop102" 192.168.9.102
sudo vim /etc/ganglia/gmond.conf
name = "hadoop102"
host = 192.168.9.102
bind = 192.168.9.102
sudo vim /etc/selinux/config
SELINUX=disabled # 需重启生效,或(sudo setenforce 0)
// 启动
sudo service httpd start
sudo service gmetad start
sudo service gmond start
http://192.168.9.102/ganglia
修改/opt/module/flume/conf/flume-env.sh
JAVA_OPTS="-Dflume.monitoring.type=ganglia
-Dflume.monitoring.hosts=192.168.9.102:8649
-Xms100m
-Xmx200m"
启动flume
bin/flume-ng agent \
--conf conf/ \
--name a1 \
--conf-file job/flume-netcat-logger.conf
-Dflume.root.logger==INFO,console \
-Dflume.monitoring.type=ganglia \
-Dflume.monitoring.hosts=192.168.9.102:8649
Flume调优
- 增加Source个数(使用TailDir Source是增加FileGroups个数)增大Source读取数据的能力;batchSize决定Source单次批量运输到Channel的event条数,适当调大batchSize可以提高Source搬运Event到Channel时的性能。
- memoryChannel性能优于FileChannel,但是当Agent进程意外挂掉会丢失数据。使用fileChannel是配置多个不同盘下的目录可以提高性能。Capacity参数决定Channel可容纳的最大event条数,transactionCapacity参数决定Source往channel里面写的最大event条数和每次Sink从channel里面读取的最大event条数。trasactionCapacity需要大于Source和Sink的batchSize参数。
- 增加Sink个数可以增加Sink消费能力,但不是越多越好。增大batchSize可以增加Sink的单次拉取量。