概述
Flume :分布式的海量日志采集、聚合和传输的系统
安装 : 修改 conf 下的flume-env.sh的java的配置
监控数据端口数据
通过netcat工具向本机的 44444 端口发送数据
nc localhost 44444 --client
nc -lk 44444 --service
netstat -tnlp | grep 7777
kill -9 pid
相关应用场景配置
一、netcat到打印控制台
a1.sources = r1
a1.sinks = k1
a1.channels = c1
a1.sources.r1.type = netcat
a1.sources.r1.bind = localhost
a1.sources.r1.port = 44444
a1.sinks.k1.type = logger
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1
bin/flume-ng agent --conf conf --conf-file job/netcat-flume-logger.conf --name a1 -Dflume.root.logger=INFO,console
二、netcat到HDFS配置
a1.sources = r1
a1.sinks = k1
a1.channels = c1
a1.sources.r1.type = netcat
a1.sources.r1.bind = localhost
a1.sources.r1.port = 44444
a1.sinks.k1.type = hdfs
a1.sinks.k1.hdfs.path = hdfs:/flume
a1.sinks.k1.hdfs.filePrefix = events
a1.sinks.k1.hdfs.round = true
a1.sinks.k1.hdfs.roundValue = 10
a1.sinks.k1.hdfs.roundUnit = minute
a1.sinks.k1.hdfs.roundInterval = 60
a1.sinks.k1.hdfs.fileType = DataStream
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1
bin/flume-ng agent --conf conf --conf-file job/hdfs-flume.conf --name a1 - Dflume.root.logger=INFO,console
三、netcat到控制台输出(拦截器的使用)
a1.sources = r1
a1.sinks = k1
a1.channels = c1
a1.sources.r1.type = netcat
a1.sources.r1.bind = localhost
a1.sources.r1.port = 44444
a1.sinks.k1.type = logger
a1.sources.r1.interceptors = i1
a1.sources.r1.interceptors.i1.type =regex_filter
a1.sources.r1.interceptors.i1.regex =^[0-9]*$
a1.sources.r1.interceptors.i1.excludeEvents =true
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1
bin/flume-ng agent --conf conf --conf-file job/interceptors-flume.conf --name a1 -Dflume.root.logger=INFO,console
四、HTTP head 的使用
a1.sources = r1
a1.sinks = k1
a1.channels = c1
a1.sources.r1.type = org.apache.flume.source.http.HTTPSource
a1.sources.r1.bind = master
a1.sources.r1.port = 9989
a1.sinks.k1.type = logger
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1
bin/flume-ng agent --conf conf --conf-file job/http-flume.conf --name a1 -Dflume.root.logger=INFO,console
curl -X POST -d '[{"headers" : {"timestamp" : "141212123", "host" : "www.baidu.com"}, "body" : "elaiza"}]' master:9989
五、log-file到hdfs
a2.sources = r2
a2.sinks = k2
a2.channels = c2
a2.sources.r2.type = exec
a2.sources.r2.command = tail -F /usr/local/src/apache-hive-1.2.2-bin/logs/hive.log
a2.sinks.k2.type = hdfs
a2.sinks.k2.hdfs.path = hdfs://master:9000/flume2/%Y%m%d/%H
a2.sinks.k2.hdfs.filePrefix = logs-
a2.sinks.k2.hdfs.round = true
a2.sinks.k2.hdfs.roundValue = 1
a2.sinks.k2.hdfs.roundUnit = hour
a2.sinks.k2.hdfs.useLocalTimeStamp = true
a2.sinks.k2.hdfs.fileType = DataStream
a2.sinks.k2.hdfs.rollInterval = 30
a2.sinks.k2.hdfs.rollSize = 134217700
a2.sinks.k2.hdfs.rollCount = 0
a2.channels.c2.type = memory
a2.channels.c2.capacity = 1000
a2.channels.c2.transactionCapacity = 100
a2.sources.r2.channels = c2
a2.sinks.k2.channel = c2
bin/flume-ng agent -c conf/ -f job/file-flume-hdfs.conf -n a2
六、spooldir 到 hdfs
a2.sources = r2
a2.sinks = k2
a2.channels = c2
a2.sources.r2.type = spooldir
a2.sources.r2.spoolDir = /usr/local/src/apache-flume-1.7.0-bin/upload
a2.sources.r2.fileSuffix = .COMPLETED
a2.sources.r2.fileHeader = true
a2.sources.r2.ignorePattern = ([^ ]*\.tmp)
a2.sinks.k2.type = hdfs
a2.sinks.k2.hdfs.path = hdfs://master:9000/flume2/%Y%m%d/%H
a2.sinks.k2.hdfs.filePrefix = logs-
a2.sinks.k2.hdfs.round = true
a2.sinks.k2.hdfs.roundValue = 1
a2.sinks.k2.hdfs.roundUnit = hour
a2.sinks.k2.hdfs.useLocalTimeStamp = true
a2.sinks.k2.hdfs.fileType = DataStream
a2.sinks.k2.hdfs.rollInterval = 30
a2.sinks.k2.hdfs.rollSize = 134217700
a2.sinks.k2.hdfs.rollCount = 0
a2.channels.c2.type = memory
a2.channels.c2.capacity = 1000
a2.channels.c2.transactionCapacity = 100
a2.sources.r2.channels = c2
a2.sinks.k2.channel = c2
bin/flume-ng agent -c conf/ -f job/dir-flume-hdfs.conf -n a2
七、taildir 断点续传
a1.sources = r1
a1.sinks = k1
a1.channels = c1
a1.sources.r1.type = TAILDIR
a1.sources.r1.filegroups = f1 f2
a1.sources.r1.filegroups.f1 = /usr/local/src/apache-flume-1.7.0-bin/files/1.txt
a1.sources.r1.filegroups.f2 = /usr/local/src/apache-flume-1.7.0-bin/files/2.txt
a1.sources.r1.positionFile = /usr/local/src/apache-flume-1.7.0-bin/position/position.json
a1.sinks.k1.type = logger
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1
bin/flume-ng agent -c conf/ -f job/files-flume-logger.conf -n a1 -Dflume.root.logger=INFO,console
Flume 进阶教程
一、单数据源多出口
单数据源多出口案例(选择器-副本级别)两个channel
a1.sources = r1
a1.channels = c1 c2
a1.sinks = k1 k2
a1.sources.r1.type = TAILDIR
a1.sources.r1.filegroups = f1
a1.sources.r1.filegroups.f1 = /usr/local/src/data/flume_about/hive.log
a1.sources.r1.positionFile = /usr/local/src/apache-flume-1.7.0-bin/position/position1.json
a1.sources.r1.selector.type = replicating
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100
a1.channels.c2.type = memory
a1.channels.c2.capacity = 1000
a1.channels.c2.transactionCapacity = 100
a1.sinks.k1.type = avro
a1.sinks.k1.hostname = master
a1.sinks.k1.port = 4141
a1.sinks.k2.type = avro
a1.sinks.k2.hostname = master
a1.sinks.k2.port = 4142
a1.sources.r1.channels = c1 c2
a1.sinks.k1.channel = c1
a1.sinks.k2.channel = c2
a2.sources = r1
a2.channels = c1
a2.sinks = k1
a2.sources.r1.type = avro
a2.sources.r1.bind = master
a2.sources.r1.port = 4141
a2.channels.c1.type = memory
a2.channels.c1.capacity = 1000
a2.channels.c1.transactionCapacity = 100
a2.sinks.k1.type = hdfs
a2.sinks.k1.hdfs.path = hdfs://master:9000/group1/%Y%m%d/%H
a2.sinks.k1.hdfs.filePrefix = logs-
a2.sinks.k1.hdfs.round = true
a2.sinks.k1.hdfs.roundValue = 1
a2.sinks.k1.hdfs.roundUnit = hour
a2.sinks.k1.hdfs.useLocalTimeStamp = true
a2.sinks.k1.hdfs.fileType = DataStream
a2.sinks.k1.hdfs.rollInterval = 30
a2.sinks.k1.hdfs.rollSize = 134217700
a2.sinks.k1.hdfs.rollCount = 0
a2.sources.r1.channels = c1
a2.sinks.k1.channel = c1
a3.sources = r1
a3.channels = c1
a3.sinks = k1
a3.sources.r1.type = avro
a3.sources.r1.bind = master
a3.sources.r1.port = 4142
a3.channels.c1.type = memory
a3.channels.c1.capacity = 1000
a3.channels.c1.transactionCapacity = 100
a3.sinks.k1.type = file_roll
a3.sinks.k1.sink.directory = /usr/local/src/data/group1
a3.sources.r1.channels = c1
a3.sinks.k1.channel = c1
bin/flume-ng agent -c conf/ -f job/group1/flume1.conf -n a1
bin/flume-ng agent -c conf/ -f job/group1/flume2.conf -n a2
bin/flume-ng agent -c conf/ -f job/group1/flume3.conf -n a3
二、故障转移
a1.sources = r1
a1.channels = c1
a1.sinks = k1 k2
a1.sinkgroups = g1
a1.sources.r1.type = netcat
a1.sources.r1.bind = localhost
a1.sources.r1.port = 44444
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100
a1.sinks.k1.type = avro
a1.sinks.k1.hostname = master
a1.sinks.k1.port = 4141
a1.sinks.k2.type = avro
a1.sinks.k2.hostname = master
a1.sinks.k2.port = 4142
a1.sinkgroups.g1.sinks = k1 k2
a1.sinkgroups.g1.processor.type = failover
a1.sinkgroups.g1.processor.priority.k1 = 5
a1.sinkgroups.g1.processor.priority.k2 = 10
a1.sinkgroups.g1.processor.maxpenalty = 10000
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1
a1.sinks.k2.channel = c1
a2.sources = r1
a2.channels = c1
a2.sinks = k1
a2.sources.r1.type = avro
a2.sources.r1.bind = master
a2.sources.r1.port = 4141
a2.channels.c1.type = memory
a2.channels.c1.capacity = 1000
a2.channels.c1.transactionCapacity = 100
a2.sinks.k1.type = logger
a2.sources.r1.channels = c1
a2.sinks.k1.channel = c1
a3.sources = r1
a3.channels = c1
a3.sinks = k1
a3.sources.r1.type = avro
a3.sources.r1.bind = master
a3.sources.r1.port = 4142
a3.channels.c1.type = memory
a3.channels.c1.capacity = 1000
a3.channels.c1.transactionCapacity = 100
a3.sinks.k1.type = logger
a3.sources.r1.channels = c1
a3.sinks.k1.channel = c1
bin/flume-ng agent -c conf/ -f job/group2/flume1.conf -n a1
bin/flume-ng agent -c conf/ -f job/group2/flume2.conf -n a2 -Dflume.root.logger=INFO,console
bin/flume-ng agent -c conf/ -f job/group2/flume3.conf -n a3 -Dflume.root.logger=INFO,console
三、负载均衡
a1.sources = r1
a1.channels = c1
a1.sinks = k1 k2
a1.sinkgroups = g1
a1.sources.r1.type = netcat
a1.sources.r1.bind = localhost
a1.sources.r1.port = 44444
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100
a1.sinks.k1.type = avro
a1.sinks.k1.hostname = master
a1.sinks.k1.port = 4141
a1.sinks.k2.type = avro
a1.sinks.k2.hostname = master
a1.sinks.k2.port = 4142
a1.sinkgroups.g1.sinks = k1 k2
a1.sinkgroups.g1.processor.type = load_balance
a1.sinkgroups.g1.processor.backoff = true
a1.sinkgroups.g1.processor.selector = random
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1
a1.sinks.k2.channel = c1
a2.sources = r1
a2.channels = c1
a2.sinks = k1
a2.sources.r1.type = avro
a2.sources.r1.bind = master
a2.sources.r1.port = 4141
a2.channels.c1.type = memory
a2.channels.c1.capacity = 1000
a2.channels.c1.transactionCapacity = 100
a2.sinks.k1.type = logger
a2.sources.r1.channels = c1
a2.sinks.k1.channel = c1
a3.sources = r1
a3.channels = c1
a3.sinks = k1
a3.sources.r1.type = avro
a3.sources.r1.bind = master
a3.sources.r1.port = 4142
a3.channels.c1.type = memory
a3.channels.c1.capacity = 1000
a3.channels.c1.transactionCapacity = 100
a3.sinks.k1.type = logger
a3.sources.r1.channels = c1
a3.sinks.k1.channel = c1
bin/flume-ng agent -c conf/ -f job/group3/flume1.conf -n a1
bin/flume-ng agent -c conf/ -f job/group3/flume2.conf -n a2 -Dflume.root.logger=INFO,console
bin/flume-ng agent -c conf/ -f job/group3/flume3.conf -n a3 -Dflume.root.logger=INFO,console
四、聚合
将source理解成服务端,将sink理解成客户端
master 上的 Flume1 监控文件 hive.log
slave1 上的 Flume2 监控某个端口的数据流
Flume1 和 Flume2 将数据发送给 slave2 的Flume3 , Flume3将最终的数据打印到控制台
a1.sources = r1
a1.channels = c1
a1.sinks = k1
a1.sources.r1.type = TAILDIR
a1.sources.r1.filegroups = f1
a1.sources.r1.filegroups.f1 = /usr/local/src/apache-flume-1.7.0-bin/files/nodes.txt
a1.sources.r1.positionFile = /usr/local/src/apache-flume-1.7.0-bin/position/position2.json
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100
a1.sinks.k1.type = avro
a1.sinks.k1.hostname = slave2
a1.sinks.k1.port = 4141
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1
a2.sources = r1
a2.channels = c1
a2.sinks = k1
a2.sources.r1.type = netcat
a2.sources.r1.bind = localhost
a2.sources.r1.port = 44444
a2.channels.c1.type = memory
a2.channels.c1.capacity = 1000
a2.channels.c1.transactionCapacity = 100
a2.sinks.k1.type = avro
a2.sinks.k1.hostname = slave2
a2.sinks.k1.port = 4141
a2.sources.r1.channels = c1
a2.sinks.k1.channel = c1
a3.sources = r1
a3.channels = c1
a3.sinks = k1
a3.sources.r1.type = avro
a3.sources.r1.bind = slave2
a3.sources.r1.port = 4141
a3.channels.c1.type = memory
a3.channels.c1.capacity = 1000
a3.channels.c1.transactionCapacity = 100
a3.sinks.k1.type = logger
a3.sources.r1.channels = c1
a3.sinks.k1.channel = c1
bin/flume-ng agent -c conf/ -f job/group4/flume_master.conf -n a1
bin/flume-ng agent -c conf/ -f job/group4/flume_slave1.conf -n a2
bin/flume-ng agent -c conf/ -f job/group4/flume_slave2.conf -n a3 -Dflume.root.logger=INFO,console
五、发送不同端口
a1.sources = r1
a1.channels = c1
a1.sinks = k1
a1.sources.r1.type = TAILDIR
a1.sources.r1.filegroups = f1
a1.sources.r1.filegroups.f1 = /usr/local/src/apache-flume-1.7.0-bin/files/nodes.txt
a1.sources.r1.positionFile = /usr/local/src/apache-flume-1.7.0-bin/position/position3.json
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100
a1.sinks.k1.type = avro
a1.sinks.k1.hostname = slave2
a1.sinks.k1.port = 4141
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1
a2.sources = r1
a2.channels = c1
a2.sinks = k1
a2.sources.r1.type = netcat
a2.sources.r1.bind = localhost
a2.sources.r1.port = 44444
a2.channels.c1.type = memory
a2.channels.c1.capacity = 1000
a2.channels.c1.transactionCapacity = 100
a2.sinks.k1.type = avro
a2.sinks.k1.hostname = slave2
a2.sinks.k1.port = 4142
a2.sources.r1.channels = c1
a2.sinks.k1.channel = c1
a3.sources = r1 r2
a3.channels = c1
a3.sinks = k1
a3.sources.r1.type = avro
a3.sources.r1.bind = slave2
a3.sources.r1.port = 4141
a3.sources.r2.type = avro
a3.sources.r2.bind = slave2
a3.sources.r2.port = 4142
a3.channels.c1.type = memory
a3.channels.c1.capacity = 1000
a3.channels.c1.transactionCapacity = 100
a3.sinks.k1.type = logger
a3.sources.r1.channels = c1
a3.sources.r2.channels = c1
a3.sinks.k1.channel = c1
bin/flume-ng agent -c conf/ -f job/group5/flume_master.conf -n a1
bin/flume-ng agent -c conf/ -f job/group5/flume_slave1.conf -n a2
bin/flume-ng agent -c conf/ -f job/group5/flume_slave2.conf -n a3 -Dflume.root.logger=INFO,console
六、自定义拦截器
a1.sources = r1
a1.channels = c1 c2
a1.sinks = k1 k2
a1.sources.r1.type = netcat
a1.sources.r1.bind = localhost
a1.sources.r1.port = 44444
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100
a1.channels.c2.type = memory
a1.channels.c2.capacity = 1000
a1.channels.c2.transactionCapacity = 100
a1.sources.r1.selector.type = multiplexing
a1.sources.r1.selector.header = type
a1.sources.r1.selector.mapping.elaiza = c1
a1.sources.r1.selector.mapping.bigdata = c2
a1.sources.r1.interceptors = i1
a1.sources.r1.interceptors.i1.type = com.elaiza.interceptor.TypeInterceptor$Builder
a1.sinks.k1.type = avro
a1.sinks.k1.hostname = slave1
a1.sinks.k1.port = 4142
a1.sinks.k2.type = avro
a1.sinks.k2.hostname = slave2
a1.sinks.k2.port = 4142
a1.sources.r1.channels = c1 c2
a1.sinks.k1.channel = c1
a1.sinks.k2.channel = c2
a2.sources = r1
a2.channels = c1
a2.sinks = k1
a2.sources.r1.type = avro
a2.sources.r1.bind = slave1
a2.sources.r1.port = 4142
a2.channels.c1.type = memory
a2.channels.c1.capacity = 1000
a2.channels.c1.transactionCapacity = 100
a2.sinks.k1.type = logger
a2.sources.r1.channels = c1
a2.sinks.k1.channel = c1
a3.sources = r1
a3.channels = c1
a3.sinks = k1
a3.sources.r1.type = avro
a3.sources.r1.bind = slave2
a3.sources.r1.port = 4142
a3.channels.c1.type = memory
a3.channels.c1.capacity = 1000
a3.channels.c1.transactionCapacity = 100
a3.sinks.k1.type = logger
a3.sources.r1.channels = c1
a3.sinks.k1.channel = c1
bin/flume-ng agent -c conf/ -f job/interceptor/flume_master.conf -n a1
bin/flume-ng agent -c conf/ -f job/interceptor/flume_slave1.conf -n a2 -Dflume.root.logger=INFO,console
bin/flume-ng agent -c conf/ -f job/interceptor/flume_slave2.conf -n a3 -Dflume.root.logger=INFO,console
七、file到kakfa
a1.sources = r1
a1.sinks = k1
a1.channels = c1
a1.sources.r1.type = exec
a1.sources.r1.command = tail -f /usr/local/src/data/original_data/flume_exec_test.txt
a1.sinks.k1.type = org.apache.flume.sink.kafka.KafkaSink
a1.sinks.k1.brokerList=master:9092
a1.sinks.k1.topic=test
a1.sinks.k1.serializer.class=kafka.serializer.StringEncoder
a1.channels.c1.type=memory
a1.channels.c1.capacity = 100000
a1.channels.c1.transactionCapacity = 1000
a1.sources.r1.channels=c1
a1.sinks.k1.channel=c1
---开启python写入文件脚本(/usr/local/src/code/python_code/flume_data_write.py)
---flume监控写入数据的文件(/usr/local/src/data/original_data/flume_exec_test.txt)
---flume sink 到kafka test主题
---开启消费者
开启 fluem配置监控本地文件 : bin/flume-ng agent -c conf/ -f job/file-flume-kafka.conf -n a1
开启 kafka消费者 : kafka-console-consumer.sh --bootstrap-server master:9092 --topic test --from-beginning
八、log 到 hive
### 写入到hive start ###
1.添加 $HIVE_HOME/hcatalog/share/hcatalog 下的所有依赖包添加到 $HIVE_HOME/lib/下
2.修改hive-site.xml的的配置信息
<property>
<name>hive.support.concurrency</name>
<value>true</value>
</property>
<property>
<name>hive.exec.dynamic.partition.mode</name>
<value>nonstrict</value>
</property>
<property>
<name>hive.txn.manager</name>
<value>org.apache.hadoop.hive.ql.lockmgr.DbTxnManager</value>
</property>
<property>
<name>hive.compactor.initiator.on</name>
<value>true</value>
</property>
<property>
<name>hive.compactor.worker.threads</name>
<value>1</value>
</property>
3.创建hive表
create table hive_flume(
order_id string,
user_id string,
eval_set string,
order_number string,
order_dow string,
order_hour_of_day string, days_since_prior_order string )
clustered by (order_id) into 5 buckets stored as orc;
4.flume->hive 的flume的conf
# Name the components on this agent
a1.sources = r1
a1.sinks = k1
a1.channels = c1
# Describe/configure the source
a1.sources.r1.type = exec
a1.sources.r1.command = tail -f /usr/local/src/data/original_data/flume_to_hive.txt
# Describe the sink
a1.sinks.k1.type = hive
a1.sinks.k1.hive.metastore = thrift://master:9083
a1.sinks.k1.hive.database = elaiza
a1.sinks.k1.hive.table = hive_flume
a1.sinks.k1.serializer = DELIMITED
a1.sinks.k1.serializer.delimiter = ","
a1.sinks.k1.serializer.serdeSeparator = ','
a1.sinks.k1.serializer.fieldnames = order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
# Use a channel which buffers events in memory
a1.channels.c1.type = memory
a1.channels.c1.capacity = 10000 # 设置大一点,默认是1000
a1.channels.c1.transactionCapacity = 1000 # 设置大一点,默认是100
# Bind the source and sink to the channel
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1
# --- 开启命令 ---
bin/flume-ng agent -c conf/ -f job/file-flume-hive.conf -n a1 -Dflume.root.logger=INFO,console