Flume概述监控数据端口数据相关应用场景配置一、netcat到打印控制台二、netcat到HDFS配置三、n

概述

Flume ：分布式的海量日志采集、聚合和传输的系统
安装 ： 修改 conf 下的flume-env.sh的java的配置

监控数据端口数据

通过netcat工具向本机的 44444 端口发送数据

nc localhost 44444  --client
nc -lk 44444        --service

netstat -tnlp | grep 7777
kill -9 pid

相关应用场景配置

一、netcat到打印控制台

# Name the components on this agent
a1.sources = r1
a1.sinks = k1
a1.channels = c1

# Describe/configure the source
a1.sources.r1.type = netcat
a1.sources.r1.bind = localhost
a1.sources.r1.port = 44444

# Describe the sink
a1.sinks.k1.type = logger

# Use a channel which buffers events in memory   1000个时间
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100

# Bind the source and sink to the channel
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1

# --- 开启命令 ---
bin/flume-ng agent --conf conf --conf-file job/netcat-flume-logger.conf --name a1 -Dflume.root.logger=INFO,console

二、netcat到HDFS配置


# Name the components on this agent
a1.sources = r1
a1.sinks = k1
a1.channels = c1

# Describe/configure the source
a1.sources.r1.type = netcat
a1.sources.r1.bind = localhost
a1.sources.r1.port = 44444

# Describe the sink
a1.sinks.k1.type = hdfs
a1.sinks.k1.hdfs.path = hdfs:/flume
a1.sinks.k1.hdfs.filePrefix = events
a1.sinks.k1.hdfs.round = true
a1.sinks.k1.hdfs.roundValue = 10
a1.sinks.k1.hdfs.roundUnit = minute
a1.sinks.k1.hdfs.roundInterval = 60 
a1.sinks.k1.hdfs.fileType = DataStream

# Use a channel which buffers events in memory   1000个时间
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100

# Bind the source and sink to the channel
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1

# --- 开启命令 ---
bin/flume-ng agent --conf conf --conf-file job/hdfs-flume.conf --name a1 - Dflume.root.logger=INFO,console

三、netcat到控制台输出(拦截器的使用)

####  #####
a1.sources = r1
a1.sinks = k1
a1.channels = c1

# Describe/configure the source
a1.sources.r1.type = netcat
a1.sources.r1.bind = localhost
a1.sources.r1.port = 44444

# Describe the sink
a1.sinks.k1.type = logger

# Add regex interceptors
a1.sources.r1.interceptors = i1 
a1.sources.r1.interceptors.i1.type =regex_filter 
a1.sources.r1.interceptors.i1.regex =^[0-9]*$ 
a1.sources.r1.interceptors.i1.excludeEvents =true

# Use a channel which buffers events in memory   1000个时间
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100

# Bind the source and sink to the channel
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1

# --- 开启命令 ---
bin/flume-ng agent --conf conf --conf-file job/interceptors-flume.conf --name a1 -Dflume.root.logger=INFO,console

四、HTTP head 的使用

# Name the components on this agent
a1.sources = r1
a1.sinks = k1
a1.channels = c1

# Describe/configure the source
a1.sources.r1.type = org.apache.flume.source.http.HTTPSource
a1.sources.r1.bind = master
a1.sources.r1.port = 9989
#a1.sources.r1.fileHeader = true

# Describe the sink
a1.sinks.k1.type = logger

# Use a channel which buffers events in memory
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100

# Bind the source and sink to the channel
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1

# --- 开始命令 ---
bin/flume-ng agent --conf conf --conf-file job/http-flume.conf --name a1 -Dflume.root.logger=INFO,console

# --- 发送post请求到指定端口 ---
curl -X POST -d '[{"headers" : {"timestamp" : "141212123", "host" : "www.baidu.com"}, "body" : "elaiza"}]' master:9989

五、log-file到hdfs

# 前提:将Hdfs下的部分jar 放到flume lib 目录下
# Name the components on this agent
a2.sources = r2
a2.sinks = k2
a2.channels = c2

# Describe/configure the source
a2.sources.r2.type = exec
a2.sources.r2.command = tail -F /usr/local/src/apache-hive-1.2.2-bin/logs/hive.log

# Describe the sink
a2.sinks.k2.type = hdfs
a2.sinks.k2.hdfs.path = hdfs://master:9000/flume2/%Y%m%d/%H
# 上传文件的前缀
a2.sinks.k2.hdfs.filePrefix = logs-
# 是否按照时间滚动文件
a2.sinks.k2.hdfs.round = true
# 多少时间单位创建一个新的文件
a2.sinks.k2.hdfs.roundValue = 1
# 时间单位
a2.sinks.k2.hdfs.roundUnit = hour
# 是否使用本地时间戳
a2.sinks.k2.hdfs.useLocalTimeStamp = true
# 设置文件类型，可支持压缩
a2.sinks.k2.hdfs.fileType = DataStream
# 多久生产一个文件（30s）
a2.sinks.k2.hdfs.rollInterval = 30
# 设置每个文件的滚动大小
a2.sinks.k2.hdfs.rollSize = 134217700
# 文件的滚动与Event数量无关
a2.sinks.k2.hdfs.rollCount = 0

# Use a channel which buffers events in memory
a2.channels.c2.type = memory
a2.channels.c2.capacity = 1000
a2.channels.c2.transactionCapacity = 100

# Bind the source and sink to the channel
a2.sources.r2.channels = c2
a2.sinks.k2.channel = c2

# --- 开启命令 ---
bin/flume-ng agent -c conf/ -f job/file-flume-hdfs.conf -n a2

六、spooldir 到 hdfs

#### spooldir hdfs 缺点是：不能动态监控文件夹下的动态变化数据的文件 ####
# Name the components on this agent
a2.sources = r2
a2.sinks = k2
a2.channels = c2

# Describe/configure the source
a2.sources.r2.type = spooldir
a2.sources.r2.spoolDir = /usr/local/src/apache-flume-1.7.0-bin/upload
a2.sources.r2.fileSuffix = .COMPLETED
a2.sources.r2.fileHeader = true
a2.sources.r2.ignorePattern = ([^ ]*\.tmp)

# Describe the sink
a2.sinks.k2.type = hdfs
a2.sinks.k2.hdfs.path = hdfs://master:9000/flume2/%Y%m%d/%H
# 上传文件的前缀
a2.sinks.k2.hdfs.filePrefix = logs-
# 是否按照时间滚动文件
a2.sinks.k2.hdfs.round = true
# 多少时间单位创建一个新的文件
a2.sinks.k2.hdfs.roundValue = 1
# 时间单位
a2.sinks.k2.hdfs.roundUnit = hour
# 是否使用本地时间戳
a2.sinks.k2.hdfs.useLocalTimeStamp = true
# 设置文件类型，可支持压缩
a2.sinks.k2.hdfs.fileType = DataStream
# 多久生产一个文件（30s）
a2.sinks.k2.hdfs.rollInterval = 30
# 设置每个文件的滚动大小
a2.sinks.k2.hdfs.rollSize = 134217700
# 文件的滚动与Event数量无关
a2.sinks.k2.hdfs.rollCount = 0

# Use a channel which buffers events in memory
a2.channels.c2.type = memory
a2.channels.c2.capacity = 1000
a2.channels.c2.transactionCapacity = 100

# Bind the source and sink to the channel
a2.sources.r2.channels = c2
a2.sinks.k2.channel = c2

# --- 开启命令 ---
bin/flume-ng agent -c conf/ -f job/dir-flume-hdfs.conf -n a2

七、taildir 断点续传

#### 断点续传 taildir source ####
a1.sources = r1
a1.sinks = k1
a1.channels = c1

# Describe/configure the source
a1.sources.r1.type = TAILDIR
a1.sources.r1.filegroups = f1 f2
a1.sources.r1.filegroups.f1 = /usr/local/src/apache-flume-1.7.0-bin/files/1.txt
a1.sources.r1.filegroups.f2 = /usr/local/src/apache-flume-1.7.0-bin/files/2.txt
a1.sources.r1.positionFile = /usr/local/src/apache-flume-1.7.0-bin/position/position.json

# Describe the sink
a1.sinks.k1.type = logger

# Use a channel which buffers events in memory
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100

# Bind the source and sink to the channel
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1

# --- 开启命令 ---
bin/flume-ng agent -c conf/ -f job/files-flume-logger.conf -n a1 -Dflume.root.logger=INFO,console

Flume 进阶教程

一、单数据源多出口

单数据源多出口案例（选择器-副本级别）两个channel
#### flume1.conf
# name
a1.sources = r1
a1.channels = c1 c2
a1.sinks = k1 k2

# source
a1.sources.r1.type = TAILDIR
a1.sources.r1.filegroups = f1
a1.sources.r1.filegroups.f1 = /usr/local/src/data/flume_about/hive.log
a1.sources.r1.positionFile = /usr/local/src/apache-flume-1.7.0-bin/position/position1.json

# 将数据流复制所有channel
a1.sources.r1.selector.type = replicating

# channel
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100

a1.channels.c2.type = memory
a1.channels.c2.capacity = 1000
a1.channels.c2.transactionCapacity = 100

# sink
a1.sinks.k1.type = avro
a1.sinks.k1.hostname = master
a1.sinks.k1.port = 4141

a1.sinks.k2.type = avro
a1.sinks.k2.hostname = master
a1.sinks.k2.port = 4142

# bind
a1.sources.r1.channels = c1 c2
a1.sinks.k1.channel = c1
a1.sinks.k2.channel = c2
### flume1.conf


### flume2.conf
# name
a2.sources = r1
a2.channels = c1
a2.sinks = k1

# source
a2.sources.r1.type = avro
a2.sources.r1.bind = master
a2.sources.r1.port = 4141

# channel
a2.channels.c1.type = memory
a2.channels.c1.capacity = 1000
a2.channels.c1.transactionCapacity = 100

# sinks
a2.sinks.k1.type = hdfs
a2.sinks.k1.hdfs.path = hdfs://master:9000/group1/%Y%m%d/%H
a2.sinks.k1.hdfs.filePrefix = logs-
a2.sinks.k1.hdfs.round = true
a2.sinks.k1.hdfs.roundValue = 1
a2.sinks.k1.hdfs.roundUnit = hour
a2.sinks.k1.hdfs.useLocalTimeStamp = true
a2.sinks.k1.hdfs.fileType = DataStream
a2.sinks.k1.hdfs.rollInterval = 30
a2.sinks.k1.hdfs.rollSize = 134217700
a2.sinks.k1.hdfs.rollCount = 0

# bind
a2.sources.r1.channels = c1
a2.sinks.k1.channel = c1
### flume2.conf


#### flume3.conf
# name
a3.sources = r1
a3.channels = c1
a3.sinks = k1

# source
a3.sources.r1.type = avro
a3.sources.r1.bind = master
a3.sources.r1.port = 4142

# channel
a3.channels.c1.type = memory
a3.channels.c1.capacity = 1000
a3.channels.c1.transactionCapacity = 100

# sinks
a3.sinks.k1.type = file_roll
a3.sinks.k1.sink.directory = /usr/local/src/data/group1

# bind
a3.sources.r1.channels = c1
a3.sinks.k1.channel = c1
### flume3.conf

# --- 开启命令 ---
bin/flume-ng agent -c conf/ -f job/group1/flume1.conf -n a1
bin/flume-ng agent -c conf/ -f job/group1/flume2.conf -n a2
bin/flume-ng agent -c conf/ -f job/group1/flume3.conf -n a3

二、故障转移

### 故障转移 两个sink
### flume1.conf
# name
a1.sources = r1
a1.channels = c1
a1.sinks = k1 k2
a1.sinkgroups = g1

# source
a1.sources.r1.type = netcat
a1.sources.r1.bind = localhost
a1.sources.r1.port = 44444

# channel  
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100

# sink
a1.sinks.k1.type = avro
a1.sinks.k1.hostname = master
a1.sinks.k1.port = 4141

a1.sinks.k2.type = avro
a1.sinks.k2.hostname = master
a1.sinks.k2.port = 4142

# sink groups
a1.sinkgroups.g1.sinks = k1 k2
a1.sinkgroups.g1.processor.type = failover
a1.sinkgroups.g1.processor.priority.k1 = 5
a1.sinkgroups.g1.processor.priority.k2 = 10
a1.sinkgroups.g1.processor.maxpenalty = 10000

# bind
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1
a1.sinks.k2.channel = c1
### flume1.conf


### flume2.conf
a2.sources = r1
a2.channels = c1
a2.sinks = k1

# source
a2.sources.r1.type = avro
a2.sources.r1.bind = master
a2.sources.r1.port = 4141

# channel
a2.channels.c1.type = memory
a2.channels.c1.capacity = 1000
a2.channels.c1.transactionCapacity = 100

# sinks
a2.sinks.k1.type = logger

# bind
a2.sources.r1.channels = c1
a2.sinks.k1.channel = c1
### flume2.conf


### flume3.conf
a3.sources = r1
a3.channels = c1
a3.sinks = k1

# source
a3.sources.r1.type = avro
a3.sources.r1.bind = master
a3.sources.r1.port = 4142

# channel
a3.channels.c1.type = memory
a3.channels.c1.capacity = 1000
a3.channels.c1.transactionCapacity = 100

# sinks
a3.sinks.k1.type = logger

# bind
a3.sources.r1.channels = c1
a3.sinks.k1.channel = c1
### flume3.conf

# --- 开启命令 ---
bin/flume-ng agent -c conf/ -f job/group2/flume1.conf -n a1 
bin/flume-ng agent -c conf/ -f job/group2/flume2.conf -n a2 -Dflume.root.logger=INFO,console
bin/flume-ng agent -c conf/ -f job/group2/flume3.conf -n a3 -Dflume.root.logger=INFO,console

三、负载均衡

### 负载均衡 两个sink
### flume1.conf
# name
a1.sources = r1
a1.channels = c1
a1.sinks = k1 k2
a1.sinkgroups = g1

# source
a1.sources.r1.type = netcat
a1.sources.r1.bind = localhost
a1.sources.r1.port = 44444

# channel  
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100

# sink
a1.sinks.k1.type = avro
a1.sinks.k1.hostname = master
a1.sinks.k1.port = 4141

a1.sinks.k2.type = avro
a1.sinks.k2.hostname = master
a1.sinks.k2.port = 4142

# sink groups
a1.sinkgroups.g1.sinks = k1 k2
a1.sinkgroups.g1.processor.type = load_balance
a1.sinkgroups.g1.processor.backoff = true
a1.sinkgroups.g1.processor.selector = random

# bind
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1
a1.sinks.k2.channel = c1
### flume1.conf


### flume2.conf
a2.sources = r1
a2.channels = c1
a2.sinks = k1

# source
a2.sources.r1.type = avro
a2.sources.r1.bind = master
a2.sources.r1.port = 4141

# channel
a2.channels.c1.type = memory
a2.channels.c1.capacity = 1000
a2.channels.c1.transactionCapacity = 100

# sinks
a2.sinks.k1.type = logger

# bind
a2.sources.r1.channels = c1
a2.sinks.k1.channel = c1
### flume2.conf


### flume3.conf
# name
a3.sources = r1
a3.channels = c1
a3.sinks = k1

# source
a3.sources.r1.type = avro
a3.sources.r1.bind = master
a3.sources.r1.port = 4142

# channel
a3.channels.c1.type = memory
a3.channels.c1.capacity = 1000
a3.channels.c1.transactionCapacity = 100

# sinks
a3.sinks.k1.type = logger

# bind
a3.sources.r1.channels = c1
a3.sinks.k1.channel = c1
### flume3.conf

# --- 开启命令 ---
bin/flume-ng agent -c conf/ -f job/group3/flume1.conf -n a1 
bin/flume-ng agent -c conf/ -f job/group3/flume2.conf -n a2 -Dflume.root.logger=INFO,console
bin/flume-ng agent -c conf/ -f job/group3/flume3.conf -n a3 -Dflume.root.logger=INFO,console

四、聚合

### 聚合 跨机器的配置 
将source理解成服务端，将sink理解成客户端
master 上的 Flume1 监控文件 hive.log
slave1 上的 Flume2 监控某个端口的数据流
Flume1 和 Flume2 将数据发送给 slave2 的Flume3 , Flume3将最终的数据打印到控制台


### 将来源不同的数据发送slave2上的同一端口的配置: 4141
# master节点
# flume_master.conf
# name
a1.sources = r1
a1.channels = c1
a1.sinks = k1

# source
a1.sources.r1.type = TAILDIR
a1.sources.r1.filegroups = f1
a1.sources.r1.filegroups.f1 = /usr/local/src/apache-flume-1.7.0-bin/files/nodes.txt
a1.sources.r1.positionFile = /usr/local/src/apache-flume-1.7.0-bin/position/position2.json

# channel
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100

# sink
a1.sinks.k1.type = avro
a1.sinks.k1.hostname = slave2
a1.sinks.k1.port = 4141

# bind
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1


# slave1节点
# flume_slave1.conf
# name
a2.sources = r1
a2.channels = c1
a2.sinks = k1

# source
a2.sources.r1.type = netcat
a2.sources.r1.bind = localhost
a2.sources.r1.port = 44444


# channel
a2.channels.c1.type = memory
a2.channels.c1.capacity = 1000
a2.channels.c1.transactionCapacity = 100

# sink
a2.sinks.k1.type = avro
a2.sinks.k1.hostname = slave2
a2.sinks.k1.port = 4141

# bind
a2.sources.r1.channels = c1
a2.sinks.k1.channel = c1


# slave2节点
# flume_slave2.conf
# name
a3.sources = r1
a3.channels = c1
a3.sinks = k1

# source
a3.sources.r1.type = avro
a3.sources.r1.bind = slave2
a3.sources.r1.port = 4141

# channel
a3.channels.c1.type = memory
a3.channels.c1.capacity = 1000
a3.channels.c1.transactionCapacity = 100

# sink
a3.sinks.k1.type = logger

# bind
a3.sources.r1.channels = c1
a3.sinks.k1.channel = c1

# --- 开启命令 ---
# master
bin/flume-ng agent -c conf/ -f job/group4/flume_master.conf -n a1
# slave1
bin/flume-ng agent -c conf/ -f job/group4/flume_slave1.conf -n a2
# slave2
bin/flume-ng agent -c conf/ -f job/group4/flume_slave2.conf -n a3 -Dflume.root.logger=INFO,console

五、发送不同端口

# 将来源不同的数据发送slave2上的不同端口的配置: 4141 4142
# master节点
# flume_master.conf
# name
a1.sources = r1
a1.channels = c1
a1.sinks = k1

# source
a1.sources.r1.type = TAILDIR
a1.sources.r1.filegroups = f1
a1.sources.r1.filegroups.f1 = /usr/local/src/apache-flume-1.7.0-bin/files/nodes.txt
a1.sources.r1.positionFile = /usr/local/src/apache-flume-1.7.0-bin/position/position3.json

# channel
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100

# sink
a1.sinks.k1.type = avro
a1.sinks.k1.hostname = slave2
a1.sinks.k1.port = 4141

# bind
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1


# slave1节点
# flume_slave1.conf
# name
a2.sources = r1
a2.channels = c1
a2.sinks = k1

# source
a2.sources.r1.type = netcat
a2.sources.r1.bind = localhost
a2.sources.r1.port = 44444


# channel
a2.channels.c1.type = memory
a2.channels.c1.capacity = 1000
a2.channels.c1.transactionCapacity = 100

# sink
a2.sinks.k1.type = avro
a2.sinks.k1.hostname = slave2
a2.sinks.k1.port = 4142

# bind
a2.sources.r1.channels = c1
a2.sinks.k1.channel = c1


# slave2节点
# flume_slave2.conf
# name
a3.sources = r1 r2
a3.channels = c1
a3.sinks = k1

# source
a3.sources.r1.type = avro
a3.sources.r1.bind = slave2
a3.sources.r1.port = 4141

a3.sources.r2.type = avro
a3.sources.r2.bind = slave2
a3.sources.r2.port = 4142

# channel
a3.channels.c1.type = memory
a3.channels.c1.capacity = 1000
a3.channels.c1.transactionCapacity = 100

# sink
a3.sinks.k1.type = logger

# bind
a3.sources.r1.channels = c1
a3.sources.r2.channels = c1
a3.sinks.k1.channel = c1

# --- 开启命令 ---
bin/flume-ng agent -c conf/ -f job/group5/flume_master.conf -n a1
# slave1
bin/flume-ng agent -c conf/ -f job/group5/flume_slave1.conf -n a2
# slave2
bin/flume-ng agent -c conf/ -f job/group5/flume_slave2.conf -n a3 -Dflume.root.logger=INFO,console

六、自定义拦截器

### 自定义拦截器的配置文件(channels选择器)
# master节点
# flume_master.conf
# name
a1.sources = r1
a1.channels = c1 c2
a1.sinks = k1 k2

# source
a1.sources.r1.type = netcat
a1.sources.r1.bind = localhost
a1.sources.r1.port = 44444

# channel
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100

a1.channels.c2.type = memory
a1.channels.c2.capacity = 1000
a1.channels.c2.transactionCapacity = 100

# channel selector(选择器)
a1.sources.r1.selector.type = multiplexing
a1.sources.r1.selector.header = type
a1.sources.r1.selector.mapping.elaiza = c1
a1.sources.r1.selector.mapping.bigdata = c2

# interceptor(拦截器)
a1.sources.r1.interceptors = i1
a1.sources.r1.interceptors.i1.type = com.elaiza.interceptor.TypeInterceptor$Builder

# sink
a1.sinks.k1.type = avro
a1.sinks.k1.hostname = slave1
a1.sinks.k1.port = 4142

a1.sinks.k2.type = avro
a1.sinks.k2.hostname = slave2
a1.sinks.k2.port = 4142

# bind
a1.sources.r1.channels = c1 c2
a1.sinks.k1.channel = c1
a1.sinks.k2.channel = c2


### slave1节点
# flume_slave1.conf
# name
a2.sources = r1
a2.channels = c1
a2.sinks = k1

# source
a2.sources.r1.type = avro
a2.sources.r1.bind = slave1
a2.sources.r1.port = 4142


# channel
a2.channels.c1.type = memory
a2.channels.c1.capacity = 1000
a2.channels.c1.transactionCapacity = 100

# sink
a2.sinks.k1.type = logger

# bind
a2.sources.r1.channels = c1
a2.sinks.k1.channel = c1


### slave2节点
# flume_slave2.conf
# name
a3.sources = r1
a3.channels = c1
a3.sinks = k1

# source
a3.sources.r1.type = avro
a3.sources.r1.bind = slave2
a3.sources.r1.port = 4142

# channel
a3.channels.c1.type = memory
a3.channels.c1.capacity = 1000
a3.channels.c1.transactionCapacity = 100

# sink
a3.sinks.k1.type = logger

# bind
a3.sources.r1.channels = c1
a3.sinks.k1.channel = c1

# --- 开启命令 ---
bin/flume-ng agent -c conf/ -f job/interceptor/flume_master.conf -n a1
# slave1
bin/flume-ng agent -c conf/ -f job/interceptor/flume_slave1.conf -n a2 -Dflume.root.logger=INFO,console
# slave2
bin/flume-ng agent -c conf/ -f job/interceptor/flume_slave2.conf -n a3 -Dflume.root.logger=INFO,console

七、file到kakfa

### file文件到kafka start ###
# name
a1.sources = r1
a1.sinks = k1
a1.channels = c1

# source
a1.sources.r1.type = exec
a1.sources.r1.command = tail -f /usr/local/src/data/original_data/flume_exec_test.txt

# sinks
# 设置kafka接收器 
a1.sinks.k1.type = org.apache.flume.sink.kafka.KafkaSink
# 设置kafka的broker地址和端口号
a1.sinks.k1.brokerList=master:9092
# 设置Kafka的topic
a1.sinks.k1.topic=test
# 设置序列化的方式
a1.sinks.k1.serializer.class=kafka.serializer.StringEncoder

# channel
a1.channels.c1.type=memory
a1.channels.c1.capacity = 100000
a1.channels.c1.transactionCapacity = 1000

# Bind
a1.sources.r1.channels=c1
a1.sinks.k1.channel=c1

# ---开启命令 ---
---开启python写入文件脚本(/usr/local/src/code/python_code/flume_data_write.py)
---flume监控写入数据的文件(/usr/local/src/data/original_data/flume_exec_test.txt)
---flume sink 到kafka test主题
---开启消费者
开启 fluem配置监控本地文件   :  bin/flume-ng agent -c conf/ -f job/file-flume-kafka.conf -n a1
开启 kafka消费者 :  kafka-console-consumer.sh --bootstrap-server master:9092 --topic test --from-beginning

八、log 到 hive

### 写入到hive start ###
1.添加 $HIVE_HOME/hcatalog/share/hcatalog 下的所有依赖包添加到 $HIVE_HOME/lib/下
2.修改hive-site.xml的的配置信息
<property>
    <name>hive.support.concurrency</name>
    <value>true</value>
</property>
<property>
    <name>hive.exec.dynamic.partition.mode</name>
    <value>nonstrict</value>
</property>
<property>
    <name>hive.txn.manager</name>
    <value>org.apache.hadoop.hive.ql.lockmgr.DbTxnManager</value>
</property>
<property>
    <name>hive.compactor.initiator.on</name>
    <value>true</value>
</property>
<property>
    <name>hive.compactor.worker.threads</name>
    <value>1</value>
</property>

3.创建hive表
create table hive_flume(
order_id string,
user_id string,
eval_set string,
order_number string,
order_dow string,
order_hour_of_day string, days_since_prior_order string )
clustered by (order_id) into 5 buckets stored as orc;

4.flume->hive 的flume的conf
# Name the components on this agent
a1.sources = r1
a1.sinks = k1
a1.channels = c1

# Describe/configure the source
a1.sources.r1.type = exec
a1.sources.r1.command = tail -f /usr/local/src/data/original_data/flume_to_hive.txt

# Describe the sink
a1.sinks.k1.type = hive
a1.sinks.k1.hive.metastore = thrift://master:9083
a1.sinks.k1.hive.database = elaiza
a1.sinks.k1.hive.table = hive_flume
a1.sinks.k1.serializer = DELIMITED
a1.sinks.k1.serializer.delimiter = ","
a1.sinks.k1.serializer.serdeSeparator = ','
a1.sinks.k1.serializer.fieldnames = order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order

# Use a channel which buffers events in memory
a1.channels.c1.type = memory
a1.channels.c1.capacity = 10000 # 设置大一点，默认是1000
a1.channels.c1.transactionCapacity = 1000 # 设置大一点，默认是100

# Bind the source and sink to the channel
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1

# --- 开启命令 ---
bin/flume-ng agent -c conf/ -f job/file-flume-hive.conf -n a1 -Dflume.root.logger=INFO,console