1. 下载Flume
下载Flume地址: mirror.bit.edu.cn/apache/flum…
参考链接:
wget http://mirror.bit.edu.cn/apache/flume/1.8.0/apache-flume-1.8.0-bin.tar.gz
tar zxvf apache-flume-1.8.0-bin.tar.gz
2. 配置文件
# cd /usr/local/src/apache-flume-1.8.0-bin/conf
# 进入flume的目录,修改conf下的flume-env.sh,在里面配置JAVA_HOME
cp flume-env.sh.template flume.env.sh
vim flume.env.sh
export JAVA_HOME=/usr/local/src/jdk1.8.0_231
3. 测试是否成功
# cd /usr/local/src/apache-flume-1.8.0-bin/
./bin/flume-ng version
4.flume-ng命令
Usage: bin/flume-ng [options]
# commands:
agent run a Flume agent
avro-client run an avro Flume client
# options
# global options:
--conf,-c <conf> use configs in <conf> directory
# agent options:
--name,-n <name> the name of this agent (required)
--conf-file,-f <file> specify a config file (required if -z missing)
# avro-client options:
--rpcProps,-P <file> RPC client properties file with server connection params
--host,-H <host> hostname to which events will be sent
--port,-p <port> port of the avro source
--dirname <dir> directory to stream to avro source
--filename,-F <file> text file to stream to avro source (default: std input)
--headerFile,-R <file> File containing event headers as key/value pairs on each new line
# 提交任务的命令:
bin/flume-ng agent --conf conf --name agent --conf-file conf/test.properties
bin/flume-ng agent -c conf -n agent -f conf/test.properties Dflume.root.logger=INFO,console
bin/flume-ng avro-client --conf conf --host hadoop --port 8080
5. 配置情况选择
指定采集方案配置文件,在相应的节点上启动flume agent
1. flume安装在hadoop集群中
- 配置JAVA_HOME:
export JAVA_HOME=/usr/local/src/jdk1.8.0_231
2. flume安装在hadoop集群中,且已配置hadoop
- HDFS访问入口变化
- 配置JAVA_HOME:export JAVA_HOME=/usr/local/src/jdk1.8.0_231
- 还需要添加hadoop的core-site.xml和hdfs-site.xml拷贝到flume的conf目录
3. flume不在hadoop集群里
- 配置JAVA_HOME
export JAVA_HOME=/usr/local/src/jdk1.8.0_231
- 还需要添加hadoop的core-site.xml和hdfs-site.xml拷贝到flume的conf目录
- 将hadoop的一些jar包添加到flume的lib目录下,需要时对应版本的jar包
尚未实践
6. 运行官网案例
1. 配置flume运行文件flume-conf.properties
# 先在flume的conf目录下新建一个文件,用于配置flume
# vim netcat-logger.conf
cp flume-conf.properties.template flume-conf.properties
# 定义这个agent中各组件的名字
a1.sources = r1
a1.sinks = k1
a1.channels = c1
# 描述和配置source组件:r1
a1.sources.r1.type = netcat
a1.sources.r1.bind = master # localhost
a1.sources.r1.port = 44444
# 描述和配置sink组件:k1
a1.sinks.k1.type = logger
# 描述和配置channel组件,此处使用是内存缓存的方式
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100
# 描述和配置source channel sink之间的连接关系
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1
2. 运行flume
# cd /usr/local/src/apache-flume-1.8.0-bin/
bin/flume-ng agent --name a1 --conf conf --conf-file conf/flume-conf.properties -Dflume.root.logger=INFO,console
# bin/flume-ng agent -c conf -f conf/flume-conf.properties -n a1 -Dflume.root.logger=INFO,console
# -c conf 指定flume自身的配置文件所在目录
# -f conf/netcat-logger.con 指定我们所描述的采集方案
# -n a1 指定我们这个agent的名字
3. 安装telnet
yum -y install telnet
4. 打开44444端口并且输入测试
# telnet anget-hostname port
telnet master 44444
7. 采集目录到HDFS
采集需求:某服务器的某特定目录下,会不断产生新的文件,每当有新文件出现,就需要把文件采集到HDFS中去
根据需求,首先定义以下3大要素
- 采集源,即source——监控文件目录 : spooldir
- 下沉目标,即sink——HDFS文件系统 : hdfs sink
- source和sink之间的传递通道——channel,可用file channel 也可以用内存channel
配置文件编写:
#定义三大组件的名称
agent1.sources = source1
agent1.sinks = sink1
agent1.channels = channel1
# 配置source组件
agent1.sources.source1.type = spooldir
agent1.sources.source1.spoolDir = /home/hadoop/logs/
agent1.sources.source1.fileHeader = false
#配置拦截器
agent1.sources.source1.interceptors = i1
agent1.sources.source1.interceptors.i1.type = host
agent1.sources.source1.interceptors.i1.hostHeader = hostname
# 配置sink组件
agent1.sinks.sink1.type = hdfs
agent1.sinks.sink1.hdfs.path =hdfs://hdp-node-01:9000/weblog/flume-collection/%y-%m-%d/%H-%M
agent1.sinks.sink1.hdfs.filePrefix = access_log
agent1.sinks.sink1.hdfs.maxOpenFiles = 5000
agent1.sinks.sink1.hdfs.batchSize= 100
agent1.sinks.sink1.hdfs.fileType = DataStream
agent1.sinks.sink1.hdfs.writeFormat =Text
agent1.sinks.sink1.hdfs.rollSize = 102400
agent1.sinks.sink1.hdfs.rollCount = 1000000
agent1.sinks.sink1.hdfs.rollInterval = 60
#agent1.sinks.sink1.hdfs.round = true
#agent1.sinks.sink1.hdfs.roundValue = 10
#agent1.sinks.sink1.hdfs.roundUnit = minute
agent1.sinks.sink1.hdfs.useLocalTimeStamp = true
# Use a channel which buffers events in memory
agent1.channels.channel1.type = memory
agent1.channels.channel1.keep-alive = 120
agent1.channels.channel1.capacity = 500000
agent1.channels.channel1.transactionCapacity = 600
# Bind the source and sink to the channel
agent1.sources.source1.channels = channel1
agent1.sinks.sink1.channel = channel1
# Channel参数解释:
capacity:默认该通道中最大的可以存储的event数量
trasactionCapacity:每次最大可以从source中拿到或者送到sink中的event数量
keep-alive:event添加到通道中或者移出的允许时间
8. 采集文件到HDFS
采集需求:比如业务系统使用log4j生成的日志,日志内容不断增加,需要把追加到日志文件中的数据实时采集到hdfs
根据需求,首先定义以下3大要素
- 采集源,即source——监控文件内容更新 : exec ' tail -F file '
- 下沉目标,即sink——HDFS文件系统 : hdfs sink
- Source和sink之间的传递通道——channel,可用file channel 也可以用 内存channel
配置文件编写:
agent1.sources = source1
agent1.sinks = sink1
agent1.channels = channel1
# Describe/configure tail -F source1
agent1.sources.source1.type = exec
agent1.sources.source1.command = tail -F /home/hadoop/logs/access_log
agent1.sources.source1.channels = channel1
#configure host for source
agent1.sources.source1.interceptors = i1
agent1.sources.source1.interceptors.i1.type = host
agent1.sources.source1.interceptors.i1.hostHeader = hostname
# Describe sink1
agent1.sinks.sink1.type = hdfs
#a1.sinks.k1.channel = c1
agent1.sinks.sink1.hdfs.path =hdfs://hdp-node-01:9000/weblog/flume-collection/%y-%m-%d/%H-%M
agent1.sinks.sink1.hdfs.filePrefix = access_log
agent1.sinks.sink1.hdfs.maxOpenFiles = 5000
agent1.sinks.sink1.hdfs.batchSize= 100
agent1.sinks.sink1.hdfs.fileType = DataStream
agent1.sinks.sink1.hdfs.writeFormat =Text
agent1.sinks.sink1.hdfs.rollSize = 102400
agent1.sinks.sink1.hdfs.rollCount = 1000000
agent1.sinks.sink1.hdfs.rollInterval = 60
agent1.sinks.sink1.hdfs.round = true
agent1.sinks.sink1.hdfs.roundValue = 10
agent1.sinks.sink1.hdfs.roundUnit = minute
agent1.sinks.sink1.hdfs.useLocalTimeStamp = true
# Use a channel which buffers events in memory
agent1.channels.channel1.type = memory
agent1.channels.channel1.keep-alive = 120
agent1.channels.channel1.capacity = 500000
agent1.channels.channel1.transactionCapacity = 600
# Bind the source and sink to the channel
agent1.sources.source1.channels = channel1
agent1.sinks.sink1.channel = channel1