flume自定义source和sink抽取本地xml数据到hbase

296 阅读4分钟

1. 项目需求

监听本地目录下的xml文件将其导入hbase中,实现xml数据增量导入hbase中。

原始文件

<?xml version="1.0" ?>
<Document>
 <DOI/>
 <Title_china>立法与行政的权限分配</Title_china>
 <Title_english/>
 <Creator>黄宇骁</Creator>
 <Keywords>立法与行政</Keywords>
 <Keywords>法规</Keywords>
 <Keywords>法律保留</Keywords>
 <Keywords>法的一般性</Keywords>
 <Keywords>自主行政行为</Keywords>
 <ForeignCreator/>
 <Organization>北京大学法学院</Organization>
 <PeriodicalChinese>法学家</PeriodicalChinese>
 <PeriodicalEnglish>The Jurist</PeriodicalEnglish>
 <PublishYear>2020--1</PublishYear>
 <Column>民法典编纂研究</Column>
 <OriginalClassCode/>
 <Abstract>所谓立法与行政的权限分配即是理顺法的制定与执行之间的上下游关系,本质是权力等级序列问题,它与权力分立语境下机关之间的监督与制约关系并非一回事.所有的立法与行政权限分配方式都可以总结为分离型与下降型两种模型,其中分离型又可以细分为双重分离型、立法与行政分离型和立法分离型.“分离”意味着要么是限制下游,要么是限制上游.对学说、实务、规范三者进行彻底分析可以得知,我国立法与行政的权限分配应当是一种立法分离型结构.立法权自身分离给立法机关与行政机关,但这种分离是限制下游式,受到制约的是行政立法而不是法律;相反,行政权则是立法权的下降,既不应当存在自主行政行为,也不应当限制立法作具体措施.</Abstract>
 <PageNo>17</PageNo>
 <Page>47-63</Page>
</Document>

2. 采用框架

  • hadoop-3.1.3
  • hbase-2.0.5
  • flume-1.9.0
  • kafka-2.4.1
  • zookeeper-3.5.7

3. 实现

3.1 自定义flume source

目标将xml数据封装成json,经过flume生成event,传入kafka

  • 自定义source代码
package com.lyc;

import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import org.apache.flume.Context;
import org.apache.flume.EventDeliveryException;
import org.apache.flume.PollableSource;
import org.apache.flume.conf.Configurable;
import org.apache.flume.event.SimpleEvent;
import org.apache.flume.source.AbstractSource;
import org.apache.http.StatusLine;
import org.dom4j.Document;
import org.dom4j.Element;
import org.dom4j.io.SAXReader;
import java.io.File;
import java.util.HashMap;
import java.util.List;

/**
 * @ProjectName flume-interceptor
 * @ClassName XmlDirPollSource
 * @Description TODO
 * @Author lyc
 * @Date 2022/5/21 6:52
 * @Version 1.0
 **/
public class XmlDirPollSource extends AbstractSource implements Configurable, PollableSource {
    private Long delay; //检查时间间隔
    private String xmlSourceDir; //xml数据存放目录位置
    private String fileFilterSuffix; //过滤文件名后缀
    private String completeFlag; //完成标记

    /**
     * @Description: TODO 获取配置文件参数
     * @Author: lyc
     * @Date: 2022/5/22 10:59
     * @param context:
     * @return: void
     **/
    @Override
    public void configure(Context context) {
        delay = context.getLong("delay");
        xmlSourceDir = context.getString("xmlSourceDir");
        fileFilterSuffix = context.getString("fileFilterSuffix");
        completeFlag = context.getString("completeFlag");
    }

    /**
     * @Description: TODO 将json数据封装成event,对执行过的数据进行标记
     * @Author: lyc
     * @Date: 2022/5/22 10:59
     * @return: org.apache.flume.PollableSource.Status
     **/
    @Override
    public Status process() throws EventDeliveryException {
        String[] fileList = listFliles(xmlSourceDir);
        Status status = null;

        for(String fileName : fileList) {
            if (fileName.endsWith(fileFilterSuffix)) {
                SimpleEvent event = new SimpleEvent();
                HashMap<String, String> headerMap = new HashMap<>();
                headerMap.put("filename", fileName);
                try {
                    JSONObject jsonObject = xmltoJson(xmlSourceDir + "/" + fileName);
                    event.setHeaders(headerMap);
                    event.setBody(jsonObject.toString().getBytes());
                    getChannelProcessor().processEvent(event);

                    File file = new File(xmlSourceDir + "/" + fileName);
                    File dest = new File(file.getPath() + completeFlag);
                    //执行过的文件打标记
                    file.renameTo(dest);
                    Thread.sleep(delay);
                    status = Status.READY;

                } catch (Exception e) {
                    e.printStackTrace();
                    status = Status.BACKOFF;
                }

            }
        }
        return status;
    }

    @Override
    public long getBackOffSleepIncrement() {
        return 0;
    }

    @Override
    public long getMaxBackOffSleepInterval() {
        return 0;
    }

    public String[] listFliles(String path) {
        File file = new File(path);
        return file.list();
    }
    /**
     * @Description: TODO xml数据转换为Json数据
     * @Author: lyc
     * @Date: 2022/5/22 10:57
     * @param xml: xml文件路径
     * @return: com.alibaba.fastjson.JSONObject
     **/
    public JSONObject xmltoJson(String xml) throws Exception {
        JSONObject jsonObject = new JSONObject();
        File file = new File(xml);
        SAXReader saxReader = new SAXReader();
        saxReader.setFeature("http://apache.org/xml/features/disallow-doctype-decl", true);
        Document document = saxReader.read(file);
        Element root = document.getRootElement();
        iterateNodes(root, jsonObject);
        return jsonObject;
    }

    /**
     * @Description:TODO 解析xml数据
     * @Author: lyc
     * @Date: 2022/5/22 10:58
     * @param node:
     * @param json:
     * @return: void
     **/
    public static void iterateNodes(Element node,JSONObject json){
        //获取当前元素的名称
        String nodeName = node.getName();
        //判断已遍历的JSON中是否已经有了该元素的名称
        if(json.containsKey(nodeName)){
            //该元素在同级下有多个
            Object Object = json.get(nodeName);
            JSONArray array = null;
            if(Object instanceof JSONArray){
                array = (JSONArray) Object;
            }else {
                array = new JSONArray();
                array.add(Object);
            }
            //获取该元素下所有子元素
            List<Element> listElement = node.elements();
            if(listElement.isEmpty()){
                //该元素无子元素,获取元素的值
                String nodeValue = node.getTextTrim();
                array.add(nodeValue);
                json.put(nodeName, array);
                return ;
            }
            //有子元素
            JSONObject newJson = new JSONObject();
            //遍历所有子元素
            for(Element e:listElement){
                //递归
                iterateNodes(e,newJson);
            }
            array.add(newJson);
            json.put(nodeName, array);
            return ;
        }
        //该元素同级下第一次遍历
        //获取该元素下所有子元素
        List<Element> listElement = node.elements();
        if(listElement.isEmpty()){
            //该元素无子元素,获取元素的值
            String nodeValue = node.getTextTrim();
            json.put(nodeName, nodeValue);
            return ;
        }
        //有子节点,新建一个JSONObject来存储该节点下子节点的值
        JSONObject object = new JSONObject();
        //遍历所有一级子节点
        for(Element e:listElement){
            //递归
            iterateNodes(e,object);
        }
        json.put(nodeName, object);
        return ;
    }
    }
  • flume配置文件
#为各组件命名
a1.sources = r1
a1.channels = c1

#描述source
# 自定义source类名
a1.sources.r1.type = com.lyc.XmlDirPollSource
a1.sources.r1.delay = 5000
# 监听目录
a1.sources.r1.xmlSourceDir = /home/paper/
# 监听后缀为xml文件名
a1.sources.r1.fileFilterSuffix = xml
# 处理标记
a1.sources.r1.completeFlag = .complete

#描述channel
a1.channels.c1.type = org.apache.flume.channel.kafka.KafkaChannel
a1.channels.c1.kafka.bootstrap.servers = hadoop104:9092
# kafka主题
a1.channels.c1.kafka.topic = topic_paper
a1.channels.c1.parseAsFlumeEvent = false
a1.sources.r1.channels = c1

3.2 自定义sink代码

消费kafka数据到hbase中

package com.lyc;

import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONObject;
import org.apache.flume.*;
import org.apache.flume.conf.Configurable;
import org.apache.flume.sink.AbstractSink;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.*;
import org.apache.hadoop.hbase.util.Bytes;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.UUID;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;

/**
 * @ProjectName flume-interceptor
 * @ClassName MyHbaseSink
 * @Description TODO
 * @Author lyc
 * @Date 2022/5/21 11:46
 * @Version 1.0
 **/
public class MyHbaseSink extends AbstractSink implements Configurable {

    public static Configuration configuration;
    public static Connection connection;
    public static HBaseAdmin admin;
    public static String zookeeperQuorum;
    //zookeeper端口号
    public static String port;
    //hbase表名
    public static String tableName;
    //habse列簇
    public static String columnFamily;
    public static Table table;
    //线程池
    public static ExecutorService pool = Executors.newScheduledThreadPool(20);
    public static final Logger logger = LoggerFactory.getLogger(MyHbaseSink.class);



    @Override
    public void configure(Context context) {
        zookeeperQuorum = context.getString("zookeeperQuorum");
        port = context.getString("port");
        tableName = context.getString("tableName");
        columnFamily = context.getString("columnFamily");
        try {
            configuration = HBaseConfiguration.create();
            configuration.set("hbase.zookeeper.property.clientPort", port);
            configuration.set("hbase.zookeeper.quorum", zookeeperQuorum);
            connection = ConnectionFactory.createConnection(configuration, pool);
            admin = (HBaseAdmin)connection.getAdmin();
        } catch (IOException e) {
            logger.info("==============Hbase连接异常==============");
        }
    }

    @Override
    public Status process(){
        Status status = null;
        Channel channel = getChannel();
        Transaction transaction = channel.getTransaction();
        //开启事务
        transaction.begin();
        try {
            Event event;
            while(true) {
                event = channel.take();
                if (event != null) {
                    logger.info("==============获取到event==============");
                    break;
                }
            }
            processEvent(event);
            //提交事务
            transaction.commit();
            status = Status.READY;
        } catch (ChannelException e) {
            //事务回滚
            transaction.rollback();
            logger.info("==============插入数据失败==============");
            status = Status.BACKOFF;
        } finally {
            transaction.close();
            logger.info("==============插入数据成功==============");
        }
        return status;
    }

    public void processEvent(Event event) {
        String body = new String(event.getBody());
        JSONObject jsonObject = (JSONObject) JSON.parse(body);
        logger.info("======================================================================");
        logger.info("==============获取Kafka数据:" + jsonObject.toString() + "==============");
        logger.info("======================================================================");
        String rowkey = UUID.randomUUID().toString().replaceAll("-","");
        Put put = new Put(Bytes.toBytes(rowkey));
        ArrayList<Put> putlist = new ArrayList<>();
        jsonObject = (JSONObject)jsonObject.get("Document");
        for (Map.Entry<String, Object> entry : jsonObject.entrySet()) {
            System.out.println(entry.getKey() + "," + entry.getValue());
            put.addColumn(Bytes.toBytes(columnFamily), Bytes.toBytes(entry.getKey()), Bytes.toBytes(entry.getValue().toString()));
            putlist.add(put);
        }
        putBatch(tableName, putlist);
    }
    
    /**
     * @Description: TODO 批量插入数据
     * @Author: lyc
     * @Date: 2022/5/22 11:55
     * @param tableName: 
     * @param putList:  
     * @return: void
     **/
    public void putBatch(String tableName, List<Put> putList) {
        if (tableExists(tableName)) {
            try {
                table = connection.getTable(TableName.valueOf(tableName));
                table.put(putList);
                if (System.currentTimeMillis() % 20 == 1) {
                    admin.flush(TableName.valueOf(tableName));
                }
            } catch (IOException e) {
                e.printStackTrace();
            }finally {
                closeTable(table);
            }
        }else {
            throw new IllegalArgumentException(tableName + "表不存在");
        }

    }
    
    /**
     * @Description: TODO 判断表是否存在
     * @Author: lyc
     * @Date: 2022/5/22 11:56
     * @param tableName:  
     * @return: boolean
     **/
    public boolean tableExists(String tableName) {
        TableName table = TableName.valueOf(tableName);
        boolean tableExistsFlag = false;
        try {
            tableExistsFlag = admin.tableExists(table);
        } catch (IOException e) {
            e.printStackTrace();
        }
        return tableExistsFlag;
    }
    
    /**
     * @Description: TODO 关闭流
     * @Author: lyc
     * @Date: 2022/5/22 11:57
     * @param table:  
     * @return: void
     **/
    public void closeTable(Table table) {
        try {
            table.close();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
}
  • flume配置文件
# 配置Agent a1各个组件的名称
a2.channels = c2   
a2.sinks = k2 

# kafka source配置
a2.channels.c2.type = org.apache.flume.channel.kafka.KafkaChannel
a2.channels.c2.kafka.bootstrap.servers = hadoop105:9092
a2.channels.c2.kafka.topic = topic_paper
# 这个必须加, 不然会报sink错误
a2.channels.c2.parseAsFlumeEvent = false

# hbase sink 自定sink的全类名
a2.sinks.k2.type = com.lyc.MyHbaseSink
# 部署zookeeper机器
a2.sinks.k2.zookeeperQuorum = hadoop104,hadoop105,hadoop106
# zookeeper端口号
a2.sinks.k2.port = 2181
# hbase表名
a2.sinks.k2.tableName = paperdata
# habse列簇
a2.sinks.k2.columnFamily = info
# 绑定channel 的sink
a2.sinks.k2.channel = c2

3.3 pom.xml

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>org.example</groupId>
    <artifactId>flume-interceptor</artifactId>
    <version>1.0-SNAPSHOT</version>

    <dependencies>
        <dependency>
            <groupId>org.apache.flume</groupId>
            <artifactId>flume-ng-core</artifactId>
            <version>1.9.0</version>
            <scope>provided</scope>
        </dependency>

        <dependency>
            <groupId>com.alibaba</groupId>
            <artifactId>fastjson</artifactId>
            <version>1.2.62</version>
        </dependency>
        <dependency>
            <groupId>dom4j</groupId>
            <artifactId>dom4j</artifactId>
            <version>1.6.1</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hbase</groupId>
            <artifactId>hbase-client</artifactId>
            <version>2.0.5</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hbase</groupId>
            <artifactId>hbase-common</artifactId>
            <version>2.0.5</version>
        </dependency>
    </dependencies>

    <build>
        <plugins>
            <plugin>
                <artifactId>maven-compiler-plugin</artifactId>
                <version>2.3.2</version>
                <configuration>
                    <source>1.8</source>
                    <target>1.8</target>
                </configuration>
            </plugin>
            <plugin>
                <artifactId>maven-assembly-plugin</artifactId>
                <configuration>
                    <descriptorRefs>
                        <descriptorRef>jar-with-dependencies</descriptorRef>
                    </descriptorRefs>
                </configuration>
                <executions>
                    <execution>
                        <id>make-assembly</id>
                        <phase>package</phase>
                        <goals>
                            <goal>single</goal>
                        </goals>
                    </execution>
                </executions>
            </plugin>
        </plugins>
    </build>
</project>