Hudi集成Flink

DataStreamAPI

添加hudi-flinkX-bundle依赖

<!-- Flink 1.14 -->
<dependency>
    <groupId>org.apache.hudi</groupId>
    <artifactId>hudi-flink1.14-bundle</artifactId>
    <version>0.13.1/version>
</dependency>

可以根据需要修改版本，我这里flink用的是1.14.5

Insert

import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.data.GenericRowData;
import org.apache.flink.table.data.StringData;
import org.apache.flink.table.data.TimestampData;
import org.apache.hudi.common.model.HoodieTableType;
import org.apache.hudi.configuration.FlinkOptions;
import org.apache.hudi.util.HoodiePipeline;
import org.apache.flink.table.data.RowData;
import java.util.HashMap;
import java.util.Map;

public class InsertDemo {

    public static void main(String[] args) throws Exception {
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.setParallelism(1);
        env.enableCheckpointing(10000);
        String targetTable = "t1";
        String basePath = "F:\study\hudi\code\hudi_flink\hudi_flink\output\t1";
        Map<String, String> options = new HashMap<>();
        options.put(FlinkOptions.PATH.key(),basePath);
        options.put(FlinkOptions.TABLE_TYPE.key(), HoodieTableType.MERGE_ON_READ.name());
        options.put(FlinkOptions.PRECOMBINE_FIELD.key(), "ts");
        DataStreamSource<String> source = env.socketTextStream("hadoop001", 2222);
        SingleOutputStreamOperator<RowData> dataStream = source.map(new MapFunction<String, RowData>() {
            @Override
            public RowData map(String s) throws Exception {
                String[] split = s.split(",");
                GenericRowData genericRowData = new GenericRowData(5);
                genericRowData.setField(0, StringData.fromString(split[0]));
                genericRowData.setField(1, StringData.fromString(split[1]));
                genericRowData.setField(2, Integer.valueOf(split[2]));
                genericRowData.setField(3, TimestampData.fromEpochMillis(Long.valueOf(split[3])));
                genericRowData.setField(4, StringData.fromString(split[4]));
                return genericRowData;
            }
        });
        HoodiePipeline.Builder builder = HoodiePipeline.builder(targetTable)
                .column("uuid VARCHAR(20)")
                .column("name VARCHAR(10)")
                .column("age INT")
                .column("ts TIMESTAMP(3)")
                .column("`partition` VARCHAR(20)")
                .pk("uuid")
                .partition("partition")
                .options(options);
        dataStream.print();
        builder.sink(dataStream, false);
        env.execute("sink");
    }
}

在hadoop001节点执行 nc -l 2222
测试数据

001,hehe,15,1000,20230101
002,hehe2,12,1000,20230102
003,hehe3,13,1000,20230103
004,hehe4,14,1000,20230101
005,hehe5,15,1000,20230101
005,hehe5,15,1000,20230102

运行后的数据目录

这里是根据最后一列作为分区的，所以有20230101、20230102、20230103三个分区目录

Query

import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.data.RowData;
import org.apache.hudi.common.model.HoodieTableType;
import org.apache.hudi.configuration.FlinkOptions;
import org.apache.hudi.util.HoodiePipeline;
import java.util.HashMap;

public class QueryDemo {

    public static void main(String[] args) throws Exception {
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.setParallelism(1);
        env.enableCheckpointing(10000);
        String targetTable = "t1";
        String basePath = "F:\study\hudi\code\hudi_flink\hudi_flink\output\t1";
        HashMap<String, String> options = new HashMap<>();
        options.put(FlinkOptions.PATH.key(), basePath);
        options.put(FlinkOptions.TABLE_TYPE.key(), HoodieTableType.MERGE_ON_READ.name());
        options.put(FlinkOptions.READ_AS_STREAMING.key(), "true");
        // 在这里可以指定commit时间增量查询
//        options.put(FlinkOptions.READ_START_COMMIT.key(), "'20210316134557'");
        HoodiePipeline.Builder builder = HoodiePipeline.builder(targetTable)
                .column("uuid varchar(20)")
                .column("name varchar(10)")
                .column("age INT")
                .column("ts TIMESTAMP(3)")
                .column("`partition` varchar(20)")
                .pk("uuid")
                .partition("partition")
                .options(options);
        DataStream<RowData> rowDataDataStream = builder.source(env);
        rowDataDataStream.print();
        env.execute("");
    }
}

在这里可以看到 005这个数据重复了，由于uuid是表的pk，所以会进行去重操作，可以看到结果中的第三行是-D 也就是delete操作。

+I(001,hehe,15,1970-01-01T00:00:01,20230101)
+I(004,hehe4,14,1970-01-01T00:00:01,20230101)
-D(005,null,null,null,null)
+I(003,hehe3,13,1970-01-01T00:00:01,20230103)
+I(002,hehe2,12,1970-01-01T00:00:01,20230102)
+I(005,hehe5,15,1970-01-01T00:00:01,20230102)

当把表的文件路径改为hdfs路径时，运行insert代码时会报错。

Exception in thread "main" org.apache.flink.util.FlinkException: Failed to execute job 'sink'.
	at org.apache.flink.streaming.api.environment.StreamExecutionEnvironment.executeAsync(StreamExecutionEnvironment.java:2055)
	at org.apache.flink.streaming.api.environment.StreamExecutionEnvironment.execute(StreamExecutionEnvironment.java:1930)
	at org.apache.flink.streaming.api.environment.LocalStreamEnvironment.execute(LocalStreamEnvironment.java:69)
	at org.apache.flink.streaming.api.environment.StreamExecutionEnvironment.execute(StreamExecutionEnvironment.java:1916)
	at com.liker.demo.InsertDemo.main(InsertDemo.java:56)
Caused by: java.lang.RuntimeException: Error while waiting for job to be initialized
	at org.apache.flink.client.ClientUtils.waitUntilJobInitializationFinished(ClientUtils.java:160)
	at org.apache.flink.client.program.PerJobMiniClusterFactory.lambda$submitJob$2(PerJobMiniClusterFactory.java:83)
	at org.apache.flink.util.function.FunctionUtils.lambda$uncheckedFunction$2(FunctionUtils.java:73)
	at java.util.concurrent.CompletableFuture.uniApply(CompletableFuture.java:616)
	at java.util.concurrent.CompletableFuture$UniApply.tryFire(CompletableFuture.java:591)
	at java.util.concurrent.CompletableFuture$Completion.exec(CompletableFuture.java:457)
	at java.util.concurrent.ForkJoinTask.doExec(ForkJoinTask.java:289)
	at java.util.concurrent.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1067)
	at java.util.concurrent.ForkJoinPool.runWorker(ForkJoinPool.java:1703)
	at java.util.concurrent.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:172)
Caused by: java.lang.IllegalStateException: MiniCluster is not yet running or has already been shut down.
	at org.apache.flink.util.Preconditions.checkState(Preconditions.java:193)
	at org.apache.flink.runtime.minicluster.MiniCluster.getDispatcherGatewayFuture(MiniCluster.java:878)
	at org.apache.flink.runtime.minicluster.MiniCluster.runDispatcherCommand(MiniCluster.java:778)
	at org.apache.flink.runtime.minicluster.MiniCluster.getJobStatus(MiniCluster.java:731)
	at org.apache.flink.client.program.PerJobMiniClusterFactory.lambda$null$0(PerJobMiniClusterFactory.java:86)
	at org.apache.flink.client.ClientUtils.waitUntilJobInitializationFinished(ClientUtils.java:144)
	... 9 more

打开log4j的日志级别为INFO后重新启动。发现报错原因是用户权限的问题。

Caused by: org.apache.hadoop.ipc.RemoteException(org.apache.hadoop.security.AccessControlException): Permission denied: user=Administrator, access=WRITE, inode="/":liker:supergroup:drwxr-xr-x

在代码中增加如下一行代码指定hadoop的用户为有权限的用户，我这里是liker，重新启动成功。

System.setProperty("HADOOP_USER_NAME", "liker");

FlinkSql

准备工作

编译hudi或下载hudi-flink-bundle jar包。
下载地址：Download | Apache Hudi \

编译

mvn clean package -DskipTests -Dspark3.1 -Dflink1.14 -Drat.skip=true -Pflink-bundle-shade-hive3

编译好的hudi-flink-bundle jar包在packaging/hudi-flink-bundle/target 目录下
启动sql-client

./bin/sql-client.sh embedded -j ../hudi/hudi-flink1.14-bundle-0.13.1.jar shell

创建表

Flink SQL> CREATE TABLE tbl(
>   uuid VARCHAR(20) PRIMARY KEY NOT ENFORCED,
>   name VARCHAR(10),
>   age INT,
>   ts TIMESTAMP(3),
>   `partition` VARCHAR(20)
> )
> PARTITIONED BY (`partition`)
> WITH (
>   'connector' = 'hudi',
>   'path' = 'hdfs://hadoop001:9820/hudi/tbl',
>   'table.type' = 'MERGE_ON_READ'
> );
[INFO] Execute statement succeed.

Insert数据

Flink SQL> INSERT INTO tbl VALUES
>   ('id1','Danny',23,TIMESTAMP '1970-01-01 00:00:01','20230601'),
>   ('id2','Stephen',33,TIMESTAMP '1970-01-01 00:00:02','20230601'),
>   ('id3','Julian',53,TIMESTAMP '1970-01-01 00:00:03','20230602'),
>   ('id4','Fabian',31,TIMESTAMP '1970-01-01 00:00:04','20230602'),
>   ('id5','Sophia',18,TIMESTAMP '1970-01-01 00:00:05','20230603'),
>   ('id6','Emma',20,TIMESTAMP '1970-01-01 00:00:06','20230603'),
>   ('id7','Bob',44,TIMESTAMP '1970-01-01 00:00:07','20230604'),
>   ('id8','Han',56,TIMESTAMP '1970-01-01 00:00:08','20230604');
[INFO] Submitting SQL update statement to the cluster...
[INFO] SQL update statement has been successfully submitted to the cluster:
Job ID: 588d740c7f6bf4c15b61086b11f8d0f5

查询数据

Flink SQL> select * from tbl;
2023-06-23 17:39:38,822 INFO  org.apache.hadoop.hdfs.protocol.datatransfer.sasl.SaslDataTransferClient [] - SASL encryption trust check: localHostTrusted = false, remoteHostTrusted = false
+----+--------------------------------+--------------------------------+-------------+-------------------------+--------------------------------+
| op |                           uuid |                           name |         age |                      ts |                      partition |
+----+--------------------------------+--------------------------------+-------------+-------------------------+--------------------------------+
| +I |                            id8 |                            Han |          56 | 1970-01-01 00:00:08.000 |                       20230604 |
| +I |                            id7 |                            Bob |          44 | 1970-01-01 00:00:07.000 |                       20230604 |
| +I |                            id6 |                           Emma |          20 | 1970-01-01 00:00:06.000 |                       20230603 |
| +I |                            id5 |                         Sophia |          18 | 1970-01-01 00:00:05.000 |                       20230603 |
| +I |                            id4 |                         Fabian |          31 | 1970-01-01 00:00:04.000 |                       20230602 |
| +I |                            id3 |                         Julian |          53 | 1970-01-01 00:00:03.000 |                       20230602 |
| +I |                            id2 |                        Stephen |          33 | 1970-01-01 00:00:02.000 |                       20230601 |
| +I |                            id1 |                          Danny |          23 | 1970-01-01 00:00:01.000 |                       20230601 |
+----+--------------------------------+--------------------------------+-------------+-------------------------+--------------------------------+
Received a total of 8 rows

StreamQuery

关键参数
- read.streaming.enabled：开启流式读取模式，默认为false
- read.start-commit：设置读取的开始时间，可以指定earliest、latest或具体的时间，如20230610010101

> CREATE TABLE stream_read(
>   uuid VARCHAR(20) PRIMARY KEY NOT ENFORCED,
>   name VARCHAR(10),
>   age INT,
>   ts TIMESTAMP(3),
>   `partition` VARCHAR(20)
> )
> PARTITIONED BY (`partition`)
> WITH (
>   'connector' = 'hudi',
>   'path' = 'hdfs://hadoop001:9820/hudi/tbl',
>   'table.type' = 'MERGE_ON_READ',
>   'read.streaming.enabled' = 'true',  -- this option enable the streaming read
>   'read.start-commit' = 'earliest', -- specifies the start commit instant time
>   'read.streaming.check-interval' = '4' -- specifies the check interval for finding new source commits, default 60s.
> );
[INFO] Execute statement succeed.

Flink SQL> select * from stream_read;
2023-06-23 17:42:41,198 INFO  org.apache.hadoop.hdfs.protocol.datatransfer.sasl.SaslDataTransferClient [] - SASL encryption trust check: localHostTrusted = false, remoteHostTrusted = false
+----+--------------------------------+--------------------------------+-------------+-------------------------+--------------------------------+
| op |                           uuid |                           name |         age |                      ts |                      partition |
+----+--------------------------------+--------------------------------+-------------+-------------------------+--------------------------------+
| +I |                            id2 |                        Stephen |          33 | 1970-01-01 00:00:02.000 |                       20230601 |
| +I |                            id1 |                          Danny |          23 | 1970-01-01 00:00:01.000 |                       20230601 |
| +I |                            id4 |                         Fabian |          31 | 1970-01-01 00:00:04.000 |                       20230602 |
| +I |                            id3 |                         Julian |          53 | 1970-01-01 00:00:03.000 |                       20230602 |
| +I |                            id6 |                           Emma |          20 | 1970-01-01 00:00:06.000 |                       20230603 |
| +I |                            id5 |                         Sophia |          18 | 1970-01-01 00:00:05.000 |                       20230603 |
| +I |                            id8 |                            Han |          56 | 1970-01-01 00:00:08.000 |                       20230604 |
| +I |                            id7 |                            Bob |          44 | 1970-01-01 00:00:07.000 |                       20230604 |

可以看到StreamQuery的方式并没有在查询完后，任务就停止，这是因为stream query读取数据的方式，会认为source为unbound数据流。

hudi on flink