Hudi集成Flink
DataStreamAPI
添加hudi-flinkX-bundle依赖
<!-- Flink 1.14 -->
<dependency>
<groupId>org.apache.hudi</groupId>
<artifactId>hudi-flink1.14-bundle</artifactId>
<version>0.13.1/version>
</dependency>
可以根据需要修改版本,我这里flink用的是1.14.5
Insert
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.data.GenericRowData;
import org.apache.flink.table.data.StringData;
import org.apache.flink.table.data.TimestampData;
import org.apache.hudi.common.model.HoodieTableType;
import org.apache.hudi.configuration.FlinkOptions;
import org.apache.hudi.util.HoodiePipeline;
import org.apache.flink.table.data.RowData;
import java.util.HashMap;
import java.util.Map;
public class InsertDemo {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
env.enableCheckpointing(10000);
String targetTable = "t1";
String basePath = "F:\study\hudi\code\hudi_flink\hudi_flink\output\t1";
Map<String, String> options = new HashMap<>();
options.put(FlinkOptions.PATH.key(),basePath);
options.put(FlinkOptions.TABLE_TYPE.key(), HoodieTableType.MERGE_ON_READ.name());
options.put(FlinkOptions.PRECOMBINE_FIELD.key(), "ts");
DataStreamSource<String> source = env.socketTextStream("hadoop001", 2222);
SingleOutputStreamOperator<RowData> dataStream = source.map(new MapFunction<String, RowData>() {
@Override
public RowData map(String s) throws Exception {
String[] split = s.split(",");
GenericRowData genericRowData = new GenericRowData(5);
genericRowData.setField(0, StringData.fromString(split[0]));
genericRowData.setField(1, StringData.fromString(split[1]));
genericRowData.setField(2, Integer.valueOf(split[2]));
genericRowData.setField(3, TimestampData.fromEpochMillis(Long.valueOf(split[3])));
genericRowData.setField(4, StringData.fromString(split[4]));
return genericRowData;
}
});
HoodiePipeline.Builder builder = HoodiePipeline.builder(targetTable)
.column("uuid VARCHAR(20)")
.column("name VARCHAR(10)")
.column("age INT")
.column("ts TIMESTAMP(3)")
.column("`partition` VARCHAR(20)")
.pk("uuid")
.partition("partition")
.options(options);
dataStream.print();
builder.sink(dataStream, false);
env.execute("sink");
}
}
在hadoop001节点执行 nc -l 2222
测试数据
001,hehe,15,1000,20230101
002,hehe2,12,1000,20230102
003,hehe3,13,1000,20230103
004,hehe4,14,1000,20230101
005,hehe5,15,1000,20230101
005,hehe5,15,1000,20230102
运行后的数据目录
这里是根据最后一列作为分区的,所以有20230101、20230102、20230103三个分区目录
Query
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.data.RowData;
import org.apache.hudi.common.model.HoodieTableType;
import org.apache.hudi.configuration.FlinkOptions;
import org.apache.hudi.util.HoodiePipeline;
import java.util.HashMap;
public class QueryDemo {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
env.enableCheckpointing(10000);
String targetTable = "t1";
String basePath = "F:\study\hudi\code\hudi_flink\hudi_flink\output\t1";
HashMap<String, String> options = new HashMap<>();
options.put(FlinkOptions.PATH.key(), basePath);
options.put(FlinkOptions.TABLE_TYPE.key(), HoodieTableType.MERGE_ON_READ.name());
options.put(FlinkOptions.READ_AS_STREAMING.key(), "true");
// 在这里可以指定commit时间增量查询
// options.put(FlinkOptions.READ_START_COMMIT.key(), "'20210316134557'");
HoodiePipeline.Builder builder = HoodiePipeline.builder(targetTable)
.column("uuid varchar(20)")
.column("name varchar(10)")
.column("age INT")
.column("ts TIMESTAMP(3)")
.column("`partition` varchar(20)")
.pk("uuid")
.partition("partition")
.options(options);
DataStream<RowData> rowDataDataStream = builder.source(env);
rowDataDataStream.print();
env.execute("");
}
}
在这里可以看到 005这个数据重复了,由于uuid是表的pk,所以会进行去重操作,可以看到结果中的第三行是-D 也就是delete操作。
+I(001,hehe,15,1970-01-01T00:00:01,20230101)
+I(004,hehe4,14,1970-01-01T00:00:01,20230101)
-D(005,null,null,null,null)
+I(003,hehe3,13,1970-01-01T00:00:01,20230103)
+I(002,hehe2,12,1970-01-01T00:00:01,20230102)
+I(005,hehe5,15,1970-01-01T00:00:01,20230102)
当把表的文件路径改为hdfs路径时,运行insert代码时会报错。
Exception in thread "main" org.apache.flink.util.FlinkException: Failed to execute job 'sink'.
at org.apache.flink.streaming.api.environment.StreamExecutionEnvironment.executeAsync(StreamExecutionEnvironment.java:2055)
at org.apache.flink.streaming.api.environment.StreamExecutionEnvironment.execute(StreamExecutionEnvironment.java:1930)
at org.apache.flink.streaming.api.environment.LocalStreamEnvironment.execute(LocalStreamEnvironment.java:69)
at org.apache.flink.streaming.api.environment.StreamExecutionEnvironment.execute(StreamExecutionEnvironment.java:1916)
at com.liker.demo.InsertDemo.main(InsertDemo.java:56)
Caused by: java.lang.RuntimeException: Error while waiting for job to be initialized
at org.apache.flink.client.ClientUtils.waitUntilJobInitializationFinished(ClientUtils.java:160)
at org.apache.flink.client.program.PerJobMiniClusterFactory.lambda$submitJob$2(PerJobMiniClusterFactory.java:83)
at org.apache.flink.util.function.FunctionUtils.lambda$uncheckedFunction$2(FunctionUtils.java:73)
at java.util.concurrent.CompletableFuture.uniApply(CompletableFuture.java:616)
at java.util.concurrent.CompletableFuture$UniApply.tryFire(CompletableFuture.java:591)
at java.util.concurrent.CompletableFuture$Completion.exec(CompletableFuture.java:457)
at java.util.concurrent.ForkJoinTask.doExec(ForkJoinTask.java:289)
at java.util.concurrent.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1067)
at java.util.concurrent.ForkJoinPool.runWorker(ForkJoinPool.java:1703)
at java.util.concurrent.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:172)
Caused by: java.lang.IllegalStateException: MiniCluster is not yet running or has already been shut down.
at org.apache.flink.util.Preconditions.checkState(Preconditions.java:193)
at org.apache.flink.runtime.minicluster.MiniCluster.getDispatcherGatewayFuture(MiniCluster.java:878)
at org.apache.flink.runtime.minicluster.MiniCluster.runDispatcherCommand(MiniCluster.java:778)
at org.apache.flink.runtime.minicluster.MiniCluster.getJobStatus(MiniCluster.java:731)
at org.apache.flink.client.program.PerJobMiniClusterFactory.lambda$null$0(PerJobMiniClusterFactory.java:86)
at org.apache.flink.client.ClientUtils.waitUntilJobInitializationFinished(ClientUtils.java:144)
... 9 more
打开log4j的日志级别为INFO后重新启动。发现报错原因是用户权限的问题。
Caused by: org.apache.hadoop.ipc.RemoteException(org.apache.hadoop.security.AccessControlException): Permission denied: user=Administrator, access=WRITE, inode="/":liker:supergroup:drwxr-xr-x
在代码中增加如下一行代码指定hadoop的用户为有权限的用户,我这里是liker,重新启动成功。
System.setProperty("HADOOP_USER_NAME", "liker");
FlinkSql
准备工作
编译hudi或下载hudi-flink-bundle jar包。
下载地址:Download | Apache Hudi \
编译
mvn clean package -DskipTests -Dspark3.1 -Dflink1.14 -Drat.skip=true -Pflink-bundle-shade-hive3
编译好的hudi-flink-bundle jar包在packaging/hudi-flink-bundle/target 目录下
启动sql-client
./bin/sql-client.sh embedded -j ../hudi/hudi-flink1.14-bundle-0.13.1.jar shell
创建表
Flink SQL> CREATE TABLE tbl(
> uuid VARCHAR(20) PRIMARY KEY NOT ENFORCED,
> name VARCHAR(10),
> age INT,
> ts TIMESTAMP(3),
> `partition` VARCHAR(20)
> )
> PARTITIONED BY (`partition`)
> WITH (
> 'connector' = 'hudi',
> 'path' = 'hdfs://hadoop001:9820/hudi/tbl',
> 'table.type' = 'MERGE_ON_READ'
> );
[INFO] Execute statement succeed.
Insert数据
Flink SQL> INSERT INTO tbl VALUES
> ('id1','Danny',23,TIMESTAMP '1970-01-01 00:00:01','20230601'),
> ('id2','Stephen',33,TIMESTAMP '1970-01-01 00:00:02','20230601'),
> ('id3','Julian',53,TIMESTAMP '1970-01-01 00:00:03','20230602'),
> ('id4','Fabian',31,TIMESTAMP '1970-01-01 00:00:04','20230602'),
> ('id5','Sophia',18,TIMESTAMP '1970-01-01 00:00:05','20230603'),
> ('id6','Emma',20,TIMESTAMP '1970-01-01 00:00:06','20230603'),
> ('id7','Bob',44,TIMESTAMP '1970-01-01 00:00:07','20230604'),
> ('id8','Han',56,TIMESTAMP '1970-01-01 00:00:08','20230604');
[INFO] Submitting SQL update statement to the cluster...
[INFO] SQL update statement has been successfully submitted to the cluster:
Job ID: 588d740c7f6bf4c15b61086b11f8d0f5
查询数据
Flink SQL> select * from tbl;
2023-06-23 17:39:38,822 INFO org.apache.hadoop.hdfs.protocol.datatransfer.sasl.SaslDataTransferClient [] - SASL encryption trust check: localHostTrusted = false, remoteHostTrusted = false
+----+--------------------------------+--------------------------------+-------------+-------------------------+--------------------------------+
| op | uuid | name | age | ts | partition |
+----+--------------------------------+--------------------------------+-------------+-------------------------+--------------------------------+
| +I | id8 | Han | 56 | 1970-01-01 00:00:08.000 | 20230604 |
| +I | id7 | Bob | 44 | 1970-01-01 00:00:07.000 | 20230604 |
| +I | id6 | Emma | 20 | 1970-01-01 00:00:06.000 | 20230603 |
| +I | id5 | Sophia | 18 | 1970-01-01 00:00:05.000 | 20230603 |
| +I | id4 | Fabian | 31 | 1970-01-01 00:00:04.000 | 20230602 |
| +I | id3 | Julian | 53 | 1970-01-01 00:00:03.000 | 20230602 |
| +I | id2 | Stephen | 33 | 1970-01-01 00:00:02.000 | 20230601 |
| +I | id1 | Danny | 23 | 1970-01-01 00:00:01.000 | 20230601 |
+----+--------------------------------+--------------------------------+-------------+-------------------------+--------------------------------+
Received a total of 8 rows
StreamQuery
- 关键参数
-
- read.streaming.enabled:开启流式读取模式,默认为false
-
- read.start-commit:设置读取的开始时间,可以指定earliest、latest或具体的时间,如20230610010101
> CREATE TABLE stream_read(
> uuid VARCHAR(20) PRIMARY KEY NOT ENFORCED,
> name VARCHAR(10),
> age INT,
> ts TIMESTAMP(3),
> `partition` VARCHAR(20)
> )
> PARTITIONED BY (`partition`)
> WITH (
> 'connector' = 'hudi',
> 'path' = 'hdfs://hadoop001:9820/hudi/tbl',
> 'table.type' = 'MERGE_ON_READ',
> 'read.streaming.enabled' = 'true', -- this option enable the streaming read
> 'read.start-commit' = 'earliest', -- specifies the start commit instant time
> 'read.streaming.check-interval' = '4' -- specifies the check interval for finding new source commits, default 60s.
> );
[INFO] Execute statement succeed.
Flink SQL> select * from stream_read;
2023-06-23 17:42:41,198 INFO org.apache.hadoop.hdfs.protocol.datatransfer.sasl.SaslDataTransferClient [] - SASL encryption trust check: localHostTrusted = false, remoteHostTrusted = false
+----+--------------------------------+--------------------------------+-------------+-------------------------+--------------------------------+
| op | uuid | name | age | ts | partition |
+----+--------------------------------+--------------------------------+-------------+-------------------------+--------------------------------+
| +I | id2 | Stephen | 33 | 1970-01-01 00:00:02.000 | 20230601 |
| +I | id1 | Danny | 23 | 1970-01-01 00:00:01.000 | 20230601 |
| +I | id4 | Fabian | 31 | 1970-01-01 00:00:04.000 | 20230602 |
| +I | id3 | Julian | 53 | 1970-01-01 00:00:03.000 | 20230602 |
| +I | id6 | Emma | 20 | 1970-01-01 00:00:06.000 | 20230603 |
| +I | id5 | Sophia | 18 | 1970-01-01 00:00:05.000 | 20230603 |
| +I | id8 | Han | 56 | 1970-01-01 00:00:08.000 | 20230604 |
| +I | id7 | Bob | 44 | 1970-01-01 00:00:07.000 | 20230604 |
可以看到StreamQuery的方式并没有在查询完后,任务就停止,这是因为stream query读取数据的方式,会认为source为unbound数据流。