pom依赖
<properties>
<maven.compiler.source>8</maven.compiler.source>
<maven.compiler.target>8</maven.compiler.target>
<flink.version>1.13.6</flink.version>
<hudi.version>0.12.0</hudi.version>
<java.version>1.8</java.version>
<scala.binary.version>2.11</scala.binary.version>
<slf4j.version>1.7.30</slf4j.version>
<hadoop.version>3.0.0-cdh6.1.1</hadoop.version>
<kafka.version>2.2.0</kafka.version>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-java</artifactId>
<version>${flink.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-streaming-java_${scala.binary.version}</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-clients_${scala.binary.version}</artifactId>
<version>${flink.version}</version>
</dependency>
<!--idea运行时也有webui-->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-runtime-web_${scala.binary.version}</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
<version>${slf4j.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
<version>${slf4j.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-to-slf4j</artifactId>
<version>2.14.0</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-statebackend-rocksdb_${scala.binary.version}</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>${hadoop.version}</version>
</dependency>
<!-- table -->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-api-java-bridge_2.11</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-api-scala-bridge_2.11</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-planner_2.11</artifactId>
<version>${flink.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-planner-blink_2.11</artifactId>
<version>${flink.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-common</artifactId>
<version>${flink.version}</version>
</dependency>
<!--手动install到本地maven仓库-->
<dependency>
<groupId>org.apache.hudi</groupId>
<artifactId>hudi-flink_2.12</artifactId>
<version>${hudi.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-connector-kafka_2.11</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>8.0.21</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-connector-jdbc_2.11</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>com.ververica</groupId>
<artifactId>flink-connector-mysql-cdc</artifactId>
<version>2.2.1</version>
</dependency>
<dependency>
<groupId>com.alibaba.fastjson2</groupId>
<artifactId>fastjson2</artifactId>
<version>2.0.18</version>
</dependency>
</dependencies>
核心代码
//环境初始化
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment()
// 设置状态后端RocksDB
// hudi推荐使用RocksDB
EmbeddedRocksDBStateBackend embeddedRocksDBStateBackend = new EmbeddedRocksDBStateBackend(true)
embeddedRocksDBStateBackend.setPredefinedOptions(PredefinedOptions.SPINNING_DISK_OPTIMIZED_HIGH_MEM)
env.setStateBackend(embeddedRocksDBStateBackend)
env.setRestartStrategy(RestartStrategies.noRestart())
// checkpoint配置
env.enableCheckpointing(TimeUnit.SECONDS.toMillis(30), CheckpointingMode.EXACTLY_ONCE)
CheckpointConfig checkpointConfig = env.getCheckpointConfig()
checkpointConfig.setCheckpointStorage(ckpPath)
checkpointConfig.setMinPauseBetweenCheckpoints(TimeUnit.SECONDS.toMillis(30))
checkpointConfig.setTolerableCheckpointFailureNumber(5)
checkpointConfig.setCheckpointTimeout(TimeUnit.MINUTES.toMillis(1))
checkpointConfig.enableExternalizedCheckpoints(CheckpointConfig.ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION)
//cdc配置设置
Properties cdcProperties = new Properties()
cdcProperties.setProperty("snapshot.mode", "schema_only")
/*
* .startupOptions(StartupOptions.latest()) 参数配置
* 1.initial() 全量扫描并且继续读取最新的binlog 最佳实践是第一次使用这个
* 2.earliest() 从binlog的开头开始读取 就是啥时候开的binlog就从啥时候读
* 3.latest() 从最新的binlog开始读取
* 4.specificOffset(String specificOffsetFile, int specificOffsetPos) 指定offset读取
* 5.timestamp(long startupTimestampMillis) 指定时间戳读取
* */
//自定义时间转换配置
// cdcProperties.setProperty("converters", "dateConverters")
// cdcProperties.setProperty("dateConverters.type", "com.qupeiyin.hudi.schma.MySqlDateTimeConverter")
// cdcProperties.setProperty("dateConverters.format.date", "yyyy-MM-dd")
// cdcProperties.setProperty("dateConverters.format.time", "HH:mm:ss")
// cdcProperties.setProperty("dateConverters.format.datetime", "yyyy-MM-dd HH:mm:ss")
// cdcProperties.setProperty("dateConverters.format.timestamp", "yyyy-MM-dd HH:mm:ss")
// cdcProperties.setProperty("dateConverters.format.timestamp.zone", "Asia/Shanghai")
cdcProperties.setProperty("debezium.snapshot.locking.mode", "none")
cdcProperties.setProperty("include.schema.changes", "true")
// 使用flink mysql cdc 发现bigint unsigned类型的字段,capture以后转成了字符串类型,
// 用的这个解析吧JsonDebeziumDeserializationSchema。
cdcProperties.setProperty("bigint.unsigned.handling.mode", "long")
cdcProperties.setProperty("decimal.handling.mode", "string")
MySqlSource<String> mySqlSource = MySqlSource.<String>builder()
.hostname(host)
.port(Integer.valueOf(port))
.databaseList("xxx")
.tableList("xxx.xxx") // set captured tables [product, user, address ,order, custom]
.username(username)
.password(password)
.deserializer(new JsonDebeziumDeserializationSchema()) // converts SourceRecord to JSON String
.startupOptions(StartupOptions.initial())
.debeziumProperties(cdcProperties)
.build()
// op 字段的值 c,u,d,r 分别对应 create,update,delete,reade
DataStream<FundOrderEntity> processStream = env.fromSource(mySqlSource, WatermarkStrategy.noWatermarks(), "MySQLSource")
// set 4 parallel source tasks
.setParallelism(1).process(new ProcessFunction<String, FundOrderEntity>() {
@Override
public void processElement(String s, Context context, Collector<FundOrderEntity> collector) throws Exception {
JSONObject jsonObject = JSON.parseObject(s)
String op = jsonObject.getString("op")
FundOrderEntity fundOrderEntity = null
if (op.equals("d")) {
fundOrderEntity = JSON.parseObject(jsonObject.getString("before"), FundOrderEntity.class)
fundOrderEntity.set_HOODIE_IS_DELETED(true)
} else {
fundOrderEntity = JSON.parseObject(jsonObject.getString("after"), FundOrderEntity.class)
fundOrderEntity.set_HOODIE_IS_DELETED(false)
}
//时间戳写入hudi会+8,这里-8处理一下
long l = fundOrderEntity.getRefresh_time().toInstant().toEpochMilli()
long newTime = l - 8 * 60 * 60 * 1000L
Timestamp timestamp = new Timestamp(newTime)
fundOrderEntity.setRefresh_time(timestamp)
collector.collect(fundOrderEntity)
}
})
StreamTableEnvironment sTableEnv = StreamTableEnvironment.create(env)
sTableEnv.createTemporaryView("fund_order", processStream, "id, trade_id,_HOODIE_IS_DELETED")
String sqlNormal = "insert into fund_order_sink_normal select id, trade_id,,_HOODIE_IS_DELETED from fund_order"
String sinkNormalHuDi = " create table fund_order_sink_normal(\n" +
" id bigint not null PRIMARY KEY NOT ENFORCED,\n" +
" trade_id string,\n" +
" _HOODIE_IS_DELETED boolean\n" +
")\n" +
" partitioned by (from_app)\n" +
" with (\n" +
" 'connector' = 'hudi',\n" +
" 'path' = '" + dataPath + "',\n" +
" 'compaction.schedule.enabled' = 'true',\n" +
//建议关闭异步合并,采用hudi提供的方案,使用调度进行合并
//合并失败的话,增加 hadoop-mapreduce-client-core-3.0.0.jar到lib
" 'compaction.async.enabled' = 'true',\n" +
" 'compaction.trigger.strategy' = 'num_or_time',\n" +
" 'compaction.delta_commits' = '10',\n" +
" 'compaction.delta_seconds' = '60',\n" +
" 'compaction.max_memory' = '1024',\n" +
" 'compaction.tasks' = '1',\n" +
" 'write.task.max.size' = '2048',\n" +
" 'write.merge.max_memory' = '1024',\n" +
" 'table.type' = 'MERGE_ON_READ',\n" +
" 'hoodie.cleaner.policy' = 'KEEP_LATEST_COMMITS',\n" +
" 'hoodie.cleaner.commits.retained' = '1',\n" +
" 'HoodieIndexConfig.BLOOM_INDEX_UPDATE_PARTITION' = 'true',\n" +
" 'hive_sync.enable'='true', \n" +
" 'hive_sync.table'='fund_order', \n" +
" 'hive_sync.db'='hudi', \n" +
" 'hive_sync.mode' = 'hms',\n" +
" 'hive_sync.metastore.uris' = '" + uris + "',\n" +
" 'hive_sync.support_timestamp'= 'true'\n" +
" )";
sTableEnv.executeSql(sinkNormalHuDi);
sTableEnv.executeSql(sqlNormal);
时区处理类
import io.debezium.spi.converter.CustomConverter
import io.debezium.spi.converter.RelationalColumn
import org.apache.kafka.connect.data.SchemaBuilder
import org.slf4j.Logger
import org.slf4j.LoggerFactory
import java.time.*
import java.time.format.DateTimeFormatter
import java.util.Properties
import java.util.function.Consumer
public class MySqlDateTimeConverter implements CustomConverter<SchemaBuilder, RelationalColumn> {
private final static Logger logger = LoggerFactory.getLogger(MySqlDateTimeConverter.class)
private DateTimeFormatter dateFormatter = DateTimeFormatter.ISO_DATE
private DateTimeFormatter timeFormatter = DateTimeFormatter.ISO_TIME
private DateTimeFormatter datetimeFormatter = DateTimeFormatter.ISO_DATE_TIME
private DateTimeFormatter timestampFormatter = DateTimeFormatter.ISO_DATE_TIME
private ZoneId timestampZoneId = ZoneId.systemDefault()
@Override
public void configure(Properties props) {
readProps(props, "format.date", p -> dateFormatter = DateTimeFormatter.ofPattern(p))
readProps(props, "format.time", p -> timeFormatter = DateTimeFormatter.ofPattern(p))
readProps(props, "format.datetime", p -> datetimeFormatter = DateTimeFormatter.ofPattern(p))
readProps(props, "format.timestamp", p -> timestampFormatter = DateTimeFormatter.ofPattern(p))
readProps(props, "format.timestamp.zone", z -> timestampZoneId = ZoneId.of(z))
}
private void readProps(Properties properties, String settingKey, Consumer<String> callback) {
String settingValue = (String) properties.get(settingKey)
if (settingValue == null || settingValue.length() == 0) {
return
}
try {
callback.accept(settingValue.trim())
} catch (IllegalArgumentException | DateTimeException e) {
logger.error("The {} setting is illegal: {}",settingKey,settingValue)
throw e
}
}
@Override
public void converterFor(RelationalColumn column, ConverterRegistration<SchemaBuilder> registration) {
String sqlType = column.typeName().toUpperCase()
SchemaBuilder schemaBuilder = null
Converter converter = null
if ("DATE".equals(sqlType)) {
schemaBuilder = SchemaBuilder.string().optional().name("com.darcytech.debezium.date.string")
converter = this::convertDate
}
if ("TIME".equals(sqlType)) {
schemaBuilder = SchemaBuilder.string().optional().name("com.darcytech.debezium.time.string")
converter = this::convertTime
}
if ("DATETIME".equals(sqlType)) {
schemaBuilder = SchemaBuilder.string().optional().name("com.darcytech.debezium.datetime.string")
converter = this::convertDateTime
}
if ("TIMESTAMP".equals(sqlType)) {
schemaBuilder = SchemaBuilder.string().optional().name("com.darcytech.debezium.timestamp.string")
converter = this::convertTimestamp
}
if (schemaBuilder != null) {
registration.register(schemaBuilder, converter)
}
}
private String convertDate(Object input) {
if (input instanceof LocalDate) {
return dateFormatter.format((LocalDate) input)
}
if (input instanceof Integer) {
LocalDate date = LocalDate.ofEpochDay((Integer) input)
return dateFormatter.format(date)
}
return null
}
private String convertTime(Object input) {
if (input instanceof Duration) {
Duration duration = (Duration) input
long seconds = duration.getSeconds()
int nano = duration.getNano()
LocalTime time = LocalTime.ofSecondOfDay(seconds).withNano(nano)
return timeFormatter.format(time)
}
return null
}
private String convertDateTime(Object input) {
if (input instanceof LocalDateTime) {
return datetimeFormatter.format((LocalDateTime) input)
}
return null
}
private String convertTimestamp(Object input) {
if (input instanceof ZonedDateTime) {
// mysql的timestamp会转成UTC存储,这里的zonedDatetime都是UTC时间
ZonedDateTime zonedDateTime = (ZonedDateTime) input
LocalDateTime localDateTime = zonedDateTime.withZoneSameInstant(timestampZoneId).toLocalDateTime()
return timestampFormatter.format(localDateTime)
}
return null
}
}