Mysql订单数据使用FlinkCDC写入Hudi记录

431 阅读3分钟

pom依赖


  <properties>
        <maven.compiler.source>8</maven.compiler.source>
        <maven.compiler.target>8</maven.compiler.target>
        <flink.version>1.13.6</flink.version>
        <hudi.version>0.12.0</hudi.version>
        <java.version>1.8</java.version>
        <scala.binary.version>2.11</scala.binary.version>
        <slf4j.version>1.7.30</slf4j.version>
        <hadoop.version>3.0.0-cdh6.1.1</hadoop.version>
        <kafka.version>2.2.0</kafka.version>
    </properties>


    <dependencies>

        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-java</artifactId>
            <version>${flink.version}</version>
            <scope>provided</scope>
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-streaming-java_${scala.binary.version}</artifactId>
            <version>${flink.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-clients_${scala.binary.version}</artifactId>
            <version>${flink.version}</version>
        </dependency>

        <!--idea运行时也有webui-->
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-runtime-web_${scala.binary.version}</artifactId>
            <version>${flink.version}</version>
        </dependency>

        <dependency>
            <groupId>org.slf4j</groupId>
            <artifactId>slf4j-api</artifactId>
            <version>${slf4j.version}</version>
            <scope>provided</scope>
        </dependency>
        <dependency>
            <groupId>org.slf4j</groupId>
            <artifactId>slf4j-log4j12</artifactId>
            <version>${slf4j.version}</version>
            <scope>provided</scope>
        </dependency>
        <dependency>
            <groupId>org.apache.logging.log4j</groupId>
            <artifactId>log4j-to-slf4j</artifactId>
            <version>2.14.0</version>
            <scope>provided</scope>
        </dependency>


        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-statebackend-rocksdb_${scala.binary.version}</artifactId>
            <version>${flink.version}</version>
        </dependency>

        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-client</artifactId>
            <version>${hadoop.version}</version>
        </dependency>

        <!-- table -->
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-table-api-java-bridge_2.11</artifactId>
            <version>${flink.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-table-api-scala-bridge_2.11</artifactId>
            <version>${flink.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-table-planner_2.11</artifactId>
            <version>${flink.version}</version>
            <scope>provided</scope>
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-table-planner-blink_2.11</artifactId>
            <version>${flink.version}</version>
            <scope>provided</scope>
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-table-common</artifactId>
            <version>${flink.version}</version>
        </dependency>


        <!--手动install到本地maven仓库-->
        <dependency>
            <groupId>org.apache.hudi</groupId>
            <artifactId>hudi-flink_2.12</artifactId>
            <version>${hudi.version}</version>
        </dependency>

        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-connector-kafka_2.11</artifactId>
            <version>${flink.version}</version>
        </dependency>


        <dependency>
            <groupId>mysql</groupId>
            <artifactId>mysql-connector-java</artifactId>
            <version>8.0.21</version>
        </dependency>

        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-connector-jdbc_2.11</artifactId>
            <version>${flink.version}</version>
        </dependency>

        <dependency>
            <groupId>com.ververica</groupId>
            <artifactId>flink-connector-mysql-cdc</artifactId>
            <version>2.2.1</version>
        </dependency>

        <dependency>
            <groupId>com.alibaba.fastjson2</groupId>
            <artifactId>fastjson2</artifactId>
            <version>2.0.18</version>
        </dependency>

    </dependencies>

核心代码

        //环境初始化
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

        // 设置状态后端RocksDB 
        // hudi推荐使用RocksDB
        EmbeddedRocksDBStateBackend embeddedRocksDBStateBackend = new EmbeddedRocksDBStateBackend(true); 
        embeddedRocksDBStateBackend.setPredefinedOptions(PredefinedOptions.SPINNING_DISK_OPTIMIZED_HIGH_MEM);
        env.setStateBackend(embeddedRocksDBStateBackend);
        env.setRestartStrategy(RestartStrategies.noRestart());
        // checkpoint配置
        env.enableCheckpointing(TimeUnit.SECONDS.toMillis(30), CheckpointingMode.EXACTLY_ONCE);
        CheckpointConfig checkpointConfig = env.getCheckpointConfig();
        checkpointConfig.setCheckpointStorage(ckpPath);
        checkpointConfig.setMinPauseBetweenCheckpoints(TimeUnit.SECONDS.toMillis(30));
        checkpointConfig.setTolerableCheckpointFailureNumber(5);
        checkpointConfig.setCheckpointTimeout(TimeUnit.MINUTES.toMillis(1));
        checkpointConfig.enableExternalizedCheckpoints(CheckpointConfig.ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION);

        //cdc配置设置
        Properties cdcProperties = new Properties();
        cdcProperties.setProperty("snapshot.mode", "schema_only"); // 增量读取
        /*
         *  .startupOptions(StartupOptions.latest()) 参数配置
         *  1.initial() 全量扫描并且继续读取最新的binlog 最佳实践是第一次使用这个
         *  2.earliest() 从binlog的开头开始读取 就是啥时候开的binlog就从啥时候读
         *  3.latest() 从最新的binlog开始读取
         *  4.specificOffset(String specificOffsetFile, int specificOffsetPos) 指定offset读取
         *  5.timestamp(long startupTimestampMillis) 指定时间戳读取
         * */

        //自定义时间转换配置
//        cdcProperties.setProperty("converters", "dateConverters");
//        cdcProperties.setProperty("dateConverters.type", "com.qupeiyin.hudi.schma.MySqlDateTimeConverter");
//        cdcProperties.setProperty("dateConverters.format.date", "yyyy-MM-dd");
//        cdcProperties.setProperty("dateConverters.format.time", "HH:mm:ss");
//        cdcProperties.setProperty("dateConverters.format.datetime", "yyyy-MM-dd HH:mm:ss");
//        cdcProperties.setProperty("dateConverters.format.timestamp", "yyyy-MM-dd HH:mm:ss");
//        cdcProperties.setProperty("dateConverters.format.timestamp.zone", "Asia/Shanghai");
        cdcProperties.setProperty("debezium.snapshot.locking.mode", "none"); //全局读写锁,可能会影响在线业务,跳过锁设置
        cdcProperties.setProperty("include.schema.changes", "true");
        // 使用flink mysql cdc 发现bigint unsigned类型的字段,capture以后转成了字符串类型,
        // 用的这个解析吧JsonDebeziumDeserializationSchema。
        cdcProperties.setProperty("bigint.unsigned.handling.mode", "long");
        cdcProperties.setProperty("decimal.handling.mode", "string");


        MySqlSource<String> mySqlSource = MySqlSource.<String>builder()
                .hostname(host)
                .port(Integer.valueOf(port))
                .databaseList("xxx")
                .tableList("xxx.xxx") // set captured tables [product, user, address ,order, custom]
                .username(username)
                .password(password)
                .deserializer(new JsonDebeziumDeserializationSchema()) // converts SourceRecord to JSON String
                .startupOptions(StartupOptions.initial())
                .debeziumProperties(cdcProperties)
                .build();

//        op 字段的值 c,u,d,r 分别对应 create,update,delete,reade
        DataStream<FundOrderEntity> processStream = env.fromSource(mySqlSource, WatermarkStrategy.noWatermarks(), "MySQLSource")
                // set 4 parallel source tasks
                .setParallelism(1).process(new ProcessFunction<String, FundOrderEntity>() {
                    @Override
                    public void processElement(String s, Context context, Collector<FundOrderEntity> collector) throws Exception {
                        JSONObject jsonObject = JSON.parseObject(s);
                        String op = jsonObject.getString("op");
                        FundOrderEntity fundOrderEntity = null;
                        if (op.equals("d")) {
                            fundOrderEntity = JSON.parseObject(jsonObject.getString("before"), FundOrderEntity.class);
                            fundOrderEntity.set_HOODIE_IS_DELETED(true);
                        } else {
                            fundOrderEntity = JSON.parseObject(jsonObject.getString("after"), FundOrderEntity.class);
                            fundOrderEntity.set_HOODIE_IS_DELETED(false);
                        }
                        //时间戳写入hudi会+8,这里-8处理一下
                        long l = fundOrderEntity.getRefresh_time().toInstant().toEpochMilli();
                        long newTime = l - 8 * 60 * 60 * 1000L;
                        Timestamp timestamp = new Timestamp(newTime);
                        fundOrderEntity.setRefresh_time(timestamp);
                        collector.collect(fundOrderEntity);
                    }
                });

        StreamTableEnvironment sTableEnv = StreamTableEnvironment.create(env);

        sTableEnv.createTemporaryView("fund_order", processStream, "id, trade_id,_HOODIE_IS_DELETED");

        String sqlNormal = "insert into fund_order_sink_normal select id, trade_id,,_HOODIE_IS_DELETED from fund_order";

        String sinkNormalHuDi = " create table fund_order_sink_normal(\n" +
                "  id             bigint   not null PRIMARY KEY NOT ENFORCED,\n" +
                "    trade_id       string,\n" +
                "    _HOODIE_IS_DELETED       boolean\n" +
                ")\n" +
                " partitioned by (from_app)\n" +
                " with (\n" +
                "  'connector' = 'hudi',\n" +
                "  'path' = '" + dataPath + "',\n" +
                "  'compaction.schedule.enabled' = 'true',\n" +
                //建议关闭异步合并,采用hudi提供的方案,使用调度进行合并
                //合并失败的话,增加 hadoop-mapreduce-client-core-3.0.0.jar到lib
                "  'compaction.async.enabled' = 'true',\n" +
                "  'compaction.trigger.strategy' = 'num_or_time',\n" +
                "  'compaction.delta_commits' = '10',\n" +
                "  'compaction.delta_seconds' = '60',\n" +
                "  'compaction.max_memory' = '1024',\n" +
                "  'compaction.tasks' = '1',\n" +
                "  'write.task.max.size' = '2048',\n" +
                "  'write.merge.max_memory' = '1024',\n" +
                "  'table.type' = 'MERGE_ON_READ',\n" +
                "  'hoodie.cleaner.policy' = 'KEEP_LATEST_COMMITS',\n" +
                "  'hoodie.cleaner.commits.retained' = '1',\n" +
                "  'HoodieIndexConfig.BLOOM_INDEX_UPDATE_PARTITION' = 'true',\n" +
                "  'hive_sync.enable'='true', \n" +
                "  'hive_sync.table'='fund_order', \n" +
                "  'hive_sync.db'='hudi', \n" +
                "  'hive_sync.mode' = 'hms',\n" +
                "  'hive_sync.metastore.uris' = '" + uris + "',\n" +
                "  'hive_sync.support_timestamp'= 'true'\n" +
                "  )";

        sTableEnv.executeSql(sinkNormalHuDi);
        sTableEnv.executeSql(sqlNormal);

时区处理类

import io.debezium.spi.converter.CustomConverter;
import io.debezium.spi.converter.RelationalColumn;
import org.apache.kafka.connect.data.SchemaBuilder;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.time.*;
import java.time.format.DateTimeFormatter;
import java.util.Properties;
import java.util.function.Consumer;

public class MySqlDateTimeConverter implements CustomConverter<SchemaBuilder, RelationalColumn> {
    private final static Logger logger = LoggerFactory.getLogger(MySqlDateTimeConverter.class);

    private DateTimeFormatter dateFormatter = DateTimeFormatter.ISO_DATE;
    private DateTimeFormatter timeFormatter = DateTimeFormatter.ISO_TIME;
    private DateTimeFormatter datetimeFormatter = DateTimeFormatter.ISO_DATE_TIME;
    private DateTimeFormatter timestampFormatter = DateTimeFormatter.ISO_DATE_TIME;

    private ZoneId timestampZoneId = ZoneId.systemDefault();

    @Override
    public void configure(Properties props) {
        readProps(props, "format.date", p -> dateFormatter = DateTimeFormatter.ofPattern(p));
        readProps(props, "format.time", p -> timeFormatter = DateTimeFormatter.ofPattern(p));
        readProps(props, "format.datetime", p -> datetimeFormatter = DateTimeFormatter.ofPattern(p));
        readProps(props, "format.timestamp", p -> timestampFormatter = DateTimeFormatter.ofPattern(p));
        readProps(props, "format.timestamp.zone", z -> timestampZoneId = ZoneId.of(z));
    }

    private void readProps(Properties properties, String settingKey, Consumer<String> callback) {
        String settingValue = (String) properties.get(settingKey);
        if (settingValue == null || settingValue.length() == 0) {
            return;
        }
        try {
            callback.accept(settingValue.trim());
        } catch (IllegalArgumentException | DateTimeException e) {
            logger.error("The {} setting is illegal: {}",settingKey,settingValue);
            throw e;
        }
    }



    @Override
    public void converterFor(RelationalColumn column, ConverterRegistration<SchemaBuilder> registration) {
        String sqlType = column.typeName().toUpperCase();
        SchemaBuilder schemaBuilder = null;
        Converter converter = null;
        if ("DATE".equals(sqlType)) {
            schemaBuilder = SchemaBuilder.string().optional().name("com.darcytech.debezium.date.string");
            converter = this::convertDate;
        }
        if ("TIME".equals(sqlType)) {
            schemaBuilder = SchemaBuilder.string().optional().name("com.darcytech.debezium.time.string");
            converter = this::convertTime;
        }
        if ("DATETIME".equals(sqlType)) {
            schemaBuilder = SchemaBuilder.string().optional().name("com.darcytech.debezium.datetime.string");
            converter = this::convertDateTime;
        }
        if ("TIMESTAMP".equals(sqlType)) {
            schemaBuilder = SchemaBuilder.string().optional().name("com.darcytech.debezium.timestamp.string");
            converter = this::convertTimestamp;
        }
        if (schemaBuilder != null) {
            registration.register(schemaBuilder, converter);
        }
    }

    private String convertDate(Object input) {
        if (input instanceof LocalDate) {
            return dateFormatter.format((LocalDate) input);
        }
        if (input instanceof Integer) {
            LocalDate date = LocalDate.ofEpochDay((Integer) input);
            return dateFormatter.format(date);
        }
        return null;
    }

    private String convertTime(Object input) {
        if (input instanceof Duration) {
            Duration duration = (Duration) input;
            long seconds = duration.getSeconds();
            int nano = duration.getNano();
            LocalTime time = LocalTime.ofSecondOfDay(seconds).withNano(nano);
            return timeFormatter.format(time);
        }
        return null;
    }

    private String convertDateTime(Object input) {
        if (input instanceof LocalDateTime) {
            return datetimeFormatter.format((LocalDateTime) input);
        }
        return null;
    }

    private String convertTimestamp(Object input) {
        if (input instanceof ZonedDateTime) {
            // mysql的timestamp会转成UTC存储,这里的zonedDatetime都是UTC时间
            ZonedDateTime zonedDateTime = (ZonedDateTime) input;
            LocalDateTime localDateTime = zonedDateTime.withZoneSameInstant(timestampZoneId).toLocalDateTime();
            return timestampFormatter.format(localDateTime);
        }
        return null;
    }
}