基于 Flink 将 parquet 文件数据提取到 ClickHouse

300 阅读4分钟

概述

先将parquet文件数据进行预处理(添加唯一主键和备注信息),然后将处理后的多个parquet文件数据提取到CK数据库中。

环境及主要软件版本说明

一、parquet 文件预处理

1、测试文件数据集准备

文件数据来源于 Green 出租车出行数据 www.nyc.gov/site/tlc/ab… 下载几个测试文件如下:

  • green_tripdata_2020-04.parquet
  • green_tripdata_2021-04.parquet
  • green_tripdata_2022-03.parquet
  • green_tripdata_2022-04.parquet

2、编写java程序进行预处理

pom 文件

<properties>
    <java.version>1.8</java.version>
    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
    <project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
    <hutool.version>5.8.22</hutool.version>
    <hadoop.version>3.3.1</hadoop.version>
    <parquet-avro.version>1.13.1</parquet-avro.version>
</properties>

<dependencies>

    <dependency>
        <groupId>org.projectlombok</groupId>
        <artifactId>lombok</artifactId>
        <version>1.18.24</version>
    </dependency>
    <dependency>
        <groupId>cn.hutool</groupId>
        <artifactId>hutool-all</artifactId>
        <version>${hutool.version}</version>
    </dependency>
    <!-- parquet文件预处理 start -->
    <dependency>
        <groupId>org.apache.parquet</groupId>
        <artifactId>parquet-avro</artifactId>
        <version>${parquet-avro.version}</version>
    </dependency>
    <dependency>
        <groupId>org.apache.hadoop</groupId>
        <artifactId>hadoop-common</artifactId>
        <version>${hadoop.version}</version>
    </dependency>
    <dependency>
        <groupId>org.apache.hadoop</groupId>
        <artifactId>hadoop-client</artifactId>
        <version>${hadoop.version}</version>
    </dependency>
    <!-- parquet文件预处理 end -->
</dependencies>

java 代码

import cn.hutool.core.lang.Snowflake;
import cn.hutool.core.util.IdUtil;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericRecord;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.parquet.avro.AvroParquetReader;
import org.apache.parquet.avro.AvroParquetWriter;
import org.apache.parquet.avro.AvroReadSupport;
import org.apache.parquet.hadoop.ParquetReader;
import org.apache.parquet.hadoop.ParquetWriter;
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
import org.apache.parquet.hadoop.util.HadoopOutputFile;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

public class ParquetRecordReader {

    /**
     * parquet 文件添加备注信息&id
     *
     * @param inputFilePath  输入文件路径
     * @param outputFilePath 输出文件路径
     * @param remark         备注信息
     * @throws IOException 异常
     */
    public static void addRemarkAndWriteToFile(String inputFilePath, String outputFilePath, String remark) throws IOException {
        // 读取原始 Parquet 文件
        Path inputPath = new Path(inputFilePath);
        ParquetReader<GenericRecord> reader = AvroParquetReader
                .builder(new AvroReadSupport<GenericRecord>(), inputPath)
                .build();
        GenericRecord record;
        Schema schema = null;

        while ((record = reader.read()) != null) {
            schema = record.getSchema();
            break; // 只需要获取一次 schema
        }

        if (schema == null) {
            throw new IOException("Failed to read schema from input file.");
        }

        // 创建新的 schema 包含 remark 字段
        Schema newSchema = Schema.createRecord(schema.getName(), schema.getDoc(), schema.getNamespace(), false);
        List<Schema.Field> newFields = new ArrayList<>();

        // 复制原始 schema 的字段并确保正确处理 UNION 类型
        for (Schema.Field field : schema.getFields()) {
            // 深拷贝字段及其类型
            Schema fieldType = field.schema();
            Schema.Field newField = new Schema.Field(field.name(), fieldType, field.doc(), field.defaultVal());
            newFields.add(newField);
        }

        // 检查是否已经存在 remark 字段,如果不存在则添加
        boolean remarkFieldExists = newFields.stream().anyMatch(field -> "remark".equals(field.name()));

        if (!remarkFieldExists) {
            newFields.add(new Schema.Field(
                    "remark",
                    Schema.create(Schema.Type.STRING),
                    "A remark field",
                    null)
            );
        }

        // 检查是否已经存在 主键 字段,如果不存在则添加
        boolean idFieldExists = newFields.stream().anyMatch(field -> "id".equals(field.name()));

        if (!idFieldExists) {
            newFields.add(new Schema.Field(
                    "id",
                    Schema.create(Schema.Type.LONG),
                    "id field",
                    null)
            );
        }

        newSchema.setFields(newFields);

        // 重置 reader 以重新读取数据
        reader.close();
        reader = AvroParquetReader.builder(new AvroReadSupport<GenericRecord>(), inputPath).build();

        List<GenericRecord> recordList = new ArrayList<>();
        Snowflake snowflake = IdUtil.getSnowflake();
        while ((record = reader.read()) != null) {
            // 创建一个新的 GenericRecord 并添加 remark 字段
            GenericRecord newRecord = new GenericData.Record(newSchema);
            GenericRecord finalRecord = record;
            schema.getFields().forEach(item -> {
                String name = item.name();
                newRecord.put(name, finalRecord.get(name));
            });
            newRecord.put("remark", remark);
            newRecord.put("id", snowflake.nextId());
            recordList.add(newRecord);
        }
        reader.close();

        // 写入新的 Parquet 文件
        Path outputPath = new Path(outputFilePath);
        HadoopOutputFile hadoopOutputFile = HadoopOutputFile.fromPath(outputPath, new Configuration());
        ParquetWriter<Object> writer = AvroParquetWriter.builder(hadoopOutputFile)
                .withSchema(newSchema)
                .withConf(new Configuration())
                .withCompressionCodec(CompressionCodecName.SNAPPY)
                .build();

        for (GenericRecord rec : recordList) {
            writer.write(rec);
        }
        writer.close();
    }


    /**
     * 文件数据转List,闲置未用
     * @param filePath 指定 Parquet 文件路径
     * @return 文件数据List
     * @throws IOException 异常
     */
    public static List<Map<String, Object>> readParquetFileWithRecord(String filePath) throws IOException {
        // 拼接parquet文件全路径
        Path parquetFilePath = new Path(filePath);
        ParquetReader<GenericRecord> reader = AvroParquetReader.builder(new AvroReadSupport<GenericRecord>(), parquetFilePath).build();
        GenericRecord record;
        List<Map<String, Object>> recordList = new ArrayList<>();
        // 开始遍历行数据
        while ((record = reader.read()) != null) {
            Map<String, Object> recordMap = new HashMap<>();
            Schema schema = record.getSchema();
            // 行的字段信息
            List<Schema.Field> fields = schema.getFields();
            GenericRecord finalRecord = record;
            fields.forEach(item -> {
                // 根据字段名称获取对应值
                String name = item.name();
                Object val = finalRecord.get(name);
                recordMap.put(name, val);
            });
            recordList.add(recordMap);
        }
        reader.close();
        return recordList;
    }
}
import lombok.extern.slf4j.Slf4j;

import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.stream.Stream;

/**
 * @author admin
 */
@Slf4j
public class FileMain {

    public static void main(String[] args) {
        final String outPath = "F:\home\parquet\out\";
        final String inPath = "F:\home\parquet\in";
        executeFiles(inPath, outPath);
    }

    public static void executeFiles(String inPath, String outPath) {
        Path dirPath = Paths.get(inPath);
        try (Stream<Path> stream = Files.walk(dirPath)) {
            stream.filter(Files::isRegularFile)
                    .filter(path -> path.toString().toLowerCase().endsWith(".parquet"))
                    .forEach(filePath -> {
                        // 文件全路径
                        Path filePathAbsolutePath = filePath.toAbsolutePath();
                        System.out.println("filePathAbsolutePath = " + filePathAbsolutePath);
                        log.info("filePathAbsolutePath = {}", filePathAbsolutePath);
                        // 根据文件路径重命名文件名称
                        String outFileName = filePathAbsolutePath.toString()
                                .replaceAll("F:\\home\\parquet\\in\\", "")
                                .replaceAll("[\\|:]+", "_");
                        System.out.println("outFileName = " + outFileName);
                        log.info("outFileName = {}", outFileName);
                        try {
                            ParquetRecordReader.addRemarkAndWriteToFile(filePath.toAbsolutePath().toString(),
                                    outPath + outFileName, outFileName);
                        } catch (Exception e) {
                            throw new RuntimeException(e);
                        }
                    });
        } catch (IOException e) {
            log.error("执行异常!", e);
        }
    }
}

二、parquet 文件数据进 CK

1、java 代码方式实现

pom

<properties>
        <java.version>1.8</java.version>
        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
        <project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
        <flink.version>1.17.2</flink.version>
        <flink-connector-clickhouse.version>1.16.0-SNAPSHOT</flink-connector-clickhouse.version>
    </properties>
    <dependencies>
        <!-- flink start -->
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-parquet</artifactId>
            <version>${flink.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-connector-files</artifactId>
            <version>${flink.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-table-api-java-bridge</artifactId>
            <version>${flink.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-table-planner_2.12</artifactId>
            <version>${flink.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-sql-client</artifactId>
            <version>${flink.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-java</artifactId>
            <version>${flink.version}</version>
        </dependency>

        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-streaming-java</artifactId>
            <version>${flink.version}</version>
        </dependency>

        <!-- 使用第三方库 https://github.com/itinycheng/flink-connector-clickhouse.git
        版本切换1.16.0-SNAPSHOT,flink.version改为1.17.2,mvn clean install -DskipTests -->
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-connector-clickhouse</artifactId>
            <version>${flink-connector-clickhouse.version}</version>
        </dependency>
        <!-- flink end -->
    </dependencies>

java (替换成自己的路径及连接参数)

import lombok.extern.slf4j.Slf4j;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.api.TableResult;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;

/**
 * parquet --> ck
 * 文件数据来源于 Green 出租车出行数据 <a href="https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page"/>
 * 挑选部分字段测试
 */
@Slf4j
public class Parquet2Ck {

    public static void main(String[] args) {
        try {
            StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
            // 设置1个并行源任务
            env.setParallelism(1);
            // set the batch runtime mode
            //env.setRuntimeMode(RuntimeExecutionMode.BATCH);
            //env.enableCheckpointing(500, CheckpointingMode.EXACTLY_ONCE);
            StreamTableEnvironment tableEnv = StreamTableEnvironment.create(env);
            // 数据源表
            String sourceDDL =
                    "CREATE TABLE IF NOT EXISTS TRIPDATA_T (\n" +
                            "  id BIGINT,\n" +
                            "  lpep_pickup_datetime BIGINT,\n" +
                            "  remark STRING,\n" +
                            "  trip_distance DOUBLE,\n" +
                            "  DOLocationID BIGINT,\n" +
                            "  PRIMARY KEY (id) NOT ENFORCED\n" +
                            ")  WITH (\n" +
                            "  'connector'='filesystem',\n" +
                            "  'path'='file:///home/parquet/out',\n" +
                            "  'format'='parquet'\n" +
                            ");" ;

            // 输出目标表
            String sinkDDL =
                    "CREATE TABLE IF NOT EXISTS ODS_TRIPDATA_T (\n" +
                            "  id BIGINT,\n" +
                            "  lpep_pickup_datetime BIGINT,\n" +
                            "  remark STRING,\n" +
                            "  trip_distance DOUBLE,\n" +
                            "  DOLocationID BIGINT,\n" +
                            "  PRIMARY KEY (id) NOT ENFORCED\n" +
                            ") WITH (\n" +
                            "    'connector' = 'clickhouse',\n" +
                            "    'url' = 'clickhouse://IP:PORT',\n" +
                            "    'username' = 'username',\n" +
                            "    'password' = 'password',\n" +
                            "    'database-name' = 'databaseName',\n" +
                            "    'table-name' = 'ODS_TRIPDATA_T',\n" +
                            "    'sink.batch-size' = '500',\n" +
                            "    'sink.flush-interval' = '1000',\n" +
                            "    'sink.max-retries' = '3'\n" +
                            ");";
            // 简单的聚合处理
            String transformSQL = "INSERT INTO ODS_TRIPDATA_T SELECT * FROM TRIPDATA_T";
            tableEnv.executeSql(sourceDDL);
            tableEnv.executeSql(sinkDDL);
            TableResult result = tableEnv.executeSql(transformSQL);
            result.print();
            env.execute("parquet-to-ck");
        } catch (Exception e) {
            log.error("parquet --> ck, Exception=", e);
        }
    }

}

2、采用Dinky实现

全部采用单机最简环境。

部署参考:www.dinky.org.cn/docs/next/d…

  • 采用默认H2数据库

Dinky部署

解压后,将依赖的flink 环境包及其他依赖包放到指定位置
# dinky/extends/flink1.17 下包含如下
#(下述包除标注的均为flink包中获取,Flink自带lib里的planner是带loader的, 
# 需要删除带loader的jar包,换一个不带loader的jar, 可前往flink根下的opt目录中找到)
flink-csv-1.17.2.jar
flink-sql-parquet-1.17.2.jar  --官网下载
log4j-1.2-api-2.17.1.jar
flink-cep-1.17.2.jar
flink-dist-1.17.2.jar
flink-table-api-java-uber-1.17.2.jar
log4j-api-2.17.1.jar
flink-connector-clickhouse-1.16.0-SNAPSHOT.jar  --第三方打包
flink-json-1.17.2.jar
flink-table-planner_2.12-1.17.2.jar
log4j-core-2.17.1.jar
flink-connector-files-1.17.2.jar
flink-scala_2.12-1.17.2.jar
flink-table-runtime-1.17.2.jar
log4j-slf4j-impl-2.17.1.jar

# dinky/extends 下包含以下(下述包为文章方法1 java方式打包后lib中获取)
guava-27.0-jre.jar
hadoop-annotations-3.3.1.jar
hadoop-auth-3.3.1.jar
hadoop-client-3.3.1.jar
hadoop-common-3.3.1.jar
hadoop-hdfs-client-3.3.1.jar
hadoop-mapreduce-client-common-3.3.1.jar
hadoop-mapreduce-client-core-3.3.1.jar
hadoop-mapreduce-client-jobclient-3.3.1.jar
hadoop-shaded-guava-1.1.1.jar
hadoop-shaded-protobuf_3_7-1.1.1.jar
hadoop-yarn-api-3.3.1.jar
hadoop-yarn-client-3.3.1.jar
hadoop-yarn-common-3.3.1.jar
stax2-api-4.2.1.jar
woodstox-core-5.3.0.jar
# 完成后启动
sh auto.sh start

启动完成后访问 http://IP:8888/ 通过 admin/默认密码 即可登录进系统进行可视化操作。

任务创建提交

image.png

编写FlinkSQL执行任务即可

CREATE TABLE IF NOT EXISTS TRIPDATA_T (
  id BIGINT,
  lpep_pickup_datetime BIGINT,
  remark STRING,
  trip_distance DOUBLE,
  DOLocationID BIGINT,
  PRIMARY KEY (id) NOT ENFORCED
)  WITH (
  'connector'='filesystem',
  'path'='/home/kc_apps/parquet-data',
  'format'='parquet'
);

CREATE TABLE IF NOT EXISTS ODS_TRIPDATA_T (
  id BIGINT,
  lpep_pickup_datetime BIGINT,
  remark STRING,
  trip_distance DOUBLE,
  DOLocationID BIGINT,
  PRIMARY KEY (id) NOT ENFORCED
) WITH (
    'connector' = 'clickhouse',
    'url' = 'clickhouse://IP:PORT',
    'username' = 'username',
    'password' = 'password',
    'database-name' = 'databaseName',
    'table-name' = 'ODS_TRIPDATA_T',
    'sink.batch-size' = '500',
    'sink.flush-interval' = '1000',
    'sink.max-retries' = '3'
);

INSERT INTO ODS_TRIPDATA_T SELECT * FROM TRIPDATA_T;

至此就大功告成了,不知是否还有最优解。欢迎交流。