概述
先将parquet文件数据进行预处理(添加唯一主键和备注信息),然后将处理后的多个parquet文件数据提取到CK数据库中。
环境及主要软件版本说明
- flink-sql-parquet-1.17.2.jar
- flink-connector-clickhouse-1.16.0-SNAPSHOT.jar (该包来源于第三方github.com/itinycheng/… 版本切换1.16.0-SNAPSHOT,pom flink.version改为1.17.2,mvn clean install -DskipTests)
- dinky-release-1.17-1.2.0.tar.gz
- java8
一、parquet 文件预处理
1、测试文件数据集准备
文件数据来源于 Green 出租车出行数据 www.nyc.gov/site/tlc/ab… 下载几个测试文件如下:
- green_tripdata_2020-04.parquet
- green_tripdata_2021-04.parquet
- green_tripdata_2022-03.parquet
- green_tripdata_2022-04.parquet
2、编写java程序进行预处理
pom 文件
<properties>
<java.version>1.8</java.version>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
<hutool.version>5.8.22</hutool.version>
<hadoop.version>3.3.1</hadoop.version>
<parquet-avro.version>1.13.1</parquet-avro.version>
</properties>
<dependencies>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<version>1.18.24</version>
</dependency>
<dependency>
<groupId>cn.hutool</groupId>
<artifactId>hutool-all</artifactId>
<version>${hutool.version}</version>
</dependency>
<!-- parquet文件预处理 start -->
<dependency>
<groupId>org.apache.parquet</groupId>
<artifactId>parquet-avro</artifactId>
<version>${parquet-avro.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>${hadoop.version}</version>
</dependency>
<!-- parquet文件预处理 end -->
</dependencies>
java 代码
import cn.hutool.core.lang.Snowflake;
import cn.hutool.core.util.IdUtil;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericRecord;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.parquet.avro.AvroParquetReader;
import org.apache.parquet.avro.AvroParquetWriter;
import org.apache.parquet.avro.AvroReadSupport;
import org.apache.parquet.hadoop.ParquetReader;
import org.apache.parquet.hadoop.ParquetWriter;
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
import org.apache.parquet.hadoop.util.HadoopOutputFile;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
public class ParquetRecordReader {
/**
* parquet 文件添加备注信息&id
*
* @param inputFilePath 输入文件路径
* @param outputFilePath 输出文件路径
* @param remark 备注信息
* @throws IOException 异常
*/
public static void addRemarkAndWriteToFile(String inputFilePath, String outputFilePath, String remark) throws IOException {
// 读取原始 Parquet 文件
Path inputPath = new Path(inputFilePath);
ParquetReader<GenericRecord> reader = AvroParquetReader
.builder(new AvroReadSupport<GenericRecord>(), inputPath)
.build();
GenericRecord record;
Schema schema = null;
while ((record = reader.read()) != null) {
schema = record.getSchema();
break; // 只需要获取一次 schema
}
if (schema == null) {
throw new IOException("Failed to read schema from input file.");
}
// 创建新的 schema 包含 remark 字段
Schema newSchema = Schema.createRecord(schema.getName(), schema.getDoc(), schema.getNamespace(), false);
List<Schema.Field> newFields = new ArrayList<>();
// 复制原始 schema 的字段并确保正确处理 UNION 类型
for (Schema.Field field : schema.getFields()) {
// 深拷贝字段及其类型
Schema fieldType = field.schema();
Schema.Field newField = new Schema.Field(field.name(), fieldType, field.doc(), field.defaultVal());
newFields.add(newField);
}
// 检查是否已经存在 remark 字段,如果不存在则添加
boolean remarkFieldExists = newFields.stream().anyMatch(field -> "remark".equals(field.name()));
if (!remarkFieldExists) {
newFields.add(new Schema.Field(
"remark",
Schema.create(Schema.Type.STRING),
"A remark field",
null)
);
}
// 检查是否已经存在 主键 字段,如果不存在则添加
boolean idFieldExists = newFields.stream().anyMatch(field -> "id".equals(field.name()));
if (!idFieldExists) {
newFields.add(new Schema.Field(
"id",
Schema.create(Schema.Type.LONG),
"id field",
null)
);
}
newSchema.setFields(newFields);
// 重置 reader 以重新读取数据
reader.close();
reader = AvroParquetReader.builder(new AvroReadSupport<GenericRecord>(), inputPath).build();
List<GenericRecord> recordList = new ArrayList<>();
Snowflake snowflake = IdUtil.getSnowflake();
while ((record = reader.read()) != null) {
// 创建一个新的 GenericRecord 并添加 remark 字段
GenericRecord newRecord = new GenericData.Record(newSchema);
GenericRecord finalRecord = record;
schema.getFields().forEach(item -> {
String name = item.name();
newRecord.put(name, finalRecord.get(name));
});
newRecord.put("remark", remark);
newRecord.put("id", snowflake.nextId());
recordList.add(newRecord);
}
reader.close();
// 写入新的 Parquet 文件
Path outputPath = new Path(outputFilePath);
HadoopOutputFile hadoopOutputFile = HadoopOutputFile.fromPath(outputPath, new Configuration());
ParquetWriter<Object> writer = AvroParquetWriter.builder(hadoopOutputFile)
.withSchema(newSchema)
.withConf(new Configuration())
.withCompressionCodec(CompressionCodecName.SNAPPY)
.build();
for (GenericRecord rec : recordList) {
writer.write(rec);
}
writer.close();
}
/**
* 文件数据转List,闲置未用
* @param filePath 指定 Parquet 文件路径
* @return 文件数据List
* @throws IOException 异常
*/
public static List<Map<String, Object>> readParquetFileWithRecord(String filePath) throws IOException {
// 拼接parquet文件全路径
Path parquetFilePath = new Path(filePath);
ParquetReader<GenericRecord> reader = AvroParquetReader.builder(new AvroReadSupport<GenericRecord>(), parquetFilePath).build();
GenericRecord record;
List<Map<String, Object>> recordList = new ArrayList<>();
// 开始遍历行数据
while ((record = reader.read()) != null) {
Map<String, Object> recordMap = new HashMap<>();
Schema schema = record.getSchema();
// 行的字段信息
List<Schema.Field> fields = schema.getFields();
GenericRecord finalRecord = record;
fields.forEach(item -> {
// 根据字段名称获取对应值
String name = item.name();
Object val = finalRecord.get(name);
recordMap.put(name, val);
});
recordList.add(recordMap);
}
reader.close();
return recordList;
}
}
import lombok.extern.slf4j.Slf4j;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.stream.Stream;
/**
* @author admin
*/
@Slf4j
public class FileMain {
public static void main(String[] args) {
final String outPath = "F:\home\parquet\out\";
final String inPath = "F:\home\parquet\in";
executeFiles(inPath, outPath);
}
public static void executeFiles(String inPath, String outPath) {
Path dirPath = Paths.get(inPath);
try (Stream<Path> stream = Files.walk(dirPath)) {
stream.filter(Files::isRegularFile)
.filter(path -> path.toString().toLowerCase().endsWith(".parquet"))
.forEach(filePath -> {
// 文件全路径
Path filePathAbsolutePath = filePath.toAbsolutePath();
System.out.println("filePathAbsolutePath = " + filePathAbsolutePath);
log.info("filePathAbsolutePath = {}", filePathAbsolutePath);
// 根据文件路径重命名文件名称
String outFileName = filePathAbsolutePath.toString()
.replaceAll("F:\\home\\parquet\\in\\", "")
.replaceAll("[\\|:]+", "_");
System.out.println("outFileName = " + outFileName);
log.info("outFileName = {}", outFileName);
try {
ParquetRecordReader.addRemarkAndWriteToFile(filePath.toAbsolutePath().toString(),
outPath + outFileName, outFileName);
} catch (Exception e) {
throw new RuntimeException(e);
}
});
} catch (IOException e) {
log.error("执行异常!", e);
}
}
}
二、parquet 文件数据进 CK
1、java 代码方式实现
pom
<properties>
<java.version>1.8</java.version>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
<flink.version>1.17.2</flink.version>
<flink-connector-clickhouse.version>1.16.0-SNAPSHOT</flink-connector-clickhouse.version>
</properties>
<dependencies>
<!-- flink start -->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-parquet</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-connector-files</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-api-java-bridge</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-planner_2.12</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-sql-client</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-java</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-streaming-java</artifactId>
<version>${flink.version}</version>
</dependency>
<!-- 使用第三方库 https://github.com/itinycheng/flink-connector-clickhouse.git
版本切换1.16.0-SNAPSHOT,flink.version改为1.17.2,mvn clean install -DskipTests -->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-connector-clickhouse</artifactId>
<version>${flink-connector-clickhouse.version}</version>
</dependency>
<!-- flink end -->
</dependencies>
java (替换成自己的路径及连接参数)
import lombok.extern.slf4j.Slf4j;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.api.TableResult;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;
/**
* parquet --> ck
* 文件数据来源于 Green 出租车出行数据 <a href="https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page"/>
* 挑选部分字段测试
*/
@Slf4j
public class Parquet2Ck {
public static void main(String[] args) {
try {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
// 设置1个并行源任务
env.setParallelism(1);
// set the batch runtime mode
//env.setRuntimeMode(RuntimeExecutionMode.BATCH);
//env.enableCheckpointing(500, CheckpointingMode.EXACTLY_ONCE);
StreamTableEnvironment tableEnv = StreamTableEnvironment.create(env);
// 数据源表
String sourceDDL =
"CREATE TABLE IF NOT EXISTS TRIPDATA_T (\n" +
" id BIGINT,\n" +
" lpep_pickup_datetime BIGINT,\n" +
" remark STRING,\n" +
" trip_distance DOUBLE,\n" +
" DOLocationID BIGINT,\n" +
" PRIMARY KEY (id) NOT ENFORCED\n" +
") WITH (\n" +
" 'connector'='filesystem',\n" +
" 'path'='file:///home/parquet/out',\n" +
" 'format'='parquet'\n" +
");" ;
// 输出目标表
String sinkDDL =
"CREATE TABLE IF NOT EXISTS ODS_TRIPDATA_T (\n" +
" id BIGINT,\n" +
" lpep_pickup_datetime BIGINT,\n" +
" remark STRING,\n" +
" trip_distance DOUBLE,\n" +
" DOLocationID BIGINT,\n" +
" PRIMARY KEY (id) NOT ENFORCED\n" +
") WITH (\n" +
" 'connector' = 'clickhouse',\n" +
" 'url' = 'clickhouse://IP:PORT',\n" +
" 'username' = 'username',\n" +
" 'password' = 'password',\n" +
" 'database-name' = 'databaseName',\n" +
" 'table-name' = 'ODS_TRIPDATA_T',\n" +
" 'sink.batch-size' = '500',\n" +
" 'sink.flush-interval' = '1000',\n" +
" 'sink.max-retries' = '3'\n" +
");";
// 简单的聚合处理
String transformSQL = "INSERT INTO ODS_TRIPDATA_T SELECT * FROM TRIPDATA_T";
tableEnv.executeSql(sourceDDL);
tableEnv.executeSql(sinkDDL);
TableResult result = tableEnv.executeSql(transformSQL);
result.print();
env.execute("parquet-to-ck");
} catch (Exception e) {
log.error("parquet --> ck, Exception=", e);
}
}
}
2、采用Dinky实现
全部采用单机最简环境。
部署参考:www.dinky.org.cn/docs/next/d…
- 采用默认H2数据库
Dinky部署
解压后,将依赖的flink 环境包及其他依赖包放到指定位置
# dinky/extends/flink1.17 下包含如下
#(下述包除标注的均为flink包中获取,Flink自带lib里的planner是带loader的,
# 需要删除带loader的jar包,换一个不带loader的jar, 可前往flink根下的opt目录中找到)
flink-csv-1.17.2.jar
flink-sql-parquet-1.17.2.jar --官网下载
log4j-1.2-api-2.17.1.jar
flink-cep-1.17.2.jar
flink-dist-1.17.2.jar
flink-table-api-java-uber-1.17.2.jar
log4j-api-2.17.1.jar
flink-connector-clickhouse-1.16.0-SNAPSHOT.jar --第三方打包
flink-json-1.17.2.jar
flink-table-planner_2.12-1.17.2.jar
log4j-core-2.17.1.jar
flink-connector-files-1.17.2.jar
flink-scala_2.12-1.17.2.jar
flink-table-runtime-1.17.2.jar
log4j-slf4j-impl-2.17.1.jar
# dinky/extends 下包含以下(下述包为文章方法1 java方式打包后lib中获取)
guava-27.0-jre.jar
hadoop-annotations-3.3.1.jar
hadoop-auth-3.3.1.jar
hadoop-client-3.3.1.jar
hadoop-common-3.3.1.jar
hadoop-hdfs-client-3.3.1.jar
hadoop-mapreduce-client-common-3.3.1.jar
hadoop-mapreduce-client-core-3.3.1.jar
hadoop-mapreduce-client-jobclient-3.3.1.jar
hadoop-shaded-guava-1.1.1.jar
hadoop-shaded-protobuf_3_7-1.1.1.jar
hadoop-yarn-api-3.3.1.jar
hadoop-yarn-client-3.3.1.jar
hadoop-yarn-common-3.3.1.jar
stax2-api-4.2.1.jar
woodstox-core-5.3.0.jar
# 完成后启动
sh auto.sh start
启动完成后访问 http://IP:8888/ 通过 admin/默认密码 即可登录进系统进行可视化操作。
任务创建提交
编写FlinkSQL执行任务即可
CREATE TABLE IF NOT EXISTS TRIPDATA_T (
id BIGINT,
lpep_pickup_datetime BIGINT,
remark STRING,
trip_distance DOUBLE,
DOLocationID BIGINT,
PRIMARY KEY (id) NOT ENFORCED
) WITH (
'connector'='filesystem',
'path'='/home/kc_apps/parquet-data',
'format'='parquet'
);
CREATE TABLE IF NOT EXISTS ODS_TRIPDATA_T (
id BIGINT,
lpep_pickup_datetime BIGINT,
remark STRING,
trip_distance DOUBLE,
DOLocationID BIGINT,
PRIMARY KEY (id) NOT ENFORCED
) WITH (
'connector' = 'clickhouse',
'url' = 'clickhouse://IP:PORT',
'username' = 'username',
'password' = 'password',
'database-name' = 'databaseName',
'table-name' = 'ODS_TRIPDATA_T',
'sink.batch-size' = '500',
'sink.flush-interval' = '1000',
'sink.max-retries' = '3'
);
INSERT INTO ODS_TRIPDATA_T SELECT * FROM TRIPDATA_T;
至此就大功告成了,不知是否还有最优解。欢迎交流。