Java实现Iceberg读写入门demo

1,715 阅读1分钟

微信截图_20240617173212.png

初次接触到Iceberg,需要通过Java实现Iceberg表的读写,且Iceberg使用Hive-metastore做为元数据存储,MinIO作为分布式数据存储。通过查阅相关资料,demo终于跑通了,记录如下:

  • pom依赖
<dependency>
    <groupId>org.apache.iceberg</groupId>
    <artifactId>iceberg-core</artifactId>
    <version>1.4.2</version>
</dependency>

<!-- https://mvnrepository.com/artifact/org.apache.iceberg/iceberg-hive-metastore -->
<dependency>
    <groupId>org.apache.iceberg</groupId>
    <artifactId>iceberg-hive-metastore</artifactId>
    <version>1.4.2</version>
</dependency>

<dependency>
    <groupId>io.minio</groupId>
    <artifactId>minio</artifactId>
    <version>8.5.7</version>
</dependency>
<!-- https://mvnrepository.com/artifact/com.amazonaws/aws-java-sdk-s3 -->
<dependency>
    <groupId>com.amazonaws</groupId>
    <artifactId>aws-java-sdk-s3</artifactId>
    <version>1.12.620</version>
</dependency>
<dependency>
    <groupId>org.apache.hadoop</groupId>
    <artifactId>hadoop-aws</artifactId>
    <version>${hadoop.version}</version>
</dependency>

<dependency>
    <groupId>org.apache.hadoop</groupId>
    <artifactId>hadoop-common</artifactId>
    <version>${hadoop.version}</version>
</dependency>

<dependency>
    <groupId>org.apache.hive</groupId>
    <artifactId>hive-metastore</artifactId>
    <version>3.1.3</version>
    <!--<exclusions>
        <exclusion>
            <artifactId>parquet-hadoop-bundle</artifactId>
            <groupId>org.apache.parquet</groupId>
        </exclusion>
    </exclusions>-->
</dependency>

<dependency>
    <groupId>org.apache.hadoop</groupId>
    <artifactId>hadoop-mapreduce-client-core</artifactId>
    <version>${hadoop.version}</version>
</dependency>

<!-- https://mvnrepository.com/artifact/org.apache.iceberg/iceberg-parquet -->
<dependency>
    <groupId>org.apache.iceberg</groupId>
    <artifactId>iceberg-parquet</artifactId>
    <version>1.4.2</version>
</dependency>

<!-- https://mvnrepository.com/artifact/org.apache.parquet/parquet-hadoop-bundle -->
<dependency>
    <groupId>org.apache.parquet</groupId>
    <artifactId>parquet-hadoop-bundle</artifactId>
    <version>1.13.1</version>
</dependency>

<!-- https://mvnrepository.com/artifact/org.apache.iceberg/iceberg-data -->
<dependency>
    <groupId>org.apache.iceberg</groupId>
    <artifactId>iceberg-data</artifactId>
    <version>1.4.1</version>
</dependency>

备注:hadoop版本根据实际情况配置,推荐Hadoop 3.x版本

  • 代码实现

import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.iceberg.*;
import org.apache.iceberg.catalog.Namespace;
import org.apache.iceberg.catalog.TableIdentifier;
import org.apache.iceberg.data.GenericRecord;
import org.apache.iceberg.data.IcebergGenerics;
import org.apache.iceberg.data.Record;
import org.apache.iceberg.data.parquet.GenericParquetWriter;
import org.apache.iceberg.expressions.Expressions;
import org.apache.iceberg.hive.HiveCatalog;
import org.apache.iceberg.io.CloseableIterable;
import org.apache.iceberg.io.DataWriter;
import org.apache.iceberg.io.OutputFile;
import org.apache.iceberg.parquet.Parquet;
import org.apache.iceberg.types.Types;
import org.apache.parquet.column.ParquetProperties;
import org.apache.parquet.example.data.Group;
import org.apache.parquet.example.data.simple.SimpleGroupFactory;
import org.apache.parquet.hadoop.ParquetWriter;
import org.apache.parquet.hadoop.example.GroupWriteSupport;
import org.apache.parquet.schema.MessageType;
import org.apache.parquet.schema.PrimitiveType;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.util.Map;
import java.util.UUID;

import static org.apache.parquet.hadoop.metadata.CompressionCodecName.UNCOMPRESSED;

public class IcebergDao {

    private static final Logger log = LoggerFactory.getLogger(IcebergDao.class);


    private HiveCatalog catalog;

    private Table table;

    private String namespace;

    private String tableName;

    private Schema schema;

    private Configuration configuration;

    private Map<String, String> catalogProperties;

    private Map<String, String> tableProperties;

    public IcebergDao() {
    }

    public IcebergDao(Configuration configuration, Map<String, String> catalogProperties,
                      Map<String, String> tableProperties, String namespace, String tableName, Schema schema) {
        this.configuration = configuration;
        this.catalogProperties = catalogProperties;
        this.tableProperties = tableProperties;
        this.namespace = namespace;
        this.tableName = tableName;
        this.schema = schema;
    }

    /**
     * 初始化Catalog和Table
     */
    public void init() {
        catalog = new HiveCatalog();
        catalog.setConf(configuration);
        catalog.initialize("hive", catalogProperties);
        table = getTable();
    }

    public Table getTable() {
        Namespace ns = Namespace.of(namespace);
        if (!catalog.namespaceExists(ns)) {
            catalog.createNamespace(ns);
        }
        TableIdentifier tableIdentifier = TableIdentifier.of(ns, tableName);
        Table table;
        if (!catalog.tableExists(tableIdentifier)) {
            table = catalog.createTable(tableIdentifier, schema, PartitionSpec.unpartitioned(), tableProperties);
        } else {
            table = catalog.loadTable(tableIdentifier);
        }
        return table;
    }

    /**
     * 写parquet文件
     * @return
     * @throws IOException
     */
    public DataFile writeParquet(Map<String, Object> dataMap) throws IOException {
        //1,构建表,构建插入数据
        GenericRecord record = GenericRecord.create(schema);
        ImmutableList.Builder<GenericRecord> builder = ImmutableList.builder();

        builder.add(record.copy(ImmutableMap.copyOf(dataMap)));
        ImmutableList<GenericRecord> records = builder.build();
        // 2. 将记录写入parquet文件
        log.info("table.location(): " + table.location());
        String filepath = table.location() + "/" + UUID.randomUUID().toString();
        OutputFile file = table.io().newOutputFile(filepath);
        DataWriter<GenericRecord> dataWriter =
                Parquet.writeData(file)
                        .schema(schema)
                        .createWriterFunc(GenericParquetWriter::buildWriter)
                        .overwrite()
                        .withSpec(PartitionSpec.unpartitioned())
                        .build();
        try {
            dataWriter.write(records);
        } finally {
            dataWriter.close();
        }
        DataFile dataFile = dataWriter.toDataFile();
        return dataFile;
    }


    /**
     * 新增数据
     * @throws IOException
     */
    public void appendData(Map<String, Object> dataMap) throws IOException {

        // 写parquet文件
        DataFile dataFile = writeParquet(dataMap);

        // 提交文件
        table.newAppend().appendFile(dataFile).commit();
    }

    /**
     * 更新
     * @throws IOException
     */
    public void hiveCatalogUpdate(Map<String, Object> dataMap) throws IOException {

        // 写parquet文件,after的U+记录
        DataFile dataFile = writeParquet(dataMap);

        // 提交文件,需同时提交dataFile和deleteDataFile
        table.newRowDelta()
                .addRows(dataFile)
                .addDeletes(unPartitionedEqDeletes())
                .commit();
    }

    /**
     * deleteFile
     * @return
     * @throws IOException
     */
    public DeleteFile unPartitionedEqDeletes() throws IOException {
        String filePath = table.location() + "/" + "equalDelete.parquet";
        //创建Parquet
        createParquet(filePath);

        return FileMetadata.deleteFileBuilder(PartitionSpec.unpartitioned())
                .ofEqualityDeletes()
                .withFormat(FileFormat.PARQUET)
                .withPath(filePath)
                .withFileSizeInBytes(100)
                .withRecordCount(10)
                .build();
    }

    /**
     * 创建parquet
     * @param filePath
     * @throws IOException
     */
    public void createParquet(String filePath) throws IOException {
        Configuration conf = new Configuration();
        Path path = new Path(filePath);
        FileSystem fs = path.getFileSystem(conf);
        if (fs.exists(path)) {
            fs.delete(path, true);
        }

        MessageType schema = org.apache.parquet.schema.Types.buildMessage()
                .required(PrimitiveType.PrimitiveTypeName.INT32).named("id")
                .named("test");

        GroupWriteSupport.setSchema(schema, conf);
        SimpleGroupFactory f = new SimpleGroupFactory(schema);
        ParquetWriter<Group> writer = new ParquetWriter<>(path, new GroupWriteSupport(),
                UNCOMPRESSED, 1024, 1024, 512, true, false, ParquetProperties.WriterVersion.PARQUET_2_0, conf);

        writer.write(f.newGroup()
                .append("id", 111));

        writer.close();
    }


    /**
     * 打印table的数据
     * @param table
     */
    public void hiveCatalogScan(Table table) {
        IcebergGenerics.ScanBuilder scanBuilder = IcebergGenerics.read(table);
        CloseableIterable<Record> records = scanBuilder.build();
        for (Record r : records) {
            System.out.print(r.get(0));
            System.out.print("|");
            System.out.print(r.get(1));
            System.out.print("|");
            System.out.print(r.get(2));
            System.out.println();
        }
    }

    /**
     * 整表删除
     */
    public void hiveCatalogDelete(String namespace, String tableName) {
        TableIdentifier name = TableIdentifier.of(namespace, tableName);
        catalog.dropTable(name, true);
    }

    /**
     * 新增列
     * @param namespace
     * @param tableName
     * @param columnName
     */
    public void addColumn(String namespace, String tableName, String columnName) {
        Table table = catalog.loadTable(TableIdentifier.of(namespace, tableName));
        UpdateSchema updateSchema = table.updateSchema();
        updateSchema.addColumn(columnName, Types.StringType.get());
        updateSchema.commit();
    }

    /**
     * 删除列
     * @param namespace
     * @param tableName
     * @param columnName
     */
    public void deleteColumn(String namespace, String tableName, String columnName) {
        Table table = catalog.loadTable(TableIdentifier.of(namespace, tableName));
        UpdateSchema updateSchema = table.updateSchema();
        updateSchema.deleteColumn(columnName);
        updateSchema.commit();
    }


    /**
     * 行级删除数据
     * @param id
     */
    public void deleteRow(int id) {
        Transaction t = table.newTransaction();
        t.newDelete().deleteFromRowFilter(Expressions.equal("id",  id)).commit();
        t.commitTransaction();
    }


    /**
     * 先删除,后append,放同一个事务里
     * @param id
     * @param dataFile
     */
    public void updateRow(int id, DataFile dataFile) {
        Transaction t = table.newTransaction();
        t.newDelete().deleteFromRowFilter(Expressions.equal("id",  id)).commit();
        t.newAppend().appendFile(dataFile).commit();
        t.commitTransaction();
    }


}
  • 测试代码

以下为测试代码,实现了iceberg数据写入。

import com.seago.dataflow.dao.IcebergDao;
import org.apache.hadoop.conf.Configuration;
import org.apache.iceberg.CatalogProperties;
import org.apache.iceberg.Schema;
import org.apache.iceberg.TableProperties;
import org.apache.iceberg.catalog.Catalog;
import org.apache.iceberg.hive.HiveCatalog;
import org.apache.iceberg.types.Types;

import java.io.IOException;
import java.util.HashMap;
import java.util.Map;

public class TestIcebergDao {

    public static void main(String[] args) {
        IcebergDao icebergDao = new IcebergDao(initHadoopConf(), initCatalogProperties(),
                initTableProperties(), "test", "test_iceberg", initSchema());
        icebergDao.init();
        try {
            // 数据写入
            icebergDao.appendData(initDataMap());
        } catch (IOException e) {
            e.printStackTrace();
        }

    }

    public static Configuration initHadoopConf() {
        Configuration conf = new Configuration();
        conf.set("fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider");
        conf.set("fs.s3a.connection.ssl.enabled", "false");
        conf.set("fs.s3a.endpoint", "http://172.27.191.206:30700");
        conf.set("fs.s3a.access.key", "admin");
        conf.set("fs.s3a.secret.key", "0zhgr4APhG");
        conf.set("fs.s3a.path.style.access", "true");
        conf.set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem");
        conf.set("fs.s3a.fast.upload", "true");
        return conf;
    }

    public static Map<String, String> initCatalogProperties() {
        Map<String, String> properties = new HashMap<String, String>();
        //WAREHOUSE地址
        properties.put(CatalogProperties.WAREHOUSE_LOCATION, "s3a://datalake");
        //metastore连接地址
        properties.put(CatalogProperties.URI, "thrift://172.27.191.206:30586");
        properties.put(CatalogProperties.CATALOG_IMPL, "org.apache.iceberg.hive.HiveCatalog");
        return properties;
    }

    public static Map<String, String> initTableProperties() {
        Map<String, String> properties = new HashMap<String, String>();
        properties.put(TableProperties.ENGINE_HIVE_ENABLED, "true");
        properties.put(TableProperties.FORMAT_VERSION, "2");
        properties.put(TableProperties.UPSERT_ENABLED, "true");
        return properties;
    }


    public static Schema initSchema() {
        Schema schema = new Schema(
                Types.NestedField.required(1, "id", Types.IntegerType.get()),
                Types.NestedField.required(2, "name", Types.StringType.get()),
                Types.NestedField.optional(3, "birth", Types.StringType.get())
        );

        return schema;
    }

    public static Map<String, Object> initDataMap() {
        Map<String, Object> map = new HashMap<>();
        map.put("id", 112);
        map.put("name", "Jack");
        map.put("birth", "1999-01-01");
        return map;
    }

}

备注:此测试代码依赖Hadoop客户端环境,需下载Hadoop 3.x版本的二进制包,解压后配置HADOOP_HOME环境变量。如果是windows环境下运行,还需要下载winutils,解压到$HADOOP_HOME的bin目录下,hadoop.dll文件需放到C://Windows/System32目录下。

本地部署trino,然后通过trino对接iceberg,catalog配置如下:

connector.name=iceberg
hive.metastore.uri=thrift://hive-metastore-service:9083
hive.s3.endpoint=http://minio:9000
hive.s3.aws-access-key=admin
hive.s3.aws-secret-key=0zhgr4APhG
hive.s3.path-style-access=true
hive.s3.ssl.enabled=false
hive.s3.max-connections=100

备注:因hive-metastore-service采用的k8s部署,故配置的服务名。

通过trino客户端查询,命令如下:

[trino@trino-deployment-5f8c8548f5-j9nfc catalog]$ trino --catalog iceberg
trino> use test;
USE
trino:test> select * from test_iceberg;
 id  | name |   birth
-----+------+------------
 112 | Jack | 1999-01-01
(1 row)

Query 20240617_091458_00013_5ckgu, FINISHED, 1 node
Splits: 1 total, 1 done (100.00%)
0.23 [1 rows, 930B] [4 rows/s, 3.88KB/s]

trino:test>

截图如下:

image.png