初次接触到Iceberg,需要通过Java实现Iceberg表的读写,且Iceberg使用Hive-metastore做为元数据存储,MinIO作为分布式数据存储。通过查阅相关资料,demo终于跑通了,记录如下:
- pom依赖
<dependency>
<groupId>org.apache.iceberg</groupId>
<artifactId>iceberg-core</artifactId>
<version>1.4.2</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.iceberg/iceberg-hive-metastore -->
<dependency>
<groupId>org.apache.iceberg</groupId>
<artifactId>iceberg-hive-metastore</artifactId>
<version>1.4.2</version>
</dependency>
<dependency>
<groupId>io.minio</groupId>
<artifactId>minio</artifactId>
<version>8.5.7</version>
</dependency>
<!-- https://mvnrepository.com/artifact/com.amazonaws/aws-java-sdk-s3 -->
<dependency>
<groupId>com.amazonaws</groupId>
<artifactId>aws-java-sdk-s3</artifactId>
<version>1.12.620</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-aws</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-metastore</artifactId>
<version>3.1.3</version>
<!--<exclusions>
<exclusion>
<artifactId>parquet-hadoop-bundle</artifactId>
<groupId>org.apache.parquet</groupId>
</exclusion>
</exclusions>-->
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-core</artifactId>
<version>${hadoop.version}</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.iceberg/iceberg-parquet -->
<dependency>
<groupId>org.apache.iceberg</groupId>
<artifactId>iceberg-parquet</artifactId>
<version>1.4.2</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.parquet/parquet-hadoop-bundle -->
<dependency>
<groupId>org.apache.parquet</groupId>
<artifactId>parquet-hadoop-bundle</artifactId>
<version>1.13.1</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.iceberg/iceberg-data -->
<dependency>
<groupId>org.apache.iceberg</groupId>
<artifactId>iceberg-data</artifactId>
<version>1.4.1</version>
</dependency>
备注:hadoop版本根据实际情况配置,推荐Hadoop 3.x版本
- 代码实现
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.iceberg.*;
import org.apache.iceberg.catalog.Namespace;
import org.apache.iceberg.catalog.TableIdentifier;
import org.apache.iceberg.data.GenericRecord;
import org.apache.iceberg.data.IcebergGenerics;
import org.apache.iceberg.data.Record;
import org.apache.iceberg.data.parquet.GenericParquetWriter;
import org.apache.iceberg.expressions.Expressions;
import org.apache.iceberg.hive.HiveCatalog;
import org.apache.iceberg.io.CloseableIterable;
import org.apache.iceberg.io.DataWriter;
import org.apache.iceberg.io.OutputFile;
import org.apache.iceberg.parquet.Parquet;
import org.apache.iceberg.types.Types;
import org.apache.parquet.column.ParquetProperties;
import org.apache.parquet.example.data.Group;
import org.apache.parquet.example.data.simple.SimpleGroupFactory;
import org.apache.parquet.hadoop.ParquetWriter;
import org.apache.parquet.hadoop.example.GroupWriteSupport;
import org.apache.parquet.schema.MessageType;
import org.apache.parquet.schema.PrimitiveType;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.util.Map;
import java.util.UUID;
import static org.apache.parquet.hadoop.metadata.CompressionCodecName.UNCOMPRESSED;
public class IcebergDao {
private static final Logger log = LoggerFactory.getLogger(IcebergDao.class);
private HiveCatalog catalog;
private Table table;
private String namespace;
private String tableName;
private Schema schema;
private Configuration configuration;
private Map<String, String> catalogProperties;
private Map<String, String> tableProperties;
public IcebergDao() {
}
public IcebergDao(Configuration configuration, Map<String, String> catalogProperties,
Map<String, String> tableProperties, String namespace, String tableName, Schema schema) {
this.configuration = configuration;
this.catalogProperties = catalogProperties;
this.tableProperties = tableProperties;
this.namespace = namespace;
this.tableName = tableName;
this.schema = schema;
}
/**
* 初始化Catalog和Table
*/
public void init() {
catalog = new HiveCatalog();
catalog.setConf(configuration);
catalog.initialize("hive", catalogProperties);
table = getTable();
}
public Table getTable() {
Namespace ns = Namespace.of(namespace);
if (!catalog.namespaceExists(ns)) {
catalog.createNamespace(ns);
}
TableIdentifier tableIdentifier = TableIdentifier.of(ns, tableName);
Table table;
if (!catalog.tableExists(tableIdentifier)) {
table = catalog.createTable(tableIdentifier, schema, PartitionSpec.unpartitioned(), tableProperties);
} else {
table = catalog.loadTable(tableIdentifier);
}
return table;
}
/**
* 写parquet文件
* @return
* @throws IOException
*/
public DataFile writeParquet(Map<String, Object> dataMap) throws IOException {
//1,构建表,构建插入数据
GenericRecord record = GenericRecord.create(schema);
ImmutableList.Builder<GenericRecord> builder = ImmutableList.builder();
builder.add(record.copy(ImmutableMap.copyOf(dataMap)));
ImmutableList<GenericRecord> records = builder.build();
// 2. 将记录写入parquet文件
log.info("table.location(): " + table.location());
String filepath = table.location() + "/" + UUID.randomUUID().toString();
OutputFile file = table.io().newOutputFile(filepath);
DataWriter<GenericRecord> dataWriter =
Parquet.writeData(file)
.schema(schema)
.createWriterFunc(GenericParquetWriter::buildWriter)
.overwrite()
.withSpec(PartitionSpec.unpartitioned())
.build();
try {
dataWriter.write(records);
} finally {
dataWriter.close();
}
DataFile dataFile = dataWriter.toDataFile();
return dataFile;
}
/**
* 新增数据
* @throws IOException
*/
public void appendData(Map<String, Object> dataMap) throws IOException {
// 写parquet文件
DataFile dataFile = writeParquet(dataMap);
// 提交文件
table.newAppend().appendFile(dataFile).commit();
}
/**
* 更新
* @throws IOException
*/
public void hiveCatalogUpdate(Map<String, Object> dataMap) throws IOException {
// 写parquet文件,after的U+记录
DataFile dataFile = writeParquet(dataMap);
// 提交文件,需同时提交dataFile和deleteDataFile
table.newRowDelta()
.addRows(dataFile)
.addDeletes(unPartitionedEqDeletes())
.commit();
}
/**
* deleteFile
* @return
* @throws IOException
*/
public DeleteFile unPartitionedEqDeletes() throws IOException {
String filePath = table.location() + "/" + "equalDelete.parquet";
//创建Parquet
createParquet(filePath);
return FileMetadata.deleteFileBuilder(PartitionSpec.unpartitioned())
.ofEqualityDeletes()
.withFormat(FileFormat.PARQUET)
.withPath(filePath)
.withFileSizeInBytes(100)
.withRecordCount(10)
.build();
}
/**
* 创建parquet
* @param filePath
* @throws IOException
*/
public void createParquet(String filePath) throws IOException {
Configuration conf = new Configuration();
Path path = new Path(filePath);
FileSystem fs = path.getFileSystem(conf);
if (fs.exists(path)) {
fs.delete(path, true);
}
MessageType schema = org.apache.parquet.schema.Types.buildMessage()
.required(PrimitiveType.PrimitiveTypeName.INT32).named("id")
.named("test");
GroupWriteSupport.setSchema(schema, conf);
SimpleGroupFactory f = new SimpleGroupFactory(schema);
ParquetWriter<Group> writer = new ParquetWriter<>(path, new GroupWriteSupport(),
UNCOMPRESSED, 1024, 1024, 512, true, false, ParquetProperties.WriterVersion.PARQUET_2_0, conf);
writer.write(f.newGroup()
.append("id", 111));
writer.close();
}
/**
* 打印table的数据
* @param table
*/
public void hiveCatalogScan(Table table) {
IcebergGenerics.ScanBuilder scanBuilder = IcebergGenerics.read(table);
CloseableIterable<Record> records = scanBuilder.build();
for (Record r : records) {
System.out.print(r.get(0));
System.out.print("|");
System.out.print(r.get(1));
System.out.print("|");
System.out.print(r.get(2));
System.out.println();
}
}
/**
* 整表删除
*/
public void hiveCatalogDelete(String namespace, String tableName) {
TableIdentifier name = TableIdentifier.of(namespace, tableName);
catalog.dropTable(name, true);
}
/**
* 新增列
* @param namespace
* @param tableName
* @param columnName
*/
public void addColumn(String namespace, String tableName, String columnName) {
Table table = catalog.loadTable(TableIdentifier.of(namespace, tableName));
UpdateSchema updateSchema = table.updateSchema();
updateSchema.addColumn(columnName, Types.StringType.get());
updateSchema.commit();
}
/**
* 删除列
* @param namespace
* @param tableName
* @param columnName
*/
public void deleteColumn(String namespace, String tableName, String columnName) {
Table table = catalog.loadTable(TableIdentifier.of(namespace, tableName));
UpdateSchema updateSchema = table.updateSchema();
updateSchema.deleteColumn(columnName);
updateSchema.commit();
}
/**
* 行级删除数据
* @param id
*/
public void deleteRow(int id) {
Transaction t = table.newTransaction();
t.newDelete().deleteFromRowFilter(Expressions.equal("id", id)).commit();
t.commitTransaction();
}
/**
* 先删除,后append,放同一个事务里
* @param id
* @param dataFile
*/
public void updateRow(int id, DataFile dataFile) {
Transaction t = table.newTransaction();
t.newDelete().deleteFromRowFilter(Expressions.equal("id", id)).commit();
t.newAppend().appendFile(dataFile).commit();
t.commitTransaction();
}
}
- 测试代码
以下为测试代码,实现了iceberg数据写入。
import com.seago.dataflow.dao.IcebergDao;
import org.apache.hadoop.conf.Configuration;
import org.apache.iceberg.CatalogProperties;
import org.apache.iceberg.Schema;
import org.apache.iceberg.TableProperties;
import org.apache.iceberg.catalog.Catalog;
import org.apache.iceberg.hive.HiveCatalog;
import org.apache.iceberg.types.Types;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
public class TestIcebergDao {
public static void main(String[] args) {
IcebergDao icebergDao = new IcebergDao(initHadoopConf(), initCatalogProperties(),
initTableProperties(), "test", "test_iceberg", initSchema());
icebergDao.init();
try {
// 数据写入
icebergDao.appendData(initDataMap());
} catch (IOException e) {
e.printStackTrace();
}
}
public static Configuration initHadoopConf() {
Configuration conf = new Configuration();
conf.set("fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider");
conf.set("fs.s3a.connection.ssl.enabled", "false");
conf.set("fs.s3a.endpoint", "http://172.27.191.206:30700");
conf.set("fs.s3a.access.key", "admin");
conf.set("fs.s3a.secret.key", "0zhgr4APhG");
conf.set("fs.s3a.path.style.access", "true");
conf.set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem");
conf.set("fs.s3a.fast.upload", "true");
return conf;
}
public static Map<String, String> initCatalogProperties() {
Map<String, String> properties = new HashMap<String, String>();
//WAREHOUSE地址
properties.put(CatalogProperties.WAREHOUSE_LOCATION, "s3a://datalake");
//metastore连接地址
properties.put(CatalogProperties.URI, "thrift://172.27.191.206:30586");
properties.put(CatalogProperties.CATALOG_IMPL, "org.apache.iceberg.hive.HiveCatalog");
return properties;
}
public static Map<String, String> initTableProperties() {
Map<String, String> properties = new HashMap<String, String>();
properties.put(TableProperties.ENGINE_HIVE_ENABLED, "true");
properties.put(TableProperties.FORMAT_VERSION, "2");
properties.put(TableProperties.UPSERT_ENABLED, "true");
return properties;
}
public static Schema initSchema() {
Schema schema = new Schema(
Types.NestedField.required(1, "id", Types.IntegerType.get()),
Types.NestedField.required(2, "name", Types.StringType.get()),
Types.NestedField.optional(3, "birth", Types.StringType.get())
);
return schema;
}
public static Map<String, Object> initDataMap() {
Map<String, Object> map = new HashMap<>();
map.put("id", 112);
map.put("name", "Jack");
map.put("birth", "1999-01-01");
return map;
}
}
备注:此测试代码依赖Hadoop客户端环境,需下载Hadoop 3.x版本的二进制包,解压后配置HADOOP_HOME环境变量。如果是windows环境下运行,还需要下载winutils,解压到$HADOOP_HOME的bin目录下,hadoop.dll文件需放到C://Windows/System32目录下。
本地部署trino,然后通过trino对接iceberg,catalog配置如下:
connector.name=iceberg
hive.metastore.uri=thrift://hive-metastore-service:9083
hive.s3.endpoint=http://minio:9000
hive.s3.aws-access-key=admin
hive.s3.aws-secret-key=0zhgr4APhG
hive.s3.path-style-access=true
hive.s3.ssl.enabled=false
hive.s3.max-connections=100
备注:因hive-metastore-service采用的k8s部署,故配置的服务名。
通过trino客户端查询,命令如下:
[trino@trino-deployment-5f8c8548f5-j9nfc catalog]$ trino --catalog iceberg
trino> use test;
USE
trino:test> select * from test_iceberg;
id | name | birth
-----+------+------------
112 | Jack | 1999-01-01
(1 row)
Query 20240617_091458_00013_5ckgu, FINISHED, 1 node
Splits: 1 total, 1 done (100.00%)
0.23 [1 rows, 930B] [4 rows/s, 3.88KB/s]
trino:test>
截图如下: