一般的flink写入任务构建
FlinkSink
.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA)
.table(table)
.tableLoader(tableLoader)
.writeParallelism(1)
.equalityFieldColumns(ImmutableList.of("id"))
.build();
env.execute();
首先是一些builder对象的设置,部分如下,rowDataInput为输入的DataStream流
public static class Builder {
private DataStream<RowData> rowDataInput = null;
private TableLoader tableLoader;
private Table table;
private TableSchema tableSchema;
private boolean overwrite = false;
private DistributionMode distributionMode = null;
private Integer writeParallelism = null;
private List<String> equalityFieldColumns = null;
private Builder() {
}
private Builder forRowData(DataStream<RowData> newRowDataInput) {
this.rowDataInput = newRowDataInput;
return this;
}
}
进入build方法
public DataStreamSink<RowData> build() {
Preconditions.checkArgument(rowDataInput != null,
"Please use forRowData() to initialize the input DataStream.");
Preconditions.checkNotNull(tableLoader, "Table loader shouldn't be null");
if (table == null) {
tableLoader.open();
try (TableLoader loader = tableLoader) {
this.table = loader.loadTable();
} catch (IOException e) {
throw new UncheckedIOException("Failed to load iceberg table from table loader: " + tableLoader, e);
}
}
// Find out the equality field id list based on the user-provided equality field column names.
List<Integer> equalityFieldIds = Lists.newArrayList();
if (equalityFieldColumns != null && equalityFieldColumns.size() > 0) {
for (String column : equalityFieldColumns) {
org.apache.iceberg.types.Types.NestedField field = table.schema().findField(column);
Preconditions.checkNotNull(field, "Missing required equality field column '%s' in table schema %s",
column, table.schema());
equalityFieldIds.add(field.fieldId());
}
}
// Convert the requested flink table schema to flink row type.
RowType flinkRowType = toFlinkRowType(table.schema(), tableSchema);
// Distribute the records from input data stream based on the write.distribution-mode.
rowDataInput = distributeDataStream(rowDataInput, table.properties(), table.spec(), table.schema(), flinkRowType);
// Chain the iceberg stream writer and committer operator.
IcebergStreamWriter<RowData> streamWriter = createStreamWriter(table, flinkRowType, equalityFieldIds);
IcebergFilesCommitter filesCommitter = new IcebergFilesCommitter(tableLoader, overwrite);
this.writeParallelism = writeParallelism == null ? rowDataInput.getParallelism() : writeParallelism;
DataStream<Void> returnStream = rowDataInput
.transform(ICEBERG_STREAM_WRITER_NAME, TypeInformation.of(WriteResult.class), streamWriter)
.setParallelism(writeParallelism)
.transform(ICEBERG_FILES_COMMITTER_NAME, Types.VOID, filesCommitter)
.setParallelism(1)
.setMaxParallelism(1);
return returnStream.addSink(new DiscardingSink())
.name(String.format("IcebergSink %s", table.name()))
.setParallelism(1);
}
在 IcebergStreamWriter streamWriter = createStreamWriter(table, flinkRowType, equalityFieldIds); 中构建写入类
static IcebergStreamWriter<RowData> createStreamWriter(Table table,
RowType flinkRowType,
List<Integer> equalityFieldIds) {
Map<String, String> props = table.properties();
long targetFileSize = getTargetFileSizeBytes(props);
FileFormat fileFormat = getFileFormat(props);
TaskWriterFactory<RowData> taskWriterFactory = new RowDataTaskWriterFactory(table.schema(), flinkRowType,
table.spec(), table.locationProvider(), table.io(), table.encryption(), targetFileSize, fileFormat, props,
equalityFieldIds);
return new IcebergStreamWriter<>(table.name(), taskWriterFactory);
}
这里根据table信息构建一个TaskWriterFactory工厂,然后再通过taskWriterFactory构建一个写入类,
class IcebergStreamWriter<T> extends AbstractStreamOperator<WriteResult>
implements OneInputStreamOperator<T, WriteResult>, BoundedOneInput {
private static final long serialVersionUID = 1L;
private final String fullTableName;
private final TaskWriterFactory<T> taskWriterFactory;
private transient TaskWriter<T> writer;
private transient int subTaskId;
private transient int attemptId;
IcebergStreamWriter(String fullTableName, TaskWriterFactory<T> taskWriterFactory) {
this.fullTableName = fullTableName;
this.taskWriterFactory = taskWriterFactory;
setChainingStrategy(ChainingStrategy.ALWAYS);
}
@Override
public void open() {
this.subTaskId = getRuntimeContext().getIndexOfThisSubtask();
this.attemptId = getRuntimeContext().getAttemptNumber();
// Initialize the task writer factory.
this.taskWriterFactory.initialize(subTaskId, attemptId);
// Initialize the task writer.
this.writer = taskWriterFactory.create();
}
@Override
public void prepareSnapshotPreBarrier(long checkpointId) throws Exception {
// close all open files and emit files to downstream committer operator
emit(writer.complete());
this.writer = taskWriterFactory.create();
}
@Override
public void processElement(StreamRecord<T> element) throws Exception {
writer.write(element.getValue());
}
}
这里在open函数中初始化一个TaskWriter对象,然后再processElement中调用其write方法写入数据,最后在checkpoint时提交
提交流程在IcebergFilesCommitter的snapshotState中
@Override
public void snapshotState(StateSnapshotContext context) throws Exception {
super.snapshotState(context);
long checkpointId = context.getCheckpointId();
LOG.info("Start to flush snapshot state to state backend, table: {}, checkpointId: {}", table, checkpointId);
// Update the checkpoint state.
dataFilesPerCheckpoint.put(checkpointId, writeToManifest(checkpointId));
// Reset the snapshot state to the latest state.
checkpointsState.clear();
checkpointsState.add(dataFilesPerCheckpoint);
jobIdState.clear();
jobIdState.add(flinkJobId);
// Clear the local buffer for current checkpoint.
writeResultsOfCurrentCkpt.clear();
}
dataFilesPerCheckpoint.put(checkpointId, writeToManifest(checkpointId)); 为更新当前checkpoint的id和文件信息
/**
* Write all the complete data files to a newly created manifest file and return the manifest's avro serialized bytes.
*/
private byte[] writeToManifest(long checkpointId) throws IOException {
if (writeResultsOfCurrentCkpt.isEmpty()) {
return EMPTY_MANIFEST_DATA;
}
WriteResult result = WriteResult.builder().addAll(writeResultsOfCurrentCkpt).build();
DeltaManifests deltaManifests = FlinkManifestUtil.writeCompletedFiles(result,
() -> manifestOutputFileFactory.create(checkpointId), table.spec());
return SimpleVersionedSerialization.writeVersionAndSerialize(DeltaManifestsSerializer.INSTANCE, deltaManifests);
}
writeResultsOfCurrentCkpt中包含了datafile信息,deletefile信息
static DeltaManifests writeCompletedFiles(WriteResult result,
Supplier<OutputFile> outputFileSupplier,
PartitionSpec spec) throws IOException {
ManifestFile dataManifest = null;
ManifestFile deleteManifest = null;
// Write the completed data files into a newly created data manifest file.
if (result.dataFiles() != null && result.dataFiles().length > 0) {
dataManifest = writeDataFiles(outputFileSupplier.get(), spec, Lists.newArrayList(result.dataFiles()));
}
// Write the completed delete files into a newly created delete manifest file.
if (result.deleteFiles() != null && result.deleteFiles().length > 0) {
OutputFile deleteManifestFile = outputFileSupplier.get();
ManifestWriter<DeleteFile> deleteManifestWriter = ManifestFiles.writeDeleteManifest(FORMAT_V2, spec,
deleteManifestFile, DUMMY_SNAPSHOT_ID);
try (ManifestWriter<DeleteFile> writer = deleteManifestWriter) {
for (DeleteFile deleteFile : result.deleteFiles()) {
writer.add(deleteFile);
}
}
deleteManifest = deleteManifestWriter.toManifestFile();
}
return new DeltaManifests(dataManifest, deleteManifest, result.referencedDataFiles());
}
这里可以看到datafile和deletefile分别生成一个Manifest,最后序列化返回,这里完成了manifast文件的写入工作
而之后的提交流程在notifyCheckpointComplete中
@Override
public void notifyCheckpointComplete(long checkpointId) throws Exception {
super.notifyCheckpointComplete(checkpointId);
// It's possible that we have the following events:
// 1. snapshotState(ckpId);
// 2. snapshotState(ckpId+1);
// 3. notifyCheckpointComplete(ckpId+1);
// 4. notifyCheckpointComplete(ckpId);
// For step#4, we don't need to commit iceberg table again because in step#3 we've committed all the files,
// Besides, we need to maintain the max-committed-checkpoint-id to be increasing.
if (checkpointId > maxCommittedCheckpointId) {
commitUpToCheckpoint(dataFilesPerCheckpoint, flinkJobId, checkpointId);
this.maxCommittedCheckpointId = checkpointId;
}
}
private void commitUpToCheckpoint(NavigableMap<Long, byte[]> deltaManifestsMap,
String newFlinkJobId,
long checkpointId) throws IOException {
NavigableMap<Long, byte[]> pendingMap = deltaManifestsMap.headMap(checkpointId, true);
List<ManifestFile> manifests = Lists.newArrayList();
NavigableMap<Long, WriteResult> pendingResults = Maps.newTreeMap();
for (Map.Entry<Long, byte[]> e : pendingMap.entrySet()) {
if (Arrays.equals(EMPTY_MANIFEST_DATA, e.getValue())) {
// Skip the empty flink manifest.
continue;
}
DeltaManifests deltaManifests = SimpleVersionedSerialization
.readVersionAndDeSerialize(DeltaManifestsSerializer.INSTANCE, e.getValue());
pendingResults.put(e.getKey(), FlinkManifestUtil.readCompletedFiles(deltaManifests, table.io()));
manifests.addAll(deltaManifests.manifests());
}
int totalFiles = pendingResults.values().stream()
.mapToInt(r -> r.dataFiles().length + r.deleteFiles().length).sum();
continuousEmptyCheckpoints = totalFiles == 0 ? continuousEmptyCheckpoints + 1 : 0;
if (totalFiles != 0 || continuousEmptyCheckpoints % maxContinuousEmptyCommits == 0) {
if (replacePartitions) {
replacePartitions(pendingResults, newFlinkJobId, checkpointId);
} else {
commitDeltaTxn(pendingResults, newFlinkJobId, checkpointId);
}
continuousEmptyCheckpoints = 0;
}
pendingMap.clear();
// Delete the committed manifests.
for (ManifestFile manifest : manifests) {
try {
table.io().deleteFile(manifest.path());
} catch (Exception e) {
// The flink manifests cleaning failure shouldn't abort the completed checkpoint.
String details = MoreObjects.toStringHelper(this)
.add("flinkJobId", newFlinkJobId)
.add("checkpointId", checkpointId)
.add("manifestPath", manifest.path())
.toString();
LOG.warn("The iceberg transaction has been committed, but we failed to clean the temporary flink manifests: {}",
details, e);
}
}
}
这里会反序列化之前序列化的值,生成deltaManifests,添加到manifests中
这里包含了sequenceNumber信息,
然后提交事务
private void commitDeltaTxn(NavigableMap<Long, WriteResult> pendingResults, String newFlinkJobId, long checkpointId) {
int deleteFilesNum = pendingResults.values().stream().mapToInt(r -> r.deleteFiles().length).sum();
if (deleteFilesNum == 0) {
// To be compatible with iceberg format V1.
AppendFiles appendFiles = table.newAppend();
int numFiles = 0;
for (WriteResult result : pendingResults.values()) {
Preconditions.checkState(result.referencedDataFiles().length == 0, "Should have no referenced data files.");
numFiles += result.dataFiles().length;
Arrays.stream(result.dataFiles()).forEach(appendFiles::appendFile);
}
commitOperation(appendFiles, numFiles, 0, "append", newFlinkJobId, checkpointId);
} else {
// To be compatible with iceberg format V2.
for (Map.Entry<Long, WriteResult> e : pendingResults.entrySet()) {
// We don't commit the merged result into a single transaction because for the sequential transaction txn1 and
// txn2, the equality-delete files of txn2 are required to be applied to data files from txn1. Committing the
// merged one will lead to the incorrect delete semantic.
WriteResult result = e.getValue();
RowDelta rowDelta = table.newRowDelta()
.validateDataFilesExist(ImmutableList.copyOf(result.referencedDataFiles()))
.validateDeletedFiles();
int numDataFiles = result.dataFiles().length;
Arrays.stream(result.dataFiles()).forEach(rowDelta::addRows);
int numDeleteFiles = result.deleteFiles().length;
Arrays.stream(result.deleteFiles()).forEach(rowDelta::addDeletes);
commitOperation(rowDelta, numDataFiles, numDeleteFiles, "rowDelta", newFlinkJobId, e.getKey());
}
}
}
这里创建一个RowDelta的对象rowDelta,实现类为BaseRowDelta继承自MergingSnapshotProducer作为一个新的snapshot提交,
private void commitOperation(SnapshotUpdate<?> operation, int numDataFiles, int numDeleteFiles, String description,
String newFlinkJobId, long checkpointId) {
LOG.info("Committing {} with {} data files and {} delete files to table {}", description, numDataFiles,
numDeleteFiles, table);
operation.set(MAX_COMMITTED_CHECKPOINT_ID, Long.toString(checkpointId));
operation.set(FLINK_JOB_ID, newFlinkJobId);
long start = System.currentTimeMillis();
operation.commit(); // abort is automatically called if this fails.
long duration = System.currentTimeMillis() - start;
LOG.info("Committed in {} ms", duration);
}
然后再operation.commit()会调用SnapshotProducer中的commit()方法,走到SnapshotProducer中的apply()方法,然后按照之前合并小文件的中的流程获取SequenceNumber和snapshotId,然后提交写入snapshot文件和metadata文件。