iceberg flink写入提交流程提交流程在IcebergFilesCommitter的snapshotState中

一般的flink写入任务构建

        FlinkSink
                .forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA)
                .table(table)
                .tableLoader(tableLoader)
                .writeParallelism(1)
                .equalityFieldColumns(ImmutableList.of("id"))
                .build();

        env.execute();

首先是一些builder对象的设置，部分如下，rowDataInput为输入的DataStream流

public static class Builder {
    private DataStream<RowData> rowDataInput = null;
    private TableLoader tableLoader;
    private Table table;
    private TableSchema tableSchema;
    private boolean overwrite = false;
    private DistributionMode distributionMode = null;
    private Integer writeParallelism = null;
    private List<String> equalityFieldColumns = null;

    private Builder() {
    }

    private Builder forRowData(DataStream<RowData> newRowDataInput) {
      this.rowDataInput = newRowDataInput;
      return this;
    }
}

进入build方法

public DataStreamSink<RowData> build() {
      Preconditions.checkArgument(rowDataInput != null,
          "Please use forRowData() to initialize the input DataStream.");
      Preconditions.checkNotNull(tableLoader, "Table loader shouldn't be null");

      if (table == null) {
        tableLoader.open();
        try (TableLoader loader = tableLoader) {
          this.table = loader.loadTable();
        } catch (IOException e) {
          throw new UncheckedIOException("Failed to load iceberg table from table loader: " + tableLoader, e);
        }
      }

      // Find out the equality field id list based on the user-provided equality field column names.
      List<Integer> equalityFieldIds = Lists.newArrayList();
      if (equalityFieldColumns != null && equalityFieldColumns.size() > 0) {
        for (String column : equalityFieldColumns) {
          org.apache.iceberg.types.Types.NestedField field = table.schema().findField(column);
          Preconditions.checkNotNull(field, "Missing required equality field column '%s' in table schema %s",
              column, table.schema());
          equalityFieldIds.add(field.fieldId());
        }
      }

      // Convert the requested flink table schema to flink row type.
      RowType flinkRowType = toFlinkRowType(table.schema(), tableSchema);

      // Distribute the records from input data stream based on the write.distribution-mode.
      rowDataInput = distributeDataStream(rowDataInput, table.properties(), table.spec(), table.schema(), flinkRowType);

      // Chain the iceberg stream writer and committer operator.
      IcebergStreamWriter<RowData> streamWriter = createStreamWriter(table, flinkRowType, equalityFieldIds);
      IcebergFilesCommitter filesCommitter = new IcebergFilesCommitter(tableLoader, overwrite);

      this.writeParallelism = writeParallelism == null ? rowDataInput.getParallelism() : writeParallelism;

      DataStream<Void> returnStream = rowDataInput
          .transform(ICEBERG_STREAM_WRITER_NAME, TypeInformation.of(WriteResult.class), streamWriter)
          .setParallelism(writeParallelism)
          .transform(ICEBERG_FILES_COMMITTER_NAME, Types.VOID, filesCommitter)
          .setParallelism(1)
          .setMaxParallelism(1);

      return returnStream.addSink(new DiscardingSink())
          .name(String.format("IcebergSink %s", table.name()))
          .setParallelism(1);
    }

在 IcebergStreamWriter streamWriter = createStreamWriter(table, flinkRowType, equalityFieldIds); 中构建写入类

static IcebergStreamWriter<RowData> createStreamWriter(Table table,
                                                         RowType flinkRowType,
                                                         List<Integer> equalityFieldIds) {
    Map<String, String> props = table.properties();
    long targetFileSize = getTargetFileSizeBytes(props);
    FileFormat fileFormat = getFileFormat(props);

    TaskWriterFactory<RowData> taskWriterFactory = new RowDataTaskWriterFactory(table.schema(), flinkRowType,
        table.spec(), table.locationProvider(), table.io(), table.encryption(), targetFileSize, fileFormat, props,
        equalityFieldIds);

    return new IcebergStreamWriter<>(table.name(), taskWriterFactory);
  }

这里根据table信息构建一个TaskWriterFactory工厂，然后再通过taskWriterFactory构建一个写入类，

class IcebergStreamWriter<T> extends AbstractStreamOperator<WriteResult>
    implements OneInputStreamOperator<T, WriteResult>, BoundedOneInput {

  private static final long serialVersionUID = 1L;

  private final String fullTableName;
  private final TaskWriterFactory<T> taskWriterFactory;

  private transient TaskWriter<T> writer;
  private transient int subTaskId;
  private transient int attemptId;

  IcebergStreamWriter(String fullTableName, TaskWriterFactory<T> taskWriterFactory) {
    this.fullTableName = fullTableName;
    this.taskWriterFactory = taskWriterFactory;
    setChainingStrategy(ChainingStrategy.ALWAYS);
  }

  @Override
  public void open() {
    this.subTaskId = getRuntimeContext().getIndexOfThisSubtask();
    this.attemptId = getRuntimeContext().getAttemptNumber();

    // Initialize the task writer factory.
    this.taskWriterFactory.initialize(subTaskId, attemptId);

    // Initialize the task writer.
    this.writer = taskWriterFactory.create();
  }

  @Override
  public void prepareSnapshotPreBarrier(long checkpointId) throws Exception {
    // close all open files and emit files to downstream committer operator
    emit(writer.complete());

    this.writer = taskWriterFactory.create();
  }

  @Override
  public void processElement(StreamRecord<T> element) throws Exception {
    writer.write(element.getValue());
  }
}

这里在open函数中初始化一个TaskWriter对象，然后再processElement中调用其write方法写入数据，最后在checkpoint时提交

提交流程在IcebergFilesCommitter的snapshotState中

@Override
  public void snapshotState(StateSnapshotContext context) throws Exception {
    super.snapshotState(context);
    long checkpointId = context.getCheckpointId();
    LOG.info("Start to flush snapshot state to state backend, table: {}, checkpointId: {}", table, checkpointId);

    // Update the checkpoint state.
    dataFilesPerCheckpoint.put(checkpointId, writeToManifest(checkpointId));
    // Reset the snapshot state to the latest state.
    checkpointsState.clear();
    checkpointsState.add(dataFilesPerCheckpoint);

    jobIdState.clear();
    jobIdState.add(flinkJobId);

    // Clear the local buffer for current checkpoint.
    writeResultsOfCurrentCkpt.clear();
  }

dataFilesPerCheckpoint.put(checkpointId, writeToManifest(checkpointId)); 为更新当前checkpoint的id和文件信息

/**
   * Write all the complete data files to a newly created manifest file and return the manifest's avro serialized bytes.
   */
  private byte[] writeToManifest(long checkpointId) throws IOException {
    if (writeResultsOfCurrentCkpt.isEmpty()) {
      return EMPTY_MANIFEST_DATA;
    }

    WriteResult result = WriteResult.builder().addAll(writeResultsOfCurrentCkpt).build();
    DeltaManifests deltaManifests = FlinkManifestUtil.writeCompletedFiles(result,
        () -> manifestOutputFileFactory.create(checkpointId), table.spec());

    return SimpleVersionedSerialization.writeVersionAndSerialize(DeltaManifestsSerializer.INSTANCE, deltaManifests);
  }

writeResultsOfCurrentCkpt中包含了datafile信息，deletefile信息

awsvHR9c_wps图片.png

static DeltaManifests writeCompletedFiles(WriteResult result,
                                            Supplier<OutputFile> outputFileSupplier,
                                            PartitionSpec spec) throws IOException {

    ManifestFile dataManifest = null;
    ManifestFile deleteManifest = null;

    // Write the completed data files into a newly created data manifest file.
    if (result.dataFiles() != null && result.dataFiles().length > 0) {
      dataManifest = writeDataFiles(outputFileSupplier.get(), spec, Lists.newArrayList(result.dataFiles()));
    }

    // Write the completed delete files into a newly created delete manifest file.
    if (result.deleteFiles() != null && result.deleteFiles().length > 0) {
      OutputFile deleteManifestFile = outputFileSupplier.get();

      ManifestWriter<DeleteFile> deleteManifestWriter = ManifestFiles.writeDeleteManifest(FORMAT_V2, spec,
          deleteManifestFile, DUMMY_SNAPSHOT_ID);
      try (ManifestWriter<DeleteFile> writer = deleteManifestWriter) {
        for (DeleteFile deleteFile : result.deleteFiles()) {
          writer.add(deleteFile);
        }
      }

      deleteManifest = deleteManifestWriter.toManifestFile();
    }

    return new DeltaManifests(dataManifest, deleteManifest, result.referencedDataFiles());
  }

这里可以看到datafile和deletefile分别生成一个Manifest，最后序列化返回，这里完成了manifast文件的写入工作

而之后的提交流程在notifyCheckpointComplete中

@Override
  public void notifyCheckpointComplete(long checkpointId) throws Exception {
    super.notifyCheckpointComplete(checkpointId);
    // It's possible that we have the following events:
    //   1. snapshotState(ckpId);
    //   2. snapshotState(ckpId+1);
    //   3. notifyCheckpointComplete(ckpId+1);
    //   4. notifyCheckpointComplete(ckpId);
    // For step#4, we don't need to commit iceberg table again because in step#3 we've committed all the files,
    // Besides, we need to maintain the max-committed-checkpoint-id to be increasing.
    if (checkpointId > maxCommittedCheckpointId) {
      commitUpToCheckpoint(dataFilesPerCheckpoint, flinkJobId, checkpointId);
      this.maxCommittedCheckpointId = checkpointId;
    }
  }

  private void commitUpToCheckpoint(NavigableMap<Long, byte[]> deltaManifestsMap,
                                    String newFlinkJobId,
                                    long checkpointId) throws IOException {
    NavigableMap<Long, byte[]> pendingMap = deltaManifestsMap.headMap(checkpointId, true);
    List<ManifestFile> manifests = Lists.newArrayList();
    NavigableMap<Long, WriteResult> pendingResults = Maps.newTreeMap();
    for (Map.Entry<Long, byte[]> e : pendingMap.entrySet()) {
      if (Arrays.equals(EMPTY_MANIFEST_DATA, e.getValue())) {
        // Skip the empty flink manifest.
        continue;
      }

      DeltaManifests deltaManifests = SimpleVersionedSerialization
          .readVersionAndDeSerialize(DeltaManifestsSerializer.INSTANCE, e.getValue());
      pendingResults.put(e.getKey(), FlinkManifestUtil.readCompletedFiles(deltaManifests, table.io()));
      manifests.addAll(deltaManifests.manifests());
    }

    int totalFiles = pendingResults.values().stream()
        .mapToInt(r -> r.dataFiles().length + r.deleteFiles().length).sum();
    continuousEmptyCheckpoints = totalFiles == 0 ? continuousEmptyCheckpoints + 1 : 0;
    if (totalFiles != 0 || continuousEmptyCheckpoints % maxContinuousEmptyCommits == 0) {
      if (replacePartitions) {
        replacePartitions(pendingResults, newFlinkJobId, checkpointId);
      } else {
        commitDeltaTxn(pendingResults, newFlinkJobId, checkpointId);
      }
      continuousEmptyCheckpoints = 0;
    }
    pendingMap.clear();

    // Delete the committed manifests.
    for (ManifestFile manifest : manifests) {
      try {
        table.io().deleteFile(manifest.path());
      } catch (Exception e) {
        // The flink manifests cleaning failure shouldn't abort the completed checkpoint.
        String details = MoreObjects.toStringHelper(this)
            .add("flinkJobId", newFlinkJobId)
            .add("checkpointId", checkpointId)
            .add("manifestPath", manifest.path())
            .toString();
        LOG.warn("The iceberg transaction has been committed, but we failed to clean the temporary flink manifests: {}",
            details, e);
      }
    }
  }

这里会反序列化之前序列化的值，生成deltaManifests，添加到manifests中

JaX4Jg6D_wps图片.png 这里包含了sequenceNumber信息，然后提交事务

  private void commitDeltaTxn(NavigableMap<Long, WriteResult> pendingResults, String newFlinkJobId, long checkpointId) {
    int deleteFilesNum = pendingResults.values().stream().mapToInt(r -> r.deleteFiles().length).sum();

    if (deleteFilesNum == 0) {
      // To be compatible with iceberg format V1.
      AppendFiles appendFiles = table.newAppend();

      int numFiles = 0;
      for (WriteResult result : pendingResults.values()) {
        Preconditions.checkState(result.referencedDataFiles().length == 0, "Should have no referenced data files.");

        numFiles += result.dataFiles().length;
        Arrays.stream(result.dataFiles()).forEach(appendFiles::appendFile);
      }

      commitOperation(appendFiles, numFiles, 0, "append", newFlinkJobId, checkpointId);
    } else {
      // To be compatible with iceberg format V2.
      for (Map.Entry<Long, WriteResult> e : pendingResults.entrySet()) {
        // We don't commit the merged result into a single transaction because for the sequential transaction txn1 and
        // txn2, the equality-delete files of txn2 are required to be applied to data files from txn1. Committing the
        // merged one will lead to the incorrect delete semantic.
        WriteResult result = e.getValue();
        RowDelta rowDelta = table.newRowDelta()
            .validateDataFilesExist(ImmutableList.copyOf(result.referencedDataFiles()))
            .validateDeletedFiles();

        int numDataFiles = result.dataFiles().length;
        Arrays.stream(result.dataFiles()).forEach(rowDelta::addRows);

        int numDeleteFiles = result.deleteFiles().length;
        Arrays.stream(result.deleteFiles()).forEach(rowDelta::addDeletes);

        commitOperation(rowDelta, numDataFiles, numDeleteFiles, "rowDelta", newFlinkJobId, e.getKey());
      }
    }
  }

这里创建一个RowDelta的对象rowDelta，实现类为BaseRowDelta继承自MergingSnapshotProducer作为一个新的snapshot提交，

  private void commitOperation(SnapshotUpdate<?> operation, int numDataFiles, int numDeleteFiles, String description,
                               String newFlinkJobId, long checkpointId) {
    LOG.info("Committing {} with {} data files and {} delete files to table {}", description, numDataFiles,
        numDeleteFiles, table);
    operation.set(MAX_COMMITTED_CHECKPOINT_ID, Long.toString(checkpointId));
    operation.set(FLINK_JOB_ID, newFlinkJobId);

    long start = System.currentTimeMillis();
    operation.commit(); // abort is automatically called if this fails.
    long duration = System.currentTimeMillis() - start;
    LOG.info("Committed in {} ms", duration);
  }

然后再operation.commit()会调用SnapshotProducer中的commit()方法，走到SnapshotProducer中的apply()方法,然后按照之前合并小文件的中的流程获取SequenceNumber和snapshotId，然后提交写入snapshot文件和metadata文件。