CUDNN 9 （5） Backend Descriptor Types & Frontend v1.0 API Examples

在CUDNN 9 （3） Frontend API example中介绍了FE 和backend APIs 都是同一组功能（Graph API）的入口点，并详细介绍了使用FE v0.x API的示例。本文介绍使用backend API、FE v1.0 API的用例。

frontend v0.x & backend API create tensor

下面代码给出了分别使用frontend v0.x API和backend API创建tensor。

TEST_CASE("Tensor creation comparison", "[frontend][comparison][backend]") {
    // Consider creation of a 2d Tensor
    // n,c,h,w as 4,32,32,32
    std::cout << "Tensor creation comparison" << std::endl;
    std::array<int64_t, 4> tensor_dim = {4, 32, 32, 32};
    std::array<int64_t, 4> tensor_str = {32768, 1024, 32, 1};  // NCHW format
    cudnnDataType_t data_type         = CUDNN_DATA_FLOAT;
    int64_t alignment                 = sizeof(float);
    int64_t id                        = 0xD0D0CACA;  // Some magic number

    // Creating Frontend code

    try {
        auto tensor = cudnn_frontend::TensorBuilder()
                          .setDim(tensor_dim.size(), tensor_dim.data())
                          .setStrides(tensor_str.size(), tensor_str.data())
                          .setId(id)
                          .setAlignment(alignment)
                          .setDataType(data_type)
                          .build();
    } catch (cudnn_frontend::cudnnException& e) {
        std::cout << "Exception in tensor creation " << e.what() << std::endl;
    }

    auto check_status = [](cudnnStatus_t status) { REQUIRE(status == CUDNN_STATUS_SUCCESS); };

    // Equivalent Backend code
    {
        cudnnBackendDescriptor_t tensor;

        // Allocate memory for the descriptor
        // This is a c-style malloc which requires
        // a equivalent 1-time deletion. Raw backend code
        // requires tracking allocation and free unlike raw
        // pointers, else it may lead to memory leak.
        check_status(cudnnBackendCreateDescriptor(CUDNN_BACKEND_TENSOR_DESCRIPTOR, &tensor));

        // Set the following attributes
        // Dimensions, Strides, Alignment, Id, DataType
        check_status(
            cudnnBackendSetAttribute(tensor, CUDNN_ATTR_TENSOR_DATA_TYPE, CUDNN_TYPE_DATA_TYPE, 1, &data_type));
        check_status(cudnnBackendSetAttribute(
            tensor, CUDNN_ATTR_TENSOR_DIMENSIONS, CUDNN_TYPE_INT64, tensor_dim.size(), tensor_dim.data()));
        check_status(cudnnBackendSetAttribute(
            tensor, CUDNN_ATTR_TENSOR_STRIDES, CUDNN_TYPE_INT64, tensor_str.size(), tensor_str.data()));
        check_status(cudnnBackendSetAttribute(tensor, CUDNN_ATTR_TENSOR_UNIQUE_ID, CUDNN_TYPE_INT64, 1, &id));
        check_status(
            cudnnBackendSetAttribute(tensor, CUDNN_ATTR_TENSOR_BYTE_ALIGNMENT, CUDNN_TYPE_INT64, 1, &alignment));
        // Finalize the descriptor
        check_status(cudnnBackendFinalize(tensor));

        // Free the memory allocated above. Any short-circuit return will
        // cause a memory leak.
        check_status(cudnnBackendDestroyDescriptor(tensor));
    }

    std::cout << "\n========================================================================================\n";
}

Backend Descriptor Types

Backend Descriptor Types使用统一的API（定义在cudnn_graph_v9.h中），通过设置不同的描述符类型（例如：CUDNN_BACKEND_ENGINE_DESCRIPTOR）来创建、设置、确定任何Descriptor，包括Data、Operation、Engine、Plan、Varpack等。例如：

Create, set, and finalize an operation graph descriptor

cudnnBackendDescriptor_t op_graph;
cudnnBackendCreateDescriptor(CUDNN_BACKEND_OPERATIONGRAPH_DESCRIPTOR, &op_graph);
cudnnBackendSetAttribute(op_graph, CUDNN_ATTR_OPERATIONGRAPH_OPS,
                        CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &fprop);
cudnnBackendSetAttribute(op_graph, CUDNN_ATTR_OPERATIONGRAPH_HANDLE,
                        CUDNN_TYPE_HANDLE, 1, &handle);
cudnnBackendFinalize(op_graph);

Create, set, and finalize an engine descriptor

cudnnBackendDescriptor_t engine;
cudnnBackendCreateDescriptor(CUDNN_BACKEND_ENGINE_DESCRIPTOR, &engine);
cudnnBackendSetAttribute(engine, CUDNN_ATTR_ENGINE_OPERATION_GRAPH,
                        CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &op_graph);
int64_t gidx = 0;
cudnnBackendSetAttribute(engine, CUDNN_ATTR_ENGINE_GLOBAL_INDEX,
                        CUDNN_TYPE_INT64, 1, &gidx);
cudnnBackendFinalize(engine);

Create, set, and finalize an execution plan descriptor. Obtain workspace size to allocate

cudnnBackendDescriptor_t plan;
cudnnBackendCreateDescriptor(CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR, &plan);
cudnnBackendSetAttribute(plan, CUDNN_ATTR_EXECUTION_PLAN_HANDLE, CUDNN_TYPE_HANDLE, 1, &handle);
cudnnBackendSetAttribute(plan, CUDNN_ATTR_EXECUTION_PLAN_ENGINE_CONFIG,
                     CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &engcfg);
cudnnBackendFinalize(plan);

int64_t workspaceSize;
cudnnBackendGetAttribute(plan, CUDNN_ATTR_EXECUTION_PLAN_WORKSPACE_SIZE,
                     CUDNN_TYPE_INT64, 1, NULL, &workspaceSize);

Create, set and finalize a variant pack descriptor

void *dev_ptrs[3] = {xData, wData, yData}; // device pointer
int64_t uids[3] = {'x', 'w', 'y'};
void *workspace;

cudnnBackendDescriptor_t varpack;
cudnnBackendCreateDescriptor(CUDNN_BACKEND_VARIANT_PACK_DESCRIPTOR, &varpack);
cudnnBackendSetAttribute(varpack, CUDNN_ATTR_VARIANT_PACK_DATA_POINTERS,
                        CUDNN_TYPE_VOID_PTR, 3, dev_ptrs);
cudnnBackendSetAttribute(varpack, CUDNN_ATTR_VARIANT_PACK_UNIQUE_IDS,
                        CUDNN_TYPE_INT64, 3, uids);
cudnnBackendSetAttribute(varpack, CUDNN_ATTR_VARIANT_PACK_WORKSPACE,
                        CUDNN_TYPE_VOID_PTR, 1, &workspace);
cudnnBackendFinalize(varpack);

Execute the plan with a variant pack

cudnnBackendExecute(handle, plan, varpack);

从上面的代码段例子可以看到，任何Descriptor的创建都使用cudnnBackendCreateDescriptor，然后通过cudnnBackendSetAttribute设置相应的属性，通过cudnnBackendFinalize实现创建，最后通过cudnnBackendExecute执行计算。Frontend API仅仅只对这一系列函数进行了wrap，其底层实现相同，只是为了方便用户使用而创建。

Frontend v1.0 API

Workflow

创建cudnn graph并指定全局属性（global properties）。计算精度和输入/输出数据类型等全局属性有助于推断未明确提及的属性。
创建并添加输入张量（input tensors）。
创建并添加算子节点（operation nodes）。这些操作的输出是张量类型，可以顺序用作下一个节点的输入。
验证（Validate）操作图。此步骤确保图构建良好并且没有悬挂（hanging）张量或节点。
构建cudnn算子图。此步骤将图形降低为 cudnn 标准化形式。
根据您选择的启发式（heuristics）类型创建执行计划（execution plan）。
[可选]检查算子图的支持。
[可选] 根据自定义标准筛选计划。
构建（Build）一个或全部执行计划。
[可选] 对滤波器（filter）计划运行自动调整（autotuning）。
使用相关数据指针执行图（Execute graph）。

Example

TEST_CASE("Convolution fprop", "[conv][graph][caching]") {
    namespace fe = cudnn_frontend;

    if (is_arch_supported_by_cudnn() == false) {
        SKIP("Architecture is not supported by currend cudnn version");
    }

    int64_t n = 16, c = 128, h = 64, w = 64, k = 256, r = 1, s = 1;

    auto build_new_graph = [=](cudnnHandle_t handle) {
        auto graph = std::make_shared<fe::graph::Graph>();
        graph->set_io_data_type(fe::DataType_t::HALF).set_compute_data_type(fe::DataType_t::FLOAT);

        auto X = graph->tensor(fe::graph::Tensor_attributes()
                                   .set_name("image")
                                   .set_dim({n, c, h, w})
                                   .set_stride({c * h * w, 1, c * w, c}));

        auto W = graph->tensor(fe::graph::Tensor_attributes()
                                   .set_name("filter")
                                   .set_dim({k, c, r, s})
                                   .set_stride({c * r * s, 1, c * s, c}));

        auto conv_options =
            fe::graph::Conv_fprop_attributes().set_padding({0, 0}).set_stride({1, 1}).set_dilation({1, 1});
        auto Y = graph->conv_fprop(X, W, conv_options);

        Y->set_output(true);

        REQUIRE(graph->validate().is_good());

        REQUIRE(graph->build_operation_graph(handle).is_good());

        REQUIRE(graph->create_execution_plans({fe::HeurMode_t::A}).is_good());

        REQUIRE(graph->check_support(handle).is_good());

        REQUIRE(graph->build_plans(handle).is_good());

        return std::make_tuple(graph, X, W, Y);
    };

    cudnnHandle_t handle;

    checkCudnnErr(cudnnCreate(&handle));

    auto [graph, X, W, Y] = build_new_graph(handle);

    Surface<half> x_tensor(n * c * h * w, false);
    Surface<half> w_tensor(k * c * r * s, false);
    Surface<half> y_tensor(n * k * h * w, false);  // Should be p, q.

    std::unordered_map<int64_t, void*> variant_pack = {
        {X->get_uid(), x_tensor.devPtr}, {W->get_uid(), w_tensor.devPtr}, {Y->get_uid(), y_tensor.devPtr}};

    Surface<int8_t> workspace(graph->get_workspace_size(), false);

    std::cout << *graph << std::endl;

    REQUIRE(graph->execute(handle, variant_pack, workspace.devPtr).is_good());
    cudnnDestroy(handle);
}

上面的代码基本实现了workflow中的标准流程。

workflow的第一步，通过std::make_shared<fe::graph::Graph>()实例化graph对象，然后在此对象上进行操作。

workflow的第二步，用户创建输入张量以提供给graph中的operation。要在graph中添加张量，使用graph->tensor(fe::graph::Tensor_attributes().set_***创建输入张量并通过不同的set函数设置属性。

workflow的第三步是创建operations，本例中调用设置属性的fe::graph::Conv_fprop_attributes()，然后定义前向卷积计算graph->conv_fprop(X, W, conv_options)。

接下来，按照workflow的4~9步依次对graph进行validate、build operation graph、create execution plans、check support、build plans。

下一步，通过cudnnCreate(&handle)创建handle，然后定义 variant_pack打包对应的数据指针、定义workspace。将上述三个参数传入graph->execute(handle, variant_pack, workspace.devPtr)，进行运算，结果保存在y_tensor.devPtr中。

最后通过cudnnDestroy(handle)结束handle的生命周期。