下载cudnn_samples_v9、运行示例

Install the cuDNN samples.

sudo apt-get -y install libcudnn9-samples
Go to the writable path.

cd $HOME/cudnn_samples_v9/mnistCUDNN
Compile the mnistCUDNN sample.

make clean && make
Run the mnistCUDNN sample.

./mnistCUDNN

或者直接clone我在github上的这一例程： Tr-buaa/mnistCUDNN (github.com)

运行结果（部分）：

源码解读

从main函数开始看，首先是解析命令行、获取设备信息、cudnn版本等，这些就不展开描述了。

main函数中核心的代码如下：

            std::cout << "\nTesting single precision\n";
            network_t<float> mnist;
            Layer_t<float> conv1(1, 20, 5, conv1_bin, conv1_bias_bin, argv[0]);
            Layer_t<float> conv2(20, 50, 5, conv2_bin, conv2_bias_bin, argv[0]);
            Layer_t<float> ip1(800, 500, 1, ip1_bin, ip1_bias_bin, argv[0]);
            Layer_t<float> ip2(500, 10, 1, ip2_bin, ip2_bias_bin, argv[0]);
            get_path(image_path, first_image, argv[0]);
            i1 = mnist.classify_example(image_path.c_str(), conv1, conv2, ip1, ip2);

network_t<float> mnist;创建了一个名为mnist的对象，在class network_t中定义了cudnn中的各种Descriptor，以及各种前向传播的函数。

Layer_t<float> conv1创建了网络各层的对象，包括卷积层与全连接层，参数分别为input、output、kernel_dim、weight路径、bias路径、可执行文件路径。

get_path获取待输入的图片的绝对路径。

mnist.classify_example是核心功能函数，通过cudnn执行前向计算，并得到结果返回给i1。

network_t

template <class value_type>
class network_t {
    typedef typename ScaleFactorTypeMap<value_type>::Type scaling_type;
    int convAlgorithm;
    cudnnDataType_t dataType;
    cudnnTensorFormat_t tensorFormat;
    cudnnHandle_t cudnnHandle;
    cudnnTensorDescriptor_t srcTensorDesc, dstTensorDesc, biasTensorDesc;
    cudnnFilterDescriptor_t filterDesc;
    cudnnConvolutionDescriptor_t convDesc;
    cudnnPoolingDescriptor_t poolingDesc;
    cudnnActivationDescriptor_t activDesc;
    cudnnLRNDescriptor_t normDesc;
    cublasHandle_t cublasHandle;

该class定义了各种cudnn需要的参数，包括handle、网络各层的Descriptor。

Handle是使用cudnn必须初始化的指针。

Descriptor是指向不透明结构的指针，用于描述各种对象（Tensor、Fliter、Pooling等）的尺寸、布局和数据类型。在cudnn的各种函数中都需要用到（创建实例、初始化实例等），使得cudnn库能够以通用的方式处理不同类型和布局的数据。

在使用描述符之前，通常需要按照以下步骤操作：

创建描述符：使用相应的创建函数，如cudnnCreateTensorDescriptor。
设置描述符：使用设置函数，如cudnnSetTensorNdDescriptor，根据需要提供数据类型、维度等信息。
使用描述符：在cuDNN函数调用中使用描述符，如卷积、池化等。
销毁描述符：使用结束后，通过相应的销毁函数释放资源，如cudnnDestroyTensorDescriptor。

在创建了network_t对象后，调用默认构造函数，创建了所有Handle与Deccriptor：

public:
    network_t() {
        convAlgorithm = -1;
        switch (sizeof(value_type)) {
            case 2:
                dataType = CUDNN_DATA_HALF;
                break;
            case 4:
                dataType = CUDNN_DATA_FLOAT;
                break;
            case 8:
                dataType = CUDNN_DATA_DOUBLE;
                break;
            default:
                FatalError("Unsupported data type");
        }
        tensorFormat = CUDNN_TENSOR_NCHW;
        createHandles();
    };
  
 private:
    void
    createHandles() {
        checkCUDNN(cudnnCreate(&cudnnHandle));
        checkCUDNN(cudnnCreateTensorDescriptor(&srcTensorDesc));
        checkCUDNN(cudnnCreateTensorDescriptor(&dstTensorDesc));
        checkCUDNN(cudnnCreateTensorDescriptor(&biasTensorDesc));
        checkCUDNN(cudnnCreateFilterDescriptor(&filterDesc));
        checkCUDNN(cudnnCreateConvolutionDescriptor(&convDesc));
        checkCUDNN(cudnnCreatePoolingDescriptor(&poolingDesc));
        checkCUDNN(cudnnCreateActivationDescriptor(&activDesc));
        checkCUDNN(cudnnCreateLRNDescriptor(&normDesc));

        checkCublasErrors(cublasCreate(&cublasHandle));
    }

Layer_t

template <class value_type>
struct Layer_t {
    fp16Import_t fp16Import;
    int inputs;
    int outputs;

    // linear dimension (i.e. size is kernel_dim * kernel_dim)
    int kernel_dim;
    value_type *data_h, *data_d;
    value_type *bias_h, *bias_d;

    Layer_t()
        : data_h(NULL),
          data_d(NULL),
          bias_h(NULL),
          bias_d(NULL),
          inputs(0),
          outputs(0),
          kernel_dim(0),
          fp16Import(FP16_HOST) {}

    Layer_t(int _inputs,
            int _outputs,
            int _kernel_dim,
            const char* fname_weights,
            const char* fname_bias,
            const char* pname        = NULL,
            fp16Import_t _fp16Import = FP16_HOST)
        : inputs(_inputs), outputs(_outputs), kernel_dim(_kernel_dim) {
        fp16Import = _fp16Import;
        std::string weights_path, bias_path;
        if (pname != NULL) {
            get_path(weights_path, fname_weights, pname);
            get_path(bias_path, fname_bias, pname);
        } else {
            weights_path = fname_weights;
            bias_path    = fname_bias;
        }
        readAllocInit(weights_path.c_str(), inputs * outputs * kernel_dim * kernel_dim, &data_h, &data_d);
        readAllocInit(bias_path.c_str(), outputs, &bias_h, &bias_d);
    }

    ~Layer_t() {
        if (data_h != NULL) delete[] data_h;
        if (data_d != NULL) checkCudaErrors(cudaFree(data_d));
        if (bias_h != NULL) delete[] bias_h;
        if (bias_d != NULL) checkCudaErrors(cudaFree(bias_d));
    }

   private:
    void
    readAllocInit(const char* fname, int size, value_type** data_h, value_type** data_d) {
        readAllocMemcpy<value_type>(fname, size, data_h, data_d);
    }
};

创建Layer_t对象时，构造函数通过data、bias路径读取layer的数据，然后分配内存、将数据从host复制至device。

classify_example

int classify_example(const char* fname,
                 const Layer_t<value_type>& conv1,
                 const Layer_t<value_type>& conv2,
                 const Layer_t<value_type>& ip1,
                 const Layer_t<value_type>& ip2) {
    int n, c, h, w;
    value_type *srcData = NULL, *dstData = NULL;
    value_type imgData_h[IMAGE_H * IMAGE_W];

    readImage(fname, imgData_h);

    std::cout << "Performing forward propagation ...\n";

    checkCudaErrors(cudaMalloc((void **)&srcData, IMAGE_H * IMAGE_W * sizeof(value_type)));
    checkCudaErrors(cudaMemcpy(srcData, imgData_h, IMAGE_H * IMAGE_W * sizeof(value_type), cudaMemcpyHostToDevice));

    n = c = 1;
    h     = IMAGE_H;
    w     = IMAGE_W;
    convoluteForward(conv1, n, c, h, w, srcData, &dstData);
    poolForward(n, c, h, w, dstData, &srcData);

    convoluteForward(conv2, n, c, h, w, srcData, &dstData);
    poolForward(n, c, h, w, dstData, &srcData);

    fullyConnectedForward(ip1, n, c, h, w, srcData, &dstData);
    activationForward(n, c, h, w, dstData, &srcData);
    lrnForward(n, c, h, w, srcData, &dstData);

    fullyConnectedForward(ip2, n, c, h, w, dstData, &srcData);
    softmaxForward(n, c, h, w, srcData, &dstData);

    // cuDNN and cuBLAS library calls are asynchronous w.r.t. the host.
    // Need a device sync here before copying back the results.
    checkCudaErrors(cudaDeviceSynchronize());
    const int max_digits = 10;

    // Take care of half precision
    value_type result[max_digits];
    checkCudaErrors(cudaMemcpy(result, dstData, max_digits * sizeof(value_type), cudaMemcpyDeviceToHost));
    int id = 0;
    for (int i = 1; i < max_digits; i++) {
        if (Convert<scaling_type>(result[id]) < Convert<scaling_type>(result[i])) {
            id = i;
        }
    }

    std::cout << "Resulting weights from Softmax:" << std::endl;
    printDeviceVector(n * c * h * w, dstData);

    checkCudaErrors(cudaFree(srcData));
    checkCudaErrors(cudaFree(dstData));
    return id;
}

该函数首先通过FreeImage库读取了输入的图片，分配、复制了数据至device memory。然后按网络顺序执行了conv、fc、pooling等操作的forward运算，然后通过softmax得到的结果取概率最大值得到最终结果id。

接下来进入一个forward函数中具体查看。

convoluteForward

void convoluteForward(const Layer_t<value_type>& conv,
                     int& n,int& c,int& h,int& w,
                     value_type* srcData,value_type** dstData) {
    cudnnConvolutionFwdAlgo_t algo;

    setTensorDesc(srcTensorDesc, tensorFormat, dataType, n, c, h, w);

    const int tensorDims             = 4;
    int tensorOuputDimA[tensorDims]  = {n, c, h, w};
    const int filterDimA[tensorDims] = {conv.outputs, conv.inputs, conv.kernel_dim, conv.kernel_dim};

    checkCUDNN(cudnnSetFilterNdDescriptor(filterDesc, dataType, CUDNN_TENSOR_NCHW, 
                tensorDims, filterDimA));

    const int convDims           = 2;
    int padA[convDims]           = {0, 0};
    int filterStrideA[convDims]  = {1, 1};
    int upscaleA[convDims]       = {1, 1};
    cudnnDataType_t convDataType = dataType;

    // Math are done in FP32 when tensor are in FP16.
    if (dataType == CUDNN_DATA_HALF) {
        convDataType = CUDNN_DATA_FLOAT;
    }

    checkCUDNN(cudnnSetConvolutionNdDescriptor(convDesc, convDims, padA, filterStrideA, 
                upscaleA, CUDNN_CROSS_CORRELATION, convDataType));

    // find dimension of convolution output
    checkCUDNN(cudnnGetConvolutionNdForwardOutputDim(convDesc, srcTensorDesc, 
                filterDesc, tensorDims, tensorOuputDimA));
    n = tensorOuputDimA[0];
    c = tensorOuputDimA[1];
    h = tensorOuputDimA[2];
    w = tensorOuputDimA[3];

    setTensorDesc(dstTensorDesc, tensorFormat, dataType, n, c, h, w);

    if (convAlgorithm < 0) {
        int requestedAlgoCount = CUDNN_CONVOLUTION_FWD_ALGO_COUNT;
        int returnedAlgoCount  = -1;
        cudnnConvolutionFwdAlgoPerf_t results[2 * CUDNN_CONVOLUTION_FWD_ALGO_COUNT];

        // Choose the best according to the preference
        std::cout << "Testing cudnnGetConvolutionForwardAlgorithm_v7 ...\n";
        checkCUDNN(cudnnGetConvolutionForwardAlgorithm_v7(cudnnHandle,
                                                          srcTensorDesc,
                                                          filterDesc,
                                                          convDesc,
                                                          dstTensorDesc,
                                                          requestedAlgoCount,
                                                          &returnedAlgoCount,
                                                          results));
        for (int algoIndex = 0; algoIndex < returnedAlgoCount; ++algoIndex) {
            printf("^^^^ %s for Algo %d: %f time requiring %llu memory\n",
                   cudnnGetErrorString(results[algoIndex].status),
                   results[algoIndex].algo,
                   results[algoIndex].time,
                   (unsigned long long)results[algoIndex].memory);
        }

        // New way of finding the fastest config
        // Setup for findFastest call
        std::cout << "Testing cudnnFindConvolutionForwardAlgorithm ...\n";
        checkCUDNN(cudnnFindConvolutionForwardAlgorithm(cudnnHandle,
                                                        srcTensorDesc,
                                                        filterDesc,
                                                        convDesc,
                                                        dstTensorDesc,
                                                        requestedAlgoCount,
                                                        &returnedAlgoCount,
                                                        results));
        for (int algoIndex = 0; algoIndex < returnedAlgoCount; ++algoIndex) {
            printf("^^^^ %s for Algo %d: %f time requiring %llu memory\n",
                   cudnnGetErrorString(results[algoIndex].status),
                   results[algoIndex].algo,
                   results[algoIndex].time,
                   (unsigned long long)results[algoIndex].memory);
        }

        algo = results[0].algo;
    } else {
        algo = (cudnnConvolutionFwdAlgo_t)convAlgorithm;
    }

    resize(n * c * h * w, dstData);
    size_t sizeInBytes = 0;
    void* workSpace    = NULL;
    checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize(
        cudnnHandle, srcTensorDesc, filterDesc, convDesc, dstTensorDesc, algo, &sizeInBytes));
    if (sizeInBytes != 0) {
        checkCudaErrors(cudaMalloc(&workSpace, sizeInBytes));
    }
    scaling_type alpha = scaling_type(1);
    scaling_type beta  = scaling_type(0);
    checkCUDNN(cudnnConvolutionForward(cudnnHandle,
                                       &alpha,
                                       srcTensorDesc,
                                       srcData,
                                       filterDesc,
                                       conv.data_d,
                                       convDesc,
                                       algo,
                                       workSpace,
                                       sizeInBytes,
                                       &beta,
                                       dstTensorDesc,
                                       *dstData));
    addBias(dstTensorDesc, conv, c, *dstData);
    if (sizeInBytes != 0) {
        checkCudaErrors(cudaFree(workSpace));
    }
}

第4行，cudnnConvolutionFwdAlgo_t是一个列举了可用于前向卷积预算的不同算法的enum。

第6行，setTensorDesc用于创建Tensor描述符，函数中调用了cudnn_ops_library中的cudnnSetTensorNdDescriptor，初始化先前通过cudnnCreateTensorDescriptor创建的tensor描述符。

第6、12、26、37行，通过cudnn的set函数初始化srcTensor、Filter（卷积核）、Convolution、dstTensor。

第30行，cudnnGetConvolutionNdForwardOutputDim可以通过给定的输入、卷积、卷积核描述符返回卷积结果tensor的维度，以便事先分配适当的内存。

第39-84行，接下来if中的功能是：如果未设置卷积算法，则通过cudnnGetConvolutionForwardAlgorithm_v7和cudnnFindConvolutionForwardAlgorithm寻找最优算法，否则使用设置的卷积算法。

第86-93行，通过cudnnGetConvolutionForwardWorkspaceSize分配GPU内存空间，以便能够使用指定算法调用 cudnnConvolutionForward()。最后一个参数为需要分配的GPU内存空间大小。

第94-108行，执行Conv的前向计算，该函数各参数解释如下：

cudnnConvolutionForward(cudnnHandle, // 上下文句柄，封装cudnn运行时状态
                       &alpha, // 指向缩放因子的指针
                       srcTensorDesc, // 输入Tensor的描述符
                       srcData, // 与输入Tensor描述符关联的指向GPU内存数据的指针
                       filterDesc, // 输入卷积核的描述符
                       conv.data_d, // 指向数据的指针
                       convDesc, // 卷积操作的描述符
                       algo, // 所用的卷积算法enum
                       workSpace, // 指向指定算法所需的工作空间的指针（可能为NULL）
                       sizeInBytes, // 指定算法所需工作空间的大小
                       &beta, //指向缩放因子的指针
                       dstTensorDesc, // 输出Tensor的描述符，带有卷积结果
                       *dstData // output，指向输出Tensor关联的指向GPU内存数据的指针
                       )

第109行，调用cudnnAddTensor完成bias的Tensor add。

最后释放内存，这一层的前向计算结束，进入下一层的计算。值得注意的是，在全连接层的前向运算中，调用了cublas库的相关功能。

CUDNN例程解析 mnistCUDNN