02GPU线程模型

295 阅读3分钟

线程模型结构

1719404930420.png

  • grid:网格
  • block:线程块
  • 线程块是逻辑的划分,不是物理上的划分
  • <<<grid_size,block_size>>>
  • 最大的线程块大小:1024

一维线程模型

1719405265943.png

每个线程的唯一标识由这两个<<>>确定;grid_size, block_size保存在

内建变量(build-in variable),目前考虑的是一维的情况:

(1)gridDim.x:该变量的数值等于执行配置中变量grid_size的值;

(2)blockDim.x:该变量的数值等于执行配置中变量block_size的值。

线程索引保存成内建变量( build-in variable):

(1)blockIdx.x:该变量指定一个线程在一个网格中的线程块索引值,范围为0~ gridDim.x-1;

(2)threadIdx.x:该变量指定一个线程在一个线程块中的线程索引值,范围为0~ blockDim.x-1。

kernel_fun<<<2, 4>>>() ;

Idx = threadIdx.x + blockIdx.x * blockDim.x

二维线程模型

dim3 grid_size(2, 2);

dim3 block_size(4, 4);

int blockId = blockIdx.x + blockId.y * gridDim.x;

int threadId = threadIdx.y *blockDim.x + threadIdx.x;

int id = blockId * (blockDim.x * blockDim.y) + threadId;

三维线程模型

dim3 grid_size(2, 2, 2);

dim3 block_size(4, 4, 2);

int blockId = blockIdx.x + blockIdx.y * gridDim.x+ gridDim.x * gridDim.y * blockIdx.z;

int threadId = (threadIdx.z * (blockDim.x * blockDim.y))+ (threadIdx.y * blockDim.x) +threadIdx.x;

int id = blockId * (blockDim.x * blockDim.y * blockDim.z) + threadId

GPU架构和计算能力

1719406838705.png

# GPU的属性

1719409109540.png

  • 计算能力(compute capability)
  • 全局内存(Amount of global memory)
  • 常量内存(Amount of constant memory)
  • 最大的网格大小(Maximum grid size)
  • 最大的线程块大小(Maximum block size)
  • SM的数量(Number of SMs)
  • 每一个block的最大的共享内存(Maximum amount of shared memory per block)
  • 每一个SM有的最大的共享内存(Maximum amount of shared memory per SM)
  • 每一个线程块的最大的寄存器数量(Maximum number of registers per block)
  • 每一个sm中的最大的寄存器数量(Maximum number of registers per SM)
  • 每一个block能够拥有的最大的线程数(Maximum number of threads per block)
  • 每一个sm能够拥有的最大的线程数(Maximum number of threads per SM)
int main(void)
{
    int device_id = 0;
    ErrorCheck(cudaSetDevice(device_id), __FILE__, __LINE__);

    cudaDeviceProp prop;
    ErrorCheck(cudaGetDeviceProperties(&prop, device_id), __FILE__, __LINE__);

    printf("Device id:                                 %d\n",
        device_id);
    printf("Device name:                               %s\n",
        prop.name);
    printf("Compute capability:                        %d.%d\n",
        prop.major, prop.minor);
    printf("Amount of global memory:                   %g GB\n",
        prop.totalGlobalMem / (1024.0 * 1024 * 1024));
    printf("Amount of constant memory:                 %g KB\n",
        prop.totalConstMem  / 1024.0);
    printf("Maximum grid size:                         %d %d %d\n",
        prop.maxGridSize[0], 
        prop.maxGridSize[1], prop.maxGridSize[2]);
    printf("Maximum block size:                        %d %d %d\n",
        prop.maxThreadsDim[0], prop.maxThreadsDim[1], 
        prop.maxThreadsDim[2]);
    printf("Number of SMs:                             %d\n",
        prop.multiProcessorCount);
    printf("Maximum amount of shared memory per block: %g KB\n",
        prop.sharedMemPerBlock / 1024.0);
    printf("Maximum amount of shared memory per SM:    %g KB\n",
        prop.sharedMemPerMultiprocessor / 1024.0);
    printf("Maximum number of registers per block:     %d K\n",
        prop.regsPerBlock / 1024);
    printf("Maximum number of registers per SM:        %d K\n",
        prop.regsPerMultiprocessor / 1024);
    printf("Maximum number of threads per block:       %d\n",
        prop.maxThreadsPerBlock);
    printf("Maximum number of threads per SM:          %d\n",
        prop.maxThreadsPerMultiProcessor);

    return 0;
}

组织线程模型


#include <stdio.h>
#include "../tools/common.cuh"

__global__ void addMatrix(int *A, int *B, int *C, const int nx, const int ny)
{
    int ix = threadIdx.x + blockIdx.x * blockDim.x;
    int iy = threadIdx.y + blockIdx.y * blockDim.y;;
    unsigned int idx = iy * nx + ix;
    if (ix < nx && iy < ny)
    {
        C[idx] = A[idx] + B[idx];
    }
}

int main(void)
{
    // 1、设置GPU设备
    setGPU();

    // 2、分配主机内存和设备内存,并初始化
    int nx = 16;
    int ny = 8;
    int nxy = nx * ny;
    size_t stBytesCount = nxy * sizeof(int);
     
     // (1)分配主机内存,并初始化
    int *ipHost_A, *ipHost_B, *ipHost_C;
    ipHost_A = (int *)malloc(stBytesCount);
    ipHost_B = (int *)malloc(stBytesCount);
    ipHost_C = (int *)malloc(stBytesCount);
    if (ipHost_A != NULL && ipHost_B != NULL && ipHost_C != NULL)
    {
        for (int i = 0; i < nxy; i++)
            {
                ipHost_A[i] = i;
                ipHost_B[i] = i + 1;
            }
        memset(ipHost_C, 0, stBytesCount); 
    }
    else
    {
        printf("Fail to allocate host memory!\n");
        exit(-1);
    }
    

    // (2)分配设备内存,并初始化
    int *ipDevice_A, *ipDevice_B, *ipDevice_C;
    ErrorCheck(cudaMalloc((int**)&ipDevice_A, stBytesCount), __FILE__, __LINE__); 
    ErrorCheck(cudaMalloc((int**)&ipDevice_B, stBytesCount), __FILE__, __LINE__); 
    ErrorCheck(cudaMalloc((int**)&ipDevice_C, stBytesCount), __FILE__, __LINE__); 
    if (ipDevice_A != NULL && ipDevice_B != NULL && ipDevice_C != NULL)
    {
        ErrorCheck(cudaMemcpy(ipDevice_A, ipHost_A, stBytesCount, cudaMemcpyHostToDevice), __FILE__, __LINE__); 
        ErrorCheck(cudaMemcpy(ipDevice_B, ipHost_B, stBytesCount, cudaMemcpyHostToDevice), __FILE__, __LINE__); 
        ErrorCheck(cudaMemcpy(ipDevice_C, ipHost_C, stBytesCount, cudaMemcpyHostToDevice), __FILE__, __LINE__); 
    }   
    else
    {
        printf("Fail to allocate memory\n");
        free(ipHost_A);
        free(ipHost_B);
        free(ipHost_C);
        exit(1);
    }

    // calculate on GPU
    dim3 block(4, 4);
    dim3 grid((nx + block.x -1) / block.x, (ny + block.y - 1) / block.y);
    printf("Thread config:grid:<%d, %d>, block:<%d, %d>\n", grid.x, grid.y, block.x, block.y);
    
    addMatrix<<<grid, block>>>(ipDevice_A, ipDevice_B, ipDevice_C, nx, ny);  // 调用内核函数
    
    ErrorCheck(cudaMemcpy(ipHost_C, ipDevice_C, stBytesCount, cudaMemcpyDeviceToHost), __FILE__, __LINE__); 
    for (int i = 0; i < 10; i++)
    {
        printf("id=%d, matrix_A=%d, matrix_B=%d, result=%d\n", i + 1,ipHost_A[i], ipHost_B[i], ipHost_C[i]);
    }

    free(ipHost_A);
    free(ipHost_B);
    free(ipHost_C);

    ErrorCheck(cudaFree(ipDevice_A), __FILE__, __LINE__); 
    ErrorCheck(cudaFree(ipDevice_B), __FILE__, __LINE__); 
    ErrorCheck(cudaFree(ipDevice_C), __FILE__, __LINE__); 

    ErrorCheck(cudaDeviceReset(), __FILE__, __LINE__); 
    return 0;
}