线程模型结构
- grid:网格
- block:线程块
- 线程块是逻辑的划分,不是物理上的划分
- <<<grid_size,block_size>>>
- 最大的线程块大小:1024
一维线程模型
内建变量(build-in variable),目前考虑的是一维的情况:
(1)gridDim.x:该变量的数值等于执行配置中变量grid_size的值;
(2)blockDim.x:该变量的数值等于执行配置中变量block_size的值。
线程索引保存成内建变量( build-in variable):
(1)blockIdx.x:该变量指定一个线程在一个网格中的线程块索引值,范围为0~ gridDim.x-1;
(2)threadIdx.x:该变量指定一个线程在一个线程块中的线程索引值,范围为0~ blockDim.x-1。
kernel_fun<<<2, 4>>>() ;
Idx = threadIdx.x + blockIdx.x * blockDim.x
二维线程模型
dim3 grid_size(2, 2);
dim3 block_size(4, 4);
int blockId = blockIdx.x + blockId.y * gridDim.x;
int threadId = threadIdx.y *blockDim.x + threadIdx.x;
int id = blockId * (blockDim.x * blockDim.y) + threadId;
三维线程模型
dim3 grid_size(2, 2, 2);
dim3 block_size(4, 4, 2);
int blockId = blockIdx.x + blockIdx.y * gridDim.x+ gridDim.x * gridDim.y * blockIdx.z;
int threadId = (threadIdx.z * (blockDim.x * blockDim.y))+ (threadIdx.y * blockDim.x) +threadIdx.x;
int id = blockId * (blockDim.x * blockDim.y * blockDim.z) + threadId
GPU架构和计算能力
- 计算能力(compute capability)
- 全局内存(Amount of global memory)
- 常量内存(Amount of constant memory)
- 最大的网格大小(Maximum grid size)
- 最大的线程块大小(Maximum block size)
- SM的数量(Number of SMs)
- 每一个block的最大的共享内存(Maximum amount of shared memory per block)
- 每一个SM有的最大的共享内存(Maximum amount of shared memory per SM)
- 每一个线程块的最大的寄存器数量(Maximum number of registers per block)
- 每一个sm中的最大的寄存器数量(Maximum number of registers per SM)
- 每一个block能够拥有的最大的线程数(Maximum number of threads per block)
- 每一个sm能够拥有的最大的线程数(Maximum number of threads per SM)
int main(void)
{
int device_id = 0;
ErrorCheck(cudaSetDevice(device_id), __FILE__, __LINE__);
cudaDeviceProp prop;
ErrorCheck(cudaGetDeviceProperties(&prop, device_id), __FILE__, __LINE__);
printf("Device id: %d\n",
device_id);
printf("Device name: %s\n",
prop.name);
printf("Compute capability: %d.%d\n",
prop.major, prop.minor);
printf("Amount of global memory: %g GB\n",
prop.totalGlobalMem / (1024.0 * 1024 * 1024));
printf("Amount of constant memory: %g KB\n",
prop.totalConstMem / 1024.0);
printf("Maximum grid size: %d %d %d\n",
prop.maxGridSize[0],
prop.maxGridSize[1], prop.maxGridSize[2]);
printf("Maximum block size: %d %d %d\n",
prop.maxThreadsDim[0], prop.maxThreadsDim[1],
prop.maxThreadsDim[2]);
printf("Number of SMs: %d\n",
prop.multiProcessorCount);
printf("Maximum amount of shared memory per block: %g KB\n",
prop.sharedMemPerBlock / 1024.0);
printf("Maximum amount of shared memory per SM: %g KB\n",
prop.sharedMemPerMultiprocessor / 1024.0);
printf("Maximum number of registers per block: %d K\n",
prop.regsPerBlock / 1024);
printf("Maximum number of registers per SM: %d K\n",
prop.regsPerMultiprocessor / 1024);
printf("Maximum number of threads per block: %d\n",
prop.maxThreadsPerBlock);
printf("Maximum number of threads per SM: %d\n",
prop.maxThreadsPerMultiProcessor);
return 0;
}
组织线程模型
#include <stdio.h>
#include "../tools/common.cuh"
__global__ void addMatrix(int *A, int *B, int *C, const int nx, const int ny)
{
int ix = threadIdx.x + blockIdx.x * blockDim.x;
int iy = threadIdx.y + blockIdx.y * blockDim.y;;
unsigned int idx = iy * nx + ix;
if (ix < nx && iy < ny)
{
C[idx] = A[idx] + B[idx];
}
}
int main(void)
{
// 1、设置GPU设备
setGPU();
// 2、分配主机内存和设备内存,并初始化
int nx = 16;
int ny = 8;
int nxy = nx * ny;
size_t stBytesCount = nxy * sizeof(int);
// (1)分配主机内存,并初始化
int *ipHost_A, *ipHost_B, *ipHost_C;
ipHost_A = (int *)malloc(stBytesCount);
ipHost_B = (int *)malloc(stBytesCount);
ipHost_C = (int *)malloc(stBytesCount);
if (ipHost_A != NULL && ipHost_B != NULL && ipHost_C != NULL)
{
for (int i = 0; i < nxy; i++)
{
ipHost_A[i] = i;
ipHost_B[i] = i + 1;
}
memset(ipHost_C, 0, stBytesCount);
}
else
{
printf("Fail to allocate host memory!\n");
exit(-1);
}
// (2)分配设备内存,并初始化
int *ipDevice_A, *ipDevice_B, *ipDevice_C;
ErrorCheck(cudaMalloc((int**)&ipDevice_A, stBytesCount), __FILE__, __LINE__);
ErrorCheck(cudaMalloc((int**)&ipDevice_B, stBytesCount), __FILE__, __LINE__);
ErrorCheck(cudaMalloc((int**)&ipDevice_C, stBytesCount), __FILE__, __LINE__);
if (ipDevice_A != NULL && ipDevice_B != NULL && ipDevice_C != NULL)
{
ErrorCheck(cudaMemcpy(ipDevice_A, ipHost_A, stBytesCount, cudaMemcpyHostToDevice), __FILE__, __LINE__);
ErrorCheck(cudaMemcpy(ipDevice_B, ipHost_B, stBytesCount, cudaMemcpyHostToDevice), __FILE__, __LINE__);
ErrorCheck(cudaMemcpy(ipDevice_C, ipHost_C, stBytesCount, cudaMemcpyHostToDevice), __FILE__, __LINE__);
}
else
{
printf("Fail to allocate memory\n");
free(ipHost_A);
free(ipHost_B);
free(ipHost_C);
exit(1);
}
// calculate on GPU
dim3 block(4, 4);
dim3 grid((nx + block.x -1) / block.x, (ny + block.y - 1) / block.y);
printf("Thread config:grid:<%d, %d>, block:<%d, %d>\n", grid.x, grid.y, block.x, block.y);
addMatrix<<<grid, block>>>(ipDevice_A, ipDevice_B, ipDevice_C, nx, ny); // 调用内核函数
ErrorCheck(cudaMemcpy(ipHost_C, ipDevice_C, stBytesCount, cudaMemcpyDeviceToHost), __FILE__, __LINE__);
for (int i = 0; i < 10; i++)
{
printf("id=%d, matrix_A=%d, matrix_B=%d, result=%d\n", i + 1,ipHost_A[i], ipHost_B[i], ipHost_C[i]);
}
free(ipHost_A);
free(ipHost_B);
free(ipHost_C);
ErrorCheck(cudaFree(ipDevice_A), __FILE__, __LINE__);
ErrorCheck(cudaFree(ipDevice_B), __FILE__, __LINE__);
ErrorCheck(cudaFree(ipDevice_C), __FILE__, __LINE__);
ErrorCheck(cudaDeviceReset(), __FILE__, __LINE__);
return 0;
}