2021.4.2
- gpu的内存需要gpu来操作
int main()
{
int* dev_1,dev_2;
cudaMalloc((void**)&dev_1,sizeof(int)*100);
cudaMalloc((void**)&dev_2,sizeof(int)*100);
dev_1[0]=dev_2[0];
}
- kernal中分配不了大内存
__global__ void test_kernal()
{
int* dev_1;
auto cudaStatus = cudaMalloc((void**)&dev_1,sizeof(int)*100000000);
}
2021.4.14
- kernal中for循环有大小限制
__global__ void test_kernal()
{
for(int i=0;i<100000;i++)
{
}
}
- kernal中调用其他kernal,也是有限制的
__global__ void t1_kernal()
{
}
__global__ void t2_kernal()
{
t1_kernal<<<1024,1024>>>();
}
__global__ void main_kernal()
{
t2_kernal<<<1024,1024>>>();
}
void main()
{
main_kernal<<<1024,1024>>>();
}
- 关于线性插值
y_dest=interp1(x_src,y_src,x_dest,'linear');
__host__ void interp1_prepare(float *src, float *dest, int *indexs, float *ratios, int size)
{
int src_index = 0;
float src_min = src[src_index];
float src_max = src[src_index + 1];
for (int i = 0; i < size; i++)
{
float now_dest = dest[i];
while (true)
{
if ((now_dest - src_min) * (now_dest - src_max) <= 0)
{
break;
}
else if ((src_index == 0 || src_index == size - 2) && ((now_dest <= src_min && src_min < src_max) || (now_dest >= src_min && src_min > src_max)))
{
break;
}
if (src_index < size - 1)
{
src_index++;
src_min = src[src_index];
src_max = src[src_index + 1];
}
else
{
break;
}
}
indexs[i] = src_index;
ratios[i] = (now_dest - src_min) / (src_max - src_min);
}
}
int x_src[3]={1,2,4};
int x_dest[3]={1.1,2.3,3.5};
int indexs[3];
int ratios[3];
interp1_prepare(x_src,x_dest,indexs,ratios,3);
__global__ void interp1_liear_kernal(int *indexs, float *ratios, float *src, float *dest, int row_length,int col_length, int size)
{
int all_blocks = gridDim.x * gridDim.y * blockIdx.z + gridDim.x * blockIdx.y + blockIdx.x;
int block_threads = blockDim.x * blockDim.y * threadIdx.z + blockDim.x * threadIdx.y + threadIdx.x;
int loc = all_blocks * (blockDim.x * blockDim.y * blockDim.z) + block_threads;
if (loc < size)
{
int row = loc % row_length;
int col = loc / row_length;
int src_index = indexs[row];
float src_ratio = ratios[row];
float src_min = src[col*row_length+src_index];
float src_max = src[col*row_length+src_index+1];
dest[col*row_length+row] = src_min + src_ratio * (src_max - src_min);
}
}
interp1_liear_kernal<<<1,16>>>(indexs_gpu,ratios_gpu,y_src_gpu,y_dest_gpu,4,4,16);
- 用代码来描述NVIDIA GPU
class GPU
{
SM[] SM_Array;
GlobalMemory Global_Memory;
}
class SM
{
SP[192] SP_Array;
SharedMemory Shared_Memory;
int WrapSize=32;
public void Run(Thread[] threads)
{
for(int i=0;i<threads.Length;i+=SP_Array.Length)
{
for(int j=0;j<SP_Array.Length;j++)
{
SP_Array[j].Run(threads[i+j]);
}
}
}
public void Run(Block block)
{
Run(block.Threads);
}
}
class SP
{
public void Run(Thread thread)
{
}
}