hw3的目标是建立一个简单的支持库:n维数组(又名NDArray)。之前使用的是numpy,hw3也要求支持CPU和GPU,各自实现Compact、元素操作、约减、矩阵乘等功能。
文件目录
hw3目录下只包括与数组相关的,还有一定的C和cu代码,以及配置文件。
.
├── CMakeLists.txt
├── hw3.ipynb
├── Makefile
├── python
│ └── needle
│ ├── autograd.py
│ ├── backend_ndarray
│ │ ├── __init__.py
│ │ ├── ndarray_backend_numpy.py
│ │ └── ndarray.py
│ ├── backend_numpy.py
│ ├── backend_selection.py
│ ├── data.py
│ ├── __init__.py
│ ├── init.py
│ ├── nn.py
│ ├── ops.py
│ └── optim.py
├── src
│ ├── ndarray_backend_cpu.cc
│ └── ndarray_backend_cuda.cu
└── tests
├── __pycache__
│ ├── test_ndarray.cpython-38-pytest-6.1.1.pyc
│ └── test_ndarray.cpython-38-pytest-7.1.2.pyc
└── test_ndarray.py
6 directories, 20 files
Getting familiar with the NDArray class
首先熟悉为作业起点提供的NDArray.py类,里面主要包括BackenDevice和NDArray两个类
test_ndarray.py上要添加路径
import sys
sys.path.append('./python')
Part 1: Python array operations
ndarray.py文件
reshape
def reshape(self, new_shape):
### BEGIN YOUR SOLUTION
# 算出新的stride,根据新shape和新stride
return self.as_strided(new_shape, NDArray.compact_strides(new_shape))
### END YOUR SOLUTION
permute
def permute(self, new_axes):
### BEGIN YOUR SOLUTION
# 交换维度
# 改变shape和strides即可(但不确定数据会不会乱)
new_strides = tuple([self.strides[i] for i in new_axes])
new_shape = tuple([self.shape[i] for i in new_axes])
return self.as_strided(new_shape, new_strides)
### END YOUR SOLUTION
broadcast_to
def broadcast_to(self, new_shape):
### BEGIN YOUR SOLUTION
# 这种写法应该只能在最后拓展维度
new_strides = list(self.strides)
for i, d in enumerate(self.shape):
assert d == 1 or new_shape[i] == d
if d==1:
new_strides[i] = 0
return self.as_strided(new_shape, tuple(new_strides))
### END YOUR SOLUTION
__getitem__
def __getitem__(self, idxs):
# handle singleton as tuple, everything as slices
if not isinstance(idxs, tuple):
idxs = (idxs,)
idxs = tuple(
[
self.process_slice(s, i) if isinstance(s, slice) else slice(s, s + 1, 1)
for i, s in enumerate(idxs)
]
)
assert len(idxs) == self.ndim, "Need indexes equal to number of dimensions"
### BEGIN YOUR SOLUTION
new_offset = 0
new_shape = list(self.shape)
new_strides = list(self.strides)
for i, sli in enumerate(idxs):
new_offset += self.strides[i]*sli.start
new_shape[i] = math.ceil((sli.stop - sli.start)/sli.step) #?
new_strides[i] = self.strides[i]*sli.step
return NDArray.make(tuple(new_shape), tuple(new_strides), self.device, self._handle, new_offset)
### END YOUR SOLUTION
在项目路径下make一下,编译C语言代码,再执行本地测试命令
make
python3 -m pytest -v -k "(permute or reshape or broadcast or getitem) and cpu and not compact"
Part 2: CPU Backend - Compact and setitem
void Compact(const AlignedArray& a, AlignedArray* out, std::vector<uint32_t> shape,
std::vector<uint32_t> strides, size_t offset) {
/// BEGIN YOUR SOLUTION
size_t dim = shape.size();
std::vector<uint32_t> pos(dim, 0);
for (size_t i = 0; i<out->size; i++){
uint32_t idx = 0;
for (int j=0; j<dim; j++){
idx += strides[dim-1-j]*pos[j];
}
out->ptr[i] = a.ptr[idx+offset];
pos[0] += 1;
for(int j=0; j<dim; j++){
if(pos[j]==shape[dim-1-j]){
pos[j] = 0;
if(j!=dim-1){
pos[j+1] += 1;
}
}
}
}
/// END YOUR SOLUTION
}
void EwiseSetitem(const AlignedArray& a, AlignedArray* out, std::vector<uint32_t> shape,
std::vector<uint32_t> strides, size_t offset) {
/// BEGIN YOUR SOLUTION
size_t dim = shape.size();
std::vector<uint32_t> pos(dim, 0);
for(size_t i=0; i<a.size; i++){
uint32_t idx = 0;
for(int j=0; j<dim; j++){
idx += strides[dim-1-j]*pos[j];
}
out->ptr[idx+offset] = a.ptr[i];
pos[0] += 1;
for(int j=0; j<dim; j++){
if(pos[j]==shape[dim-1-j]){
pos[j] = 0;
if(j!=dim-1){
pos[j+1] += 1;
}
}
}
}
/// END YOUR SOLUTION
}
void ScalarSetitem(const size_t size, scalar_t val, AlignedArray* out, std::vector<uint32_t> shape,
std::vector<uint32_t> strides, size_t offset) {
/// BEGIN YOUR SOLUTION
size_t dim = shape.size();
std::vector<uint32_t> pos(dim, 0);
for(size_t i=0; i<size; i++){
uint32_t idx = 0;
for(int j=0; j<dim; j++){
idx += strides[dim-1-j]*pos[j];
}
out->ptr[idx+offset] = val;
pos[0] += 1;
for(int j=0; j<dim; j++){
if(pos[j]==shape[dim-1-j]){
pos[j] = 0;
if(j!=dim-1){
pos[j+1] += 1;
}
}
}
}
/// END YOUR SOLUTION
}
在项目路径下make一下,编译C语言代码,再执行本地测试命令
make
python3 -m pytest -v -k "(compact or setitem) and cpu"
Part 3: CPU Backend - Elementwise and scalar operations
任务3用C++实现元素或标量操作
EwiseMul(),ScalarMul()EwiseDiv(),ScalarDiv()ScalarPower()EwiseMaximum(),ScalarMaximum()EwiseEq(),ScalarEq()EwiseGe(),ScalarGe()EwiseLog()EwiseExp()EwiseTanh()
// 1.EwiseMul, ScalarMul
void EwiseMul(const AlignedArray& a, const AlignedArray& b, AlignedArray* out) {
for (size_t i = 0; i < a.size; i++) {
out->ptr[i] = a.ptr[i] * b.ptr[i];
}
}
void ScalarMul(const AlignedArray& a, scalar_t val, AlignedArray* out) {
for (size_t i = 0; i < a.size; i++) {
out->ptr[i] = a.ptr[i] * val;
}
}
// 2.EwiseDiv, ScalarDiv
void EwiseDiv(const AlignedArray& a, const AlignedArray& b, AlignedArray* out) {
for (size_t i = 0; i < a.size; i++) {
out->ptr[i] = a.ptr[i] / b.ptr[i];
}
}
void ScalarDiv(const AlignedArray& a, scalar_t val, AlignedArray* out) {
for (size_t i = 0; i < a.size; i++) {
out->ptr[i] = a.ptr[i] / val;
}
}
// 3.ScalarPower
void ScalarPower(const AlignedArray& a, scalar_t val, AlignedArray* out) {
for (size_t i = 0; i < a.size; i++) {
out->ptr[i] = pow(a.ptr[i], val);
}
}
// 4.EwiseMaximum, ScalarMaximum
void EwiseMaximum(const AlignedArray& a, const AlignedArray& b, AlignedArray* out) {
for (size_t i = 0; i < a.size; i++) {
out->ptr[i] = std::max(a.ptr[i], b.ptr[i]);
}
}
void ScalarMaximum(const AlignedArray& a, scalar_t val, AlignedArray* out) {
for (size_t i = 0; i < a.size; i++) {
out->ptr[i] = std::max(a.ptr[i], val);
}
}
// 5.EwiseEq, ScalarEq 判断是否相等
void EwiseEq(const AlignedArray& a, const AlignedArray& b, AlignedArray* out) {
for (size_t i = 0; i < a.size; i++) {
out->ptr[i] = scalar_t(a.ptr[i]==b.ptr[i]);
}
}
void ScalarEq(const AlignedArray& a, scalar_t val, AlignedArray* out) {
for (size_t i = 0; i < a.size; i++) {
out->ptr[i] = scalar_t(a.ptr[i]== val);
}
}
// 6.EwiseGe, ScalarGe 判断左边是不是大于等于右边
void EwiseGe(const AlignedArray& a, const AlignedArray& b, AlignedArray* out) {
for (size_t i = 0; i < a.size; i++) {
out->ptr[i] = scalar_t(a.ptr[i] >= b.ptr[i]);
}
}
void ScalarGe(const AlignedArray& a, scalar_t val, AlignedArray* out) {
for (size_t i = 0; i < a.size; i++) {
out->ptr[i] = scalar_t(a.ptr[i]>= val);
}
}
// 7.EwiseLog
void EwiseLog(const AlignedArray& a, AlignedArray* out) {
for (size_t i = 0; i < a.size; i++) {
out->ptr[i] = log(a.ptr[i]);
}
}
// 8.EwiseExp
void EwiseExp(const AlignedArray& a, AlignedArray* out) {
for (size_t i = 0; i < a.size; i++) {
out->ptr[i] = exp(a.ptr[i]);
}
}
// 9.EwiseTanh
void EwiseTanh(const AlignedArray& a, AlignedArray* out) {
for (size_t i = 0; i < a.size; i++) {
out->ptr[i] = tanh(a.ptr[i]);
}
}
去掉文件最后面的注释,否则编译时候找不到函数
m.def("ewise_mul", EwiseMul);
m.def("scalar_mul", ScalarMul);
m.def("ewise_div", EwiseDiv);
m.def("scalar_div", ScalarDiv);
m.def("scalar_power", ScalarPower);
m.def("ewise_maximum", EwiseMaximum);
m.def("scalar_maximum", ScalarMaximum);
m.def("ewise_eq", EwiseEq);
m.def("scalar_eq", ScalarEq);
m.def("ewise_ge", EwiseGe);
m.def("scalar_ge", ScalarGe);
m.def("ewise_log", EwiseLog);
m.def("ewise_exp", EwiseExp);
m.def("ewise_tanh", EwiseTanh);
在项目路径下make一下,编译C语言代码,再执行本地测试命令
make
python3 -m pytest -v -k "(ewise_fn or ewise_max or log or exp or tanh or (scalar and not setitem)) and cpu"
有一个warn
Part 4: CPU Backend - Reductions
在指定维度上进行约简,求和或者最大值(给了步长)
void ReduceMax(const AlignedArray& a, AlignedArray* out, size_t reduce_size) {
/// BEGIN YOUR SOLUTION
for (size_t i = 0; i < out->size; i++) {
auto m = a.ptr[i * reduce_size];
for (size_t j = 1; j < reduce_size; j++) {
m = std::max(m, a.ptr[i * reduce_size + j]);
}
out->ptr[i] = m;
}
/// END YOUR SOLUTION
}
void ReduceSum(const AlignedArray& a, AlignedArray* out, size_t reduce_size) {
/// BEGIN YOUR SOLUTION
for (size_t i = 0; i < out->size; i++) {
scalar_t m = 0;
for (size_t j = 0; j < reduce_size; j++) {
m += a.ptr[i * reduce_size + j];
}
out->ptr[i] = m;
}
/// END YOUR SOLUTION
}
在项目路径下make一下,编译C语言代码,再执行本地测试命令
make
python3 -m pytest -v -k "reduce and cpu"
Part 5: CPU Backend - Matrix multiplication
矩阵乘法
两个任意大小二维矩阵乘法
void Matmul(const AlignedArray& a, const AlignedArray& b, AlignedArray* out, uint32_t m, uint32_t n,
uint32_t p) {
/// BEGIN YOUR SOLUTION
for (size_t i = 0; i < m; i++) {
for (size_t j = 0; j < p; j++) {
out->ptr[i * p + j] = 0;
for (size_t k = 0; k < n; k++) {
out->ptr[i * p + j] += a.ptr[i * n + k] * b.ptr[k * p + j];
}
}
}
/// END YOUR SOLUTION
}
两个固定大小二维矩阵乘法
inline void AlignedDot(const float* __restrict__ a,
const float* __restrict__ b,
float* __restrict__ out) {
a = (const float*)__builtin_assume_aligned(a, TILE * ELEM_SIZE);
b = (const float*)__builtin_assume_aligned(b, TILE * ELEM_SIZE);
out = (float*)__builtin_assume_aligned(out, TILE * ELEM_SIZE);
for (size_t i = 0; i < TILE; i++) {
for (size_t j = 0; j < TILE; j++) {
for (size_t k = 0; k < TILE; k++) {
out[i * TILE + j] += a[i * TILE + k] * b[k * TILE + j];
}
}
}
}
四维数组相乘(不太懂)
void MatmulTiled(const AlignedArray& a, const AlignedArray& b, AlignedArray* out, uint32_t m,
uint32_t n, uint32_t p) {
float* tmp = new float[TILE * TILE];
float* A = new float[TILE * TILE];
float* B = new float[TILE * TILE];
Fill(out, 0);
for (size_t i = 0; i < m / TILE; i++) {
for (size_t j = 0; j < p / TILE; j++) {
for (size_t l = 0; l < TILE * TILE; l++) {
tmp[l] = 0;
}
for (size_t k = 0; k < n / TILE; k++) {
for (size_t l = 0; l < TILE*TILE; l++) {
size_t row = l / TILE, col = l % TILE;
A[l] = a.ptr[i * TILE * n + k * TILE * TILE + l];
B[l] = b.ptr[k * TILE * p + j * TILE * TILE + l];
}
AlignedDot(A, B, tmp);
}
for (size_t l = 0; l < TILE * TILE; l++) {
out->ptr[i * p * TILE + j * TILE * TILE + l] = tmp[l];
}
}
}
}
在项目路径下make一下,编译C语言代码,再执行本地测试命令
make
python3 -m pytest -v -k "matmul and cpu"
Part 6: CUDA Backend - Compact and setitem
任务6789是把任务2345用cuda重写一下
EwiseSetitemKernel与ScalarSetitemKernel核函数需要自己写上,没有给填空的函数。
__global__ void CompactKernel(const scalar_t* a, scalar_t* out, size_t size, CudaVec shape,
CudaVec strides, size_t offset) {
size_t gid = blockIdx.x * blockDim.x + threadIdx.x;
/// BEGIN YOUR SOLUTION
if(gid < size){
size_t gid_a = 0;
size_t tmp = gid;
for(int i=shape.size-1; i>=0; i--){
size_t idx = tmp%shape.data[i];
tmp /= shape.data[i];
gid_a += idx*strides.data[i];
}//又除又乘啥意思
out[gid] = a[gid_a+offset];
}
/// END YOUR SOLUTION
}
__global__ void EwiseSetitemKernel(const scalar_t*a, scalar_t* out, size_t size, CudaVec shape, CudaVec strides, size_t offset){
size_t gid = blockIdx.x * blockDim.x + threadIdx.x;
if (gid<size){
size_t gid_out = 0;
size_t tmp = gid;
for(int i=shape.size-1;i>=0; i--){
size_t idx = tmp%shape.data[i];
tmp/=shape.data[i];
gid_out += idx*strides.data[i];
}
out[gid_out+offset] = a[gid];
}
}
__global__ void ScalarSetitemKernel(scalar_t val, scalar_t* out, size_t size, CudaVec shape,
CudaVec strides, size_t offset) {
size_t gid = blockIdx.x * blockDim.x + threadIdx.x;
/// BEGIN YOUR SOLUTION
if (gid < size) {
size_t gid_out = 0;
size_t tmp = gid;
for (int i=shape.size-1; i>=0; i--) {
size_t idx = tmp % shape.data[i];
tmp /= shape.data[i];
gid_out += idx * strides.data[i];
}
out[gid_out + offset] = val;
}
/// END YOUR SOLUTION
}
在项目路径下make一下,编译C语言代码,再执行本地测试命令
make
python3 -m pytest -v -k "(compact or setitem) and cuda"
Part 7: CUDA Backend - Elementwise and scalar operations
// 1 - EwiseMul, ScalarMul
__global__ void EwiseMulKernel(const scalar_t* a, const scalar_t* b, scalar_t* out, size_t size) {
size_t gid = blockIdx.x * blockDim.x + threadIdx.x;
if (gid < size) out[gid] = a[gid] * b[gid];
}
void EwiseMul(const CudaArray& a, const CudaArray& b, CudaArray* out) {
CudaDims dim = CudaOneDim(out->size);
EwiseMulKernel<<<dim.grid, dim.block>>>(a.ptr, b.ptr, out->ptr, out->size);
}
__global__ void ScalarMulKernel(const scalar_t* a, scalar_t val, scalar_t* out, size_t size) {
size_t gid = blockIdx.x * blockDim.x + threadIdx.x;
if (gid < size) out[gid] = a[gid] * val;
}
void ScalarMul(const CudaArray& a, scalar_t val, CudaArray* out) {
CudaDims dim = CudaOneDim(out->size);
ScalarMulKernel<<<dim.grid, dim.block>>>(a.ptr, val, out->ptr, out->size);
}
// 2 - EwiseDiv, ScalarDiv
__global__ void EwiseDivKernel(const scalar_t* a, const scalar_t* b, scalar_t* out, size_t size) {
size_t gid = blockIdx.x * blockDim.x + threadIdx.x;
if (gid < size) out[gid] = a[gid] / b[gid];
}
void EwiseDiv(const CudaArray& a, const CudaArray& b, CudaArray* out) {
CudaDims dim = CudaOneDim(out->size);
EwiseDivKernel<<<dim.grid, dim.block>>>(a.ptr, b.ptr, out->ptr, out->size);
}
__global__ void ScalarDivKernel(const scalar_t* a, scalar_t val, scalar_t* out, size_t size) {
size_t gid = blockIdx.x * blockDim.x + threadIdx.x;
if (gid < size) out[gid] = a[gid] / val;
}
void ScalarDiv(const CudaArray& a, scalar_t val, CudaArray* out) {
CudaDims dim = CudaOneDim(out->size);
ScalarDivKernel<<<dim.grid, dim.block>>>(a.ptr, val, out->ptr, out->size);
}
// 3 - ScalarPower
__global__ void ScalarPowerKernel(const scalar_t* a, scalar_t val, scalar_t* out, size_t size) {
size_t gid = blockIdx.x * blockDim.x + threadIdx.x;
if (gid < size) out[gid] = pow(a[gid], val);
}
void ScalarPower(const CudaArray& a, scalar_t val, CudaArray* out) {
CudaDims dim = CudaOneDim(out->size);
ScalarPowerKernel<<<dim.grid, dim.block>>>(a.ptr, val, out->ptr, out->size);
}
// 4 - EwiseMaximum, ScalarMaximum
__global__ void EwiseMaximumKernel(const scalar_t* a, const scalar_t* b, scalar_t* out, size_t size) {
size_t gid = blockIdx.x * blockDim.x + threadIdx.x;
if (gid < size) out[gid] = max(a[gid], b[gid]);
}
void EwiseMaximum(const CudaArray& a, const CudaArray& b, CudaArray* out) {
CudaDims dim = CudaOneDim(out->size);
EwiseMaximumKernel<<<dim.grid, dim.block>>>(a.ptr, b.ptr, out->ptr, out->size);
}
__global__ void ScalarMaximumKernel(const scalar_t* a, scalar_t val, scalar_t* out, size_t size) {
size_t gid = blockIdx.x * blockDim.x + threadIdx.x;
if (gid < size) out[gid] = max(a[gid], val);
}
void ScalarMaximum(const CudaArray& a, scalar_t val, CudaArray* out) {
CudaDims dim = CudaOneDim(out->size);
ScalarMaximumKernel<<<dim.grid, dim.block>>>(a.ptr, val, out->ptr, out->size);
}
// 5 - EwiseEq, ScalarEq
__global__ void EwiseEqKernel(const scalar_t* a, const scalar_t* b, scalar_t* out, size_t size) {
size_t gid = blockIdx.x * blockDim.x + threadIdx.x;
if (gid < size) out[gid] = scalar_t(a[gid]==b[gid]);
}
void EwiseEq(const CudaArray& a, const CudaArray& b, CudaArray* out) {
CudaDims dim = CudaOneDim(out->size);
EwiseEqKernel<<<dim.grid, dim.block>>>(a.ptr, b.ptr, out->ptr, out->size);
}
__global__ void ScalarEqKernel(const scalar_t* a, scalar_t val, scalar_t* out, size_t size) {
size_t gid = blockIdx.x * blockDim.x + threadIdx.x;
if (gid < size) out[gid] = scalar_t(a[gid]==val);
}
void ScalarEq(const CudaArray& a, scalar_t val, CudaArray* out) {
CudaDims dim = CudaOneDim(out->size);
ScalarEqKernel<<<dim.grid, dim.block>>>(a.ptr, val, out->ptr, out->size);
}
// 6 - EwiseGe, ScalarGe
__global__ void EwiseGeKernel(const scalar_t* a, const scalar_t* b, scalar_t* out, size_t size) {
size_t gid = blockIdx.x * blockDim.x + threadIdx.x;
if (gid < size) out[gid] = scalar_t(a[gid]>=b[gid]);
}
void EwiseGe(const CudaArray& a, const CudaArray& b, CudaArray* out) {
CudaDims dim = CudaOneDim(out->size);
EwiseGeKernel<<<dim.grid, dim.block>>>(a.ptr, b.ptr, out->ptr, out->size);
}
__global__ void ScalarGeKernel(const scalar_t* a, scalar_t val, scalar_t* out, size_t size) {
size_t gid = blockIdx.x * blockDim.x + threadIdx.x;
if (gid < size) out[gid] = scalar_t(a[gid]>=val);
}
void ScalarGe(const CudaArray& a, scalar_t val, CudaArray* out) {
CudaDims dim = CudaOneDim(out->size);
ScalarGeKernel<<<dim.grid, dim.block>>>(a.ptr, val, out->ptr, out->size);
}
// 7 - EwiseLog
__global__ void EwiseLogKernel(const scalar_t* a, scalar_t* out, size_t size) {
size_t gid = blockIdx.x * blockDim.x + threadIdx.x;
if (gid < size) out[gid] = log(a[gid]);
}
void EwiseLog(const CudaArray& a, CudaArray* out) {
CudaDims dim = CudaOneDim(out->size);
EwiseLogKernel<<<dim.grid, dim.block>>>(a.ptr, out->ptr, out->size);
}
// 8 - EwiseExp
__global__ void EwiseExpKernel(const scalar_t* a, scalar_t* out, size_t size) {
size_t gid = blockIdx.x * blockDim.x + threadIdx.x;
if (gid < size) out[gid] = exp(a[gid]);
}
void EwiseExp(const CudaArray& a, CudaArray* out) {
CudaDims dim = CudaOneDim(out->size);
EwiseExpKernel<<<dim.grid, dim.block>>>(a.ptr, out->ptr, out->size);
}
// 9 - EwiseTanh
__global__ void EwiseTanhKernel(const scalar_t* a, scalar_t* out, size_t size) {
size_t gid = blockIdx.x * blockDim.x + threadIdx.x;
if (gid < size) out[gid] = tanh(a[gid]);
}
void EwiseTanh(const CudaArray& a, CudaArray* out) {
CudaDims dim = CudaOneDim(out->size);
EwiseTanhKernel<<<dim.grid, dim.block>>>(a.ptr, out->ptr, out->size);
}
去掉文件最后面的注释,否则编译时候找不到函数
m.def("ewise_mul", EwiseMul);
m.def("scalar_mul", ScalarMul);
m.def("ewise_div", EwiseDiv);
m.def("scalar_div", ScalarDiv);
m.def("scalar_power", ScalarPower);
m.def("ewise_maximum", EwiseMaximum);
m.def("scalar_maximum", ScalarMaximum);
m.def("ewise_eq", EwiseEq);
m.def("scalar_eq", ScalarEq);
m.def("ewise_ge", EwiseGe);
m.def("scalar_ge", ScalarGe);
m.def("ewise_log", EwiseLog);
m.def("ewise_exp", EwiseExp);
m.def("ewise_tanh", EwiseTanh);
在项目路径下make一下,编译C语言代码,再执行本地测试命令
make
python3 -m pytest -v -k "(ewise_fn or ewise_max or log or exp or tanh or (scalar and not setitem)) and cuda"
Part 8: CUDA Backend - Reductions
__global__ void ReduceMaxKernel(scalar_t *a, scalar_t *out, size_t size, size_t reduce_size){
size_t gid = blockIdx.x * blockDim.x + threadIdx.x;
if(gid<size){
scalar_t m = a[gid*reduce_size];
for(size_t i=1; i<reduce_size; i++){
m = max(m, a[gid*reduce_size+i]);
}
out[gid] = m;
}
}
void ReduceMax(const CudaArray& a, CudaArray* out, size_t reduce_size) {
/// BEGIN YOUR SOLUTION
size_t num_threads = a.size/reduce_size;
CudaDims dim = CudaOneDim(num_threads);
ReduceMaxKernel<<<dim.grid, dim.block>>> (a.ptr, out->ptr, num_threads, reduce_size);
/// END YOUR SOLUTION
}
__global__ void ReduceSumKernel(scalar_t *a, scalar_t *out, size_t size, size_t reduce_size){
size_t gid = blockIdx.x * blockDim.x + threadIdx.x;
if(gid<size){
scalar_t m = 0;
for(size_t i=0; i<reduce_size; i++){
m += a[gid*reduce_size+i];
}
out[gid] = m;
}
}
void ReduceSum(const CudaArray& a, CudaArray* out, size_t reduce_size) {
/// BEGIN YOUR SOLUTION
size_t num_threads = a.size/reduce_size;
CudaDims dim = CudaOneDim(num_threads);
ReduceSumKernel<<<dim.grid, dim.block>>> (a.ptr, out->ptr, num_threads, reduce_size);
/// END YOUR SOLUTION
}
在项目路径下make一下,编译C语言代码,再执行本地测试命令
make
python3 -m pytest -v -k "reduce and cuda"
Part 9: CUDA Backend - Matrix multiplication
__global__ void MatmulKernel(scalar_t *a, scalar_t *b, scalar_t *out, size_t size, uint32_t M,
uint32_t N, uint32_t P){
size_t gid = blockIdx.x * blockDim.x + threadIdx.x;
if(gid<size){
size_t i = gid/P, j = gid%P;
scalar_t sum = 0;
for(size_t k=0; k<N; k++){
sum += a[i*N + k] * b[k*P + j];
}
out[gid] = sum;
}
}
void Matmul(const CudaArray& a, const CudaArray& b, CudaArray* out, uint32_t M, uint32_t N,
uint32_t P) {
/// BEGIN YOUR SOLUTION
CudaDims dim = CudaOneDim(out->size);
MatmulKernel<<<dim.grid, dim.block>>>(a.ptr, b.ptr, out->ptr, out->size, M, N, P);
/// END YOUR SOLUTION
}
在项目路径下make一下,编译C语言代码,再执行本地测试命令
make
python3 -m pytest -v -k "matmul and cuda"
总结
本章用CPP和CUDA实现一个N维数组,各自实现Compact、元素操作、约减、矩阵乘等功能。