在开发中经常遇到以 Python 为主的项目,一些模块调用的是 C++/CUDA 子模块,或者说可以把一些 Python 部分的内容使用 C++/CUDA 进行重写实现加速。
由于时间问题没有完整的学习官方教程(后续有空时补充),在 o1-preview 的帮助下跑通了两个简单的模块 demo,便于查看源码过程中能更好的通过关键词进行检索定位,也给未来的学习提供一个开始的切入点。
值得参考的资料:
- pytorch.org/tutorials/a…
- PyTorch 源码解读之即时编译篇 - OpenMMLab的文章 - 知乎 zhuanlan.zhihu.com/p/361101354
- and so on...
编译 C++ 模块并导入到 Python
1. 构建如图项目目录
2. C++ 模块代码 my_module.cpp
#include "my_module.h"
#include <Python.h>
extern "C" {
double add(double a, double b) { return a + b; }
static PyObject *py_add(PyObject *self, PyObject *args) {
double a, b;
if (!PyArg_ParseTuple(args, "dd", &a, &b))
return NULL;
return Py_BuildValue("d", add(a, b));
}
static PyMethodDef MyMethods[] = {
{"add", py_add, METH_VARARGS, "Add two numbers"}, {NULL, NULL, 0, NULL}};
static struct PyModuleDef mymodule = {PyModuleDef_HEAD_INIT, "my_cpp_module",
NULL, -1, MyMethods};
PyMODINIT_FUNC PyInit_my_cpp_module(void) { return PyModule_Create(&mymodule); }
}
-
在 cpp 的代码中,需要确保定义
PyMODINIT_FUNC PyInit_my_cpp_module函数,否则可能在后续中会遇到ImportError的问题 -
需要使用
extern "C"声明导出函数,以避免名称重整,出现symbol not found
3. 头文件 my_module.h
#ifndef MY_MODULE_H
#define MY_MODULE_H
extern "C" {
double add(double a, double b);
}
#endif
4. setup.py 文件 setup.py
from setuptools import setup, Extension
from setuptools.command.build_ext import build_ext
class CustomBuildExt(build_ext):
def build_extension(self, ext):
super().build_extension(ext)
module = Extension('my_cpp_module',
sources=['my_module.cpp'])
setup(name='my_cpp_module',
version='1.0',
description='A simple C++ extension',
ext_modules=[module],
cmdclass={'build_ext': CustomBuildExt})
5. 编译构建
python setup.py build_ext --inplace
# 允许在源代码目录中直接生成共享对象文件 (.so),而无需安装模块
# 安装模块的命令为:
# python setup.py build
# python setup.py install
执行内容大致为:
运行后项目目录结构如图:
6. python 调用示例 example.py
# 通过 load .so 文件的方式进行调用
import ctypes
import os
current_dir = os.path.dirname(os.path.abspath(__file__))
so_file = os.path.join(current_dir, 'my_cpp_module.cpython-310-darwin.so')
my_cpp_module = ctypes.CDLL(so_file)
my_cpp_module.add.argtypes = [ctypes.c_double, ctypes.c_double]
my_cpp_module.add.restype = ctypes.c_double
result = my_cpp_module.add(3.0, 4.5)
print(result)
# 通过 import 方式进行调用
import my_cpp_module
result = my_cpp_module.add(3.0, 4.5)
print(result)
使用 torch::jit 构建 C++/CUDA 模块并导入到 Python - 库
构建如图项目目录
my_cuda_ops.cu
#include <torch/extension.h>
// CUDA kernel
__global__ void add_kernel(float *x, float *y, float *result, int size) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
if (index < size) {
result[index] = x[index] + y[index];
}
}
// Host function
void add_tensors(torch::Tensor x, torch::Tensor y, torch::Tensor result) {
int size = x.size(0);
add_kernel<<<(size + 255) / 256, 256>>>(
x.data_ptr<float>(), y.data_ptr<float>(), result.data_ptr<float>(), size);
}
// Register the operation with TorchScript
TORCH_LIBRARY(my_cuda_library, m) { m.def("add_tensors", &add_tensors); }
my_class.cpp
#include <torch/script.h>
class MyDemoClass : public torch::CustomClassHolder {
public:
MyDemoClass() {}
int64_t multiply(int64_t a, int64_t b) { return a * b; }
};
// Register the class with TorchScript
TORCH_LIBRARY(my_demo, m) {
m.class_<MyDemoClass>("MyDemoClass")
.def(torch::init<>())
.def("multiply", &MyDemoClass::multiply);
}
setup.py
from setuptools import setup
from torch.utils.cpp_extension import CppExtension, CUDAExtension, BuildExtension
setup(
name='my_library',
ext_modules=[
CppExtension(
name='my_library.my_class',
sources=['my_library/my_class.cpp'],
),
CUDAExtension(
name='my_cuda_library.my_cuda_ops',
sources=['my_cuda_library/my_cuda_ops.cu'],
),
],
cmdclass={
'build_ext': BuildExtension
}
)
编译构建
python setup.py build_ext --inplace
运行后项目目录结构如图:
example.py
import torch
# Load the library
torch.classes.load_library('my_library/my_class.cpython-38-x86_64-linux-gnu.so')
torch.classes.load_library('my_cuda_library/my_cuda_ops.cpython-38-x86_64-linux-gnu.so')
# Create an instance of the class
my_instance = torch.classes.my_demo.MyDemoClass()
# Call the multiply method
result = my_instance.multiply(3, 4)
print("Result of multiplication:", result)
# Example of using CUDA operations
x = torch.tensor([1.0, 2.0, 3.0], device='cuda')
y = torch.tensor([4.0, 5.0, 6.0], device='cuda')
result = torch.empty_like(x)
torch.ops.my_cuda_library.add_tensors(x, y, result)
print("Result of CUDA addition:", result)