torch_musa源码阅读记录

1,136 阅读3分钟

注:个人阅读记录,因此写的很随意

torch_musa是摩尔线程推出的pytorch插件,主要思想是通过pytorch的privateuse1键让pytorch可以在摩尔线程的显卡上运行,从而兼容pytorch生态。

1.setup.py

torch_musa的安装脚本实际上调用了pip install setup.py指令,因此首先来看看setup.py。在setup.py中,其主要进行以下操作:

① 调用build_musa_lib()函数来构建libmusa_python.so:

Ⅰ.cuda_port

由于摩尔线程的musa runtime和cuda runtime 100%兼容,因此可以通过字符串替换的方法将pytorch中的cuda代码porting为musa代码。porting的键值对保存在/home/torch_musa/torch_musa/utils/mapping/include.json/home/torch_musa/torch_musa/utils/mapping/general.json/home/torch_musa/torch_musa/utils/mapping/extra.json以及tools/cuda_porting/cuda_porting.pyextra_replace_map变量中,定义被porting的pytorch代码的torch_musa代码如下:

class PortingFile:
    r"""Class used to manage porting files."""

    def __init__(self, dir_name: str, recursive: bool, need_filter_cpp: bool) -> None:
        r"""Initializes class.

        Args:
            dir_name (str): folder name of porting files.
            recursive (bool): whether to port recursively in the folder or not.
            need_filter_cpp (bool): whether to filter out cpp files or not.
        """
        self.dir_name = dir_name
        self.recursive = recursive
        self.need_filter_cpp = need_filter_cpp


r"""All folders needed for cuda-porting
"""
PORT_FILES = [
    PortingFile("aten/src/ATen/native/cuda", True, False),
    PortingFile("aten/src/ATen/native/nested", True, True),
    PortingFile("aten/src/ATen/native/quantized", True, True),
    PortingFile("aten/src/ATen/native/sparse", True, True),
    PortingFile("aten/src/ATen/native/transformers", True, True),
    PortingFile("aten/src/ATen/cuda", True, False),
    PortingFile("aten/src/THC", True, True),
    PortingFile("c10/cuda", True, False),
    PortingFile("include", True, True),
    PortingFile("c10/core/impl", True, True),
    PortingFile("aten/src/ATen/cuda", True, True),
    PortingFile("torch/csrc/cuda", True, False),
]

porting后的cuda代码保存在/home/torch_musa/build/generated_cuda_compatible

Ⅱ.通过cmake生成库

cmake构建命令如下:

cmake -GNinja -DBUILD_PYTHON=True \
-DBUILD_PYTORCH_REPO_PATH=/home/torch_musa/../pytorch \ 
-DBUILD_TEST=True \
-DCMAKE_BUILD_TYPE=Release \
-DCMAKE_INSTALL_PREFIX=/home/torch_musa/torch_musa \
-DCMAKE_PREFIX_PATH=/opt/conda/envs/py38/lib/python3.8/site-packages;/opt/conda/envs/py38 \
-DENABLE_COMPILE_FP64=1 \
-DGENERATED_PORTING_DIR=/home/torch_musa/build/generated_cuda_compatible \
-DNUMPY_INCLUDE_DIR=/opt/conda/envs/py38/lib/python3.8/site-packages/numpy/core/include -DPYTHON_EXECUTABLE=/opt/conda/envs/py38/bin/python \
-DPYTHON_INCLUDE_DIR=/opt/conda/envs/py38/include/python3.8 \
-DPYTHON_LIBRARY=/opt/conda/envs/py38/lib/libpython3.8.so.1.0 \
-DTORCH_BUILD_VERSION=1.1.0 \
-DUSE_ASAN=0 \
-DUSE_MCCL=1 \
-DUSE_NUMPY=True \
/home/torch_musa

/home/torch_musa/CMakeLists.txt中有如下代码:

include_directories(${GENERATED_PORTING_DIR}/include)
include_directories(${GENERATED_PORTING_DIR}/aten/src)
include_directories(${GENERATED_PORTING_DIR}/include/torch/csrc/api/include)

可见torch_musa使用了cuda porting后的pytorch头文件。 整个torch_musa生成了两个so库,分别是libmusa_python.so和libmusa_kernels.so,libmusa_kernels.so主要包含了一些cuda porting后的cpp和mu文件以及aten/musa/*.cpp的文件,也就是一些和算子相关的文件,其定义如/home/torch_musa/torch_musa/csrc/CMakeLists.txt中的代码所示:

set(MUSA_KERNELS_LIB "musa_kernels")

FILE(GLOB ATEN_MUSA_CSRCS
  aten/musa/*.cpp
)

FILE(GLOB PORTING_CPP_CSRCS
  aten/ops/musa/*.cpp
  ${GENERATED_PORTING_DIR}/aten/src/ATen/native/musa/Activation.cpp
  ${GENERATED_PORTING_DIR}/aten/src/ATen/musa/EmptyTensor.cpp
  ${GENERATED_PORTING_DIR}/torch/csrc/musa/comm.cpp
  ${GENERATED_PORTING_DIR}/torch/csrc/musa/Stream.cpp)

FILE(GLOB_RECURSE MU_SRCS
  ${GENERATED_PORTING_DIR}/aten/src/ATen/musa/cub.mu
  ${GENERATED_PORTING_DIR}/aten/src/ATen/musa/cub-RadixSortKeys.mu
  ${GENERATED_PORTING_DIR}/aten/src/ATen/musa/cub-RadixSortPairs.mu
  ${GENERATED_PORTING_DIR}/aten/src/ATen/musa/detail/IndexUtils.mu
  ${GENERATED_PORTING_DIR}/aten/src/ATen/native/musa/AbsKernel.mu
  ${GENERATED_PORTING_DIR}/aten/src/ATen/native/musa/ActivationHardsigmoidKernel.mu
  ${GENERATED_PORTING_DIR}/aten/src/ATen/native/musa/ActivationHardswishKernel.mu
  ${GENERATED_PORTING_DIR}/aten/src/ATen/native/musa/ActivationLeakyReluKernel.mu
  ${GENERATED_PORTING_DIR}/aten/src/ATen/native/musa/ActivationLogSigmoidKernel.mu
  ${GENERATED_PORTING_DIR}/aten/src/ATen/native/musa/ActivationGeluKernel.mu
  ${GENERATED_PORTING_DIR}/aten/src/ATen/native/musa/ActivationPreluKernel.mu
  ${GENERATED_PORTING_DIR}/aten/src/ATen/native/musa/ActivationHardtanhKernel.mu
  ${GENERATED_PORTING_DIR}/aten/src/ATen/native/musa/ActivationSoftplusKernel.mu
  ${GENERATED_PORTING_DIR}/aten/src/ATen/native/musa/ActivationGluKernel.mu
  ${GENERATED_PORTING_DIR}/aten/src/ATen/native/musa/AmpKernels.mu
  ${GENERATED_PORTING_DIR}/aten/src/ATen/native/musa/AveragePool3d.mu
  ${GENERATED_PORTING_DIR}/aten/src/ATen/native/musa/BinaryMiscBackwardOpsKernels.mu
  ${GENERATED_PORTING_DIR}/aten/src/ATen/native/musa/BinaryGeometricKernels.mu
  ${GENERATED_PORTING_DIR}/aten/src/ATen/native/musa/BinaryMiscOpsKernels.mu
  ${GENERATED_PORTING_DIR}/aten/src/ATen/native/musa/BinaryBitwiseOpsKernels.mu
  ${GENERATED_PORTING_DIR}/aten/src/ATen/native/musa/BinaryShiftOpsKernels.mu
  ${GENERATED_PORTING_DIR}/aten/src/ATen/native/musa/CompareKernels.mu
  ${GENERATED_PORTING_DIR}/aten/src/ATen/native/musa/CrossKernel.mu
  ${GENERATED_PORTING_DIR}/aten/src/ATen/native/musa/DistributionUniform.mu
  ${GENERATED_PORTING_DIR}/aten/src/ATen/native/musa/DistributionNormal.mu
  ${GENERATED_PORTING_DIR}/aten/src/ATen/native/musa/DistributionBernoulli.mu
  ${GENERATED_PORTING_DIR}/aten/src/ATen/native/musa/DistributionRandomKernel.mu
  ${GENERATED_PORTING_DIR}/aten/src/ATen/native/musa/DilatedMaxPool3d.mu
  ${GENERATED_PORTING_DIR}/aten/src/ATen/native/musa/DistributionExponentialKernel.mu
  ${GENERATED_PORTING_DIR}/aten/src/ATen/native/musa/FillKernel.mu
  ${GENERATED_PORTING_DIR}/aten/src/ATen/native/musa/ForeachBinaryOpList.mu
  ${GENERATED_PORTING_DIR}/aten/src/ATen/native/musa/ForeachBinaryOpScalar.mu
  ${GENERATED_PORTING_DIR}/aten/src/ATen/native/musa/ForeachBinaryOpScalarList.mu
  ${GENERATED_PORTING_DIR}/aten/src/ATen/native/musa/ForeachPointwiseOp.mu
  ${GENERATED_PORTING_DIR}/aten/src/ATen/native/musa/ForeachUnaryOp.mu
  ${GENERATED_PORTING_DIR}/aten/src/ATen/native/musa/group_norm_kernel.mu
  ${GENERATED_PORTING_DIR}/aten/src/ATen/native/musa/GridSampler.mu
  ${GENERATED_PORTING_DIR}/aten/src/ATen/native/musa/Indexing.mu
  ${GENERATED_PORTING_DIR}/aten/src/ATen/native/musa/IndexKernel.mu
  ${GENERATED_PORTING_DIR}/aten/src/ATen/native/musa/LegacyThrustHelpers.mu
  ${GENERATED_PORTING_DIR}/aten/src/ATen/native/musa/LossCTC.mu
  ${GENERATED_PORTING_DIR}/aten/src/ATen/native/musa/Loss.mu
  ${GENERATED_PORTING_DIR}/aten/src/ATen/native/musa/MultinomialKernel.mu
  ${GENERATED_PORTING_DIR}/aten/src/ATen/native/musa/Nonzero.mu
  ${GENERATED_PORTING_DIR}/aten/src/ATen/native/musa/NLLLoss2d.mu
  ${GENERATED_PORTING_DIR}/aten/src/ATen/native/musa/PointwiseOpsKernel.mu
  ${GENERATED_PORTING_DIR}/aten/src/ATen/native/musa/Randperm.mu
  ${GENERATED_PORTING_DIR}/aten/src/ATen/native/musa/RangeFactories.mu
  ${GENERATED_PORTING_DIR}/aten/src/ATen/native/musa/ReduceAMinMaxKernel.mu
  ${GENERATED_PORTING_DIR}/aten/src/ATen/native/musa/ReduceLogicKernel.mu
  ${GENERATED_PORTING_DIR}/aten/src/ATen/native/musa/ReduceMaxValuesKernel.mu
  ${GENERATED_PORTING_DIR}/aten/src/ATen/native/musa/ReduceMomentKernel.mu
  ${GENERATED_PORTING_DIR}/aten/src/ATen/native/musa/ReduceNormKernel.mu
  ${GENERATED_PORTING_DIR}/aten/src/ATen/native/musa/ReflectionPad.mu
  ${GENERATED_PORTING_DIR}/aten/src/ATen/native/musa/Repeat.mu
  ${GENERATED_PORTING_DIR}/aten/src/ATen/native/musa/ReplicationPadding.mu
  ${GENERATED_PORTING_DIR}/aten/src/ATen/native/musa/ScatterGatherKernel.mu
  ${GENERATED_PORTING_DIR}/aten/src/ATen/native/musa/Shape.mu
  ${GENERATED_PORTING_DIR}/aten/src/ATen/native/musa/TensorCompare.mu
  ${GENERATED_PORTING_DIR}/aten/src/ATen/native/musa/TensorFactories.mu
  ${GENERATED_PORTING_DIR}/aten/src/ATen/native/musa/TensorTransformations.mu
  ${GENERATED_PORTING_DIR}/aten/src/ATen/native/musa/UnaryFractionKernels.mu
  ${GENERATED_PORTING_DIR}/aten/src/ATen/native/musa/UnarySignKernels.mu
  ${GENERATED_PORTING_DIR}/aten/src/ATen/native/musa/Unique.mu
  ${GENERATED_PORTING_DIR}/aten/src/ATen/native/musa/UniqueCub.mu
  ${GENERATED_PORTING_DIR}/aten/src/ATen/native/musa/UpSampleBicubic2d.mu
  ${GENERATED_PORTING_DIR}/aten/src/ATen/native/musa/UpSampleBilinear2d.mu
  ${GENERATED_PORTING_DIR}/aten/src/ATen/native/musa/UpSampleLinear1d.mu
  ${GENERATED_PORTING_DIR}/aten/src/ATen/native/musa/UpSampleNearest1d.mu
  ${GENERATED_PORTING_DIR}/aten/src/ATen/native/musa/UpSampleNearest3d.mu
  ${GENERATED_PORTING_DIR}/aten/src/ATen/native/musa/UnaryOpsKernel.mu
  ${GENERATED_PORTING_DIR}/aten/src/ATen/native/musa/WeightNorm.mu
  ${GENERATED_PORTING_DIR}/aten/src/ATen/native/musa/linalg/BatchLinearAlgebra.mu
  ${GENERATED_PORTING_DIR}/aten/src/ATen/native/quantized/musa/Activation.mu
  ${GENERATED_PORTING_DIR}/aten/src/ATen/native/quantized/musa/IntReprQuant.mu
  ${GENERATED_PORTING_DIR}/aten/src/ATen/native/quantized/musa/MakePerTensorQuantizedTensor.mu
  aten/quantized/musa/*.mu
  aten/ops/torchvision/*.mu
  aten/ops/musa/*.mu
  aten/ops/attention/mudnn/*.mu
  core/*.mu
)
musa_add_library(${MUSA_KERNELS_LIB} SHARED ${MU_SRCS} ${PORTING_CPP_CSRCS} ${ATEN_MUSA_CSRCS})

而libmusa_python.so则主要包含torch_musa中自己的一些cpp文件,主要代码如下所示:

set(PLUGIN_NAME "musa_python")

FILE(GLOB _CSRCS
  aten/ops/*.cpp
  aten/utils/*.cpp
  aten/mudnn/*.cpp
  core/*.cpp
  amp/*.cpp
  utils/*.cpp
  aten/quantized/*.cpp
  aten/quantized/mudnn/*.cpp
)

FILE(GLOB _MUDNN_ATTENTION_CSRCS aten/ops/attention/mudnn/*.cpp)
LIST(APPEND MUSA_CSRCS ${_CSRCS} ${_MUDNN_ATTENTION_CSRCS})

if(USE_MCCL)
  FILE(GLOB _MCCL_CSRCS distributed/*.cpp)
  list(APPEND MUSA_CSRCS ${_MCCL_CSRCS})
endif()

LIST(APPEND MUSA_CSRCS ${_CSRCS})

# Pass to parent
set(MUSA_CSRCS ${MUSA_CSRCS} PARENT_SCOPE)

set(CPP_SRCS ${MUSA_CSRCS})
add_library(${PLUGIN_NAME} SHARED ${CPP_SRCS})

# libmusa_python.so依赖libmusa_kernels.so
target_link_libraries(${PLUGIN_NAME} PUBLIC ${MUSA_KERNELS_LIB})

build_musa_lib()函数执行完成后会在/home/torch_musa/torch_musa/lib生成libmusa_kernels.solibmusa_python.solib_ext_musa_kernels.so(这个是extension的so库)

② 通过setup函数进行安装:

setup.py中的setup函数定义如下:

    setup(
        name="torch_musa",
        version=version,
        description="A PyTorch backend extension for Moore Threads MUSA",
        url="https://github.mthreads.com/mthreads/torch_musa",
        author="Moore Threads PyTorch AI Dev Team",
        packages=find_packages(exclude=["tools", "tools*"]),
        ext_modules=configure_extension_build(),
        include_package_data=True,
        install_requires=install_requires,
        extras_require={},
        cmdclass={"build_ext": Build, "clean": Clean, "install": Install},
    )

定义ext_modules的函数如下:

def configure_extension_build():
    if CLEAN_MODE:
        return
    extra_link_args = []
    extra_compile_args = [
        "-std=c++17",
        "-Wall",
        "-Wextra",
        "-Werror",
        "-fno-strict-aliasing",
        "-fstack-protector-all",
    ]

    if build_type.is_debug():
        extra_compile_args += ["-O0", "-g"]
        extra_link_args += ["-O0", "-g"]

    if build_type.is_rel_with_deb_info():
        extra_compile_args += ["-g"]
        extra_link_args += ["-g"]

    use_asan = os.getenv("USE_ASAN", default="").upper() in [
        "ON",
        "1",
        "YES",
        "TRUE",
        "Y",
    ]

    if use_asan:
        extra_compile_args += ["-fsanitize=address"]
        extra_link_args += ["-fsanitize=address"]

    torch_musa_sources = glob.glob("torch_musa/csrc/stub.cpp")
    cpp_extension = CppExtension(
        name="torch_musa._MUSAC",
        sources=torch_musa_sources,
        libraries=["musa_python"],
        include_dirs=[],
        extra_compile_args=extra_compile_args,
        library_dirs=[os.path.join(BASE_DIR, "torch_musa/lib")],
        extra_link_args=extra_link_args + ["-Wl,-rpath,$ORIGIN/lib"],
    )
    ext_extension = CppExtension(
        name="torch_musa._ext",
        sources=glob.glob("torch_musa/csrc/extension/C_frontend.cpp"),
        libraries=["_ext_musa_kernels", "musa_python"],
        include_dirs=[],
        extra_compile_args={"cxx": ['-std=c++17']},
        library_dirs=[os.path.join(BASE_DIR, "torch_musa/lib")],
        extra_link_args=extra_link_args + ["-Wl,-rpath,$ORIGIN/lib"],
    )
    return [cpp_extension, ext_extension]

可见其返回了两个extension,cpp_extension是torch_musa的主要部分,ext_extension是torch_musa的ext部分。 /home/torch_musa/torch_musa/csrc/stub.cpp的内容如下:

#include <pybind11/pybind11.h>

extern PyObject* InitMusaModule();

PyMODINIT_FUNC PyInit__MUSAC(void) {
  return InitMusaModule();
}

可见其初始化了torch_musa,其具体原理如下链接所示: github.com/xgfone/Pyth…