引言
在LightGBM中的CUDA算子(一)中我们已介绍了LightGBM源码中基础的cuda算子,包括并行规约(数组之和、前缀和)与双调排序算子。在LightGBM中的CUDA算子(二)中我们又继续介绍了直方图相关算子,包括直方图构建、运算。本文我们将继续介绍在boosting过程中基于特征直方图在节点上寻找最佳分裂点的算子。本文将是LightGBM cuda算子系列的最后一章。
寻找最佳分裂点
寻找最佳分裂点算子位于src/treelearner/cuda/cuda_best_split_finder.h和cuda_best_split_finder.cu中。包括为节点寻找最佳分裂点和计算最大增益两类。同直方图算子一样,寻找分裂点算子也有分别针对不同特征类型(dense or sparse),不同设备内存(global or shared)的实现。下面我们先以FindBestSplitsForLeafKernelInner为例介绍基于数值型特征直方图寻找最佳分裂的过程。
// cuda_best_split_finder.cu
template <bool USE_RAND, bool USE_L1, bool USE_SMOOTHING, bool REVERSE>
__device__ void FindBestSplitsForLeafKernelInner(
// input feature information
const double * feature_hist_ptr, //特征直方图
// input task information 单个feature寻找分裂节点作为一个task
const SplitFindTask* task, // task
CUDARandom* cuda_random, // random
// input config parameter values 对应模型超参数, 用于计算gain
const double lambda_l1, // l1正则
const double lambda_l2, // l2正则
const double path_smooth, // 路径平滑,用来控制过拟合的参数,当叶子节点样本数很少时用父节点/祖节点的leaf_value
const int min_data_in_leaf, // 每个叶子节点上最少训练样本数
const double min_sum_hessian_in_leaf, // 每个叶子节点上最小的hessian之和
const double min_gain_to_split, //分裂需要最小增益
// input parent node information (待分裂节点)当前节点信息
const double parent_gain, //当前节点增益
const double sum_gradients, // 当前节点梯度之和
const double sum_hessians, // 当前节点二阶偏导之和
const int num_data, // 当前节点样本量
const double parent_output, // 当前节点输出值(leaf_value)
// output parameters
CUDASplitInfo* cuda_best_split_info //输出分裂信息
) {
const double cnt_factor = num_data / sum_hessians; //根据hess计算样本量的因子
const double min_gain_shift = parent_gain + min_gain_to_split; // 满足最小信息增益所需叶子节点信息值之和
cuda_best_split_info->is_valid = false;
double local_grad_hist = 0.0f; // 当前thread所负责bin的grad
double local_hess_hist = 0.0f; // 当前thread所负责bin的hess
double local_gain = 0.0f; // 当前thread所负责bin的gain
bool threshold_found = false; // 当前bin是否满足信息增益要求
uint32_t threshold_value = 0; // 类似与当前bin的索引
__shared__ int rand_threshold;
if (USE_RAND && threadIdx.x == 0) {
if (task->num_bin - 2 > 0) {
rand_threshold = cuda_random->NextInt(0, task->num_bin - 2);
}
}
__shared__ uint32_t best_thread_index; // 最佳分裂点所在线程
// 申请shared memory
__shared__ double shared_double_buffer[32];
__shared__ bool shared_bool_buffer[32];
__shared__ uint32_t shared_int_buffer[32];
const unsigned int threadIdx_x = threadIdx.x;
// 读取task信息
// 当前线程负责的bin是否跳过
const bool skip_sum = REVERSE ?
(task->skip_default_bin && (task->num_bin - 1 - threadIdx_x) == static_cast<int>(task->default_bin)) :
(task->skip_default_bin && (threadIdx_x + task->mfb_offset) == static_cast<int>(task->default_bin));
const uint32_t feature_num_bin_minus_offset = task->num_bin - task->mfb_offset;
// 读取grad和hess信息
if (!REVERSE) {
if (task->na_as_missing && task->mfb_offset == 1) {
if (threadIdx_x < static_cast<uint32_t>(task->num_bin) && threadIdx_x > 0) {
const unsigned int bin_offset = (threadIdx_x - 1) << 1;
local_grad_hist = feature_hist_ptr[bin_offset];
local_hess_hist = feature_hist_ptr[bin_offset + 1];
}
} else {
if (threadIdx_x < feature_num_bin_minus_offset && !skip_sum) {
const unsigned int bin_offset = threadIdx_x << 1;
local_grad_hist = feature_hist_ptr[bin_offset];
local_hess_hist = feature_hist_ptr[bin_offset + 1];
}
}
} else {
if (threadIdx_x >= static_cast<unsigned int>(task->na_as_missing) &&
threadIdx_x < feature_num_bin_minus_offset && !skip_sum) {
const unsigned int read_index = feature_num_bin_minus_offset - 1 - threadIdx_x;
const unsigned int bin_offset = read_index << 1;
local_grad_hist = feature_hist_ptr[bin_offset];
local_hess_hist = feature_hist_ptr[bin_offset + 1];
}
}
__syncthreads();
// 计算grad、hess前缀和
// 先计算base grad和hess, 如空值的hess和grad
if (!REVERSE && task->na_as_missing && task->mfb_offset == 1) {
// 特征下非空值grad、hess之和
const double sum_gradients_non_default = ShuffleReduceSum<double>(local_grad_hist, shared_double_buffer, blockDim.x);
__syncthreads();
const double sum_hessians_non_default = ShuffleReduceSum<double>(local_hess_hist, shared_double_buffer, blockDim.x);
if (threadIdx_x == 0) {
local_grad_hist += (sum_gradients - sum_gradients_non_default); //空值的grad之和 = 总的grad - 非空值的grad之和
local_hess_hist += (sum_hessians - sum_hessians_non_default);
}
}
if (threadIdx_x == 0) {
local_hess_hist += kEpsilon; // 即增益计算中的G^2/(H+λ)中的λ
}
local_gain = kMinScore;
// 前缀和+空值和
local_grad_hist = ShufflePrefixSum(local_grad_hist, shared_double_buffer);
__syncthreads();
local_hess_hist = ShufflePrefixSum(local_hess_hist, shared_double_buffer);
// 计算gain并更新threshold_found和threshold_value
if (REVERSE) {
if (threadIdx_x >= static_cast<unsigned int>(task->na_as_missing) && threadIdx_x <= task->num_bin - 2 && !skip_sum) {
const double sum_right_gradient = local_grad_hist;
const double sum_right_hessian = local_hess_hist;
const data_size_t right_count = static_cast<data_size_t>(__double2int_rn(sum_right_hessian * cnt_factor));
const double sum_left_gradient = sum_gradients - sum_right_gradient;
const double sum_left_hessian = sum_hessians - sum_right_hessian;
const data_size_t left_count = num_data - right_count;
if (sum_left_hessian >= min_sum_hessian_in_leaf && left_count >= min_data_in_leaf &&
sum_right_hessian >= min_sum_hessian_in_leaf && right_count >= min_data_in_leaf &&
(!USE_RAND || static_cast<int>(task->num_bin - 2 - threadIdx_x) == rand_threshold)) {
double current_gain = CUDALeafSplits::GetSplitGains<USE_L1, USE_SMOOTHING>(
sum_left_gradient, sum_left_hessian, sum_right_gradient,
sum_right_hessian, lambda_l1,
lambda_l2, path_smooth, left_count, right_count, parent_output);
// gain with split is worse than without split
if (current_gain > min_gain_shift) {
local_gain = current_gain - min_gain_shift;
threshold_value = static_cast<uint32_t>(task->num_bin - 2 - threadIdx_x);
threshold_found = true;
}
}
}
} else {
const uint32_t end = (task->na_as_missing && task->mfb_offset == 1) ? static_cast<uint32_t>(task->num_bin - 2) : feature_num_bin_minus_offset - 2;
if (threadIdx_x <= end && !skip_sum) {
const double sum_left_gradient = local_grad_hist;
const double sum_left_hessian = local_hess_hist;
const data_size_t left_count = static_cast<data_size_t>(__double2int_rn(sum_left_hessian * cnt_factor));
const double sum_right_gradient = sum_gradients - sum_left_gradient;
const double sum_right_hessian = sum_hessians - sum_left_hessian;
const data_size_t right_count = num_data - left_count;
if (sum_left_hessian >= min_sum_hessian_in_leaf && left_count >= min_data_in_leaf &&
sum_right_hessian >= min_sum_hessian_in_leaf && right_count >= min_data_in_leaf &&
(!USE_RAND || static_cast<int>(threadIdx_x + task->mfb_offset) == rand_threshold)) {
// 计算gain
double current_gain = CUDALeafSplits::GetSplitGains<USE_L1, USE_SMOOTHING>(
sum_left_gradient, sum_left_hessian, sum_right_gradient,
sum_right_hessian, lambda_l1,
lambda_l2, path_smooth, left_count, right_count, parent_output);
// gain with split is worse than without split
if (current_gain > min_gain_shift) {
local_gain = current_gain - min_gain_shift;
threshold_value = (task->na_as_missing && task->mfb_offset == 1) ?
static_cast<uint32_t>(threadIdx_x) :
static_cast<uint32_t>(threadIdx_x + task->mfb_offset);
threshold_found = true;
}
}
}
}
__syncthreads();
// 计算当前feature的最佳分裂点(样本索引)
const uint32_t result = ReduceBestGain(local_gain, threshold_found, threadIdx_x, shared_double_buffer, shared_bool_buffer, shared_int_buffer);
if (threadIdx_x == 0) {
best_thread_index = result;
}
__syncthreads();
// cuda_best_split_info接收feature的分裂结果输出
if (threshold_found && threadIdx_x == best_thread_index) {
cuda_best_split_info->is_valid = true;
cuda_best_split_info->threshold = threshold_value;
cuda_best_split_info->gain = local_gain;
cuda_best_split_info->default_left = task->assume_out_default_left;
if (REVERSE) {
const double sum_right_gradient = local_grad_hist;
const double sum_right_hessian = local_hess_hist - kEpsilon;
const data_size_t right_count = static_cast<data_size_t>(__double2int_rn(sum_right_hessian * cnt_factor));
const double sum_left_gradient = sum_gradients - sum_right_gradient;
const double sum_left_hessian = sum_hessians - sum_right_hessian - kEpsilon;
const data_size_t left_count = num_data - right_count;
const double left_output = CUDALeafSplits::CalculateSplittedLeafOutput<USE_L1, USE_SMOOTHING>(sum_left_gradient,
sum_left_hessian, lambda_l1, lambda_l2, path_smooth, left_count, parent_output);
const double right_output = CUDALeafSplits::CalculateSplittedLeafOutput<USE_L1, USE_SMOOTHING>(sum_right_gradient,
sum_right_hessian, lambda_l1, lambda_l2, path_smooth, right_count, parent_output);
cuda_best_split_info->left_sum_gradients = sum_left_gradient;
cuda_best_split_info->left_sum_hessians = sum_left_hessian;
cuda_best_split_info->left_count = left_count;
cuda_best_split_info->right_sum_gradients = sum_right_gradient;
cuda_best_split_info->right_sum_hessians = sum_right_hessian;
cuda_best_split_info->right_count = right_count;
cuda_best_split_info->left_value = left_output;
cuda_best_split_info->left_gain = CUDALeafSplits::GetLeafGainGivenOutput<USE_L1>(sum_left_gradient,
sum_left_hessian, lambda_l1, lambda_l2, left_output);
cuda_best_split_info->right_value = right_output;
cuda_best_split_info->right_gain = CUDALeafSplits::GetLeafGainGivenOutput<USE_L1>(sum_right_gradient,
sum_right_hessian, lambda_l1, lambda_l2, right_output);
} else {
const double sum_left_gradient = local_grad_hist;
const double sum_left_hessian = local_hess_hist - kEpsilon;
const data_size_t left_count = static_cast<data_size_t>(__double2int_rn(sum_left_hessian * cnt_factor));
const double sum_right_gradient = sum_gradients - sum_left_gradient;
const double sum_right_hessian = sum_hessians - sum_left_hessian - kEpsilon;
const data_size_t right_count = num_data - left_count;
// 计算左右叶子节点的输出
const double left_output = CUDALeafSplits::CalculateSplittedLeafOutput<USE_L1, USE_SMOOTHING>(sum_left_gradient,
sum_left_hessian, lambda_l1, lambda_l2, path_smooth, left_count, parent_output);
const double right_output = CUDALeafSplits::CalculateSplittedLeafOutput<USE_L1, USE_SMOOTHING>(sum_right_gradient,
sum_right_hessian, lambda_l1, lambda_l2, path_smooth, right_count, parent_output);
cuda_best_split_info->left_sum_gradients = sum_left_gradient;
cuda_best_split_info->left_sum_hessians = sum_left_hessian;
cuda_best_split_info->left_count = left_count;
cuda_best_split_info->right_sum_gradients = sum_right_gradient;
cuda_best_split_info->right_sum_hessians = sum_right_hessian;
cuda_best_split_info->right_count = right_count;
cuda_best_split_info->left_value = left_output;
cuda_best_split_info->left_gain = CUDALeafSplits::GetLeafGainGivenOutput<USE_L1>(sum_left_gradient,
sum_left_hessian, lambda_l1, lambda_l2, left_output);
cuda_best_split_info->right_value = right_output;
cuda_best_split_info->right_gain = CUDALeafSplits::GetLeafGainGivenOutput<USE_L1>(sum_right_gradient,
sum_right_hessian, lambda_l1, lambda_l2, right_output);
}
}
}
FindBestSplitsForLeafKernelInner实现了寻找单个特征最佳分裂点的方法,限定符__device__表明kernel是在设备端运行,FindBestSplitsForLeafKernelInner运行在单个block上,即单个block负责一个feature,其中block中每个thread负责feature下的一个bin,在单个block上寻找最佳分裂点的执行过程分为5步:
- 读取当前特征直方图每个bin下的数据到寄存器;
- 计算每个bin下对应直方图grad、hess的前缀和;
- 根据前缀和计算以当前bin作为分裂点,左子节点和右子节点的grad、hess之和,并计算分裂增益;
- 根据各个bin下的增益,寻找最佳分裂点;
- 使用输出指针接收最佳分裂点信息。
如注释所示,FindBestSplitsForLeafKernelInner参数有:
- 存放了单个特征直方图(grad, hess数据)的指针feature_hist_ptr;
- 存放了单个特征分裂任务信息如bin个数、空值信息的指针task;
- 计算gain和判断节点是否满足分裂条件的超参数lambda_m1, min_gain_to_split等;
- 当前节点信息的parent_gain, parent_output等;
- 用于接收输出的指针cuda_best_split_info。
第25~47行进行了索引、寄存器变量的定义,申请了一些后续计算所使用的共享内存。57~79行读取当前特征直方图每个bin下的grad和hess存入bin对应线程的寄存器变量local_grad_hist和local_hess_hist中。REVESE控制特征直方图数据的读取顺序与前缀和的计算顺序。正序表示从左往后读取并计算前缀和,得到左子节点的gred、hess之和,然后通过总和减去左子节点之和得到右子节点之和。58行的task->na_as_missing && task->mfb_offset == 1表示正序时当前特征下有空值,且空值位于索引为0的bin上,读取直方图数据时跳过此处。
84~101为前缀和的计算过程,前缀和的计算分为两步:首先84~93计算了空值bin的base grad、hess之和,所使用方法是当前节点总和减去非空bin之和;第二步99、101再在base基础上计算grad和hess的前缀和。103~155使用前缀和计算了当前bin作为分裂点得到的左子节点和右子节点的sum_grad和sum_hess,然后传入CUDALeafSplits::GetSplitGains中计算当前bin分裂的信息增益,并判定是否满足最小增益的要求。REVERSE控制的分支用于兼容两种不同的顺序。
157~160为寻找最佳分裂bin的过程,reduceBestGain也是一个线程束洗牌指令的规约过程:先在warp内找gain最大的,然后再在block内warp间找gain最大的,最后在第164~215行将输出传入接收指针。reduceBestGain从各个bin的分裂增益中找到最佳分裂的过程如下:
// cuda_best_split_finder.cu
// 线程束洗牌指令实现的规约算法,用于寻找大增益对应的bin,实现原理与LightGBM中的CUDA算子(一)中shuffleReduceSum类似
__device__ uint32_t ReduceBestGain(double gain, bool found, uint32_t thread_index,
double* shared_gain_buffer, bool* shared_found_buffer, uint32_t* shared_thread_index_buffer) {
const uint32_t warpID = threadIdx.x / warpSize;
const uint32_t warpLane = threadIdx.x % warpSize;
const uint32_t num_warp = blockDim.x / warpSize;
// warp内找best
ReduceBestGainWarp(gain, found, thread_index, shared_gain_buffer + warpID, shared_found_buffer + warpID, shared_thread_index_buffer + warpID);
__syncthreads();
// block中warp间找best
if (warpID == 0) {
gain = warpLane < num_warp ? shared_gain_buffer[warpLane] : kMinScore;
found = warpLane < num_warp ? shared_found_buffer[warpLane] : false;
thread_index = warpLane < num_warp ? shared_thread_index_buffer[warpLane] : 0;
thread_index = ReduceBestGainBlock(gain, found, thread_index);
}
return thread_index;
}
__device__ void ReduceBestGainWarp(double gain, bool found, uint32_t thread_index, double* out_gain, bool* out_found, uint32_t* out_thread_index) {
const uint32_t mask = 0xffffffff;
const uint32_t warpLane = threadIdx.x % warpSize;
for (uint32_t offset = warpSize / 2; offset > 0; offset >>= 1) {
const bool other_found = __shfl_down_sync(mask, found, offset); // tid + offset处gain
const double other_gain = __shfl_down_sync(mask, gain, offset); // tid处gain
const uint32_t other_thread_index = __shfl_down_sync(mask, thread_index, offset);
if ((other_found && found && other_gain > gain) || (!found && other_found)) {
found = other_found;
gain = other_gain;
thread_index = other_thread_index;
}
}
// 输出
if (warpLane == 0) {
*out_gain = gain;
*out_found = found;
*out_thread_index = thread_index;
}
}
__device__ uint32_t ReduceBestGainBlock(double gain, bool found, uint32_t thread_index) {
const uint32_t mask = 0xffffffff;
for (uint32_t offset = warpSize / 2; offset > 0; offset >>= 1) {
const bool other_found = __shfl_down_sync(mask, found, offset);
const double other_gain = __shfl_down_sync(mask, gain, offset);
const uint32_t other_thread_index = __shfl_down_sync(mask, thread_index, offset);
if ((other_found && found && other_gain > gain) || (!found && other_found)) {
found = other_found;
gain = other_gain;
thread_index = other_thread_index;
}
}
return thread_index;
}
ReduceBestGainWarp 为束内规约寻找最大增益分裂的实现,输出最大增益bin所对应的gain值--out_gain、最大增益是否满足要求--out_found、最大增益bin对应的thread索引--out_thread_idx。ReduceBestGainBlock为块内寻找最佳分裂实现,只用输出block下最大增益bin对应的thread索引--thread_index。
小结
本文介绍了LightGBM在boosting过程中基于特征直方图寻找最佳分裂点的CUDA算子。至此关于《LightGBM中的CUDA算子》系列介绍告一段落。本系列针对源码中最为核心三大类的算子:基础算子、直方图、寻找最佳分裂点进行了系统性的介绍,并对每一类算子挑了一种实现进行解读。当然这三类算子在源码中还有针对不同case的兼容实现,感兴趣的读者可以移步到源码中进行更为全面的了解。