llm.c 源码解析——激活函数 GELU
GELU(Gaussian Error Linear Unit)是一种激活函数,最初由Dan Hendrycks 和 Kevin Gimpel 在2016 年提出。
gelu_forward
Andrej Karpathy 的 C 语言版的 GELU 采用了一种近似的算法(下图中红框内),
void gelu_forward(float* out, float* inp, int N) {
float s = sqrtf(2.0f / M_PI);
for (int i = 0; i < N; i++) {
float x = inp[i];
float cube = 0.044715f * x * x * x;
out[i] = 0.5f * x * (1.0f + tanhf(s * (x + cube)));
}
}
而 PyTorch 版的 GELU 可通过 approximate 参数控制是采用精确算法(none)还是近似算法(tanh)。
其中的 M_2_SQRTPI 是 2/sqrt(π)。
def gelu(a: TensorLikeType, approximate: str = "none") -> TensorLikeType:
if not isinstance(a, TensorLike):
# raise RuntimeError...
M_SQRT2 = 1.41421356237309504880
M_SQRT1_2 = 0.70710678118654752440
M_2_SQRTPI = 1.12837916709551257390
if approximate == "tanh":
kBeta = M_SQRT2 * M_2_SQRTPI * 0.5
kKappa = 0.044715
a_cube = a * a * a
inner = kBeta * (a + kKappa * a_cube)
return 0.5 * a * (1 + torch.tanh(inner))
elif approximate == "none":
kAlpha = M_SQRT1_2
return a * 0.5 * (1 + torch.erf(a * kAlpha))
else:
raise RuntimeError("approximate argument must be either none or tanh.")
gelu_backward
void gelu_backward(float* dinp, float* inp, float* dout, int N) {
float s = sqrtf(2.0f / M_PI);
for (int i = 0; i < N; i++) {
float x = inp[i];
float cube = 0.044715f * x * x * x;
float tanh_arg = s * (x + cube);
float tanh_out = tanhf(tanh_arg);
float coshf_out = coshf(tanh_arg);
float sech_out = 1.0f / (coshf_out * coshf_out);
float local_grad = 0.5f * (1.0f + tanh_out) + x * 0.5f * sech_out * s * (1.0f + 3.0f * 0.044715f * x * x);
dinp[i] += local_grad * dout[i];
}
}
局部梯度 local_grad