对于Pod异常判断,通常需要结合phase condition containerstatus等字段综合判断原因,这里给个代码示例:
const (
// Insufficient Unschedulable PodCondition reason when pod pending
Insufficient = "Insufficient"
// InsufficientCpu Unschedulable PodCondition reason when pod pending
InsufficientCpu = "cpu"
// InsufficientMemory Unschedulable PodCondition reason when pod pending
InsufficientMemory = "memory"
// InsufficientEphemeralStorage Unschedulable PodCondition reason when pod pending
InsufficientEphemeralStorage = "ephemeral-storage"
// PersistentVolumeClaim Unschedulable PodCondition reason when pod pending
PersistentVolumeClaim = "persistentvolumeclaim"
// ContainersNotReady kubelet PodCondition reason when pod pending
ContainersNotReady = "ContainersNotReady"
// ErrImagePull kubelet PodCondition reason when pod pending
ErrImagePull = "ErrImagePull"
// ImagePullBackOff kubelet PodCondition reason when pod pending
ImagePullBackOff = "ImagePullBackOff"
// CreateContainerError kubelet PodCondition reason when pod pending
CreateContainerError = "CreateContainerError"
// CreateContainerConfigError kubelet PodCondition reason when pod pending
CreateContainerConfigError = "CreateContainerConfigError"
// Evicted is pod status reason
Evicted = "Evicted"
// OOMKilled is pod status reason
OOMKilled = "OOMKilled"
// CrashLoopBackOff is pod status reason
CrashLoopBackOff = "CrashLoopBackOff"
)
type Diagnosis struct {
Code string
MessageID string
Reason string
Message string
}
func judgeDiagnosisForPendingPod(pod *corev1.Pod) *Diagnosis {
for _, condition := range pod.Status.Conditions {
diagnosis := &Diagnosis{
Reason: condition.Reason,
Message: condition.Message,
}
switch condition.Reason {
case corev1.PodReasonUnschedulable:
// 资源超限导致pod无法调度
if strings.Contains(condition.Message, Insufficient) {
insufficientResource := findInsufficientResource(condition.Message)
if biz.ContainsGpuResource(insufficientResource) {
diagnosis.Code = PodUnschedulableForInsufficientGpuDiagnosisCode
diagnosis.MessageID = PodUnschedulableForInsufficientGpu
return diagnosis
}
if strings.Contains(insufficientResource, InsufficientEphemeralStorage) {
diagnosis.Code = PodUnschedulableForInsufficientEphemeralStorageDiagnosisCode
diagnosis.MessageID = PodUnschedulableForInsufficientEphemeralStorage
return diagnosis
}
if strings.Contains(insufficientResource, InsufficientMemory) {
diagnosis.Code = PodUnschedulableForInsufficientMemoryDiagnosisCode
diagnosis.MessageID = PodUnschedulableForInsufficientMemory
return diagnosis
}
if strings.Contains(insufficientResource, InsufficientCpu) {
diagnosis.Code = PodUnschedulableForInsufficientCpuDiagnosisCode
diagnosis.MessageID = PodUnschedulableForInsufficientCpu
return diagnosis
}
diagnosis.Code = PodUnschedulableForInsufficientDiagnosisCode
diagnosis.MessageID = PodUnschedulableForInsufficient
return diagnosis
}
// 存储卷错误导致pod无法调度
if strings.Contains(condition.Message, PersistentVolumeClaim) {
diagnosis.Code = PodUnschedulableForPVCErrorDiagnosisCode
diagnosis.MessageID = PodUnschedulableForPVCError
return diagnosis
}
// pod无法调度
diagnosis.Code = PodUnschedulableDiagnosisCode
diagnosis.MessageID = PodUnschedulable
return diagnosis
case ContainersNotReady:
for _, containerStatus := range pod.Status.ContainerStatuses {
if containerStatus.State.Waiting != nil {
switch containerStatus.State.Waiting.Reason {
case ErrImagePull, ImagePullBackOff:
// 镜像拉取失败
diagnosis.Code = ContainersNotReadyForImagePullErrorDiagnosisCode
diagnosis.MessageID = ContainersNotReadyForImagePullError
return diagnosis
case CreateContainerError, CreateContainerConfigError:
if strings.Contains(containerStatus.State.Waiting.Message, "volumeMount") {
// 卷挂载失败
diagnosis.Code = ContainersNotReadyForVolumeMountErrorDiagnosisCode
diagnosis.MessageID = ContainersNotReadyForVolumeMountError
return diagnosis
}
// 创建容器配置失败
diagnosis.Code = ContainersNotReadyForCreateContainerErrorDiagnosisCode
diagnosis.MessageID = ContainersNotReadyForCreateContainerError
return diagnosis
}
}
}
// ContainersNotReady也有可能为正常情况,重试后会拉起
}
}
return nil
}
/*
解析思路:
找到每个 Insufficient 字符串前的数字,然后找出最小的那个数字对应的资源,这很可能就是真正缺少的资源。
假如最小数字的资源有多个,则使用空格拼接。
报错示例:
0/1 nodes are available: 3 Insufficient cpu, 2 Insufficient ephemeral-storage,
1 Insufficient hygon.com/dcu, 2 Insufficient memory. preemption: 0/1 nodes are
available: 1 No preemption victims found for incoming pod..
*/
func findInsufficientResource(message string) string {
// 正则表达式匹配 "数字 Insufficient 资源"
re := regexp.MustCompile(`(\d+)\s+Insufficient\s+(\S+)`)
matches := re.FindAllStringSubmatch(message, -1)
minValue := -1
var resource string
for _, match := range matches {
value, err := strconv.Atoi(match[1])
if err != nil {
continue
}
if minValue == -1 || value < minValue {
minValue = value
resource = match[2]
} else if value == minValue {
resource = resource + " " + match[2]
}
}
return resource
}
/*
两种使用情况:
1.pod phase为Running,restartPolicy为always时,即使业务程序异常,phase也始终为Running状态
2.pod phase为Failed
*/
func judgeDiagnosisForContainerStatuses(pod *corev1.Pod) *Diagnosis {
diagnosis := &Diagnosis{}
switch pod.Status.Reason {
case Evicted:
// Pod被驱逐
diagnosis.Code = PodIsEvictedDiagnosisCode
diagnosis.MessageID = PodIsEvicted
diagnosis.Reason = pod.Status.Reason
diagnosis.Message = pod.Status.Message
return diagnosis
}
diagnosisTerminatedFunc := func(term *corev1.ContainerStateTerminated) Diagnosis {
diagnosis.Reason = term.Reason
diagnosis.Message = term.Message
if term.Reason == OOMKilled {
// 进程内存使用量超出限制被OOMKille
diagnosis.Code = PodIsOOMKilledDiagnosisCode
diagnosis.MessageID = PodIsOOMKilled
return diagnosis
} else if term.ExitCode == 137 {
// 进程被杀死
diagnosis.Code = PodIsKilledDiagnosisCode
diagnosis.MessageID = PodIsKilled
return diagnosis
} else {
// 进程已终止
diagnosis.Code = PodIsTerminatedDiagnosisCode
diagnosis.MessageID = PodIsTerminated
return diagnosis
}
}
for _, containerStatus := range pod.Status.ContainerStatuses {
if term := containerStatus.State.Terminated; term != nil {
return diagnosisTerminatedFunc(term)
}
if wait := containerStatus.State.Waiting; wait != nil {
if wait.Reason == CrashLoopBackOff {
if term := containerStatus.LastTerminationState.Terminated; term != nil {
// CrashLoopBackOff 可使用上次终止原因
return diagnosisTerminatedFunc(term)
} else {
// 进程崩溃
diagnosis.Code = PodIsCrashLoopBackOffDiagnosisCode
diagnosis.MessageID = PodIsCrashLoopBackOff
return diagnosis
}
}
}
if containerStatus.Started != nil && !*containerStatus.Started {
// 容器未启动,可能为正常情况
diagnosis.Code = PodIsNotStartedDiagnosisCode
diagnosis.MessageID = PodIsNotStarted
return diagnosis
}
if !containerStatus.Ready {
// 容器未就绪,可能为正常情况
diagnosis.Code = PodIsNotReadyDiagnosisCode
diagnosis.MessageID = PodIsNotReady
return diagnosis
}
}
return nil
}