hungtask问题处理内核某进程/线程长期处于D状态（TASK_UNINTERRUPTIBLE），不能接收信号，只能在

1.hungtask问题现象

内核打印“INFO: task xxx:xxx blocked for more than 120 seconds.”

2.hungtask问题原因

内核某进程/线程长期处于D状态（TASK_UNINTERRUPTIBLE），不能接收信号，只能在资源满足条件时进行唤醒。但是资源超过120s不能满足，即发生hungtask。

3.hungtask基本原理

3.1 task_strcut中hung task相关成员

在进行hung task分析之前，需要了解struct task_strcut中的state、nvcsw、nivcsw、last_switch_count几个成员含义。

struct task_struct {
...
    /* -1 unrunnable, 0 runnable, >0 stopped */
    volatile long state; 
...
    //nvcsw表示进程主动切换次数，nivcsw表示进程被动切换次数，两者之和就是进程总的切换次数
    unsigned long nvcsw, nivcsw; 
#ifdef CONFIG_DETECT_HUNG_TASK
    //这个变量只有两个地方修改，一是在新建进程的时候设置初始值last_switch_count=nvcsw+nivcsw。
    //另一个是在khungtaskd中进行更新。
    unsigned long last_switch_count;
#endif
...
};

3.2 khungtaskd线程创建

watchdog()是khuangtaskd线程主函数，线程每隔sysctl_hung_task_timeout_secs醒来一次，调用check_hung_uninterruptible_tasks()检查所有进程。

static int watchdog(void *dummy)
{
    unsigned long hung_last_checked = jiffies;
    //设置当前进程nice为0，即普通优先级
    set_user_nice(current, 0);

    for ( ; ; ) {
        //获取进程hung时间上限
        unsigned long timeout = sysctl_hung_task_timeout_secs;
        long t = hung_timeout_jiffies(hung_last_checked, timeout);

        if (t <= 0) {
            if (!atomic_xchg(&reset_hung_task, 0))
                check_hung_uninterruptible_tasks(timeout);
            hung_last_checked = jiffies;
            continue;
        }
        //休眠sysctl_hung_task_timeout_secs秒
        schedule_timeout_interruptible(t);
    }

    return 0;
}

static int __init hung_task_init(void)
{
    //注册panic通知链，在panic时执行相关操作
    atomic_notifier_chain_register(&panic_notifier_list, &panic_block);
    //创建内核线程khungtaskd
    watchdog_task = kthread_run(watchdog, NULL, "khungtaskd");

    return 0;
}
subsys_initcall(hung_task_init);

panic_block注册到panic_notifier_list通知链表上，如果系统产生panic，那么did_panic就会被置1。

static int
hung_task_panic(struct notifier_block *this, unsigned long event, void *ptr)
{
    did_panic = 1;

    return NOTIFY_DONE;
}

static struct notifier_block panic_block = {
    .notifier_call = hung_task_panic,
};

3.3 检查进程是否hung

check_hung_uninterruptible_tasks()遍历内核中所有task_struct，首先判断状态是否是TASK_UNINTERRUPTIBLE。

static void check_hung_uninterruptible_tasks(unsigned long timeout)
{
    //检测最大进程数，默认为最大进程号
    int max_count = sysctl_hung_task_check_count;
    //每次遍历进程数上限1024
    int batch_count = HUNG_TASK_BATCHING;
    struct task_struct *g, *t;

    /*
     * If the system crashed already then all bets are off,
     * do not report extra hung tasks:
     */
    if (test_taint(TAINT_DIE) || did_panic)
        return;

    rcu_read_lock();
    for_each_process_thread(g, t) {
        if (!max_count--)
            goto unlock;
        if (!--batch_count) {
            batch_count = HUNG_TASK_BATCHING;
            //防止rcu_read_lock占用过长时间。释放rcu，并主动调度。
            //调度回来后检查响应进程是否还在，不在则退出遍历，否则继续
            if (!rcu_lock_break(g, t))
                goto unlock;
        }
        // khungtaskd只监控TASK_UNINTERRUPTIBLE状态的进程线程
        if (t->state == TASK_UNINTERRUPTIBLE)
            check_hung_task(t, timeout);
    }
 unlock:
    rcu_read_unlock();
}


static void check_hung_task(struct task_struct *t, unsigned long timeout)
{
    //表示线程总的切换次数，包括主动和被动的
    unsigned long switch_count = t->nvcsw + t->nivcsw;

    /*
     * Ensure the task is not frozen.
     * Also, skip vfork and any other user process that freezer should skip.
     */
    if (unlikely(t->flags & (PF_FROZEN | PF_FREEZER_SKIP)))
        return;

    /*
     * When a freshly created task is scheduled once, changes its state to
     * TASK_UNINTERRUPTIBLE without having ever been switched out once, it
     * musn't be checked.
     */
    if (unlikely(!switch_count))
        return;
    //如果总切换次数和last_switch_count不等，表示在上次khungtaskd更新last_switch_count之后就发生了进程切换；
    //反之，相等则表示120s时间内没有发生切换。
    if (switch_count != t->last_switch_count) {
        //更新last_switch_count
        t->last_switch_count = switch_count;
        return;
    }

    trace_sched_process_hang(t);
    //如果不使能warning和panic，返回
    if (!sysctl_hung_task_warnings && !sysctl_hung_task_panic)
        return;

    //hung task错误打印次数限制，默认为10次，整个系统运行期间最多打印10次
    if (sysctl_hung_task_warnings) {
        sysctl_hung_task_warnings--;
        pr_err("INFO: task %s:%d blocked for more than %ld seconds.\n",
            t->comm, t->pid, timeout);
        pr_err("      %s %s %.*s\n",
            print_tainted(), init_utsname()->release,
            (int)strcspn(init_utsname()->version, " "),
            init_utsname()->version);
        pr_err("\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\""
            " disables this message.\n");
        //显示进程ID、名称、状态以及栈等信息
        sched_show_task(t);
        //如果使能debug_locks，则打印进程持有的锁
        debug_show_all_locks();
    }

    touch_nmi_watchdog();

    if (sysctl_hung_task_panic) {
        trigger_all_cpu_backtrace();
        panic("hung_task: blocked tasks");
    }
}

4. 对khungtaskd的配置

通过sysctl或者在/proc/sys/kernel/中进行配置：

hung_task_panic------------------------是否在检测到hung后panic，默认值0
hung_task_check_count---------------最大检查task数量，默认值32768
hung_task_timeout_secs--------------超时时间，默认值120
hung_task_warnings--------------------打印hung warning的次数，默认值10

5.hungtask问题如何处理

5.1 可能出现问题的情况

处于io schedule
内存不足，导致多次持有shrink_node相关的锁
等待各种uninterruptable的锁

5.2 如何分析

通过log找到出现问题的cpu的栈

如果是io schedule，分析具体的存储过程，判断是否io异常导致
如果是shrink_node相关，则可能是内存不足导致
如果是等待锁，可能当前线程栈只是个受害者

分析hung住的线程的线程栈的代码
使用其他工具进行分析：例如kdump、打印锁的持有者信息