Android R LMKD介绍

884 阅读9分钟

LMKD 介绍

lmkd(Low Memory Killer Daemon)是低内存终止守护进程,用来监控运行中android系统内存的状态,通过终止最不必要的进程来应对内存压力较高的问题,使系统以可接受的水平运行。

启动

lmkd是属于core组服务进程,rc的配置如下:

 service lmkd /system/bin/lmkd
     class core
     user lmkd
     group lmkd system readproc
     capabilities DAC_OVERRIDE KILL IPC_LOCK SYS_NICE SYS_RESOURCE
     critical
     socket lmkd seqpacket+passcred 0660 system system
     writepid /dev/cpuset/system-background/tasks
 
 on property:lmkd.reinit=1
     exec_background /system/bin/lmkd --reinit

我们看下main函数主要作了哪些工作:

  1. 属性值的初始化函数update_props(),读取设备中属性配置,默认配置
static void update_props() {
    /* By default disable low level vmpressure events */
    level_oomadj[VMPRESS_LEVEL_LOW] =
        property_get_int32("ro.lmk.low", OOM_SCORE_ADJ_MAX + 1);
    level_oomadj[VMPRESS_LEVEL_MEDIUM] =
        property_get_int32("ro.lmk.medium", 800);
    level_oomadj[VMPRESS_LEVEL_CRITICAL] =
        property_get_int32("ro.lmk.critical", 0);
    debug_process_killing = property_get_bool("ro.lmk.debug", false);

    /* By default disable upgrade/downgrade logic */
    enable_pressure_upgrade =
        property_get_bool("ro.lmk.critical_upgrade", false);
    upgrade_pressure =
        (int64_t)property_get_int32("ro.lmk.upgrade_pressure", 100);
    downgrade_pressure =
        (int64_t)property_get_int32("ro.lmk.downgrade_pressure", 100);
    kill_heaviest_task =
        property_get_bool("ro.lmk.kill_heaviest_task", false);
    low_ram_device = property_get_bool("ro.config.low_ram", false);
    kill_timeout_ms =
        (unsigned long)property_get_int32("ro.lmk.kill_timeout_ms", 100);
    use_minfree_levels =
        property_get_bool("ro.lmk.use_minfree_levels", false);
    per_app_memcg =
        property_get_bool("ro.config.per_app_memcg", low_ram_device);
    swap_free_low_percentage = clamp(0, 100, property_get_int32("ro.lmk.swap_free_low_percentage",
        DEF_LOW_SWAP));
    psi_partial_stall_ms = property_get_int32("ro.lmk.psi_partial_stall_ms",
        low_ram_device ? DEF_PARTIAL_STALL_LOWRAM : DEF_PARTIAL_STALL);
    psi_complete_stall_ms = property_get_int32("ro.lmk.psi_complete_stall_ms",
        DEF_COMPLETE_STALL);
    thrashing_limit_pct = max(0, property_get_int32("ro.lmk.thrashing_limit",
        low_ram_device ? DEF_THRASHING_LOWRAM : DEF_THRASHING));
    thrashing_limit_decay_pct = clamp(0, 100, property_get_int32("ro.lmk.thrashing_limit_decay",
        low_ram_device ? DEF_THRASHING_DECAY_LOWRAM : DEF_THRASHING_DECAY));
    thrashing_critical_pct = max(0, property_get_int32("ro.lmk.thrashing_limit_critical",
        thrashing_limit_pct * 2));
}
  1. create_android_logger 创建android event logtag
     /* Android Logger event logtags (see event.logtags) */
#define KILLINFO_LOG_TAG 10195355


# for killinfo logs
10195355 killinfo (Pid|1|5),(Uid|1|5),(OomAdj|1),(MinOomAdj|1),(TaskSize|1),(enum kill_reasons|1|5),(MemFree|1),(Cached|1),(SwapCached|1),(Buffers|1),(Shmem|1),(Unevictable|1),(SwapTotal|1),(SwapFree|1),(ActiveAnon|1),(InactiveAnon|1),(ActiveFile|1),(InactiveFile|1),(SReclaimable|1),(SUnreclaim|1),(KernelStack|1),(PageTables|1),(IonHeap|1),(IonHeapPool|1),(CmaFree|1),(MsSinceEvent|1),(MsSincePrevWakeup|1),(WakeupsSinceEvent|1),(SkippedWakeups|1)
  1. 执行init()操作,初始化epoll事件监听,创建socket “/dev/socket/lmkd” 并监听socket 连接,当有客户端连接时,回调函数ctrl_connect_handler,注意这里我们用的是用户空间的lmkd,并不是内核空间的实现,这里use_inkernel_interface 是false
    has_inkernel_module = !access(INKERNEL_MINFREE_PATH, W_OK);
    use_inkernel_interface = has_inkernel_module;

    if (use_inkernel_interface) {
        ALOGI("Using in-kernel low memory killer interface");
        if (init_poll_kernel()) {
            epev.events = EPOLLIN;
            epev.data.ptr = (void*)&kernel_poll_hinfo;
            if (epoll_ctl(epollfd, EPOLL_CTL_ADD, kpoll_fd, &epev) != 0) {
                ALOGE("epoll_ctl for lmk events failed (errno=%d)", errno);
                close(kpoll_fd);
                kpoll_fd = -1;
            } else {
                maxevents++;
                /* let the others know it does support reporting kills */
                property_set("sys.lmk.reportkills", "1");
            }
        }
    } else {
        if (!init_monitors()) {
            return -1;
        }
        /* let the others know it does support reporting kills */
        property_set("sys.lmk.reportkills", "1");
    }
  1. init_monitors()函数中主要调用了init_psi_monitors函数,init_psi_monitors函数主要调用init_mp_psi函数进行psi监控初始化
static bool init_mp_psi(enum vmpressure_level level, bool use_new_strategy) {
    int fd;

    /* Do not register a handler if threshold_ms is not set */
    if (!psi_thresholds[level].threshold_ms) {
        return true;
    }

    fd = init_psi_monitor(psi_thresholds[level].stall_type,
        psi_thresholds[level].threshold_ms * US_PER_MS,
        PSI_WINDOW_SIZE_MS * US_PER_MS);

    if (fd < 0) {
        return false;
    }

    vmpressure_hinfo[level].handler = use_new_strategy ? mp_event_psi : mp_event_common;
    vmpressure_hinfo[level].data = level;
    if (register_psi_monitor(epollfd, fd, &vmpressure_hinfo[level]) < 0) {
        destroy_psi_monitor(fd);
        return false;
    }
    maxevents++;
    mpevfd[level] = fd;

    return true;
}
  1. init_psi_monitor 往节点"/proc/pressure/memory"中写入stall_type,threshold_us,PSI_WINDOW_SIZE_MS
    fd = TEMP_FAILURE_RETRY(open(PSI_MON_FILE_MEMORY, O_WRONLY | O_CLOEXEC));
    if (fd < 0) {
        ALOGE("No kernel psi monitor support (errno=%d)", errno);
        return -1;
    }

    switch (stall_type) {
    case (PSI_SOME):
    case (PSI_FULL):
        res = snprintf(buf, sizeof(buf), "%s %d %d",
            stall_type_name[stall_type], threshold_us, window_us);
        break;
    default:
        ALOGE("Invalid psi stall type: %d", stall_type);
        errno = EINVAL;
        goto err;
    }

    if (res >= (ssize_t)sizeof(buf)) {
        ALOGE("%s line overflow for psi stall type '%s'",
            PSI_MON_FILE_MEMORY, stall_type_name[stall_type]);
        errno = EINVAL;
        goto err;
    }

    res = TEMP_FAILURE_RETRY(write(fd, buf, strlen(buf) + 1));
    if (res < 0) {
        ALOGE("%s write failed for psi stall type '%s'; errno=%d",
            PSI_MON_FILE_MEMORY, stall_type_name[stall_type], errno);
        goto err;
    }

  1. main函数最后进入loop循环,epoll_wait等待系统内存压力上报

启动总结

总结一下启动顺序,这里由于都是在同一个进程和文件中的调用我们直接将函数作为时序图的节点信息

lmkd.png

如何杀进程来释放内存

我们要区分两个handler:

  1. 用于处理和AMS的通信的handler:ctrl_data_handler
  2. 用于处理memory压力上报的handler:mp_event_common

这里我们要引入两个关于内存信息的节点,在函数mp_event_common中会先处理meminfo和zoneinfo的两个节点信息:

#define ZONEINFO_PATH "/proc/zoneinfo"
#define MEMINFO_PATH "/proc/meminfo"

// 这里解析meminfo和zoneinfo
    if (meminfo_parse(&mi) < 0 || zoneinfo_parse(&zi) < 0) {
        ALOGE("Failed to get free memory!");
        return;
    }

我们看下meminfo_parse函数主要作什么:

static int meminfo_parse(union meminfo *mi) {
    static struct reread_data file_data = {
        .filename = MEMINFO_PATH,
        .fd = -1,
    };
    char *buf;
    char *save_ptr;
    char *line;

    memset(mi, 0, sizeof(union meminfo));

    if ((buf = reread_file(&file_data)) == NULL) {
        return -1;
    }

    for (line = strtok_r(buf, "\n", &save_ptr); line;
         line = strtok_r(NULL, "\n", &save_ptr)) {
        if (!meminfo_parse_line(line, mi)) {
            ALOGE("%s parse error", file_data.filename);
            return -1;
        }
    }
    mi->field.nr_file_pages = mi->field.cached + mi->field.swap_cached +
        mi->field.buffers;

    return 0;
}

下面看下zoneinfo_parse函数:

    /* calculate totals fields */
    for (node_idx = 0; node_idx < zi->node_count; node_idx++) {
        node = &zi->nodes[node_idx];
        for (zone_idx = 0; zone_idx < node->zone_count; zone_idx++) {
            struct zoneinfo_zone *zone = &zi->nodes[node_idx].zones[zone_idx];
            zi->totalreserve_pages += zone->max_protection + zone->fields.field.high;
        }
        zi->total_inactive_file += node->fields.field.nr_inactive_file;
        zi->total_active_file += node->fields.field.nr_active_file;
        zi->total_workingset_refault += node->fields.field.workingset_refault;
    }

继续往下看

if (use_minfree_levels) {  // use_minfree_levels 为true
    int i;
 
    // other_free表示系统中可用内存,从meminfo和zoneinfo中算出来
    // mi.field.nr_free_pages 代表meminof中的MemFree,表示当前系统空闲内存大小,是完全没有被占用的内存
    //zi.totalreserve_pages 是zoneinfo中的max_protection + high,其中max_protection在android中为0
    other_free = mi.field.nr_free_pages - zi.totalreserve_pages;
    //nr_file_pages = Cached + SwapCached + Buffers 缓存页总和
    if (mi.field.nr_file_pages > (mi.field.shmem + mi.field.unevictable + mi.field.swap_cached)) {
        // other_file = Cached + SwapCached + Buffers - Shmem - Unevictable - SwapCached
        other_file = (mi.field.nr_file_pages - mi.field.shmem -
                      mi.field.unevictable - mi.field.swap_cached);
    } else {
        other_file = 0;
    }
     
    // 根据上面算出来的可用内存 来判断目标min_score_adj
    min_score_adj = OOM_SCORE_ADJ_MAX + 1; // 1000 + 1
    for (i = 0; i < lowmem_targets_size; i++) {
        minfree = lowmem_minfree[i];
        if (other_free < minfree && other_file < minfree) {
            min_score_adj = lowmem_adj[i];
            break;
        }
    }
 
 
    // 如果目标adj 等于1001 也就没有意义了 跳过这次kill
    if (min_score_adj == OOM_SCORE_ADJ_MAX + 1) {
        if (debug_process_killing) {
            ALOGI("Ignore %s memory pressure event "
                  "(free memory=%ldkB, cache=%ldkB, limit=%ldkB)",
                  level_name[level], other_free * page_k, other_file * page_k,
                  (long)lowmem_minfree[lowmem_targets_size - 1] * page_k);
        }
        return;
    }
 
    goto do_kill;
}

我们找到min_score_adj之后就goto do_kill了,这里继续调用find_and_kill_process函数:

// 这里我们choose_heaviest_task 为true,但是如果我们的min_score_adj 小于200,设置choose_heaviest_task 为true
    for (i = OOM_SCORE_ADJ_MAX; i >= min_score_adj; i--) {
        struct proc *procp;
 
        if (!choose_heaviest_task && i <= PERCEPTIBLE_APP_ADJ) {
            /*
             * If we have to choose a perceptible process, choose the heaviest one to
             * hopefully minimize the number of victims.
             */
            choose_heaviest_task = true;
        }
 
        //  如果是choose_heaviest_task 则寻找tasksize最大的那个,否则就找到第一个就可以,proc/1586/statm
        while (true) {
            procp = choose_heaviest_task ?
                proc_get_heaviest(i) : proc_adj_lru(i);
 
            if (!procp)
                break;
 
            killed_size = kill_one_process(procp, min_score_adj, kill_reason, kill_desc,
                                           mi, wi, tm);
            if (killed_size >= 0) {
                if (!lmk_state_change_start) {
                    lmk_state_change_start = true;
                    stats_write_lmk_state_changed(
                            android::lmkd::stats::LMK_STATE_CHANGED__STATE__START);
                }
                break;
            }
        }
        if (killed_size) {
            break;
        }
    }
 
// 获取heaviest proc
static struct proc *proc_get_heaviest(int oomadj) {
    struct adjslot_list *head = &procadjslot_list[ADJTOSLOT(oomadj)];
    struct adjslot_list *curr = head->next;
    struct proc *maxprocp = NULL;
    int maxsize = 0;
    while (curr != head) {
        int pid = ((struct proc *)curr)->pid;
        int tasksize = proc_get_size(pid);
        if (tasksize <= 0) {
            struct adjslot_list *next = curr->next;
            pid_remove(pid);
            curr = next;
        } else {
            if (tasksize > maxsize) {
                maxsize = tasksize;
                maxprocp = (struct proc *)curr;
            }
            curr = curr->next;
        }
    }
    return maxprocp;
}
 
 
// 获取proc size
static int proc_get_size(int pid) {
    char path[PATH_MAX];
    char line[LINE_MAX];
    int fd;
    int rss = 0;
    int total;
    ssize_t ret;
 
 
    /* gid containing AID_READPROC required */
    snprintf(path, PATH_MAX, "/proc/%d/statm", pid);
    fd = open(path, O_RDONLY | O_CLOEXEC);
    if (fd == -1)
        return -1;
 
    ret = read_all(fd, line, sizeof(line) - 1);
    if (ret < 0) {
        close(fd);
        return -1;
    }
    line[ret] = '\0';
 
    sscanf(line, "%d %d ", &total, &rss);
    close(fd);
    return rss;
}

最后调用kill_one_process发送sigkill信号杀死进程,然后返回rss,作为size,这里放一张简单的图:

lmkdk.png

和AMS的交互

我们看下cmd

/*
 * Supported LMKD commands
 */
enum lmk_cmd {
    LMK_TARGET = 0, /* Associate minfree with oom_adj_score */ 
    LMK_PROCPRIO,   /* Register a process and set its oom_adj_score */
    LMK_PROCREMOVE, /* Unregister a process */
    LMK_PROCPURGE,  /* Purge all registered processes */
    LMK_GETKILLCNT, /* Get number of kills */
    LMK_SUBSCRIBE,  /* Subscribe for asynchronous events */
    LMK_PROCKILL,   /* Unsolicited msg to subscribed clients on proc kills */
    LMK_UPDATE_PROPS, /* Reinit properties */
};

lmkd2.png 我们先看下第一个cmd LMK_TARGET,就是写入min free和oom_adj_score:

[sys.lmk.minfree_levels]: [18432:0,23040:100,27648:200,32256:250,55296:900,80640:950]

我们看下面这个时序图,开机过程中是在wm.displayReady函数最终调用到updateOomLevels,然后将minfree写入到lmkd的节点中的:

lmkdtargetcmd.png

我们看下代码:

private final int[] mOomAdj = new int[] {
        FOREGROUND_APP_ADJ, VISIBLE_APP_ADJ, PERCEPTIBLE_APP_ADJ,
        PERCEPTIBLE_LOW_APP_ADJ, CACHED_APP_MIN_ADJ, CACHED_APP_LMK_FIRST_ADJ
}; // 0,100,200,250,900,950
 
 
// These are the low-end OOM level limits.  This is appropriate for an
// HVGA or smaller phone with less than 512MB.  Values are in KB.
private final int[] mOomMinFreeLow = new int[] {
        12288, 18432, 24576,
        36864, 43008, 49152
};
 
// These are the high-end OOM level limits.  This is appropriate for a
// 1280x800 or larger screen with around 1GB RAM.  Values are in KB.
private final int[] mOomMinFreeHigh = new int[] {
        73728, 92160, 110592,
        129024, 147456, 184320
};
 
// 这里scale = 1.0
 
    for (int i = 0; i < mOomAdj.length; i++) {
        int low = mOomMinFreeLow[i];
        int high = mOomMinFreeHigh[i];
        if (is64bit) {
            // Increase the high min-free levels for cached processes for 64-bit
            if (i == 4) high = (high * 3) / 2;
            else if (i == 5) high = (high * 7) / 4;
        }
        mOomMinFree[i] = (int)(low + ((high - low) * scale));
    }
 
    if (write) {
        ByteBuffer buf = ByteBuffer.allocate(4 * (2 * mOomAdj.length + 1));
        buf.putInt(LMK_TARGET);
        for (int i = 0; i < mOomAdj.length; i++) {
            buf.putInt((mOomMinFree[i] * 1024)/PAGE_SIZE); // 相当于除以4
            buf.putInt(mOomAdj[i]);
        }
 
        writeLmkd(buf, null);
        SystemProperties.set("sys.sysctl.extra_free_kbytes", Integer.toString(reserve));
        mOomLevelsSet = true;
    }

LMK_PROCPRIO是更新oom或者进程创建的时候设置OOM用的,这个调用的地方比较多就不一一列举,调用入口updateOomAdjLocked:


public static void setOomAdj(int pid, int uid, int amt) {
    // This indicates that the process is not started yet and so no need to proceed further.
    if (pid <= 0) {
        return;
    }
    if (amt == UNKNOWN_ADJ)
        return;
 
    long start = SystemClock.elapsedRealtime();
    ByteBuffer buf = ByteBuffer.allocate(4 * 4);
    buf.putInt(LMK_PROCPRIO);
    buf.putInt(pid);
    buf.putInt(uid);
    buf.putInt(amt);
    writeLmkd(buf, null);
    long now = SystemClock.elapsedRealtime();
    if ((now-start) > 250) {
        Slog.w("ActivityManager", "SLOW OOM ADJ: " + (now-start) + "ms for pid " + pid
                + " = " + amt);
    }
}

LMK_GETKILLCNT命令用来获取一个adj 区间内发生的kill次数,比如(0,900)之间:

private boolean reportLmkKillAtOrBelow(PrintWriter pw, int oom_adj) {
    Integer cnt = ProcessList.getLmkdKillCount(0, oom_adj);
    if (cnt != null) {
        pw.println("    kills at or below oom_adj " + oom_adj + ": " + cnt);
        return true;
    }
    return false;
}

我们也可以通过dumpsys命令来查看:

ACTIVITY MANAGER LMK KILLS (dumpsys activity lmk)
 Total number of kills: 0
   kills at or below oom_adj 999: 0
   kills at or below oom_adj 900: 0
   kills at or below oom_adj 800: 0
   kills at or below oom_adj 700: 0
   kills at or below oom_adj 600: 0
   kills at or below oom_adj 500: 0
   kills at or below oom_adj 400: 0
   kills at or below oom_adj 300: 0
   kills at or below oom_adj 250: 0
   kills at or below oom_adj 200: 0
   kills at or below oom_adj 100: 0
   kills at or below oom_adj 0: 0

LMK_PROCREMOVE 是应用死亡之后, AMS 向LMKD报告,移除相关的数据结构

final void handleAppDiedLocked(ProcessRecord app,
        boolean restarting, boolean allowRestart) {
    int pid = app.pid;
    boolean kept = cleanUpApplicationRecordLocked(app, restarting, allowRestart, -1,
            false /*replacingPid*/);
    if (!kept && !restarting) {
        removeLruProcessLocked(app);
        if (pid > 0) {
            ProcessList.remove(pid); //这里向lmkd发送proc移除命令
        }
    }
    //省略一万行代码
}

总结

  1. LMKD 初始化时,读取系统的配置属性决定自己的参数信息,然后注册两个handler用于处理内核压力上报处理和AMS cmd处理
  2. LMKD接受内核内存压力上报,然后结合当前内存的状态,根据minfree配置参数去杀进程
  3. AMS 负责Android侧的进程管理,调整adj的时候通过cmd发送给LMKD,维护在LMKD的链表中