记一次内存分配失败导致的自动重启

1,536 阅读9分钟

1、问题现象

  1. 机器开机进行日常测试3小时
  2. USB wifi模块插入机器后,机器自动重启

2、kernel log文件

全部贴上来会很长,在下面一点点单独解释。

3、从log来看,重启原因是空指针导致

Unable to handle kernel NULL pointer dereference at virtual address 0000000000000000
Mem abort info:
ESR = 0x96000006
EC = 0x25: DABT (current EL), IL = 32 bits
SET = 0, FnV = 0
EA = 0, S1PTW = 0
Data abort info:
ISV = 0, ISS = 0x00000006
CM = 0, WnR = 0
user pgtable: 4k pages, 39-bit VAs, pgdp=000000000d701000
[0000000000000000] pgd=000000000d702003, pud=000000000d702003, pmd=0000000000000000
Internal error: Oops: 96000006 [#1] PREEMPT SMP

4、可能造成空指针的原因

继续往上查看log,发现插入USB wifi模块后,存在内存分配失败的现象,可能是内存分配失败导致使用时出现空指针。

kworker/0:0: page allocation failure: order:4, mode:0xa20(GFP_ATOMIC), nodemask=(null),cpuset=/,mems_allowed=0
CPU: 0 PID: 5171 Comm: kworker/0:0 Tainted: P        WC O      5.4.125-android11-2-g6d2ca40337f4-dirty-ab2800 #1
Hardware name: Amlogic (DT)
Workqueue: usb_hub_wq hub_event
Call trace:
[ffffffc0200eb240+  64][<ffffffc0100c63b4>] dump_backtrace+0x0/0x18c
[ffffffc0200eb280+  32][<ffffffc0100c6564>] show_stack+0x24/0x34
[ffffffc0200eb2a0+ 160][<ffffffc010dce124>] dump_stack+0xbc/0x108
[ffffffc0200eb340+ 224][<ffffffc0102c6624>] warn_alloc+0xd8/0x128
[ffffffc0200eb420+ 144][<ffffffc0102c78e0>] __alloc_pages_slowpath+0xb10/0xb3c
[ffffffc0200eb4b0+  80][<ffffffc0102c69b8>] __alloc_pages_nodemask+0x2d4/0x35c
[ffffffc0200eb500+  96][<ffffffc01029e8d8>] kmalloc_order+0x5c/0x1a8
[ffffffc0200eb560+  96][<ffffffc01029ea64>] kmalloc_order_trace+0x40/0x108
[ffffffc0200eb5c0+ 144][<ffffffc00a72fd30>] glSetHifInfo+0x670/0x7c0 [wlan_mt7663_usb]
[ffffffc0200eb650+  96][<ffffffc00a6c5e70>] cleanup_module+0x10bc/0x2438 [wlan_mt7663_usb]
[ffffffc0200eb6b0+  48][<ffffffc00a72e434>] glRegisterBus+0x1b4/0x978 [wlan_mt7663_usb]

4、查看代码

根据打印的堆栈,问题出在函数glSetHifInfo()中,查看源码可知,glSetHifInfo中会使用kmalloc分配连续的物理内存。

prUsbReq->prBufCtrl->pucBuf = kmalloc(USB_TX_CMD_BUF_SIZE, GFP_ATOMIC);

5、kmalloc介绍

kmalloc用来在内核中分配在物理上连续的内存,虚拟地址自然也是连续的。

kmalloc使用GFP_ATOMIC、GFP_KERNEL分配内存的区别?
  • GFP_ATOMIC:分配内存的过程是一个原子过程,分配内存的过程不会被(高优先级进程或中断)打断,可能返回失败;
  • GFP_KERNEL:正常分配内存,可能会阻塞;
  • GFP_DMA —— 给 DMA 控制器分配内存,需要使用该标志(DMA要求分配虚拟地址和物理地址连续)。
#define GFP_ATOMIC	    (__GFP_HIGH|__GFP_ATOMIC|__GFP_KSWAPD_RECLAIM) //0xa20u
#define GFP_KERNEL	    (__GFP_RECLAIM | __GFP_IO | __GFP_FS) //0xcc0u
#define GFP_DMA	            __GFP_DMA //0x01u

#define __GFP_HIGH	            ((__force gfp_t)___GFP_HIGH) //表示更高优先级
#define __GFP_IO	            ((__force gfp_t)___GFP_IO)  //表示调用者不可以回收页面或者睡眠
#define __GFP_FS	            ((__force gfp_t)___GFP_FS)
#define __GFP_RECLAIM               ((__force gfp_t)(___GFP_DIRECT_RECLAIM|___GFP_KSWAPD_RECLAIM))
#define __GFP_KSWAPD_RECLAIM        ((__force gfp_t)___GFP_KSWAPD_RECLAIM) /* kswapd can wake */
#define __GFP_DMA	            ((__force gfp_t)___GFP_DMA)

#define ___GFP_DMA		    0x01u
#define ___GFP_HIGHMEM		    0x02u
#define ___GFP_DMA32		    0x04u
#define ___GFP_MOVABLE		    0x08u
#define ___GFP_RECLAIMABLE	    0x10u
#define ___GFP_HIGH		    0x20u
#define ___GFP_IO		    0x40u
#define ___GFP_FS		    0x80u
#define ___GFP_ZERO		    0x100u
#define ___GFP_ATOMIC		    0x200u
#define ___GFP_DIRECT_RECLAIM	    0x400u
#define ___GFP_KSWAPD_RECLAIM	    0x800u
为什么使用GFP_ATOMIC参数申请内存呢?

结合出现问题的场景,当插入USB wifi模块,hub驱动会响应到port端口有设备插入,然后在中断内的工作队列中回调hub_event(),并开始进行usb枚举,此时是在中断上下文中执行的(linux USB子系统详解)。故最好使用GFP_ATOMIC分配内存,避免等待。不过在工作队列中,也可以使用GFP_KERNEL进行分配。

6、分析log

"kworker/0:0: page allocation failure......"warn_alloc()的输出,表示无法满足分配2^order大小的页面。warn_alloc()被如下函数调用:__alloc_pages_slowpath()、__vmalloc_area_node()、__vmalloc_node_range。

warn_alloc()打印的含义
//path: mm/page_alloc.c
static inline struct page *
__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, struct alloc_context *ac)
{
    ......
    alloc_flags = gfp_to_alloc_flags(gfp_mask);
    ......
fail:
    warn_alloc(gfp_mask, ac->nodemask,
			"page allocation failure: order:%u", order);
got_pg:
    return page;
}

void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...)
{
	struct va_format vaf;
	va_list args;
	static DEFINE_RATELIMIT_STATE(nopage_rs, 10*HZ, 1);

	if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs))
		return;

	va_start(args, fmt);
	vaf.fmt = fmt;
	vaf.va = &args;
       //显示对应进程名称
	pr_warn("%s: %pV, mode:%#x(%pGg), nodemask=%*pbl",
			current->comm, &vaf, gfp_mask, &gfp_mask,
			nodemask_pr_args(nodemask));
	va_end(args);

	cpuset_print_current_mems_allowed();
	pr_cont("\n");
       //显示栈信息
	dump_stack();
	warn_alloc_show_mem(gfp_mask, nodemask);
}

static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask)
{
	unsigned int filter = SHOW_MEM_FILTER_NODES;

	if (!(gfp_mask & __GFP_NOMEMALLOC))
		if (tsk_is_oom_victim(current) ||
		    (current->flags & (PF_MEMALLOC | PF_EXITING)))
			filter &= ~SHOW_MEM_FILTER_NODES;
	if (in_interrupt() || !(gfp_mask & __GFP_DIRECT_RECLAIM))
		filter &= ~SHOW_MEM_FILTER_NODES;
       //显示内存信息,这里是重点
	show_mem(filter, nodemask);
}

//lib/show_mem.c
void show_mem(unsigned int filter, nodemask_t *nodemask)
{
	pg_data_t *pgdat;
	unsigned long total = 0, reserved = 0, highmem = 0;

	printk("Mem-Info:\n");
	show_free_areas(filter, nodemask);

	for_each_online_pgdat(pgdat) {
		int zoneid;

		for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
			struct zone *zone = &pgdat->node_zones[zoneid];
			if (!populated_zone(zone))
				continue;

			total += zone->present_pages;
			reserved += zone->present_pages - zone_managed_pages(zone);

			if (is_highmem_idx(zoneid))
				highmem += zone->present_pages;
		}
	}
       //整个平台的页面统计信息:所有页面数、reserved、cma等等
	printk("%lu pages RAM\n", total);
	printk("%lu pages HighMem/MovableOnly\n", highmem);
	printk("%lu pages reserved\n", reserved);
#ifdef CONFIG_CMA
	printk("%lu pages cma reserved\n", totalcma_pages);
#endif
#ifdef CONFIG_MEMORY_FAILURE
	printk("%lu pages hwpoisoned\n", atomic_long_read(&num_poisoned_pages));
#endif
}
show_mem()打印内容详解

如下是出问题时,kernel中打印出来的内存信息:

//show_mem()打印内容详解:
Mem-Info:
//显示所有node的统计信息
active_anon:152878 inactive_anon:92312 isolated_anon:0
active_file:55513 inactive_file:50038 isolated_file:0
unevictable:585 dirty:46 writeback:0 unstable:0
slab_reclaimable:13087 slab_unreclaimable:23878 slab_unreclaimable_Order:2154
mapped:74780 shmem:6497 pagetables:11885 bounce:0
[cma] driver:46165 anon:36301 file:10015 isolate:1939456 total:242688
 free:158325 free_pcp:1023 free_cma:145990
//分别显示不同node的统计信息,只有一个node
Node 0 active_anon:611512kB inactive_anon:369248kB active_file:222052kB inactive_file:200152kB 
unevictable:2340kB isolated(anon):0kB isolated(file):0kB mapped:299120kB dirty:184kB writeback:0kB 
shmem:25988kB writeback_tmp:0kB unstable:0kB all_unreclaimable? no
//分别显示所有zone的统计信息
DMA32 free:633300kB min:10240kB low:21996kB high:32740kB active_anon:611512kB inactive_anon:369248kB 
active_file:222052kB inactive_file:200152kB unevictable:2340kB writepending:184kB present:3670016kB 
managed:3582536kB mlocked:2340kB kernel_stack:32112kB shadow_call_stack:2008kB pagetables:47540kB 
bounce:0kB free_pcp:4092kB local_pcp:860kB free_cma:583960kB
lowmem_reserve[]: 0 0 0
//显示所有zone下不同order空闲数目统计信息
//'U':不可移动
//'M':可移动
//'E':可回收
//'H':等同于MIGRATE_PCPTYPES
//'C':CMA区域页面
DMA32: 1587*4kB (UMEC) 1959*8kB (UMEC) 1803*16kB (UMEC) 676*32kB (UMEC) 305*64kB (C) 86*128kB (C) 
19*256kB (C) 10*512kB (C) 26*1024kB (C) 5*2048kB (C) 118*4096kB (C) = 633204kB
//AML添加的各migratetype空闲的内存
Free_Unmovable:7440
Free_Movable:4775
Free_Reclaimable:96
Free_CMA:145990
Free_HighAtomic:0
Free_Isolate:0
//总的文件缓存页面数量
112537 total pagecache pages
//显示swap cache统计信息
52 pages in swap cache
Swap cache stats: add 17548, delete 17497, find 26/64
Free swap  = 453884kB
Total swap = 524284kB
//整个平台的页面统计信息:所有页面数、reserved、cma等等
917504 pages RAM
0 pages HighMem/MovableOnly
21870 pages reserved
242688 pages cma reserved

7、内存分配失败原因

从log中可以看到: kworker/0:0: page allocation failure: order:4, mode:0xa20(GFP_ATOMIC), nodemask=(null),cpuset=/,mems_allowed=0

表示在系统通过kmalloc分配内存时,小于8k的内存,采用slub进行分配;大于8k的内存,使用伙伴系统buddy进行分配。在分配order:4(4k*2^4 = 64kb)的内存时,使用buddy分配失败。

查看zone下内存的空闲情况: DMA32: 1587*4kB (UMEC) 1959*8kB (UMEC) 1803*16kB (UMEC) 676*32kB (UMEC) 305*64kB (C) 86*128kB (C) 19*256kB (C) 10*512kB (C) 26*1024kB (C) 5*2048kB (C) 118*4096kB (C) = 633204kB

在DMA32中,64kb的内存块剩下305,但是都是"C",表示都是CMA的内存。

CMA还剩下这么大的内存,为什么申请还是失败呢?

在CMA存在的情况下根据migratetype决定是否可用CMA区域,而gfp_mask决定了申请页面的migratetype。在__alloc_pages_slowpath()中,gfp_to_alloc_flags()来进行gfp_mask和migrate转换。此问题中,

static inline unsigned int
gfp_to_alloc_flags(gfp_t gfp_mask)
{
	unsigned int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
	/* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */
	BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);
	//__GFP_HIGH到ALLOC_HIGH转换
	alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);

	if (gfp_mask & __GFP_ATOMIC) {
		if (!(gfp_mask & __GFP_NOMEMALLOC))
			alloc_flags |= ALLOC_HARDER;
		alloc_flags &= ~ALLOC_CPUSET;
	} else if (unlikely(rt_task(current)) && !in_interrupt())
		alloc_flags |= ALLOC_HARDER;
	if (gfp_mask & __GFP_KSWAPD_RECLAIM)
		alloc_flags |= ALLOC_KSWAPD;
#ifdef CONFIG_CMA
        //将gfp_mask转换到migratetype,判断是否是MIGRATE_MOVABLE。
        //如果是,则可以在CMA中去分配。
        //也就是说必须gfp_flags中包含__GFP_MOVABLE才可以在CMA中分配。
        //MIGRATE_MOVABLE值为1,此问题gfpflags_to_migratetype()返回为0
	if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
		alloc_flags |= ALLOC_CMA;
#endif
	return alloc_flags;
}

enum migratetype {
	MIGRATE_UNMOVABLE,
	MIGRATE_MOVABLE,
	MIGRATE_RECLAIMABLE,
#ifdef CONFIG_CMA
	MIGRATE_CMA,
#endif
	MIGRATE_PCPTYPES, /* the number of types on the pcp lists */
	MIGRATE_HIGHATOMIC = MIGRATE_PCPTYPES,
#ifdef CONFIG_MEMORY_ISOLATION
	MIGRATE_ISOLATE,	/* can't allocate from here */
#endif
	MIGRATE_TYPES
};

#define ___GFP_RECLAIMABLE	        0x10u
#define ___GFP_MOVABLE		0x08u
#define __GFP_RECLAIMABLE          ((__force gfp_t)___GFP_RECLAIMABLE)
#define __GFP_MOVABLE	        ((__force gfp_t)___GFP_MOVABLE)
#define GFP_MOVABLE_MASK           (__GFP_RECLAIMABLE|__GFP_MOVABLE) //0x18u
#define GFP_MOVABLE_SHIFT          3
//gfp_flags = GFP_ATOMIC = 0xa20u
static inline int gfpflags_to_migratetype(const gfp_t gfp_flags)
{
	VM_WARN_ON((gfp_flags & GFP_MOVABLE_MASK) == GFP_MOVABLE_MASK);
	BUILD_BUG_ON((1UL << GFP_MOVABLE_SHIFT) != ___GFP_MOVABLE);
	BUILD_BUG_ON((___GFP_MOVABLE >> GFP_MOVABLE_SHIFT) != MIGRATE_MOVABLE);

	if (unlikely(page_group_by_mobility_disabled))
		return MIGRATE_UNMOVABLE;

       //(0xa20u & 0x18u) >> 3 = 0
	return (gfp_flags & GFP_MOVABLE_MASK) >> GFP_MOVABLE_SHIFT;
}

从DMA32区域空闲页面可以看出,有305个64kb块空闲,但是属于CMA区域。所以申请不成功。

那么GFP_KERNEL申请CMA的区域会成功吗?

如果要gfpflags_to_migratetype()返回为1,则gfp_flags需要包含___GFP_MOVABLEGFP_KERNEL = (__GFP_RECLAIM | __GFP_IO | __GFP_FS) = 0xcc0u,不包含___GFP_MOVABLE,则也会返回失败。

故问题的根本原因为:虽然存在很多64kb大小的空闲内存,但是这些内存全是CMA区域,kmalloc()无法使用。

抓取正常开机时候的pagetypeinfo,可看到开机时order为4时,是由内存可供分配的。所以这里怀疑是内存泄漏,通过下面脚本跟踪MemFree。

console:/ # cat /proc/pagetypeinfo
Page block order: 10
Pages per block:  1024

Free pages count per migrate type at order       0      1      2      3      4      5      6      7      8      9     10
Node    0, zone    DMA32, type    Unmovable    123    149     95     66     45     23     13      5      9      0      0
Node    0, zone    DMA32, type      Movable     14     10     23      9      1      2      6      2      2      0      0
Node    0, zone    DMA32, type  Reclaimable      0      0      1     27     30     16      7      0      0      0      0
Node    0, zone    DMA32, type          CMA      0      1      0      1      1      1      1      3      3      2    170
Node    0, zone    DMA32, type   HighAtomic      0      0      0      0      0      0      0      0      0      0      0
Node    0, zone    DMA32, type      Isolate      0      0      0      0      0      0      0      0      0      0      0

Number of blocks type     Unmovable      Movable  Reclaimable          CMA   HighAtomic      Isolate
Node 0, zone    DMA32          150          495           14          237            0            0

针对此问题措施如下:

  • 临时措施:可以在开机的时候预先分配内存,防止内存泄漏后再插入模块,导致申请不到内存;在申请不到内存时,判断内存是否为NULL,为NULL则return,防止系统崩溃。
  • 永久措施:找到内存泄漏的地方。