一文搞明白Linux内核《物理内存模型》(成为大牛要收藏)

736 阅读8分钟

一、体系结构与内存模型

1、体系结构

  • 目前多处理器系统当中,有两种体系结构:
  1. 非一致内存访问(NUMA),指内存划分成多个内存节点的多处理器系统,访问一个内存节点花费的时候取决于处理和内存节点的距离。NUMA是中高端服务器的主流体系结构。

  2. 对称多处理器(SMP),即一致内存访问(UMA),所有处理器访问内存花费的时间是相同。每个处理器的地位是平等的,仅在内核初始化的时候不平等:“0号处理器作为引导处理器负责初始化内核,其他处理器等待内核初始化完成。”

  • 在实际应用中可以采用混合体系结构,在NUMA节点内部使用SMP体系结构。

原文地址:一文搞明白Linux内核《物理内存模型》,(成为大牛要收藏) - 知乎 (zhihu.com)

2、内存模型

  • 从处理器角度看到的物理内存分布,内核管理不同内存模型的方式存在差异。内存管理子系统当中有3种内存模型:
  1. 平坦内存(Flat Memory):内存的物理地址空间是连续的,没有空洞。

  2. 不连续内存(Discontiguous Memory):内存的物理地址空间存在空洞,这种模型可以高效地处理空洞。

  3. 稀疏内存(Sparse Memory):内存的物理地址空间存在空洞,如果需要支持内存热插拔,只能选择稀疏内存模型。

二、三级结构(Node/Zone/Page)

  • 从内存管理子系统使用节点(node)、区域(zone)和页(page)三级结构描述物理内存。

1、内存节点

  • NUMA系统的内存节点,根据处理器和内存的距离划分; 在具有不连续内存的UMA系统中,表示比区域的级别理高的内存区域,根据物理地址是否连续划分,每块物理地址连续的内存是一个内存节点。内存节点使用一个pglist_data结构体数据类型描述内存布局。

  • 成员node_mem_map指向页描述符数组,每个物理页对应一个页描述符。node_mem_map可能不是指向数组的第一个元素,因为页描述符数组的大小必须对齐到2的(MAX_ORDER-1)次方。(MAX_ORDER-1)是页分配器可分配的最大阶数。具体pglist_ddata对应内核源码分析如下:

    typedef struct pglist_data {

    struct zone node_zones[MAX_NR_ZONES]; // 内存区域数组

    struct zonelist node_zonelists[MAX_ZONELISTS]; // 备用区域列表

    int nr_zones; // 内存区域数量

    #ifdef CONFIG_FLAT_NODE_MEM_MAP /* means !SPARSEMEM */

    struct page *node_mem_map; // 页描述符数组,除了稀疏内存模型以外

    #ifdef CONFIG_PAGE_EXTENSION

    struct page_ext *node_page_ext; // 页的扩展属性

    #endif

    #endif

    #if defined(CONFIG_MEMORY_HOTPLUG) || defined(CONFIG_DEFERRED_STRUCT_PAGE_INIT)

    /*

    • Must be held any time you expect node_start_pfn,

    • node_present_pages, node_spanned_pages or nr_zones to stay constant.

    • pgdat_resize_lock() and pgdat_resize_unlock() are provided to

    • manipulate node_size_lock without checking for CONFIG_MEMORY_HOTPLUG

    • or CONFIG_DEFERRED_STRUCT_PAGE_INIT.

    • Nests above zone->lock and zone->span_seqlock

    */

    spinlock_t node_size_lock;

    #endif

    unsigned long node_start_pfn; // 起始物理页号

    unsigned long node_present_pages; // 物理页总数(不包括空洞)

    unsigned long node_spanned_pages; // 物理页总数(包括空洞)

    int node_id; // 节点标识符

    wait_queue_head_t kswapd_wait;

    wait_queue_head_t pfmemalloc_wait;

    struct task_struct kswapd; / Protected by

    mem_hotplug_begin/end() */

    int kswapd_order;

    enum zone_type kswapd_classzone_idx;

    int kswapd_failures; /* Number of 'reclaimed == 0' runs */

    #ifdef CONFIG_COMPACTION

    int kcompactd_max_order;

    enum zone_type kcompactd_classzone_idx;

    wait_queue_head_t kcompactd_wait;

    struct task_struct *kcompactd;

    #endif

    /*

    • This is a per-node reserve of pages that are not available

    • to userspace allocations.

    */

    unsigned long totalreserve_pages;

    #ifdef CONFIG_NUMA

    /*

    • zone reclaim becomes active if more unmapped pages exist.

    */

    unsigned long min_unmapped_pages;

    unsigned long min_slab_pages;

    #endif /* CONFIG_NUMA */

    /* Write-intensive fields used by page reclaim */

    ZONE_PADDING(pad1)

    spinlock_t lru_lock;

    #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT

    /*

    • If memory initialisation on large machines is deferred then this

    • is the first PFN that needs to be initialised.

    */

    unsigned long first_deferred_pfn;

    #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */

    #ifdef CONFIG_TRANSPARENT_HUGEPAGE

    spinlock_t split_queue_lock;

    struct list_head split_queue;

    unsigned long split_queue_len;

    #endif

    /* Fields commonly accessed by the page reclaim scanner */

    struct lruvec lruvec;

    unsigned long flags;

    ZONE_PADDING(pad2)

    /* Per-node vmstats */

    struct per_cpu_nodestat __percpu *per_cpu_nodestats;

    atomic_long_t vm_stat[NR_VM_NODE_STAT_ITEMS];

    } pg_data_t;

    2、内存区域

    内存节点被划分为内存区域,内核定义区域类型如下所述:

    enum zone_type {

    #ifdef CONFIG_ZONE_DMA

    /*

    • ZONE_DMA is used when there are devices that are not able

    • to do DMA to all of addressable memory (ZONE_NORMAL). Then we

    • carve out the portion of memory that is needed for these devices.

    • The range is arch specific.

    • Some examples

    • Architecture Limit


    • parisc, ia64, sparc <4G

    • s390, powerpc <2G

    • arm Various

    • alpha Unlimited or 0-16MB.

    • i386, x86_64 and multiple other arches

    • <16M.

    */

    /*

    DMA区域-->直接内存访问

    */

    ZONE_DMA,

    #endif

    #ifdef CONFIG_ZONE_DMA32

    /*

    • x86_64 needs two ZONE_DMAs because it supports devices that are

    • only able to do DMA to the lower 16M but also 32 bit devices that

    • can only do DMA areas below 4G.

    */

    ZONE_DMA32, // 64位系统

    #endif

    /*

    • Normal addressable memory is in ZONE_NORMAL. DMA operations can be

    • performed on pages in ZONE_NORMAL if the DMA devices support

    • transfers to all addressable memory.

    */

    ZONE_NORMAL, // 普通区域 -->线性映射区域(ARM处理器需要使用页表映射,MIPS处理器不需要使用页表映射)

    #ifdef CONFIG_HIGHMEM

    /*

    • A memory area that is only addressable by the kernel through

    • mapping portions into its own address space. This is for example

    • used by i386 to allow the kernel to address the memory beyond

    • 900MB. The kernel will set up special mappings (page

    • table entries on i386) for each page that the kernel needs to

    • access.

    */

    ZONE_HIGHMEM, // 高端内存区域。64位系统的内核虚拟地址空间非常大,不再需要庙内存区域

    #endif

    ZONE_MOVABLE, // 可移动区域:伪内存区域,用来防止内存碎片

    #ifdef CONFIG_ZONE_DEVICE

    ZONE_DEVICE, // 为支持持久内存(热插拔增加的内存区域)

    #endif

    __MAX_NR_ZONES

    };

    每一个内存区域用一个zone结构体描述,对应内核源码如下:

    struct zone {

    /* Read-mostly fields */

    /* zone watermarks, access with *_wmark_pages(zone) macros */

    unsigned long _watermark[NR_WMARK]; // 页分配器使用的水线

    unsigned long watermark_boost;

    unsigned long nr_reserved_highatomic;

    /*

    • We don't know if the memory that we're going to allocate will be

    • freeable or/and it will be released eventually, so to avoid totally

    • wasting several GB of ram we must reserve some of the lower zone

    • memory (otherwise we risk to run OOM on the lower zones despite

    • there being tons of freeable ram on the higher zones). This array is

    • recalculated at runtime if the sysctl_lowmem_reserve_ratio sysctl

    • changes.

    */

    long lowmem_reserve[MAX_NR_ZONES]; // 页分配器使用,当前区域保留多少页不能借给高的区域类型

    #ifdef CONFIG_NUMA

    int node;

    #endif

    struct pglist_data *zone_pgdat; // 指向内存节点的pglist_data实例

    struct per_cpu_pageset __percpu *pageset; // 每处理器页集合

    #ifndef CONFIG_SPARSEMEM

    /*

    • Flags for a pageblock_nr_pages block. See pageblock-flags.h.

    • In SPARSEMEM, this map is stored in struct mem_section

    */

    unsigned long *pageblock_flags;

    #endif /* CONFIG_SPARSEMEM */

    /* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */

    unsigned long zone_start_pfn;

    /*

    • spanned_pages is the total pages spanned by the zone, including

    • holes, which is calculated as:

    • spanned_pages = zone_end_pfn - zone_start_pfn;

    • present_pages is physical pages existing within the zone, which

    • is calculated as:

    • present_pages = spanned_pages - absent_pages(pages in holes);

    • managed_pages is present pages managed by the buddy system, which

    • is calculated as (reserved_pages includes pages allocated by the

    • bootmem allocator):

    • managed_pages = present_pages - reserved_pages;

    • So present_pages may be used by memory hotplug or memory power

    • management logic to figure out unmanaged pages by checking

    • (present_pages - managed_pages). And managed_pages should be used

    • by page allocator and vm scanner to calculate all kinds of watermarks

    • and thresholds.

    • Locking rules:

    • zone_start_pfn and spanned_pages are protected by span_seqlock.

    • It is a seqlock because it has to be read outside of zone->lock,

    • and it is done in the main allocator path. But, it is written

    • quite infrequently.

    • The span_seq lock is declared along with zone->lock because it is

    • frequently read in proximity to zone->lock. It's good to

    • give them a chance of being in the same cacheline.

    • Write access to present_pages at runtime should be protected by

    • mem_hotplug_begin/end(). Any reader who can't tolerant drift of

    • present_pages should get_online_mems() to get a stable value.

    */

    atomic_long_t managed_pages; // 伙伴分配器管理的物理页的数量

    unsigned long spanned_pages; // 当前区域跨越的总页数,包括空洞

    unsigned long present_pages; // 当前区域存在的物理页的数量,不我包括空洞

    const char *name; // 区域名称

    #ifdef CONFIG_MEMORY_ISOLATION

    /*

    • Number of isolated pageblock. It is used to solve incorrect

    • freepage counting problem due to racy retrieving migratetype

    • of pageblock. Protected by zone->lock.

    */

    unsigned long nr_isolate_pageblock;

    #endif

    #ifdef CONFIG_MEMORY_HOTPLUG

    /* see spanned/present_pages for more description */

    seqlock_t span_seqlock;

    #endif

    int initialized;

    /* Write-intensive fields used from the page allocator */

    ZONE_PADDING(pad1)

    /* 不同长度的空闲区域 */

    struct free_area free_area[MAX_ORDER];

    /* zone flags, see below */

    unsigned long flags;

    /* Primarily protects free_area */

    spinlock_t lock;

    /* Write-intensive fields used by compaction and vmstats. */

    ZONE_PADDING(pad2)

    /*

    • When free pages are below this point, additional steps are taken

    • when reading the number of free pages to avoid per-cpu counter

    • drift allowing watermarks to be breached

    */

    unsigned long percpu_drift_mark;

    #if defined CONFIG_COMPACTION || defined CONFIG_CMA

    /* pfn where compaction free scanner should start */

    unsigned long compact_cached_free_pfn;

    /* pfn where async and sync compaction migration scanner should start */

    unsigned long compact_cached_migrate_pfn[2];

    #endif

    #ifdef CONFIG_COMPACTION

    /*

    • On compaction failure, 1<

    • are skipped before trying again. The number attempted since

    • last failure is tracked with compact_considered.

    */

    unsigned int compact_considered;

    unsigned int compact_defer_shift;

    int compact_order_failed;

    #endif

    #if defined CONFIG_COMPACTION || defined CONFIG_CMA

    /* Set to true when the PG_migrate_skip bits should be cleared */

    bool compact_blockskip_flush;

    #endif

    bool contiguous;

    ZONE_PADDING(pad3)

    /* Zone statistics */

    atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];

    atomic_long_t vm_numa_stat[NR_VM_NUMA_STAT_ITEMS];

    } ____cacheline_internodealigned_in_smp;

    3、物理页

    每个物理页对应一个page结构体,称为页描述符,内存节点的pglist_data实例的成员node_mem_map指向该内存节点包含的所有物理页的页描述符组成的数组。

    在内核里面:内核函数page_to_nid用来得到物理内存所属的内存节点的编号源码如下:

    #ifdef NODE_NOT_IN_PAGE_FLAGS

    extern int page_to_nid(const struct page *page);

    #else

    static inline int page_to_nid(const struct page *page)

    {

    struct page *p = (struct page *)page;

    return (PF_POISONED_CHECK(p)->flags >> NODES_PGSHIFT) & NODES_MASK;

    }

    #endif

    page_zonenum用来得到物理页所属的内存区域类型:

    static inline enum zone_type page_zonenum(const struct page *page)

    {

    return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK;

    }

三、Bootmem/Memblock分配器

  • 在Linux内核初始化的时候需要分配内存,内核提供临时的引导内存分配器,在页分配器和块分配器初始化完成之后,把空闲的物理页交给页分配器管理,丢弃引导内存分配器。

1、bootmem分配器应用的数据结构源码如下:

// 其中下面这个结构体中成员node_bootmem_map,指向一个位图,

// 每个物理页对应一位,如果物理页被分配,把对应的位设置为1

struct bootmem_data;
  • 在老版本里面有bootmem_data此结构体。新版本只有memblock结构体。

2、memblock分配器应用的数据结构如下:

/**

* struct memblock_type - collection of memory regions of certain type

* @cnt: number of regions

* @max: size of the allocated array

* @total_size: size of all regions

* @regions: array of regions

* @name: the memory type symbolic name

*/

// 内存块类型的数据结构

struct memblock_type {

unsigned long cnt; // 区域数量

unsigned long max; // 已分配数组的大小

phys_addr_t total_size; // 所有区域的长度

struct memblock_region *regions; // 内存块区域数组

char *name; // 内存块类型的名称

};

/**

* struct memblock - memblock allocator metadata

* @bottom_up: is bottom up direction?

* @current_limit: physical address of the current allocation limit

* @memory: usabe memory regions

* @reserved: reserved memory regions

* @physmem: all physical memory

*/

struct memblock {

bool bottom_up; // 表示分配方式,值为真表示从低地址向上分配,为假表示从高地址向下分配

phys_addr_t current_limit; // 可分配内存的最大物理地址

struct memblock_type memory; // 内存类型(已分配内存和未分配内存)

struct memblock_type reserved; // 保存类型

#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP

struct memblock_type physmem; // 物理内存类型

#endif

};

物理内存类型和内存类型区别:

内存类型是物理内存类型的子集,在引用内核时可以使用内核参数,把定可用内存的大小。物理内存类型总是包含所有内存范围(可用内存范围)。

/**

* enum memblock_flags - definition of memory region attributes

* @MEMBLOCK_NONE: no special request

* @MEMBLOCK_HOTPLUG: hotpluggable region

* @MEMBLOCK_MIRROR: mirrored region

* @MEMBLOCK_NOMAP: don't add to kernel direct mapping

*/

enum memblock_flags {

MEMBLOCK_NONE = 0x0, /* 没有特殊要求的区域 */

MEMBLOCK_HOTPLUG = 0x1, /* 可热插拔区域 */

MEMBLOCK_MIRROR = 0x2, /* 镜像区域 */

MEMBLOCK_NOMAP = 0x4, /* 不添加到内核直接映射(线性映射区域) */

};

/**

* struct memblock_region - represents a memory region

* @base: physical address of the region

* @size: size of the region

* @flags: memory region attributes

* @nid: NUMA node id

*/

// 内存块区域数据结构如下:

struct memblock_region {

phys_addr_t base; // 起始物理地址

phys_addr_t size; // 长度

enum memblock_flags flags; // 标志

#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP

int nid; // 节点编号

#endif

};

ARM64内核初始化memblock分配器过程,具体内核源码分析如下:

a.解析设备树二进制文件中节点/memory,把所有物理内存范围添加到memblock.memoryb.直接在内核函数arm64_memblock_init初始化memblock。

void __init arm64_memblock_init(void)

{

const s64 linear_region_size = -(s64)PAGE_OFFSET;

/* Handle linux,usable-memory-range property */

fdt_enforce_memory_region();

/* Remove memory above our supported physical address size */

memblock_remove(1ULL << PHYS_MASK_SHIFT, ULLONG_MAX);

/*

* Ensure that the linear region takes up exactly half of the kernel

* virtual address space. This way, we can distinguish a linear address

* from a kernel/module/vmalloc address by testing a single bit.

*/

BUILD_BUG_ON(linear_region_size != BIT(VA_BITS - 1));

/*

* Select a suitable value for the base of physical memory.

*/

memstart_addr = round_down(memblock_start_of_DRAM(),

ARM64_MEMSTART_ALIGN);

/*

* Remove the memory that we will not be able to cover with the

* linear mapping. Take care not to clip the kernel which may be

* high in memory.

*/

memblock_remove(max_t(u64, memstart_addr + linear_region_size,

__pa_symbol(_end)), ULLONG_MAX);

if (memstart_addr + linear_region_size < memblock_end_of_DRAM()) {

/* ensure that memstart_addr remains sufficiently aligned */

memstart_addr = round_up(memblock_end_of_DRAM() - linear_region_size,

ARM64_MEMSTART_ALIGN);

memblock_remove(0, memstart_addr);

}

/*

* Apply the memory limit if it was set. Since the kernel may be loaded

* high up in memory, add back the kernel region that must be accessible

* via the linear mapping.

*/

if (memory_limit != PHYS_ADDR_MAX) {

memblock_mem_limit_remove_map(memory_limit);

memblock_add(__pa_symbol(_text), (u64)(_end - _text));

}

if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && phys_initrd_size) {

/*

* Add back the memory we just removed if it results in the

* initrd to become inaccessible via the linear mapping.

* Otherwise, this is a no-op

*/

u64 base = phys_initrd_start & PAGE_MASK;

u64 size = PAGE_ALIGN(phys_initrd_size);

/*

* We can only add back the initrd memory if we don't end up

* with more memory than we can address via the linear mapping.

* It is up to the bootloader to position the kernel and the

* initrd reasonably close to each other (i.e., within 32 GB of

* each other) so that all granule/#levels combinations can

* always access both.

*/

if (WARN(base < memblock_start_of_DRAM() ||

base + size > memblock_start_of_DRAM() +

linear_region_size,

"initrd not fully accessible via the linear mapping -- please check your bootloader ...\n")) {

initrd_start = 0;

} else {

memblock_remove(base, size); /* clear MEMBLOCK_ flags */

memblock_add(base, size);

memblock_reserve(base, size);

}

}

if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) {

extern u16 memstart_offset_seed;

u64 range = linear_region_size -

(memblock_end_of_DRAM() - memblock_start_of_DRAM());

/*

* If the size of the linear region exceeds, by a sufficient

* margin, the size of the region that the available physical

* memory spans, randomize the linear region as well.

*/

if (memstart_offset_seed > 0 && range >= ARM64_MEMSTART_ALIGN) {

range /= ARM64_MEMSTART_ALIGN;

memstart_addr -= ARM64_MEMSTART_ALIGN *

((range * memstart_offset_seed) >> 16);

}

}

/*

* Register the kernel text, kernel data, initrd, and initial

* pagetables with memblock.

*/

memblock_reserve(__pa_symbol(_text), _end - _text);

if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && phys_initrd_size) {

/* the generic initrd code expects virtual addresses */

initrd_start = __phys_to_virt(phys_initrd_start);

initrd_end = initrd_start + phys_initrd_size;

}

early_init_fdt_scan_reserved_mem();

/* 4GB maximum for 32-bit only capable devices */

if (IS_ENABLED(CONFIG_ZONE_DMA32))

arm64_dma_phys_limit = max_zone_dma_phys();

else

arm64_dma_phys_limit = PHYS_MASK + 1;

reserve_crashkernel();

reserve_elfcorehdr();

high_memory = __va(memblock_end_of_DRAM() - 1) + 1;

dma_contiguous_reserve(arm64_dma_phys_limit);

}