一、体系结构与内存模型
1、体系结构
- 目前多处理器系统当中,有两种体系结构:
-
非一致内存访问(NUMA),指内存划分成多个内存节点的多处理器系统,访问一个内存节点花费的时候取决于处理和内存节点的距离。NUMA是中高端服务器的主流体系结构。
-
对称多处理器(SMP),即一致内存访问(UMA),所有处理器访问内存花费的时间是相同。每个处理器的地位是平等的,仅在内核初始化的时候不平等:“0号处理器作为引导处理器负责初始化内核,其他处理器等待内核初始化完成。”
- 在实际应用中可以采用混合体系结构,在NUMA节点内部使用SMP体系结构。
原文地址:一文搞明白Linux内核《物理内存模型》,(成为大牛要收藏) - 知乎 (zhihu.com)
2、内存模型
- 从处理器角度看到的物理内存分布,内核管理不同内存模型的方式存在差异。内存管理子系统当中有3种内存模型:
-
平坦内存(Flat Memory):内存的物理地址空间是连续的,没有空洞。
-
不连续内存(Discontiguous Memory):内存的物理地址空间存在空洞,这种模型可以高效地处理空洞。
-
稀疏内存(Sparse Memory):内存的物理地址空间存在空洞,如果需要支持内存热插拔,只能选择稀疏内存模型。
二、三级结构(Node/Zone/Page)
- 从内存管理子系统使用节点(node)、区域(zone)和页(page)三级结构描述物理内存。
1、内存节点
- NUMA系统的内存节点,根据处理器和内存的距离划分; 在具有不连续内存的UMA系统中,表示比区域的级别理高的内存区域,根据物理地址是否连续划分,每块物理地址连续的内存是一个内存节点。内存节点使用一个pglist_data结构体数据类型描述内存布局。
-
成员node_mem_map指向页描述符数组,每个物理页对应一个页描述符。node_mem_map可能不是指向数组的第一个元素,因为页描述符数组的大小必须对齐到2的(MAX_ORDER-1)次方。(MAX_ORDER-1)是页分配器可分配的最大阶数。具体pglist_ddata对应内核源码分析如下:
typedef struct pglist_data {
struct zone node_zones[MAX_NR_ZONES]; // 内存区域数组
struct zonelist node_zonelists[MAX_ZONELISTS]; // 备用区域列表
int nr_zones; // 内存区域数量
#ifdef CONFIG_FLAT_NODE_MEM_MAP /* means !SPARSEMEM */
struct page *node_mem_map; // 页描述符数组,除了稀疏内存模型以外
#ifdef CONFIG_PAGE_EXTENSION
struct page_ext *node_page_ext; // 页的扩展属性
#endif
#endif
#if defined(CONFIG_MEMORY_HOTPLUG) || defined(CONFIG_DEFERRED_STRUCT_PAGE_INIT)
/*
-
Must be held any time you expect node_start_pfn,
-
node_present_pages, node_spanned_pages or nr_zones to stay constant.
-
pgdat_resize_lock() and pgdat_resize_unlock() are provided to
-
manipulate node_size_lock without checking for CONFIG_MEMORY_HOTPLUG
-
or CONFIG_DEFERRED_STRUCT_PAGE_INIT.
-
Nests above zone->lock and zone->span_seqlock
*/
spinlock_t node_size_lock;
#endif
unsigned long node_start_pfn; // 起始物理页号
unsigned long node_present_pages; // 物理页总数(不包括空洞)
unsigned long node_spanned_pages; // 物理页总数(包括空洞)
int node_id; // 节点标识符
wait_queue_head_t kswapd_wait;
wait_queue_head_t pfmemalloc_wait;
struct task_struct kswapd; / Protected by
mem_hotplug_begin/end() */
int kswapd_order;
enum zone_type kswapd_classzone_idx;
int kswapd_failures; /* Number of 'reclaimed == 0' runs */
#ifdef CONFIG_COMPACTION
int kcompactd_max_order;
enum zone_type kcompactd_classzone_idx;
wait_queue_head_t kcompactd_wait;
struct task_struct *kcompactd;
#endif
/*
-
This is a per-node reserve of pages that are not available
-
to userspace allocations.
*/
unsigned long totalreserve_pages;
#ifdef CONFIG_NUMA
/*
- zone reclaim becomes active if more unmapped pages exist.
*/
unsigned long min_unmapped_pages;
unsigned long min_slab_pages;
#endif /* CONFIG_NUMA */
/* Write-intensive fields used by page reclaim */
ZONE_PADDING(pad1)
spinlock_t lru_lock;
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
/*
-
If memory initialisation on large machines is deferred then this
-
is the first PFN that needs to be initialised.
*/
unsigned long first_deferred_pfn;
#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
spinlock_t split_queue_lock;
struct list_head split_queue;
unsigned long split_queue_len;
#endif
/* Fields commonly accessed by the page reclaim scanner */
struct lruvec lruvec;
unsigned long flags;
ZONE_PADDING(pad2)
/* Per-node vmstats */
struct per_cpu_nodestat __percpu *per_cpu_nodestats;
atomic_long_t vm_stat[NR_VM_NODE_STAT_ITEMS];
} pg_data_t;
2、内存区域
内存节点被划分为内存区域,内核定义区域类型如下所述:
enum zone_type {
#ifdef CONFIG_ZONE_DMA
/*
-
ZONE_DMA is used when there are devices that are not able
-
to do DMA to all of addressable memory (ZONE_NORMAL). Then we
-
carve out the portion of memory that is needed for these devices.
-
The range is arch specific.
-
Some examples
-
Architecture Limit
-
-
parisc, ia64, sparc <4G
-
s390, powerpc <2G
-
arm Various
-
alpha Unlimited or 0-16MB.
-
i386, x86_64 and multiple other arches
-
<16M.
*/
/*
DMA区域-->直接内存访问
*/
ZONE_DMA,
#endif
#ifdef CONFIG_ZONE_DMA32
/*
-
x86_64 needs two ZONE_DMAs because it supports devices that are
-
only able to do DMA to the lower 16M but also 32 bit devices that
-
can only do DMA areas below 4G.
*/
ZONE_DMA32, // 64位系统
#endif
/*
-
Normal addressable memory is in ZONE_NORMAL. DMA operations can be
-
performed on pages in ZONE_NORMAL if the DMA devices support
-
transfers to all addressable memory.
*/
ZONE_NORMAL, // 普通区域 -->线性映射区域(ARM处理器需要使用页表映射,MIPS处理器不需要使用页表映射)
#ifdef CONFIG_HIGHMEM
/*
-
A memory area that is only addressable by the kernel through
-
mapping portions into its own address space. This is for example
-
used by i386 to allow the kernel to address the memory beyond
-
900MB. The kernel will set up special mappings (page
-
table entries on i386) for each page that the kernel needs to
-
access.
*/
ZONE_HIGHMEM, // 高端内存区域。64位系统的内核虚拟地址空间非常大,不再需要庙内存区域
#endif
ZONE_MOVABLE, // 可移动区域:伪内存区域,用来防止内存碎片
#ifdef CONFIG_ZONE_DEVICE
ZONE_DEVICE, // 为支持持久内存(热插拔增加的内存区域)
#endif
__MAX_NR_ZONES
};
每一个内存区域用一个zone结构体描述,对应内核源码如下:
struct zone {
/* Read-mostly fields */
/* zone watermarks, access with *_wmark_pages(zone) macros */
unsigned long _watermark[NR_WMARK]; // 页分配器使用的水线
unsigned long watermark_boost;
unsigned long nr_reserved_highatomic;
/*
-
We don't know if the memory that we're going to allocate will be
-
freeable or/and it will be released eventually, so to avoid totally
-
wasting several GB of ram we must reserve some of the lower zone
-
memory (otherwise we risk to run OOM on the lower zones despite
-
there being tons of freeable ram on the higher zones). This array is
-
recalculated at runtime if the sysctl_lowmem_reserve_ratio sysctl
-
changes.
*/
long lowmem_reserve[MAX_NR_ZONES]; // 页分配器使用,当前区域保留多少页不能借给高的区域类型
#ifdef CONFIG_NUMA
int node;
#endif
struct pglist_data *zone_pgdat; // 指向内存节点的pglist_data实例
struct per_cpu_pageset __percpu *pageset; // 每处理器页集合
#ifndef CONFIG_SPARSEMEM
/*
-
Flags for a pageblock_nr_pages block. See pageblock-flags.h.
-
In SPARSEMEM, this map is stored in struct mem_section
*/
unsigned long *pageblock_flags;
#endif /* CONFIG_SPARSEMEM */
/* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */
unsigned long zone_start_pfn;
/*
-
spanned_pages is the total pages spanned by the zone, including
-
holes, which is calculated as:
-
spanned_pages = zone_end_pfn - zone_start_pfn;
-
present_pages is physical pages existing within the zone, which
-
is calculated as:
-
present_pages = spanned_pages - absent_pages(pages in holes);
-
managed_pages is present pages managed by the buddy system, which
-
is calculated as (reserved_pages includes pages allocated by the
-
bootmem allocator):
-
managed_pages = present_pages - reserved_pages;
-
So present_pages may be used by memory hotplug or memory power
-
management logic to figure out unmanaged pages by checking
-
(present_pages - managed_pages). And managed_pages should be used
-
by page allocator and vm scanner to calculate all kinds of watermarks
-
and thresholds.
-
Locking rules:
-
zone_start_pfn and spanned_pages are protected by span_seqlock.
-
It is a seqlock because it has to be read outside of zone->lock,
-
and it is done in the main allocator path. But, it is written
-
quite infrequently.
-
The span_seq lock is declared along with zone->lock because it is
-
frequently read in proximity to zone->lock. It's good to
-
give them a chance of being in the same cacheline.
-
Write access to present_pages at runtime should be protected by
-
mem_hotplug_begin/end(). Any reader who can't tolerant drift of
-
present_pages should get_online_mems() to get a stable value.
*/
atomic_long_t managed_pages; // 伙伴分配器管理的物理页的数量
unsigned long spanned_pages; // 当前区域跨越的总页数,包括空洞
unsigned long present_pages; // 当前区域存在的物理页的数量,不我包括空洞
const char *name; // 区域名称
#ifdef CONFIG_MEMORY_ISOLATION
/*
-
Number of isolated pageblock. It is used to solve incorrect
-
freepage counting problem due to racy retrieving migratetype
-
of pageblock. Protected by zone->lock.
*/
unsigned long nr_isolate_pageblock;
#endif
#ifdef CONFIG_MEMORY_HOTPLUG
/* see spanned/present_pages for more description */
seqlock_t span_seqlock;
#endif
int initialized;
/* Write-intensive fields used from the page allocator */
ZONE_PADDING(pad1)
/* 不同长度的空闲区域 */
struct free_area free_area[MAX_ORDER];
/* zone flags, see below */
unsigned long flags;
/* Primarily protects free_area */
spinlock_t lock;
/* Write-intensive fields used by compaction and vmstats. */
ZONE_PADDING(pad2)
/*
-
When free pages are below this point, additional steps are taken
-
when reading the number of free pages to avoid per-cpu counter
-
drift allowing watermarks to be breached
*/
unsigned long percpu_drift_mark;
#if defined CONFIG_COMPACTION || defined CONFIG_CMA
/* pfn where compaction free scanner should start */
unsigned long compact_cached_free_pfn;
/* pfn where async and sync compaction migration scanner should start */
unsigned long compact_cached_migrate_pfn[2];
#endif
#ifdef CONFIG_COMPACTION
/*
-
On compaction failure, 1<
-
are skipped before trying again. The number attempted since
-
last failure is tracked with compact_considered.
*/
unsigned int compact_considered;
unsigned int compact_defer_shift;
int compact_order_failed;
#endif
#if defined CONFIG_COMPACTION || defined CONFIG_CMA
/* Set to true when the PG_migrate_skip bits should be cleared */
bool compact_blockskip_flush;
#endif
bool contiguous;
ZONE_PADDING(pad3)
/* Zone statistics */
atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
atomic_long_t vm_numa_stat[NR_VM_NUMA_STAT_ITEMS];
} ____cacheline_internodealigned_in_smp;
3、物理页
每个物理页对应一个page结构体,称为页描述符,内存节点的pglist_data实例的成员node_mem_map指向该内存节点包含的所有物理页的页描述符组成的数组。
在内核里面:内核函数page_to_nid用来得到物理内存所属的内存节点的编号源码如下:
#ifdef NODE_NOT_IN_PAGE_FLAGS
extern int page_to_nid(const struct page *page);
#else
static inline int page_to_nid(const struct page *page)
{
struct page *p = (struct page *)page;
return (PF_POISONED_CHECK(p)->flags >> NODES_PGSHIFT) & NODES_MASK;
}
#endif
page_zonenum用来得到物理页所属的内存区域类型:
static inline enum zone_type page_zonenum(const struct page *page)
{
return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK;
}
-
三、Bootmem/Memblock分配器
- 在Linux内核初始化的时候需要分配内存,内核提供临时的引导内存分配器,在页分配器和块分配器初始化完成之后,把空闲的物理页交给页分配器管理,丢弃引导内存分配器。
1、bootmem分配器应用的数据结构源码如下:
// 其中下面这个结构体中成员node_bootmem_map,指向一个位图,
// 每个物理页对应一位,如果物理页被分配,把对应的位设置为1
struct bootmem_data;
- 在老版本里面有bootmem_data此结构体。新版本只有memblock结构体。
2、memblock分配器应用的数据结构如下:
/**
* struct memblock_type - collection of memory regions of certain type
* @cnt: number of regions
* @max: size of the allocated array
* @total_size: size of all regions
* @regions: array of regions
* @name: the memory type symbolic name
*/
// 内存块类型的数据结构
struct memblock_type {
unsigned long cnt; // 区域数量
unsigned long max; // 已分配数组的大小
phys_addr_t total_size; // 所有区域的长度
struct memblock_region *regions; // 内存块区域数组
char *name; // 内存块类型的名称
};
/**
* struct memblock - memblock allocator metadata
* @bottom_up: is bottom up direction?
* @current_limit: physical address of the current allocation limit
* @memory: usabe memory regions
* @reserved: reserved memory regions
* @physmem: all physical memory
*/
struct memblock {
bool bottom_up; // 表示分配方式,值为真表示从低地址向上分配,为假表示从高地址向下分配
phys_addr_t current_limit; // 可分配内存的最大物理地址
struct memblock_type memory; // 内存类型(已分配内存和未分配内存)
struct memblock_type reserved; // 保存类型
#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
struct memblock_type physmem; // 物理内存类型
#endif
};
物理内存类型和内存类型区别:
内存类型是物理内存类型的子集,在引用内核时可以使用内核参数,把定可用内存的大小。物理内存类型总是包含所有内存范围(可用内存范围)。
/**
* enum memblock_flags - definition of memory region attributes
* @MEMBLOCK_NONE: no special request
* @MEMBLOCK_HOTPLUG: hotpluggable region
* @MEMBLOCK_MIRROR: mirrored region
* @MEMBLOCK_NOMAP: don't add to kernel direct mapping
*/
enum memblock_flags {
MEMBLOCK_NONE = 0x0, /* 没有特殊要求的区域 */
MEMBLOCK_HOTPLUG = 0x1, /* 可热插拔区域 */
MEMBLOCK_MIRROR = 0x2, /* 镜像区域 */
MEMBLOCK_NOMAP = 0x4, /* 不添加到内核直接映射(线性映射区域) */
};
/**
* struct memblock_region - represents a memory region
* @base: physical address of the region
* @size: size of the region
* @flags: memory region attributes
* @nid: NUMA node id
*/
// 内存块区域数据结构如下:
struct memblock_region {
phys_addr_t base; // 起始物理地址
phys_addr_t size; // 长度
enum memblock_flags flags; // 标志
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
int nid; // 节点编号
#endif
};
ARM64内核初始化memblock分配器过程,具体内核源码分析如下:
a.解析设备树二进制文件中节点/memory,把所有物理内存范围添加到memblock.memory。
b.直接在内核函数arm64_memblock_init初始化memblock。
void __init arm64_memblock_init(void)
{
const s64 linear_region_size = -(s64)PAGE_OFFSET;
/* Handle linux,usable-memory-range property */
fdt_enforce_memory_region();
/* Remove memory above our supported physical address size */
memblock_remove(1ULL << PHYS_MASK_SHIFT, ULLONG_MAX);
/*
* Ensure that the linear region takes up exactly half of the kernel
* virtual address space. This way, we can distinguish a linear address
* from a kernel/module/vmalloc address by testing a single bit.
*/
BUILD_BUG_ON(linear_region_size != BIT(VA_BITS - 1));
/*
* Select a suitable value for the base of physical memory.
*/
memstart_addr = round_down(memblock_start_of_DRAM(),
ARM64_MEMSTART_ALIGN);
/*
* Remove the memory that we will not be able to cover with the
* linear mapping. Take care not to clip the kernel which may be
* high in memory.
*/
memblock_remove(max_t(u64, memstart_addr + linear_region_size,
__pa_symbol(_end)), ULLONG_MAX);
if (memstart_addr + linear_region_size < memblock_end_of_DRAM()) {
/* ensure that memstart_addr remains sufficiently aligned */
memstart_addr = round_up(memblock_end_of_DRAM() - linear_region_size,
ARM64_MEMSTART_ALIGN);
memblock_remove(0, memstart_addr);
}
/*
* Apply the memory limit if it was set. Since the kernel may be loaded
* high up in memory, add back the kernel region that must be accessible
* via the linear mapping.
*/
if (memory_limit != PHYS_ADDR_MAX) {
memblock_mem_limit_remove_map(memory_limit);
memblock_add(__pa_symbol(_text), (u64)(_end - _text));
}
if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && phys_initrd_size) {
/*
* Add back the memory we just removed if it results in the
* initrd to become inaccessible via the linear mapping.
* Otherwise, this is a no-op
*/
u64 base = phys_initrd_start & PAGE_MASK;
u64 size = PAGE_ALIGN(phys_initrd_size);
/*
* We can only add back the initrd memory if we don't end up
* with more memory than we can address via the linear mapping.
* It is up to the bootloader to position the kernel and the
* initrd reasonably close to each other (i.e., within 32 GB of
* each other) so that all granule/#levels combinations can
* always access both.
*/
if (WARN(base < memblock_start_of_DRAM() ||
base + size > memblock_start_of_DRAM() +
linear_region_size,
"initrd not fully accessible via the linear mapping -- please check your bootloader ...\n")) {
initrd_start = 0;
} else {
memblock_remove(base, size); /* clear MEMBLOCK_ flags */
memblock_add(base, size);
memblock_reserve(base, size);
}
}
if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) {
extern u16 memstart_offset_seed;
u64 range = linear_region_size -
(memblock_end_of_DRAM() - memblock_start_of_DRAM());
/*
* If the size of the linear region exceeds, by a sufficient
* margin, the size of the region that the available physical
* memory spans, randomize the linear region as well.
*/
if (memstart_offset_seed > 0 && range >= ARM64_MEMSTART_ALIGN) {
range /= ARM64_MEMSTART_ALIGN;
memstart_addr -= ARM64_MEMSTART_ALIGN *
((range * memstart_offset_seed) >> 16);
}
}
/*
* Register the kernel text, kernel data, initrd, and initial
* pagetables with memblock.
*/
memblock_reserve(__pa_symbol(_text), _end - _text);
if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && phys_initrd_size) {
/* the generic initrd code expects virtual addresses */
initrd_start = __phys_to_virt(phys_initrd_start);
initrd_end = initrd_start + phys_initrd_size;
}
early_init_fdt_scan_reserved_mem();
/* 4GB maximum for 32-bit only capable devices */
if (IS_ENABLED(CONFIG_ZONE_DMA32))
arm64_dma_phys_limit = max_zone_dma_phys();
else
arm64_dma_phys_limit = PHYS_MASK + 1;
reserve_crashkernel();
reserve_elfcorehdr();
high_memory = __va(memblock_end_of_DRAM() - 1) + 1;
dma_contiguous_reserve(arm64_dma_phys_limit);
}