本文基于Linux 5.0, 涉及源码如下
include/linux/gfp.h
include/linux/mmzone.h
mm/page_alloc.c
kernel/sysctl.c
1. init_per_zone_wmark_min
// min_free_kbytes默认为1MB: 代表所有zone警戒水位之和
// min_free_kbytes的范围: 128k ~ 64MB
int min_free_kbytes = 1024
int user_min_free_kbytes = -1
// 计算公式: min_free_kbytes = sqrt(lowmem_kbytes * 16)
int __meminit init_per_zone_wmark_min(void)
{
unsigned long lowmem_kbytes
int new_min_free_kbytes
// 计算所有zone除去高水位以外的空闲页数, 并将其转换为kb
lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10)
// 计算lowmem_kbytes乘以16后开方的结果
new_min_free_kbytes = int_sqrt(lowmem_kbytes * 16)
// user_min_free_kbytes默认为-1
if (new_min_free_kbytes > user_min_free_kbytes) {
// 更新min_free_kbytes
min_free_kbytes = new_min_free_kbytes
// min_free_kbytes范围: 128k ~ 64KB
if (min_free_kbytes < 128)
min_free_kbytes = 128
if (min_free_kbytes > 65536)
min_free_kbytes = 65536
} else {
pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n",
new_min_free_kbytes, user_min_free_kbytes)
}
// 根据总的警戒水位计算每个zone的所有水位[见1.2节]
setup_per_zone_wmarks()
refresh_zone_stat_thresholds()
// 设置每个zone的lowmem_reserve[见1.3节]
setup_per_zone_lowmem_reserve()
setup_min_unmapped_ratio()
setup_min_slab_ratio()
return 0
}
core_initcall(init_per_zone_wmark_min)
1.1 nr_free_buffer_pages
unsigned long nr_free_buffer_pages(void)
{
return nr_free_zone_pages(gfp_zone(GFP_USER));
}
1.1.1 gfp_zone
#define GFP_USER (__GFP_RECLAIM | __GFP_IO | __GFP_FS | __GFP_HARDWALL)
#define __GFP_DMA ((__force gfp_t)___GFP_DMA)
#define __GFP_HIGHMEM ((__force gfp_t)___GFP_HIGHMEM)
#define __GFP_DMA32 ((__force gfp_t)___GFP_DMA32)
#define __GFP_MOVABLE ((__force gfp_t)___GFP_MOVABLE)
#define GFP_ZONEMASK (__GFP_DMA|__GFP_HIGHMEM|__GFP_DMA32|__GFP_MOVABLE)
enum zone_type {
#ifdef CONFIG_ZONE_DMA
ZONE_DMA,
#endif
#ifdef CONFIG_ZONE_DMA32
ZONE_DMA32,
#endif
ZONE_NORMAL,
#ifdef CONFIG_HIGHMEM
ZONE_HIGHMEM,
#endif
ZONE_MOVABLE,
#ifdef CONFIG_ZONE_DEVICE
ZONE_DEVICE,
#endif
__MAX_NR_ZONES
};
#if MAX_NR_ZONES < 2
#define ZONES_SHIFT 0
#elif MAX_NR_ZONES <= 2
#define ZONES_SHIFT 1
#elif MAX_NR_ZONES <= 4
#define ZONES_SHIFT 2
#elif MAX_NR_ZONES <= 8
#define ZONES_SHIFT 3
#else
#error ZONES_SHIFT -- too many zones configured adjust calculation
#endif
#if defined(CONFIG_ZONE_DEVICE) && (MAX_NR_ZONES-1) <= 4
#define GFP_ZONES_SHIFT 2
#else
#define GFP_ZONES_SHIFT ZONES_SHIFT
#endif
#define GFP_ZONE_TABLE ( \
(ZONE_NORMAL << 0 * GFP_ZONES_SHIFT) \
| (OPT_ZONE_DMA << ___GFP_DMA * GFP_ZONES_SHIFT) \
| (OPT_ZONE_HIGHMEM << ___GFP_HIGHMEM * GFP_ZONES_SHIFT) \
| (OPT_ZONE_DMA32 << ___GFP_DMA32 * GFP_ZONES_SHIFT) \
| (ZONE_NORMAL << ___GFP_MOVABLE * GFP_ZONES_SHIFT) \
| (OPT_ZONE_DMA << (___GFP_MOVABLE | ___GFP_DMA) * GFP_ZONES_SHIFT) \
| (ZONE_MOVABLE << (___GFP_MOVABLE | ___GFP_HIGHMEM) * GFP_ZONES_SHIFT)\
| (OPT_ZONE_DMA32 << (___GFP_MOVABLE | ___GFP_DMA32) * GFP_ZONES_SHIFT)\
)
static inline enum zone_type gfp_zone(gfp_t flags)
{
enum zone_type z;
int bit = (__force int) (flags & GFP_ZONEMASK);
z = (GFP_ZONE_TABLE >> (bit * GFP_ZONES_SHIFT)) &
((1 << GFP_ZONES_SHIFT) - 1);
VM_BUG_ON((GFP_ZONE_BAD >> bit) & 1);
return z;
}
1.1.2 nr_free_zone_pages
enum zone_watermarks {
WMARK_MIN,
WMARK_LOW,
WMARK_HIGH,
NR_WMARK
};
#define min_wmark_pages(z) (z->_watermark[WMARK_MIN] + z->watermark_boost)
#define low_wmark_pages(z) (z->_watermark[WMARK_LOW] + z->watermark_boost)
#define high_wmark_pages(z) (z->_watermark[WMARK_HIGH] + z->watermark_boost)
#define wmark_pages(z, i) (z->_watermark[i] + z->watermark_boost)
static unsigned long nr_free_zone_pages(int offset)
{
struct zoneref *z;
struct zone *zone;
unsigned long sum = 0;
struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);
for_each_zone_zonelist(zone, z, zonelist, offset) {
unsigned long size = zone_managed_pages(zone);
unsigned long high = high_wmark_pages(zone);
if (size > high)
sum += size - high;
}
return sum;
}
1.2 setup_per_zone_wmarks
void setup_per_zone_wmarks(void)
{
static DEFINE_SPINLOCK(lock);
spin_lock(&lock);
__setup_per_zone_wmarks();
spin_unlock(&lock);
}
int watermark_boost_factor __read_mostly = 15000;
int watermark_scale_factor = 10;
static void __setup_per_zone_wmarks(void)
{
unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
unsigned long lowmem_pages = 0;
struct zone *zone;
unsigned long flags;
for_each_zone(zone) {
if (!is_highmem(zone))
lowmem_pages += zone_managed_pages(zone);
}
for_each_zone(zone) {
u64 tmp;
spin_lock_irqsave(&zone->lock, flags);
tmp = (u64)pages_min * zone_managed_pages(zone);
do_div(tmp, lowmem_pages);
if (is_highmem(zone)) {
unsigned long min_pages;
min_pages = zone_managed_pages(zone) / 1024;
min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL);
zone->_watermark[WMARK_MIN] = min_pages;
} else {
zone->_watermark[WMARK_MIN] = tmp;
}
tmp = max_t(u64, tmp >> 2,
mult_frac(zone_managed_pages(zone),
watermark_scale_factor, 10000));
zone->_watermark[WMARK_LOW] = min_wmark_pages(zone) + tmp;
zone->_watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2;
zone->watermark_boost = 0;
spin_unlock_irqrestore(&zone->lock, flags);
}
calculate_totalreserve_pages();
}
1.2.1 is_highmem
#define zone_idx(zone) ((zone) - (zone)->zone_pgdat->node_zones)
static inline int is_highmem_idx(enum zone_type idx)
{
#ifdef CONFIG_HIGHMEM
return (idx == ZONE_HIGHMEM ||
(idx == ZONE_MOVABLE && zone_movable_is_highmem()));
#else
return 0;
#endif
}
static inline int is_highmem(struct zone *zone)
{
#ifdef CONFIG_HIGHMEM
return is_highmem_idx(zone_idx(zone));
#else
return 0;
#endif
}
1.2.2 calculate_totalreserve_pages
// 系统总的预留内存
unsigned long totalreserve_pages __read_mostly
// 当sysctl_lowmem_reserve_ratio或者min_free_kbytes发生变化时计算总的预留内存
static void calculate_totalreserve_pages(void)
{
struct pglist_data *pgdat
unsigned long reserve_pages = 0
enum zone_type i, j
// 遍历每个节点
for_each_online_pgdat(pgdat) {
// pglist_data->totalreserve_pages统计节点总的预留内存
pgdat->totalreserve_pages = 0
// 遍历每个zone
for (i = 0
struct zone *zone = pgdat->node_zones + i
long max = 0
// zone->managed_pages
unsigned long managed_pages = zone_managed_pages(zone)
/* Find valid and maximum lowmem_reserve in the zone */
// 计算zone->lowmem_reserve数组中的最大值
for (j = i
if (zone->lowmem_reserve[j] > max)
max = zone->lowmem_reserve[j]
}
/* we treat the high watermark as reserved pages. */
// zone->lowmem_reserve最大值 + zone->_watermark[WMARK_HIGH]作为总的预留值
max += high_wmark_pages(zone)
// 总的预留值不能大于zone->managed_pages
if (max > managed_pages)
max = managed_pages
// 更新pglist_data->totalreserve_pages
pgdat->totalreserve_pages += max
reserve_pages += max
}
}
// 返回总的预留内存
totalreserve_pages = reserve_pages
}
1.3 setup_per_zone_lowmem_reserve
// sysctl_lowmem_reserve_ratio可以通过/proc/sys/vm/lowmem_reserve_ratio进行设置
int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES] = {
[ZONE_DMA] = 256,
[ZONE_DMA32] = 256,
[ZONE_NORMAL] = 32,
[ZONE_HIGHMEM] = 0,
[ZONE_MOVABLE] = 0,
}
// 设置每个zone的lowmem_reserve数组
static void setup_per_zone_lowmem_reserve(void)
{
struct pglist_data *pgdat
enum zone_type j, idx
// 遍历每1个node
for_each_online_pgdat(pgdat) {
// 遍历每1个zone
for (j = 0
struct zone *zone = pgdat->node_zones + j
// 每个zone可管理的页数
unsigned long managed_pages = zone_managed_pages(zone)
// 每个zone都由1个lowmem_reserve数组, 用于预留低端zone的内存
zone->lowmem_reserve[j] = 0
idx = j
// idx = 0即ZONE_DMA时不会进入while循环
// idx = 1即ZONE_NORMAL时会进入while循环
// idx = 2即ZONE_MOVEABLE时会进入while循环
while (idx) {
struct zone *lower_zone
idx--
// idx等于0: 低端内存代表ZONE_DMA
// idx等于1: 低端内存代表ZONE_NORMAL
lower_zone = pgdat->node_zones + idx
if (sysctl_lowmem_reserve_ratio[idx] < 1) {
sysctl_lowmem_reserve_ratio[idx] = 0
lower_zone->lowmem_reserve[j] = 0
} else {
// DMA zone->lowmem_reserve[1] = Normal zone->managed_pages / 256
// Normal zone->lowmem_reserve[2] = Movable zone->managed_pages / 32 = 0
// DMA zone->lowmem_reserve[2] = Normal zone->managed_pages / 256
lower_zone->lowmem_reserve[j] =
managed_pages / sysctl_lowmem_reserve_ratio[idx]
}
// managed_pages += Normal zone->managed_pages
managed_pages += zone_managed_pages(lower_zone)
}
}
}
/* update totalreserve_pages */
// zone->lowmem_reserve发生改变时需要更新总的预留内存[同1.2.2节]
calculate_totalreserve_pages()
}
2. /proc/sys/vm/min_free_kbytes
static struct ctl_table vm_table[] = {
...
{
// /proc/sys/vm提供min_free_kbytes节点以供调整zone的三个水位值
.procname = "min_free_kbytes",
.data = &min_free_kbytes,
.maxlen = sizeof(min_free_kbytes),
.mode = 0644,
// 回调函数[见2.1节]
.proc_handler = min_free_kbytes_sysctl_handler,
.extra1 = &zero,
},
...
}
2.1 min_free_kbytes_sysctl_handler
int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos)
{
int rc;
rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
if (rc)
return rc;
if (write) {
user_min_free_kbytes = min_free_kbytes;
setup_per_zone_wmarks();
}
return 0;
}
3. /proc/sys/vm/watermark_scale_factor
static struct ctl_table vm_table[] = {
...
{
// /proc/sys/vm提供watermark_scale_factor节点以供调整zone的低水位和高水位
.procname = "watermark_scale_factor",
.data = &watermark_scale_factor,
.maxlen = sizeof(watermark_scale_factor),
.mode = 0644,
// 回调函数
.proc_handler = watermark_scale_factor_sysctl_handler,
// 最小值为1
.extra1 = &one,
// 最大值为1000
.extra2 = &one_thousand,
},
...
}
3.1 watermark_scale_factor_sysctl_handler
int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos)
{
int rc;
rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
if (rc)
return rc;
if (write)
setup_per_zone_wmarks();
return 0;
}