1. 内存区域的回收操作
mm-buddy_allocator 分配页框中讲到的 get_page_from_freelist
函数,在 zone 的空闲页框数不满足 watermark 被视为 full 时,并且 zone 允许进行页框回收,就会调用 zone_reclaim
试着释放一些页框。
zone_reclaim
定义在 mm/vmscan.c 中,在执行回收操作之前,会进行一系列的判断。属于以下情况时,不进行回收操作,直接返回状态码:
-
zone 中未映射的页框数小于等于 min_unmapped_pages ,并且属于 NR_SLAB_RECLAIMABLE 的页框数小于等于 min_slab_pages 。
这种情况返回 ZONE_RECLAIM_FULL 。
-
自从上次回收操作过后,zone 中扫描过的页面数不小于可回收的页面数的 6 倍。可回收的页面数包括 NR_ACTIVE_FILE 和 NR_INACTIVE_FILE ;如果有页面被 swapped-out,还包括匿名页。
这种情况直接返回 ZONE_RECLAIM_FULL。
-
没有设置 __GFP_WAIT,或者当前进程没有 PF_MEMALLOC 标志。
这种情况返回 ZONE_RECLAIM_NOSCAN 。
-
zone 所属的节点有 CPU,并且节点的 id 不等于 numa_node_id 。
返回 ZONE_RECLAIM_NOSCAN 。
-
zone 设置了 ZONE_RECLAIM_LOCKED 标志。
返回 ZONE_RECLAIM_NO_SCAN 。
否则, zone_reclaim
调用 __zone_reclaim
函数,执行回收操作,并且清除 zone 的 ZONE_RECLAIM_LOCKED 标志。
1.1. 回收操作所需数据结构
介绍 __zone_reclaim
函数之前,先介绍一下函数执行过程中用到的两个数据结构。
第一个数据结构 struct scan_control 保存回收过程中,扫描 zone 时所需要的信息:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
|
struct scan_control {
/* 扫描过的 inactive 页框数 */
unsigned long nr_scanned;
/* 调用 shrink_zone s期间释放的页框数*/
unsigned long nr_reclaimed;
/* shrink_list 应该回收的页框数 */
unsigned long nr_to_reclaim;
unsigned long hibernation_mode;
/* gfp 掩码 */
gfp_t gfp_mask;
/* 回收操作是否可以写出页框 */
int may_writepage;
/* 是否能够回收未映射的页框 */
int may_unmap;
/* 要回收的阶数 */
int order;
/* 每次扫描 ( total_size >> priority ) 个页框 */
int priority;
/* 匿名页和file LRU之间的扫描比例 */
int swappiness;
/* 达到限制的内存cgroup,即本次回收操作需要考虑的目标 */
struct mem_cgroup *target_mem_cgroup;
/* 调用者允许使用的节点掩码,为空时所有节点可用 */
nodemask_t *nodemask;
};
|
第二个数据结构 struct shrink_control 保存 shrink 操作所需的信息,由 __zone_reclaim
函数传递给两个收缩操作,达到回收页框的目的。
1
2
3
4
5
6
7
8
9
10
11
12
13
|
struct shrink_control {
/* 当前请求页框的标志 */
gfp_t gfp_mask;
/* scan_objects 应该扫描和回收的对象数 */
unsigned long nr_to_scan;
/* 执行收缩操作的内存节点 */
nodemask_t nodes_to_scan;
/* 当前收缩的节点 */
int nid;
}
|
1.2. __zone_reclaim
__zone_reclaim
函数执行的一开始,先初始化两个所需的数据结构 struct scan_control 和 struct shrink_control ;然后先回收 zone 的页高速缓存,再回收 slab 高速缓存。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
|
static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) {
/* Minimum pages needed in order to stay on node */
const unsigned long nr_pages = 1 << order;
struct task_struct *p = current;
struct reclaim_state reclaim_state;
struct scan_control sc = {
/* 回收过程中是否可以写出页面 */
.may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
//能否回收未映射的页面取决于回收过程是否可以 swap out页面
.may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP),
/* 回收过程是否可以执行换出操作 */
.may_swap = 1,
/* SWAP_CLUSTER_MAX = 32 */
.nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
.gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),
.order = order,
/* priority = 4 表示扫描 nr_pages 的 1/16 页面 */
.priority = ZONE_RECLAIM_PRIORITY,
};
struct shrink_control shrink = {
.gfp_mask = sc.gfp_mask,
};
unsigned long nr_slab_pages0, nr_slab_pages1;
cond_resched();
/*
* We need to be able to allocate from the reserves for RECLAIM_SWAP
* and we also need to be able to write out pages for RECLAIM_WRITE
* and RECLAIM_SWAP.
*/
p->flags |= PF_MEMALLOC | PF_SWAPWRITE;
lockdep_set_current_reclaim_state(gfp_mask);
reclaim_state.reclaimed_slab = 0;
p->reclaim_state = &reclaim_state;
/* zone 中可回收的页高速缓存数大于最小的未映射页框数 */
if (zone_pagecache_reclaimable(zone) > zone->min_unmapped_pages) {
/*
* Free memory by calling shrink zone with increasing
* priorities until we have enough memory freed.
*/
/* priority的值越小,每次扫描的页框数越多 */
do {
shrink_zone(zone, &sc);
} while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0);
}
nr_slab_pages0 = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
/* zone 中可回收的 slab 数大于最小的 slab 数 */
if (nr_slab_pages0 > zone->min_slab_pages) {
/*
* shrink_slab() does not currently allow us to determine how
* many pages were freed in this zone. So we take the current
* number of slab pages and shake the slab until it is reduced
* by the same nr_pages that we used for reclaiming unmapped
* pages.
*/
nodes_clear(shrink.nodes_to_scan);
node_set(zone_to_nid(zone), shrink.nodes_to_scan);
for (;;) {
unsigned long lru_pages = zone_reclaimable_pages(zone);
/* No reclaimable slab or very low memory pressure */
if (!shrink_slab(&shrink, sc.nr_scanned, lru_pages))
break;
/* Freed enough memory */
nr_slab_pages1 = zone_page_state(zone,
NR_SLAB_RECLAIMABLE);
if (nr_slab_pages1 + nr_pages <= nr_slab_pages0)
break;
}
/*
* Update nr_reclaimed by the number of slab pages we
* reclaimed from this zone.
*/
nr_slab_pages1 = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
if (nr_slab_pages1 < nr_slab_pages0)
sc.nr_reclaimed += nr_slab_pages0 - nr_slab_pages1;
}
p->reclaim_state = NULL;
current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
lockdep_clear_current_reclaim_state();
return sc.nr_reclaimed >= nr_pages;
}
|
2. shrink_zone
zone 中可回收的页高速缓存数量大于阈值时,多次调用 shrink_zone
回收页框。每次调用回收函数时扫描的页面数都会翻倍。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
|
static void shrink_zone(struct zone *zone, struct scan_control *sc)
{
unsigned long nr_reclaimed, nr_scanned;
do {
struct mem_cgroup *root = sc->target_mem_cgroup;
struct mem_cgroup_reclaim_cookie reclaim = {
.zone = zone,
.priority = sc->priority,
};
struct mem_cgroup *memcg;
nr_reclaimed = sc->nr_reclaimed;
nr_scanned = sc->nr_scanned;
/* 获取zone所属的mem_cgroup */
memcg = mem_cgroup_iter(root, NULL, &reclaim);
do {
struct lruvec *lruvec;
/* 获取zone的lru list */
lruvec = mem_cgroup_zone_lruvec(zone, memcg);
sc->swappiness = mem_cgroup_swappiness(memcg);
/* 释放页框 */
shrink_lruvec(lruvec, sc);
/*
* Direct reclaim and kswapd have to scan all memory
* cgroups to fulfill the overall scan target for the
* zone.
*
* Limit reclaim, on the other hand, only cares about
* nr_to_reclaim pages to be reclaimed and it will
* retry with decreasing priority if one round over the
* whole hierarchy is not sufficient.
*/
if (!global_reclaim(sc) &&
sc->nr_reclaimed >= sc->nr_to_reclaim) {
mem_cgroup_iter_break(root, memcg);
break;
}
memcg = mem_cgroup_iter(root, memcg, &reclaim);
} while (memcg);
vmpressure(sc->gfp_mask, sc->target_mem_cgroup,
sc->nr_scanned - nr_scanned,
sc->nr_reclaimed - nr_reclaimed);
} while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed,
sc->nr_scanned - nr_scanned, sc));
}
|
3. shrink_slab