1. 内存区域的回收操作

mm-buddy_allocator 分配页框中讲到的 get_page_from_freelist 函数,在 zone 的空闲页框数不满足 watermark 被视为 full 时,并且 zone 允许进行页框回收,就会调用 zone_reclaim 试着释放一些页框。

zone_reclaim 定义在 mm/vmscan.c 中,在执行回收操作之前,会进行一系列的判断。属于以下情况时,不进行回收操作,直接返回状态码:

  1. zone 中未映射的页框数小于等于 min_unmapped_pages ,并且属于 NR_SLAB_RECLAIMABLE 的页框数小于等于 min_slab_pages
    这种情况返回 ZONE_RECLAIM_FULL

  2. 自从上次回收操作过后,zone 中扫描过的页面数不小于可回收的页面数的 6 倍。可回收的页面数包括 NR_ACTIVE_FILENR_INACTIVE_FILE ;如果有页面被 swapped-out,还包括匿名页。
    这种情况直接返回 ZONE_RECLAIM_FULL

  3. 没有设置 __GFP_WAIT,或者当前进程没有 PF_MEMALLOC 标志。
    这种情况返回 ZONE_RECLAIM_NOSCAN

  4. zone 所属的节点有 CPU,并且节点的 id 不等于 numa_node_id
    返回 ZONE_RECLAIM_NOSCAN

  5. zone 设置了 ZONE_RECLAIM_LOCKED 标志。
    返回 ZONE_RECLAIM_NO_SCAN

否则, zone_reclaim 调用 __zone_reclaim 函数,执行回收操作,并且清除 zone 的 ZONE_RECLAIM_LOCKED 标志。

1.1. 回收操作所需数据结构

介绍 __zone_reclaim 函数之前,先介绍一下函数执行过程中用到的两个数据结构。

第一个数据结构 struct scan_control 保存回收过程中,扫描 zone 时所需要的信息:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
struct scan_control {
    /* 扫描过的 inactive 页框数 */
    unsigned long nr_scanned;

    /* 调用 shrink_zone s期间释放的页框数*/
    unsigned long nr_reclaimed;

    /* shrink_list 应该回收的页框数 */
    unsigned long nr_to_reclaim;

    unsigned long hibernation_mode;

    /* gfp 掩码 */
    gfp_t gfp_mask;

    /* 回收操作是否可以写出页框 */
    int may_writepage;

    /* 是否能够回收未映射的页框 */
    int may_unmap;

    /* 要回收的阶数 */
    int order;

    /* 每次扫描 ( total_size >> priority ) 个页框 */
    int priority;

    /* 匿名页和file LRU之间的扫描比例 */
    int swappiness;

    /* 达到限制的内存cgroup,即本次回收操作需要考虑的目标 */
    struct mem_cgroup *target_mem_cgroup;

    /* 调用者允许使用的节点掩码,为空时所有节点可用 */
    nodemask_t *nodemask;
};

第二个数据结构 struct shrink_control 保存 shrink 操作所需的信息,由 __zone_reclaim 函数传递给两个收缩操作,达到回收页框的目的。

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
struct shrink_control {
    /* 当前请求页框的标志 */
    gfp_t gfp_mask;

    /* scan_objects 应该扫描和回收的对象数 */
    unsigned long nr_to_scan;

    /* 执行收缩操作的内存节点 */
    nodemask_t nodes_to_scan;

    /* 当前收缩的节点 */
    int nid;
}

1.2. __zone_reclaim

__zone_reclaim 函数执行的一开始,先初始化两个所需的数据结构 struct scan_controlstruct shrink_control ;然后先回收 zone 的页高速缓存,再回收 slab 高速缓存。

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) {
    /* Minimum pages needed in order to stay on node */
    const unsigned long nr_pages = 1 << order;
    struct task_struct *p = current;
    struct reclaim_state reclaim_state;
    struct scan_control sc = {

        /* 回收过程中是否可以写出页面 */
        .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),

        //能否回收未映射的页面取决于回收过程是否可以 swap out页面
        .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP),

        /* 回收过程是否可以执行换出操作 */
        .may_swap = 1,
        /* SWAP_CLUSTER_MAX = 32 */
        .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
        .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),
        .order = order,

        /* priority = 4 表示扫描 nr_pages 的 1/16 页面 */
        .priority = ZONE_RECLAIM_PRIORITY,
    };
    struct shrink_control shrink = {
        .gfp_mask = sc.gfp_mask,
    };
    unsigned long nr_slab_pages0, nr_slab_pages1;

    cond_resched();
    /*
     * We need to be able to allocate from the reserves for RECLAIM_SWAP
     * and we also need to be able to write out pages for RECLAIM_WRITE
     * and RECLAIM_SWAP.
     */
    p->flags |= PF_MEMALLOC | PF_SWAPWRITE;
    lockdep_set_current_reclaim_state(gfp_mask);
    reclaim_state.reclaimed_slab = 0;
    p->reclaim_state = &reclaim_state;

    /* zone 中可回收的页高速缓存数大于最小的未映射页框数 */
    if (zone_pagecache_reclaimable(zone) > zone->min_unmapped_pages) {
        /*
         * Free memory by calling shrink zone with increasing
         * priorities until we have enough memory freed.
         */
        /* priority的值越小,每次扫描的页框数越多 */
        do {
            shrink_zone(zone, &sc);
        } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0);
    }

    nr_slab_pages0 = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
    
    /* zone 中可回收的 slab 数大于最小的 slab 数 */
    if (nr_slab_pages0 > zone->min_slab_pages) {
        /*
         * shrink_slab() does not currently allow us to determine how
         * many pages were freed in this zone. So we take the current
         * number of slab pages and shake the slab until it is reduced
         * by the same nr_pages that we used for reclaiming unmapped
         * pages.
         */
        nodes_clear(shrink.nodes_to_scan);
        node_set(zone_to_nid(zone), shrink.nodes_to_scan);
        for (;;) {
            unsigned long lru_pages = zone_reclaimable_pages(zone);

            /* No reclaimable slab or very low memory pressure */
            if (!shrink_slab(&shrink, sc.nr_scanned, lru_pages))
                break;

            /* Freed enough memory */
            nr_slab_pages1 = zone_page_state(zone,
                            NR_SLAB_RECLAIMABLE);
            if (nr_slab_pages1 + nr_pages <= nr_slab_pages0)
                break;
        }

        /*
         * Update nr_reclaimed by the number of slab pages we
         * reclaimed from this zone.
         */
        nr_slab_pages1 = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
        if (nr_slab_pages1 < nr_slab_pages0)
            sc.nr_reclaimed += nr_slab_pages0 - nr_slab_pages1;
        }

    p->reclaim_state = NULL;
    current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
    lockdep_clear_current_reclaim_state();
    return sc.nr_reclaimed >= nr_pages;
}

2. shrink_zone

zone 中可回收的页高速缓存数量大于阈值时,多次调用 shrink_zone 回收页框。每次调用回收函数时扫描的页面数都会翻倍。

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
static void shrink_zone(struct zone *zone, struct scan_control *sc)
{
    unsigned long nr_reclaimed, nr_scanned;

    do {
        struct mem_cgroup *root = sc->target_mem_cgroup;
        struct mem_cgroup_reclaim_cookie reclaim = {
            .zone = zone,
            .priority = sc->priority,
        };
        struct mem_cgroup *memcg;

        nr_reclaimed = sc->nr_reclaimed;
        nr_scanned = sc->nr_scanned;

        /* 获取zone所属的mem_cgroup */
        memcg = mem_cgroup_iter(root, NULL, &reclaim);
        do {
            struct lruvec *lruvec;

            /* 获取zone的lru list */
            lruvec = mem_cgroup_zone_lruvec(zone, memcg);

            sc->swappiness = mem_cgroup_swappiness(memcg);

            /* 释放页框 */
            shrink_lruvec(lruvec, sc);

            /*
             * Direct reclaim and kswapd have to scan all memory
             * cgroups to fulfill the overall scan target for the
             * zone.
             *
             * Limit reclaim, on the other hand, only cares about
             * nr_to_reclaim pages to be reclaimed and it will
             * retry with decreasing priority if one round over the
             * whole hierarchy is not sufficient.
             */
            if (!global_reclaim(sc) &&
                    sc->nr_reclaimed >= sc->nr_to_reclaim) {
                        mem_cgroup_iter_break(root, memcg);
                break;
            }
            memcg = mem_cgroup_iter(root, memcg, &reclaim);
        } while (memcg);

        vmpressure(sc->gfp_mask, sc->target_mem_cgroup,
               sc->nr_scanned - nr_scanned,
               sc->nr_reclaimed - nr_reclaimed);

    } while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed,                           
        sc->nr_scanned - nr_scanned, sc));
}

3. shrink_slab