1. 背景介绍

Documentation 目录没有关于 memory compaction 的详细介绍，本文从 slab 分配器的一个慢路径分配函数 __alloc_pages_direct_compact 的主要函数 try_to_compact_pages 入手，介绍内存压缩的相关内容。

压缩操作用到的一个关键数据结构是 struct compact_control ，内核中的代码注释如下：

compact_control 用于记录被迁移的页和内存压缩过程中被迁移的空闲页， free_pfn 从 zone 的结束位置开始， migrate_pfn 从起始位置开始。压缩的过程中，可移动的内存页被移动到 zone 的末尾， free_pfn <= migrate_pfn 时内存压缩结束。

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22


struct compact_control {
	struct list_head freepages;	/* List of free pages to migrate to */
	struct list_head migratepages;	/* List of pages being migrated */
	unsigned long nr_freepages;	/* Number of isolated free pages */
	unsigned long nr_migratepages;	/* Number of pages to migrate */
	unsigned long free_pfn;		/* isolate_freepages search base */
	unsigned long migrate_pfn;	/* isolate_migratepages search base */
	enum migrate_mode mode;		/* Async or sync migration mode */
	bool ignore_skip_hint;		/* Scan blocks even if marked skip */
	bool finished_update_free;	/* True when the zone cached pfns are
					 * no longer being updated
					 */
	bool finished_update_migrate;

	int order;			/* order a direct compactor needs */
	int migratetype;		/* MOVABLE, RECLAIMABLE etc */
	struct zone *zone;
	bool contended;			/* True if a lock was contended, or
					 * need_resched() true during async
					 * compaction
					 */
};

__alloc_pages_direct_compact 函数执行时，涉及到 compaction 推迟的逻辑，和其相关的 struct zone 结构中的一些成员变量为：

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14


struct zone {
    unsigned long compact_cached_free_pfn;
    unsigned long compact_cached_migrate_pfn[2];

    // compact_considered 记录 compact 失败的次数
    unsigned long compact_considered;

    // 1 << compact_defer_shift 作为 compact_considered 的上限
    unsigned int compact_defer_shift;

    // compact_order_failed 保存 compaction 失败的 order
    int compact_order_failed;
    bool compact_blockskip_flush;
}

如果 zone 的 compact_order_failed 大于当前请求的 order ，或者 compact_considerred ( 推迟的次数 ) 已经达到上限，就不会推迟 compaction 。

2. try_to_compact_pages

内核中关于函数的简介为：直接压缩来满足高阶的分配。

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50


unsigned long try_to_compact_pages(struct zonelist *zonelist,
			int order, gfp_t gfp_mask, nodemask_t *nodemask,
			enum migrate_mode mode, bool *contended)
{
	enum zone_type high_zoneidx = gfp_zone(gfp_mask);
	int may_enter_fs = gfp_mask & __GFP_FS;
	int may_perform_io = gfp_mask & __GFP_IO;
	struct zoneref *z;
	struct zone *zone;
	int rc = COMPACT_SKIPPED;
	int alloc_flags = 0;

	/* Check if the GFP flags allow compaction */
    /*
     * 同时满足下列条件的分配请求才可以进行内存压缩：
     * 1. 请求的 page 大于 0 
     * 2. 可以调用 FS 
     * 3. 可以进行 IO 操作
     */
	if (!order || !may_enter_fs || !may_perform_io)
		return rc;

    // 增加 vm 的内存压缩计数
	count_compact_event(COMPACTSTALL);

#ifdef CONFIG_CMA
	if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
		alloc_flags |= ALLOC_CMA;
#endif
	/* Compact each zone in the list */
	for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx,
								nodemask) {
		int status;

        /*
         * 创建一个 struct compact_control cc 对象，用传入的参数初始化，
         * 调用 compact_zone(zone, &cc) 执行具体的压缩操作
         */
		status = compact_zone_order(zone, order, gfp_mask, mode,
						contended);
		rc = max(status, rc);

		/* If a normal allocation would succeed, stop compacting */
		if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0,
				      alloc_flags))
			break;
	}

	return rc;
}

3. compact_zone

compact_zone_order 压缩指定的 zone ，创建的 struct compact_control 结构体的各个成员初始化如下：

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131


struct compact_control cc = {
    .nr_freepages = 0,
    .nr_migratepages = 0,
    .order = order,
    .migratetype = allocflags_to_migratetype(gfp_mask),
    .zone = zone,
    .mode = mode,
};

static int compact_zone(struct zone *zone, struct compact_control *cc)
{
	int ret;
	unsigned long start_pfn = zone->zone_start_pfn;
	unsigned long end_pfn = zone_end_pfn(zone);
	const bool sync = cc->mode != MIGRATE_ASYNC;

    /*
     * 判断 zone 是否适合进行内存压缩
     * 1. 根据 order 和 watermark 判断是否适合压缩
     * 2. 根据 fragment index 判断是否适合压缩
     */
	ret = compaction_suitable(zone, cc->order);
	switch (ret) {
	case COMPACT_PARTIAL:
	case COMPACT_SKIPPED:
		/* Compaction is likely to fail */
		return ret;
	case COMPACT_CONTINUE:
		/* Fall through to compaction */
		;
	}

	/*
	 * Clear pageblock skip if there were failures recently and compaction
	 * is about to be retried after being deferred. kswapd does not do
	 * this reset as it'll reset the cached information when going to sleep.
	 */
    /*
     * compaction_restarting 判断是否应该重启 compaction
     * 1. cc->order < zone->compact_order_failed 不会执行推迟函数，返回 false
     * 2. zone->compact_defer_shift 达到最大值， compact_considered 达到上限，
     *    返回 true
     * 
     * __reset_isolation_suitable 重置 zone 中和 compact 相关的成员变量，
     * 清除 zone 中每个 pageblock 的 PB_migrate_skip 标志
     */
	if (compaction_restarting(zone, cc->order) && !current_is_kswapd())
		__reset_isolation_suitable(zone);

	/*
	 * Setup to move all movable pages to the end of the zone. Used cached
	 * information on where the scanners should start but check that it
	 * is initialised by ensuring the values are within zone boundaries.
	 */
	cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync];
	cc->free_pfn = zone->compact_cached_free_pfn;
	if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) {
		cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1);
		zone->compact_cached_free_pfn = cc->free_pfn;
	}
	if (cc->migrate_pfn < start_pfn || cc->migrate_pfn > end_pfn) {
		cc->migrate_pfn = start_pfn;
		zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn;
		zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn;
	}

	trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, cc->free_pfn, end_pfn);

	migrate_prep_local();

    /*
     * 以下情况 compact_finished 返回 COMPACT_CONTINUE：
     * 1. cc->order = -1 ，即通过 /proc/sys/vm/compact_memory 触发压缩
     * 2. watermark 值没有满足要求
     * 3. cc->free_pfn > cc->migrate_pfn ， 并且 zone 中没有 order 大于
     *    请求的 order 的 free_area
     */
	while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) {
		int err;

        /*
         * isolate 要迁移的 page ，即将 page 从 lruvec 中删除，并且添加到 
         * cc->migratepages 
         */
		switch (isolate_migratepages(zone, cc)) {
		case ISOLATE_ABORT:
			ret = COMPACT_PARTIAL;
			putback_movable_pages(&cc->migratepages);
			cc->nr_migratepages = 0;
			goto out;
		case ISOLATE_NONE:
			continue;
		case ISOLATE_SUCCESS:
			;
		}

		if (!cc->nr_migratepages)
			continue;

        // migrate 页面
		err = migrate_pages(&cc->migratepages, compaction_alloc,
				compaction_free, (unsigned long)cc, cc->mode,
				MR_COMPACTION);

		trace_mm_compaction_migratepages(cc->nr_migratepages, err,
							&cc->migratepages);

		/* All pages were either migrated or will be released */
		cc->nr_migratepages = 0;
		if (err) {
			putback_movable_pages(&cc->migratepages);
			/*
			 * migrate_pages() may return -ENOMEM when scanners meet
			 * and we want compact_finished() to detect it
			 */
			if (err == -ENOMEM && cc->free_pfn > cc->migrate_pfn) {
				ret = COMPACT_PARTIAL;
				goto out;
			}
		}
	}

out:
	/* Release free pages and check accounting */
	cc->nr_freepages -= release_freepages(&cc->freepages);
	VM_BUG_ON(cc->nr_freepages != 0);

	trace_mm_compaction_end(ret);

	return ret;
}

3.1. compaction_alloc

compaction_alloc 方法从 compact_control 的空闲页链表中分配一个内存页。

如果空闲页链表为空，并且没有设置 cc->contended ，调用 isolate_freepages 。

3.2. compaction_free

compaction_free 将传入的 page 添加到 cc->freepages 链表中。

mm-buddy_allocator_memory_compaction

Contents

1. 背景介绍

2. try_to_compact_pages

3. compact_zone

3.1. compaction_alloc

3.2. compaction_free