1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
|
static inline struct page *
__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, enum zone_type high_zoneidx,
nodemask_t *nodemask, struct zone *preferred_zone,
int classzone_idx, int migratetype)
{
const gfp_t wait = gfp_mask & __GFP_WAIT;
struct page *page = NULL;
int alloc_flags;
unsigned long pages_reclaimed = 0;
unsigned long did_some_progress;
enum migrate_mode migration_mode = MIGRATE_ASYNC;
bool deferred_compaction = false;
bool contended_compaction = false;
/* 超过最大阶数的分配请求直接返回NULL */
if (order >= MAX_ORDER) {
WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));
return NULL;
}
/*
* NUMA 系统中, GFP_THISNODE = (__GFP_THISNODE |
* __GFP_NORETRY | __GFP_NOWARN) ,没有后备选项,没有
* 内存策略,分配失败时不会重试,这种情况下直接跳转到
* nopage ,避免节点分配过多的内存
*/
if (IS_ENABLED(CONFIG_NUMA) &&
(gfp_mask & GFP_THISNODE) == GFP_THISNODE)
goto nopage;
restart:
if (!(gfp_mask & __GFP_NO_KSWAPD))
/*
* 唤醒 zonelist 中所有 zone 类型小于 high_zoneidx 的
* zone 的 kswapd_wait 等待队列
*/
wake_all_kswapds(order, zonelist, high_zoneidx, preferred_zone);
/*
* 和 __alloc_pages_nodemask 调用 get_page_from_freelist
* 不同, slow_path 使用的 alloc_pages 根据分配请求的 gfp 得到
*/
alloc_flags = gfp_to_alloc_flags(gfp_mask);
/*
* 如果原本的内存请求没有 cpuset 限制也没有节点限制,就在没有
* 任何限制的情况下,获取 preferred_zone
*/
if (!(alloc_flags & ALLOC_CPUSET) && !nodemask) {
struct zoneref *preferred_zoneref;
preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx,
NULL, &preferred_zone);
classzone_idx = zonelist_zone_idx(preferred_zoneref);
}
rebalance:
/* This is the last chance, in general, before the goto nopage. */
/*
* 只考虑 watermark 限制,并且使用新的 preferred_zone ,再次调用
* get_page_from_freelist
*/
page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
preferred_zone, classzone_idx, migratetype);
if (page)
goto got_pg;
/* Allocate without watermarks if the context allows */
if (alloc_flags & ALLOC_NO_WATERMARKS) {
/*
* 如果允许忽视 watermark 限制,分配请求是高优先级的,
* 更有可能是系统发起的请求,而不是用户
*/
zonelist = node_zonelist(numa_node_id(), gfp_mask);
page = __alloc_pages_high_priority(gfp_mask, order,
zonelist, high_zoneidx, nodemask,
preferred_zone, classzone_idx, migratetype);
if (page) {
goto got_pg;
}
}
/* 原子分配,不允许等待 */
if (!wait) {
/* 警告不允许失败的分配请求 */
WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL);
goto nopage;
}
/* Avoid recursion of direct reclaim */
if (current->flags & PF_MEMALLOC)
goto nopage;
/* Avoid allocations with no watermarks from looping endlessly */
/* 由于OOM被杀死的进程会设置 TIF_MEMDIE */
if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
goto nopage;
/*
* Try direct compaction. The first pass is asynchronous. Subsequent
* attempts after direct reclaim are synchronous
*/
page = __alloc_pages_direct_compact(gfp_mask, order, zonelist,
high_zoneidx, nodemask, alloc_flags,
preferred_zone,
classzone_idx, migratetype,
migration_mode, &contended_compaction,
&deferred_compaction,
&did_some_progress);
if (page)
goto got_pg;
/*
* It can become very expensive to allocate transparent hugepages at
* fault, so use asynchronous memory compaction for THP unless it is
* khugepaged trying to collapse.
*/
/*
* 允许进行交换操作,或者当前的进程是内核进程,将迁移模式设置为轻量
* 同步模式,意为可以在大多数操作阻塞,除了 current->writepage,
* 因为可能停滞较长时间
*/
if (!(gfp_mask & __GFP_NO_KSWAPD) || (current->flags & PF_KTHREAD))
migration_mode = MIGRATE_SYNC_LIGHT;
/*
* 如果高阶分配的压缩操作被推迟,说明同步的压缩刚失败。
* 如果调用者请求可以移动的内存,虽然不会严重扰乱系统,
* 直接返回失败,以免进入直接回收操作
*/
if ((deferred_compaction || contended_compaction) &&
(gfp_mask & __GFP_NO_KSWAPD))
goto nopage;
/* mm-buddy_allocator 初始化中提到的 direct-reclaim 路径 */
page = __alloc_pages_direct_reclaim(gfp_mask, order,
zonelist, high_zoneidx,
nodemask,
alloc_flags, preferred_zone,
classzone_idx, migratetype,
&did_some_progress);
if (page)
goto got_pg;
/*
* If we failed to make any progress reclaiming, then we are
* running out of options and have to consider going OOM
*/
// dis_some_progress 在上面两个 direct 函数调用中会赋值为回收操作回收的页面数
if (!did_some_progress) {
// 设置 __GFP_FS 的同时没有设置 __GFP__NORETRY
if (oom_gfp_allowed(gfp_mask)) {
/* 关闭了 OOM killer */
if (oom_killer_disabled)
goto nopage;
/* Coredumps can quickly deplete all memory reserves */
if ((current->flags & PF_DUMPCORE) &&
!(gfp_mask & __GFP_NOFAIL))
goto nopage;
page = __alloc_pages_may_oom(gfp_mask, order,
zonelist, high_zoneidx,
nodemask, preferred_zone,
classzone_idx, migratetype);
if (page)
goto got_pg;
if (!(gfp_mask & __GFP_NOFAIL)) {
/*
* The oom killer is not called for high-order
* allocations that may fail, so if no progress
* is being made, there are no other options and
* retrying is unlikely to help.
*/
/* 大于此值的分配被视为开销很大 */
if (order > PAGE_ALLOC_COSTLY_ORDER)
goto nopage;
/*
* The oom killer is not called for lowmem
* allocations to prevent needlessly killing
* innocent tasks.
*/
if (high_zoneidx < ZONE_NORMAL)
goto nopage;
}
goto restart;
}
}
/* Check if we should retry the allocation */
pages_reclaimed += did_some_progress;
/*
* should_alloc_retry 根据 gfp_mask , order ,
* did_some_pregress 和 pages_reclaimed 判断当前的分配
* 请求是否需要重试,并且等待一段时间,以便当前 zone 的
* 写操作能够完成
*/
if (should_alloc_retry(gfp_mask, order, did_some_progress,
pages_reclaimed)) {
wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
goto rebalance;
} else {
/*
* High-order allocations do not necessarily loop after
* direct reclaim and reclaim/compaction depends on compaction
* being called after reclaim so call directly if necessary
*/
/* 如果不需要重试,在执行回收/压缩操作后,再次尝试分配 */
page = __alloc_pages_direct_compact(gfp_mask, order, zonelist,
high_zoneidx, nodemask, alloc_flags,
preferred_zone,
classzone_idx, migratetype,
migration_mode, &contended_compaction,
&deferred_compaction,
&did_some_progress);
if (page)
goto got_pg;
}
nopage:
warn_alloc_failed(gfp_mask, order, NULL);
return page;
got_pg:
if (kmemcheck_enabled)
kmemcheck_pagealloc_alloc(page, order, gfp_mask);
return page;
}
|