• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/mm.h>
3 #include <linux/mmzone.h>
4 #include <linux/memblock.h>
5 #include <linux/page_ext.h>
6 #include <linux/memory.h>
7 #include <linux/vmalloc.h>
8 #include <linux/kmemleak.h>
9 #include <linux/page_owner.h>
10 #include <linux/page_pinner.h>
11 #include <linux/page_idle.h>
12 #include <linux/rcupdate.h>
13 /*
14  * struct page extension
15  *
16  * This is the feature to manage memory for extended data per page.
17  *
18  * Until now, we must modify struct page itself to store extra data per page.
19  * This requires rebuilding the kernel and it is really time consuming process.
20  * And, sometimes, rebuild is impossible due to third party module dependency.
21  * At last, enlarging struct page could cause un-wanted system behaviour change.
22  *
23  * This feature is intended to overcome above mentioned problems. This feature
24  * allocates memory for extended data per page in certain place rather than
25  * the struct page itself. This memory can be accessed by the accessor
26  * functions provided by this code. During the boot process, it checks whether
27  * allocation of huge chunk of memory is needed or not. If not, it avoids
28  * allocating memory at all. With this advantage, we can include this feature
29  * into the kernel in default and can avoid rebuild and solve related problems.
30  *
31  * To help these things to work well, there are two callbacks for clients. One
32  * is the need callback which is mandatory if user wants to avoid useless
33  * memory allocation at boot-time. The other is optional, init callback, which
34  * is used to do proper initialization after memory is allocated.
35  *
36  * The need callback is used to decide whether extended memory allocation is
37  * needed or not. Sometimes users want to deactivate some features in this
38  * boot and extra memory would be unnecessary. In this case, to avoid
39  * allocating huge chunk of memory, each clients represent their need of
40  * extra memory through the need callback. If one of the need callbacks
41  * returns true, it means that someone needs extra memory so that
42  * page extension core should allocates memory for page extension. If
43  * none of need callbacks return true, memory isn't needed at all in this boot
44  * and page extension core can skip to allocate memory. As result,
45  * none of memory is wasted.
46  *
47  * When need callback returns true, page_ext checks if there is a request for
48  * extra memory through size in struct page_ext_operations. If it is non-zero,
49  * extra space is allocated for each page_ext entry and offset is returned to
50  * user through offset in struct page_ext_operations.
51  *
52  * The init callback is used to do proper initialization after page extension
53  * is completely initialized. In sparse memory system, extra memory is
54  * allocated some time later than memmap is allocated. In other words, lifetime
55  * of memory for page extension isn't same with memmap for struct page.
56  * Therefore, clients can't store extra data until page extension is
57  * initialized, even if pages are allocated and used freely. This could
58  * cause inadequate state of extra data per page, so, to prevent it, client
59  * can utilize this callback to initialize the state of it correctly.
60  */
61 
62 #ifdef CONFIG_SPARSEMEM
63 #define PAGE_EXT_INVALID       (0x1)
64 #endif
65 
66 #if defined(CONFIG_PAGE_IDLE_FLAG) && !defined(CONFIG_64BIT)
need_page_idle(void)67 static bool need_page_idle(void)
68 {
69 	return true;
70 }
71 struct page_ext_operations page_idle_ops = {
72 	.need = need_page_idle,
73 };
74 #endif
75 
76 static struct page_ext_operations *page_ext_ops[] = {
77 #ifdef CONFIG_PAGE_OWNER
78 	&page_owner_ops,
79 #endif
80 #if defined(CONFIG_PAGE_IDLE_FLAG) && !defined(CONFIG_64BIT)
81 	&page_idle_ops,
82 #endif
83 #ifdef CONFIG_PAGE_PINNER
84 	&page_pinner_ops,
85 #endif
86 };
87 
88 unsigned long page_ext_size = sizeof(struct page_ext);
89 
90 static unsigned long total_usage;
91 
invoke_need_callbacks(void)92 static bool __init invoke_need_callbacks(void)
93 {
94 	int i;
95 	int entries = ARRAY_SIZE(page_ext_ops);
96 	bool need = false;
97 
98 	for (i = 0; i < entries; i++) {
99 		if (page_ext_ops[i]->need && page_ext_ops[i]->need()) {
100 			page_ext_ops[i]->offset = page_ext_size;
101 			page_ext_size += page_ext_ops[i]->size;
102 			need = true;
103 		}
104 	}
105 
106 	return need;
107 }
108 
invoke_init_callbacks(void)109 static void __init invoke_init_callbacks(void)
110 {
111 	int i;
112 	int entries = ARRAY_SIZE(page_ext_ops);
113 
114 	for (i = 0; i < entries; i++) {
115 		if (page_ext_ops[i]->init)
116 			page_ext_ops[i]->init();
117 	}
118 }
119 
120 #ifndef CONFIG_SPARSEMEM
page_ext_init_flatmem_late(void)121 void __init page_ext_init_flatmem_late(void)
122 {
123 	invoke_init_callbacks();
124 }
125 #endif
126 
get_entry(void * base,unsigned long index)127 static inline struct page_ext *get_entry(void *base, unsigned long index)
128 {
129 	return base + page_ext_size * index;
130 }
131 
132 /**
133  * page_ext_get() - Get the extended information for a page.
134  * @page: The page we're interested in.
135  *
136  * Ensures that the page_ext will remain valid until page_ext_put()
137  * is called.
138  *
139  * Return: NULL if no page_ext exists for this page.
140  * Context: Any context.  Caller may not sleep until they have called
141  * page_ext_put().
142  */
page_ext_get(struct page * page)143 struct page_ext *page_ext_get(struct page *page)
144 {
145 	struct page_ext *page_ext;
146 
147 	rcu_read_lock();
148 	page_ext = lookup_page_ext(page);
149 	if (!page_ext) {
150 		rcu_read_unlock();
151 		return NULL;
152 	}
153 
154 	return page_ext;
155 }
156 
157 /**
158  * page_ext_put() - Working with page extended information is done.
159  * @page_ext: Page extended information received from page_ext_get().
160  *
161  * The page extended information of the page may not be valid after this
162  * function is called.
163  *
164  * Return: None.
165  * Context: Any context with corresponding page_ext_get() is called.
166  */
page_ext_put(struct page_ext * page_ext)167 void page_ext_put(struct page_ext *page_ext)
168 {
169 	if (unlikely(!page_ext))
170 		return;
171 
172 	rcu_read_unlock();
173 }
174 
175 #if !defined(CONFIG_SPARSEMEM)
176 
pgdat_page_ext_init(struct pglist_data * pgdat)177 void __meminit pgdat_page_ext_init(struct pglist_data *pgdat)
178 {
179 	pgdat->node_page_ext = NULL;
180 }
181 
lookup_page_ext(const struct page * page)182 struct page_ext *lookup_page_ext(const struct page *page)
183 {
184 	unsigned long pfn = page_to_pfn(page);
185 	unsigned long index;
186 	struct page_ext *base;
187 
188 	WARN_ON_ONCE(!rcu_read_lock_held());
189 	base = NODE_DATA(page_to_nid(page))->node_page_ext;
190 	/*
191 	 * The sanity checks the page allocator does upon freeing a
192 	 * page can reach here before the page_ext arrays are
193 	 * allocated when feeding a range of pages to the allocator
194 	 * for the first time during bootup or memory hotplug.
195 	 */
196 	if (unlikely(!base))
197 		return NULL;
198 	index = pfn - round_down(node_start_pfn(page_to_nid(page)),
199 					MAX_ORDER_NR_PAGES);
200 	return get_entry(base, index);
201 }
202 EXPORT_SYMBOL_NS_GPL(lookup_page_ext, MINIDUMP);
203 
alloc_node_page_ext(int nid)204 static int __init alloc_node_page_ext(int nid)
205 {
206 	struct page_ext *base;
207 	unsigned long table_size;
208 	unsigned long nr_pages;
209 
210 	nr_pages = NODE_DATA(nid)->node_spanned_pages;
211 	if (!nr_pages)
212 		return 0;
213 
214 	/*
215 	 * Need extra space if node range is not aligned with
216 	 * MAX_ORDER_NR_PAGES. When page allocator's buddy algorithm
217 	 * checks buddy's status, range could be out of exact node range.
218 	 */
219 	if (!IS_ALIGNED(node_start_pfn(nid), MAX_ORDER_NR_PAGES) ||
220 		!IS_ALIGNED(node_end_pfn(nid), MAX_ORDER_NR_PAGES))
221 		nr_pages += MAX_ORDER_NR_PAGES;
222 
223 	table_size = page_ext_size * nr_pages;
224 
225 	base = memblock_alloc_try_nid(
226 			table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS),
227 			MEMBLOCK_ALLOC_ACCESSIBLE, nid);
228 	if (!base)
229 		return -ENOMEM;
230 	NODE_DATA(nid)->node_page_ext = base;
231 	total_usage += table_size;
232 	return 0;
233 }
234 
page_ext_init_flatmem(void)235 void __init page_ext_init_flatmem(void)
236 {
237 
238 	int nid, fail;
239 
240 	if (!invoke_need_callbacks())
241 		return;
242 
243 	for_each_online_node(nid)  {
244 		fail = alloc_node_page_ext(nid);
245 		if (fail)
246 			goto fail;
247 	}
248 	pr_info("allocated %ld bytes of page_ext\n", total_usage);
249 	return;
250 
251 fail:
252 	pr_crit("allocation of page_ext failed.\n");
253 	panic("Out of memory");
254 }
255 
256 #else /* CONFIG_FLATMEM */
page_ext_invalid(struct page_ext * page_ext)257 static bool page_ext_invalid(struct page_ext *page_ext)
258 {
259 	return !page_ext || (((unsigned long)page_ext & PAGE_EXT_INVALID) == PAGE_EXT_INVALID);
260 }
261 
lookup_page_ext(const struct page * page)262 struct page_ext *lookup_page_ext(const struct page *page)
263 {
264 	unsigned long pfn = page_to_pfn(page);
265 	struct mem_section *section = __pfn_to_section(pfn);
266 	struct page_ext *page_ext = READ_ONCE(section->page_ext);
267 
268 	WARN_ON_ONCE(!rcu_read_lock_held());
269 	/*
270 	 * The sanity checks the page allocator does upon freeing a
271 	 * page can reach here before the page_ext arrays are
272 	 * allocated when feeding a range of pages to the allocator
273 	 * for the first time during bootup or memory hotplug.
274 	 */
275 	if (page_ext_invalid(page_ext))
276 		return NULL;
277 	return get_entry(page_ext, pfn);
278 }
279 EXPORT_SYMBOL_NS_GPL(lookup_page_ext, MINIDUMP);
280 
alloc_page_ext(size_t size,int nid)281 static void *__meminit alloc_page_ext(size_t size, int nid)
282 {
283 	gfp_t flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN;
284 	void *addr = NULL;
285 
286 	addr = alloc_pages_exact_nid(nid, size, flags);
287 	if (addr) {
288 		kmemleak_alloc(addr, size, 1, flags);
289 		return addr;
290 	}
291 
292 	addr = vzalloc_node(size, nid);
293 
294 	return addr;
295 }
296 
init_section_page_ext(unsigned long pfn,int nid)297 static int __meminit init_section_page_ext(unsigned long pfn, int nid)
298 {
299 	struct mem_section *section;
300 	struct page_ext *base;
301 	unsigned long table_size;
302 
303 	section = __pfn_to_section(pfn);
304 
305 	if (section->page_ext)
306 		return 0;
307 
308 	table_size = page_ext_size * PAGES_PER_SECTION;
309 	base = alloc_page_ext(table_size, nid);
310 
311 	/*
312 	 * The value stored in section->page_ext is (base - pfn)
313 	 * and it does not point to the memory block allocated above,
314 	 * causing kmemleak false positives.
315 	 */
316 	kmemleak_not_leak(base);
317 
318 	if (!base) {
319 		pr_err("page ext allocation failure\n");
320 		return -ENOMEM;
321 	}
322 
323 	/*
324 	 * The passed "pfn" may not be aligned to SECTION.  For the calculation
325 	 * we need to apply a mask.
326 	 */
327 	pfn &= PAGE_SECTION_MASK;
328 	section->page_ext = (void *)base - page_ext_size * pfn;
329 	total_usage += table_size;
330 	return 0;
331 }
332 
free_page_ext(void * addr)333 static void free_page_ext(void *addr)
334 {
335 	if (is_vmalloc_addr(addr)) {
336 		vfree(addr);
337 	} else {
338 		struct page *page = virt_to_page(addr);
339 		size_t table_size;
340 
341 		table_size = page_ext_size * PAGES_PER_SECTION;
342 
343 		BUG_ON(PageReserved(page));
344 		kmemleak_free(addr);
345 		free_pages_exact(addr, table_size);
346 	}
347 }
348 
__free_page_ext(unsigned long pfn)349 static void __free_page_ext(unsigned long pfn)
350 {
351 	struct mem_section *ms;
352 	struct page_ext *base;
353 
354 	ms = __pfn_to_section(pfn);
355 	if (!ms || !ms->page_ext)
356 		return;
357 
358 	base = READ_ONCE(ms->page_ext);
359 	/*
360 	 * page_ext here can be valid while doing the roll back
361 	 * operation in online_page_ext().
362 	 */
363 	if (page_ext_invalid(base))
364 		base = (void *)base - PAGE_EXT_INVALID;
365 	WRITE_ONCE(ms->page_ext, NULL);
366 
367 	base = get_entry(base, pfn);
368 	free_page_ext(base);
369 }
370 
__invalidate_page_ext(unsigned long pfn)371 static void __invalidate_page_ext(unsigned long pfn)
372 {
373 	struct mem_section *ms;
374 	void *val;
375 
376 	ms = __pfn_to_section(pfn);
377 	if (!ms || !ms->page_ext)
378 		return;
379 	val = (void *)ms->page_ext + PAGE_EXT_INVALID;
380 	WRITE_ONCE(ms->page_ext, val);
381 }
382 
online_page_ext(unsigned long start_pfn,unsigned long nr_pages,int nid)383 static int __meminit online_page_ext(unsigned long start_pfn,
384 				unsigned long nr_pages,
385 				int nid)
386 {
387 	unsigned long start, end, pfn;
388 	int fail = 0;
389 
390 	start = SECTION_ALIGN_DOWN(start_pfn);
391 	end = SECTION_ALIGN_UP(start_pfn + nr_pages);
392 
393 	if (nid == NUMA_NO_NODE) {
394 		/*
395 		 * In this case, "nid" already exists and contains valid memory.
396 		 * "start_pfn" passed to us is a pfn which is an arg for
397 		 * online__pages(), and start_pfn should exist.
398 		 */
399 		nid = pfn_to_nid(start_pfn);
400 		VM_BUG_ON(!node_state(nid, N_ONLINE));
401 	}
402 
403 	for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION)
404 		fail = init_section_page_ext(pfn, nid);
405 	if (!fail)
406 		return 0;
407 
408 	/* rollback */
409 	for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
410 		__free_page_ext(pfn);
411 
412 	return -ENOMEM;
413 }
414 
offline_page_ext(unsigned long start_pfn,unsigned long nr_pages,int nid)415 static int __meminit offline_page_ext(unsigned long start_pfn,
416 				unsigned long nr_pages, int nid)
417 {
418 	unsigned long start, end, pfn;
419 
420 	start = SECTION_ALIGN_DOWN(start_pfn);
421 	end = SECTION_ALIGN_UP(start_pfn + nr_pages);
422 
423 	/*
424 	 * Freeing of page_ext is done in 3 steps to avoid
425 	 * use-after-free of it:
426 	 * 1) Traverse all the sections and mark their page_ext
427 	 *    as invalid.
428 	 * 2) Wait for all the existing users of page_ext who
429 	 *    started before invalidation to finish.
430 	 * 3) Free the page_ext.
431 	 */
432 	for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
433 		__invalidate_page_ext(pfn);
434 
435 	synchronize_rcu();
436 
437 	for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
438 		__free_page_ext(pfn);
439 	return 0;
440 
441 }
442 
page_ext_callback(struct notifier_block * self,unsigned long action,void * arg)443 static int __meminit page_ext_callback(struct notifier_block *self,
444 			       unsigned long action, void *arg)
445 {
446 	struct memory_notify *mn = arg;
447 	int ret = 0;
448 
449 	switch (action) {
450 	case MEM_GOING_ONLINE:
451 		ret = online_page_ext(mn->start_pfn,
452 				   mn->nr_pages, mn->status_change_nid);
453 		break;
454 	case MEM_OFFLINE:
455 		offline_page_ext(mn->start_pfn,
456 				mn->nr_pages, mn->status_change_nid);
457 		break;
458 	case MEM_CANCEL_ONLINE:
459 		offline_page_ext(mn->start_pfn,
460 				mn->nr_pages, mn->status_change_nid);
461 		break;
462 	case MEM_GOING_OFFLINE:
463 		break;
464 	case MEM_ONLINE:
465 	case MEM_CANCEL_OFFLINE:
466 		break;
467 	}
468 
469 	return notifier_from_errno(ret);
470 }
471 
page_ext_init(void)472 void __init page_ext_init(void)
473 {
474 	unsigned long pfn;
475 	int nid;
476 
477 	if (!invoke_need_callbacks())
478 		return;
479 
480 	for_each_node_state(nid, N_MEMORY) {
481 		unsigned long start_pfn, end_pfn;
482 
483 		start_pfn = node_start_pfn(nid);
484 		end_pfn = node_end_pfn(nid);
485 		/*
486 		 * start_pfn and end_pfn may not be aligned to SECTION and the
487 		 * page->flags of out of node pages are not initialized.  So we
488 		 * scan [start_pfn, the biggest section's pfn < end_pfn) here.
489 		 */
490 		for (pfn = start_pfn; pfn < end_pfn;
491 			pfn = ALIGN(pfn + 1, PAGES_PER_SECTION)) {
492 
493 			if (!pfn_valid(pfn))
494 				continue;
495 			/*
496 			 * Nodes's pfns can be overlapping.
497 			 * We know some arch can have a nodes layout such as
498 			 * -------------pfn-------------->
499 			 * N0 | N1 | N2 | N0 | N1 | N2|....
500 			 */
501 			if (pfn_to_nid(pfn) != nid)
502 				continue;
503 			if (init_section_page_ext(pfn, nid))
504 				goto oom;
505 			cond_resched();
506 		}
507 	}
508 	hotplug_memory_notifier(page_ext_callback, 0);
509 	pr_info("allocated %ld bytes of page_ext\n", total_usage);
510 	invoke_init_callbacks();
511 	return;
512 
513 oom:
514 	panic("Out of memory");
515 }
516 
pgdat_page_ext_init(struct pglist_data * pgdat)517 void __meminit pgdat_page_ext_init(struct pglist_data *pgdat)
518 {
519 }
520 
521 #endif
522