• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/debugfs.h>
3 #include <linux/mm.h>
4 #include <linux/slab.h>
5 #include <linux/uaccess.h>
6 #include <linux/memblock.h>
7 #include <linux/stacktrace.h>
8 #include <linux/page_pinner.h>
9 #include <linux/jump_label.h>
10 #include <linux/migrate.h>
11 #include <linux/stackdepot.h>
12 #include <linux/seq_file.h>
13 #include <linux/sched/clock.h>
14 
15 #include "internal.h"
16 
17 #define PAGE_PINNER_STACK_DEPTH 16
18 #define LONGTERM_PIN_BUCKETS	4096
19 
20 struct page_pinner {
21 	depot_stack_handle_t handle;
22 	s64 ts_usec;
23 	atomic_t count;
24 };
25 
26 struct captured_pinner {
27 	depot_stack_handle_t handle;
28 	union {
29 		s64 ts_usec;
30 		s64 elapsed;
31 	};
32 
33 	/* struct page fields */
34 	unsigned long pfn;
35 	int count;
36 	int mapcount;
37 	struct address_space *mapping;
38 	unsigned long flags;
39 };
40 
41 struct longterm_pinner {
42 	spinlock_t lock;
43 	unsigned int index;
44 	struct captured_pinner pinner[LONGTERM_PIN_BUCKETS];
45 };
46 
47 static struct longterm_pinner lt_pinner = {
48 	.lock = __SPIN_LOCK_UNLOCKED(lt_pinner.lock),
49 };
50 
51 static s64 threshold_usec = 300000;
52 
53 /* alloc_contig failed pinner */
54 static struct longterm_pinner acf_pinner = {
55 	.lock = __SPIN_LOCK_UNLOCKED(acf_pinner.lock),
56 };
57 
58 static bool page_pinner_enabled;
59 DEFINE_STATIC_KEY_FALSE(page_pinner_inited);
60 EXPORT_SYMBOL(page_pinner_inited);
61 
62 DEFINE_STATIC_KEY_TRUE(failure_tracking);
63 EXPORT_SYMBOL(failure_tracking);
64 
65 static depot_stack_handle_t failure_handle;
66 
early_page_pinner_param(char * buf)67 static int __init early_page_pinner_param(char *buf)
68 {
69 	page_pinner_enabled = true;
70 	return 0;
71 }
72 early_param("page_pinner", early_page_pinner_param);
73 
need_page_pinner(void)74 static bool need_page_pinner(void)
75 {
76 	return page_pinner_enabled;
77 }
78 
register_failure_stack(void)79 static noinline void register_failure_stack(void)
80 {
81 	unsigned long entries[4];
82 	unsigned int nr_entries;
83 
84 	nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 0);
85 	failure_handle = stack_depot_save(entries, nr_entries, GFP_KERNEL);
86 }
87 
init_page_pinner(void)88 static void init_page_pinner(void)
89 {
90 	if (!page_pinner_enabled)
91 		return;
92 
93 	register_failure_stack();
94 	static_branch_enable(&page_pinner_inited);
95 }
96 
97 struct page_ext_operations page_pinner_ops = {
98 	.size = sizeof(struct page_pinner),
99 	.need = need_page_pinner,
100 	.init = init_page_pinner,
101 };
102 
get_page_pinner(struct page_ext * page_ext)103 static inline struct page_pinner *get_page_pinner(struct page_ext *page_ext)
104 {
105 	return (void *)page_ext + page_pinner_ops.offset;
106 }
107 
save_stack(gfp_t flags)108 static noinline depot_stack_handle_t save_stack(gfp_t flags)
109 {
110 	unsigned long entries[PAGE_PINNER_STACK_DEPTH];
111 	depot_stack_handle_t handle;
112 	unsigned int nr_entries;
113 
114 	nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 2);
115 	handle = stack_depot_save(entries, nr_entries, flags);
116 	if (!handle)
117 		handle = failure_handle;
118 
119 	return handle;
120 }
121 
capture_page_state(struct page * page,struct captured_pinner * record)122 static void capture_page_state(struct page *page,
123 			       struct captured_pinner *record)
124 {
125 	record->flags = page->flags;
126 	record->mapping = page_mapping(page);
127 	record->pfn = page_to_pfn(page);
128 	record->count = page_count(page);
129 	record->mapcount = page_mapcount(page);
130 }
131 
check_longterm_pin(struct page_pinner * page_pinner,struct page * page)132 static void check_longterm_pin(struct page_pinner *page_pinner,
133 			      struct page *page)
134 {
135 	s64 now, delta = 0;
136 	unsigned long flags;
137 	unsigned int idx;
138 	struct captured_pinner record;
139 
140 	now = ktime_to_us(ktime_get_boottime());
141 
142 	/* get/put_page can be raced. Ignore that case */
143 	if (page_pinner->ts_usec < now)
144 		delta = now - page_pinner->ts_usec;
145 
146 	if (delta <= threshold_usec)
147 		return;
148 
149 	record.handle = page_pinner->handle;
150 	record.elapsed = delta;
151 	capture_page_state(page, &record);
152 
153 	spin_lock_irqsave(&lt_pinner.lock, flags);
154 	idx = lt_pinner.index++;
155 	lt_pinner.index %= LONGTERM_PIN_BUCKETS;
156 	lt_pinner.pinner[idx] = record;
157 	spin_unlock_irqrestore(&lt_pinner.lock, flags);
158 }
159 
__reset_page_pinner(struct page * page,unsigned int order,bool free)160 void __reset_page_pinner(struct page *page, unsigned int order, bool free)
161 {
162 	struct page_pinner *page_pinner;
163 	struct page_ext *page_ext;
164 	int i;
165 
166 	page_ext = page_ext_get(page);
167 	if (unlikely(!page_ext))
168 		return;
169 
170 	for (i = 0; i < (1 << order); i++) {
171 		if (!test_bit(PAGE_EXT_GET, &page_ext->flags) &&
172 			!test_bit(PAGE_EXT_PINNER_MIGRATION_FAILED,
173 				  &page_ext->flags))
174 			continue;
175 
176 		page_pinner = get_page_pinner(page_ext);
177 		if (free) {
178 			/* record page free call path */
179 			__page_pinner_migration_failed(page);
180 			atomic_set(&page_pinner->count, 0);
181 			__clear_bit(PAGE_EXT_PINNER_MIGRATION_FAILED, &page_ext->flags);
182 		} else {
183 			check_longterm_pin(page_pinner, page);
184 		}
185 		clear_bit(PAGE_EXT_GET, &page_ext->flags);
186 		page_ext = page_ext_next(page_ext);
187 	}
188 	page_ext_put(page_ext);
189 }
190 
__set_page_pinner_handle(struct page * page,struct page_ext * page_ext,depot_stack_handle_t handle,unsigned int order)191 static inline void __set_page_pinner_handle(struct page *page,
192 	struct page_ext *page_ext, depot_stack_handle_t handle,
193 	unsigned int order)
194 {
195 	struct page_pinner *page_pinner;
196 	int i;
197 	s64 usec = ktime_to_us(ktime_get_boottime());
198 
199 	for (i = 0; i < (1 << order); i++) {
200 		page_pinner = get_page_pinner(page_ext);
201 		page_pinner->handle = handle;
202 		page_pinner->ts_usec = usec;
203 		set_bit(PAGE_EXT_GET, &page_ext->flags);
204 		atomic_inc(&page_pinner->count);
205 		page_ext = page_ext_next(page_ext);
206 	}
207 }
208 
__set_page_pinner(struct page * page,unsigned int order)209 noinline void __set_page_pinner(struct page *page, unsigned int order)
210 {
211 	struct page_ext *page_ext;
212 	depot_stack_handle_t handle;
213 
214 	handle = save_stack(GFP_NOWAIT|__GFP_NOWARN);
215 
216 	page_ext = page_ext_get(page);
217 	if (unlikely(!page_ext))
218 		return;
219 	__set_page_pinner_handle(page, page_ext, handle, order);
220 	page_ext_put(page_ext);
221 }
222 
223 static ssize_t
print_page_pinner(bool longterm,char __user * buf,size_t count,struct captured_pinner * record)224 print_page_pinner(bool longterm, char __user *buf, size_t count, struct captured_pinner *record)
225 {
226 	int ret;
227 	unsigned long *entries;
228 	unsigned int nr_entries;
229 	char *kbuf;
230 
231 	count = min_t(size_t, count, PAGE_SIZE);
232 	kbuf = kmalloc(count, GFP_KERNEL);
233 	if (!kbuf)
234 		return -ENOMEM;
235 
236 	if (longterm) {
237 		ret = snprintf(kbuf, count, "Page pinned for %lld us\n",
238 			       record->elapsed);
239 	} else {
240 		u64 ts_usec = record->ts_usec;
241 		unsigned long rem_usec = do_div(ts_usec, 1000000);
242 
243 		ret = snprintf(kbuf, count,
244 			       "Page pinned ts [%5lu.%06lu]\n",
245 			       (unsigned long)ts_usec, rem_usec);
246 	}
247 
248 	if (ret >= count)
249 		goto err;
250 
251 	/* Print information relevant to grouping pages by mobility */
252 	ret += snprintf(kbuf + ret, count - ret,
253 			"PFN 0x%lx Block %lu count %d mapcount %d mapping %pS Flags %#lx(%pGp)\n",
254 			record->pfn,
255 			record->pfn >> pageblock_order,
256 			record->count, record->mapcount,
257 			record->mapping,
258 			record->flags, &record->flags);
259 
260 	if (ret >= count)
261 		goto err;
262 
263 	nr_entries = stack_depot_fetch(record->handle, &entries);
264 	ret += stack_trace_snprint(kbuf + ret, count - ret, entries,
265 				   nr_entries, 0);
266 	if (ret >= count)
267 		goto err;
268 
269 	ret += snprintf(kbuf + ret, count - ret, "\n");
270 	if (ret >= count)
271 		goto err;
272 
273 	if (copy_to_user(buf, kbuf, ret))
274 		ret = -EFAULT;
275 
276 	kfree(kbuf);
277 	return ret;
278 
279 err:
280 	kfree(kbuf);
281 	return -ENOMEM;
282 }
283 
__dump_page_pinner(struct page * page)284 void __dump_page_pinner(struct page *page)
285 {
286 	struct page_ext *page_ext = page_ext_get(page);
287 	struct page_pinner *page_pinner;
288 	depot_stack_handle_t handle;
289 	unsigned long *entries;
290 	unsigned int nr_entries;
291 	int pageblock_mt;
292 	unsigned long pfn;
293 	int count;
294 	unsigned long rem_usec;
295 	u64 ts_usec;
296 
297 	if (unlikely(!page_ext)) {
298 		pr_alert("There is not page extension available.\n");
299 		return;
300 	}
301 
302 	page_pinner = get_page_pinner(page_ext);
303 
304 	count = atomic_read(&page_pinner->count);
305 	if (!count) {
306 		pr_alert("page_pinner info is not present (never set?)\n");
307 		page_ext_put(page_ext);
308 		return;
309 	}
310 
311 	pfn = page_to_pfn(page);
312 	ts_usec = page_pinner->ts_usec;
313 	rem_usec = do_div(ts_usec, 1000000);
314 	pr_alert("page last pinned %5lu.%06lu] count %d\n",
315 		 (unsigned long)ts_usec, rem_usec, count);
316 
317 	pageblock_mt = get_pageblock_migratetype(page);
318 	pr_alert("PFN %lu Block %lu type %s Flags %#lx(%pGp)\n",
319 			pfn,
320 			pfn >> pageblock_order,
321 			migratetype_names[pageblock_mt],
322 			page->flags, &page->flags);
323 
324 	handle = READ_ONCE(page_pinner->handle);
325 	if (!handle) {
326 		pr_alert("page_pinner allocation stack trace missing\n");
327 	} else {
328 		nr_entries = stack_depot_fetch(handle, &entries);
329 		stack_trace_print(entries, nr_entries, 0);
330 	}
331 	page_ext_put(page_ext);
332 }
333 
__page_pinner_migration_failed(struct page * page)334 void __page_pinner_migration_failed(struct page *page)
335 {
336 	struct page_ext *page_ext = page_ext_get(page);
337 	struct captured_pinner record;
338 	unsigned long flags;
339 	unsigned int idx;
340 
341 	if (unlikely(!page_ext))
342 		return;
343 
344 	if (!test_bit(PAGE_EXT_PINNER_MIGRATION_FAILED, &page_ext->flags)) {
345 		page_ext_put(page_ext);
346 		return;
347 	}
348 
349 	page_ext_put(page_ext);
350 	record.handle = save_stack(GFP_NOWAIT|__GFP_NOWARN);
351 	record.ts_usec = ktime_to_us(ktime_get_boottime());
352 	capture_page_state(page, &record);
353 
354 	spin_lock_irqsave(&acf_pinner.lock, flags);
355 	idx = acf_pinner.index++;
356 	acf_pinner.index %= LONGTERM_PIN_BUCKETS;
357 	acf_pinner.pinner[idx] = record;
358 	spin_unlock_irqrestore(&acf_pinner.lock, flags);
359 }
360 EXPORT_SYMBOL(__page_pinner_migration_failed);
361 
__page_pinner_mark_migration_failed_pages(struct list_head * page_list)362 void __page_pinner_mark_migration_failed_pages(struct list_head *page_list)
363 {
364 	struct page *page;
365 	struct page_ext *page_ext;
366 
367 	list_for_each_entry(page, page_list, lru) {
368 		/* The page will be freed by putback_movable_pages soon */
369 		if (page_count(page) == 1)
370 			continue;
371 		page_ext = page_ext_get(page);
372 		if (unlikely(!page_ext))
373 			continue;
374 		__set_bit(PAGE_EXT_PINNER_MIGRATION_FAILED, &page_ext->flags);
375 		page_ext_put(page_ext);
376 		__page_pinner_migration_failed(page);
377 	}
378 }
379 
380 static ssize_t
read_longterm_page_pinner(struct file * file,char __user * buf,size_t count,loff_t * ppos)381 read_longterm_page_pinner(struct file *file, char __user *buf, size_t count,
382 			  loff_t *ppos)
383 {
384 	loff_t i, idx;
385 	struct captured_pinner record;
386 	unsigned long flags;
387 
388 	if (!static_branch_unlikely(&page_pinner_inited))
389 		return -EINVAL;
390 
391 	if (*ppos >= LONGTERM_PIN_BUCKETS)
392 		return 0;
393 
394 	i = *ppos;
395 	*ppos = i + 1;
396 
397 	/*
398 	 * reading the records in the reverse order with newest one
399 	 * being read first followed by older ones
400 	 */
401 	idx = (lt_pinner.index - 1 - i + LONGTERM_PIN_BUCKETS) %
402 	       LONGTERM_PIN_BUCKETS;
403 	spin_lock_irqsave(&lt_pinner.lock, flags);
404 	record = lt_pinner.pinner[idx];
405 	spin_unlock_irqrestore(&lt_pinner.lock, flags);
406 	if (!record.handle)
407 		return 0;
408 
409 	return print_page_pinner(true, buf, count, &record);
410 }
411 
412 static const struct file_operations proc_longterm_pinner_operations = {
413 	.read		= read_longterm_page_pinner,
414 };
415 
read_alloc_contig_failed(struct file * file,char __user * buf,size_t count,loff_t * ppos)416 static ssize_t read_alloc_contig_failed(struct file *file, char __user *buf,
417 					size_t count, loff_t *ppos)
418 {
419 	loff_t i, idx;
420 	struct captured_pinner record;
421 	unsigned long flags;
422 
423 	if (!static_branch_unlikely(&failure_tracking))
424 		return -EINVAL;
425 
426 	if (*ppos >= LONGTERM_PIN_BUCKETS)
427 		return 0;
428 
429 	i = *ppos;
430 	*ppos = i + 1;
431 
432 	/*
433 	 * reading the records in the reverse order with newest one
434 	 * being read first followed by older ones
435 	 */
436 	idx = (acf_pinner.index - 1 - i + LONGTERM_PIN_BUCKETS) %
437 	       LONGTERM_PIN_BUCKETS;
438 
439 	spin_lock_irqsave(&acf_pinner.lock, flags);
440 	record = acf_pinner.pinner[idx];
441 	spin_unlock_irqrestore(&acf_pinner.lock, flags);
442 	if (!record.handle)
443 		return 0;
444 
445 	return print_page_pinner(false, buf, count, &record);
446 }
447 
448 static const struct file_operations proc_alloc_contig_failed_operations = {
449 	.read		= read_alloc_contig_failed,
450 };
451 
pp_threshold_set(void * data,unsigned long long val)452 static int pp_threshold_set(void *data, unsigned long long val)
453 {
454 	unsigned long flags;
455 
456 	threshold_usec = (s64)val;
457 
458 	spin_lock_irqsave(&lt_pinner.lock, flags);
459 	memset(lt_pinner.pinner, 0,
460 	       sizeof(struct captured_pinner) * LONGTERM_PIN_BUCKETS);
461 	lt_pinner.index = 0;
462 	spin_unlock_irqrestore(&lt_pinner.lock, flags);
463 	return 0;
464 }
465 
pp_threshold_get(void * data,unsigned long long * val)466 static int pp_threshold_get(void *data, unsigned long long *val)
467 {
468 	*val = (unsigned long long)threshold_usec;
469 
470 	return 0;
471 }
472 DEFINE_DEBUGFS_ATTRIBUTE(pp_threshold_fops, pp_threshold_get,
473 			 pp_threshold_set, "%lld\n");
474 
failure_tracking_set(void * data,u64 val)475 static int failure_tracking_set(void *data, u64 val)
476 {
477 	bool on;
478 
479 	on = (bool)val;
480 	if (on)
481 		static_branch_enable(&failure_tracking);
482 	else
483 		static_branch_disable(&failure_tracking);
484 	return 0;
485 }
486 
failure_tracking_get(void * data,u64 * val)487 static int failure_tracking_get(void *data, u64 *val)
488 {
489 	*val = static_branch_unlikely(&failure_tracking);
490 	return 0;
491 }
492 DEFINE_DEBUGFS_ATTRIBUTE(failure_tracking_fops,
493 			 failure_tracking_get,
494 			 failure_tracking_set, "%llu\n");
495 
page_pinner_init(void)496 static int __init page_pinner_init(void)
497 {
498 	struct dentry *pp_debugfs_root;
499 
500 	if (!static_branch_unlikely(&page_pinner_inited))
501 		return 0;
502 
503 	pr_info("page_pinner enabled\n");
504 
505 	pp_debugfs_root = debugfs_create_dir("page_pinner", NULL);
506 
507 	debugfs_create_file("longterm_pinner", 0444, pp_debugfs_root, NULL,
508 			    &proc_longterm_pinner_operations);
509 
510 	debugfs_create_file("threshold", 0644, pp_debugfs_root, NULL,
511 			    &pp_threshold_fops);
512 
513 	debugfs_create_file("alloc_contig_failed", 0444,
514 			    pp_debugfs_root, NULL,
515 			    &proc_alloc_contig_failed_operations);
516 
517 	debugfs_create_file("failure_tracking", 0644,
518 			    pp_debugfs_root, NULL,
519 			    &failure_tracking_fops);
520 	return 0;
521 }
522 late_initcall(page_pinner_init)
523