1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/debugfs.h>
3 #include <linux/mm.h>
4 #include <linux/slab.h>
5 #include <linux/uaccess.h>
6 #include <linux/memblock.h>
7 #include <linux/stacktrace.h>
8 #include <linux/page_pinner.h>
9 #include <linux/jump_label.h>
10 #include <linux/migrate.h>
11 #include <linux/stackdepot.h>
12 #include <linux/seq_file.h>
13 #include <linux/sched/clock.h>
14
15 #include "internal.h"
16
17 #define PAGE_PINNER_STACK_DEPTH 16
18 #define LONGTERM_PIN_BUCKETS 4096
19
20 struct page_pinner {
21 depot_stack_handle_t handle;
22 s64 ts_usec;
23 atomic_t count;
24 };
25
26 struct captured_pinner {
27 depot_stack_handle_t handle;
28 union {
29 s64 ts_usec;
30 s64 elapsed;
31 };
32
33 /* struct page fields */
34 unsigned long pfn;
35 int count;
36 int mapcount;
37 struct address_space *mapping;
38 unsigned long flags;
39 };
40
41 struct longterm_pinner {
42 spinlock_t lock;
43 unsigned int index;
44 struct captured_pinner pinner[LONGTERM_PIN_BUCKETS];
45 };
46
47 static struct longterm_pinner lt_pinner = {
48 .lock = __SPIN_LOCK_UNLOCKED(lt_pinner.lock),
49 };
50
51 static s64 threshold_usec = 300000;
52
53 /* alloc_contig failed pinner */
54 static struct longterm_pinner acf_pinner = {
55 .lock = __SPIN_LOCK_UNLOCKED(acf_pinner.lock),
56 };
57
58 static bool page_pinner_enabled;
59 DEFINE_STATIC_KEY_FALSE(page_pinner_inited);
60 EXPORT_SYMBOL(page_pinner_inited);
61
62 DEFINE_STATIC_KEY_TRUE(failure_tracking);
63 EXPORT_SYMBOL(failure_tracking);
64
65 static depot_stack_handle_t failure_handle;
66
early_page_pinner_param(char * buf)67 static int __init early_page_pinner_param(char *buf)
68 {
69 page_pinner_enabled = true;
70 return 0;
71 }
72 early_param("page_pinner", early_page_pinner_param);
73
need_page_pinner(void)74 static bool need_page_pinner(void)
75 {
76 return page_pinner_enabled;
77 }
78
register_failure_stack(void)79 static noinline void register_failure_stack(void)
80 {
81 unsigned long entries[4];
82 unsigned int nr_entries;
83
84 nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 0);
85 failure_handle = stack_depot_save(entries, nr_entries, GFP_KERNEL);
86 }
87
init_page_pinner(void)88 static void init_page_pinner(void)
89 {
90 if (!page_pinner_enabled)
91 return;
92
93 register_failure_stack();
94 static_branch_enable(&page_pinner_inited);
95 }
96
97 struct page_ext_operations page_pinner_ops = {
98 .size = sizeof(struct page_pinner),
99 .need = need_page_pinner,
100 .init = init_page_pinner,
101 };
102
get_page_pinner(struct page_ext * page_ext)103 static inline struct page_pinner *get_page_pinner(struct page_ext *page_ext)
104 {
105 return (void *)page_ext + page_pinner_ops.offset;
106 }
107
save_stack(gfp_t flags)108 static noinline depot_stack_handle_t save_stack(gfp_t flags)
109 {
110 unsigned long entries[PAGE_PINNER_STACK_DEPTH];
111 depot_stack_handle_t handle;
112 unsigned int nr_entries;
113
114 nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 2);
115 handle = stack_depot_save(entries, nr_entries, flags);
116 if (!handle)
117 handle = failure_handle;
118
119 return handle;
120 }
121
capture_page_state(struct page * page,struct captured_pinner * record)122 static void capture_page_state(struct page *page,
123 struct captured_pinner *record)
124 {
125 record->flags = page->flags;
126 record->mapping = page_mapping(page);
127 record->pfn = page_to_pfn(page);
128 record->count = page_count(page);
129 record->mapcount = page_mapcount(page);
130 }
131
check_longterm_pin(struct page_pinner * page_pinner,struct page * page)132 static void check_longterm_pin(struct page_pinner *page_pinner,
133 struct page *page)
134 {
135 s64 now, delta = 0;
136 unsigned long flags;
137 unsigned int idx;
138 struct captured_pinner record;
139
140 now = ktime_to_us(ktime_get_boottime());
141
142 /* get/put_page can be raced. Ignore that case */
143 if (page_pinner->ts_usec < now)
144 delta = now - page_pinner->ts_usec;
145
146 if (delta <= threshold_usec)
147 return;
148
149 record.handle = page_pinner->handle;
150 record.elapsed = delta;
151 capture_page_state(page, &record);
152
153 spin_lock_irqsave(<_pinner.lock, flags);
154 idx = lt_pinner.index++;
155 lt_pinner.index %= LONGTERM_PIN_BUCKETS;
156 lt_pinner.pinner[idx] = record;
157 spin_unlock_irqrestore(<_pinner.lock, flags);
158 }
159
__reset_page_pinner(struct page * page,unsigned int order,bool free)160 void __reset_page_pinner(struct page *page, unsigned int order, bool free)
161 {
162 struct page_pinner *page_pinner;
163 struct page_ext *page_ext;
164 int i;
165
166 page_ext = page_ext_get(page);
167 if (unlikely(!page_ext))
168 return;
169
170 for (i = 0; i < (1 << order); i++) {
171 if (!test_bit(PAGE_EXT_GET, &page_ext->flags) &&
172 !test_bit(PAGE_EXT_PINNER_MIGRATION_FAILED,
173 &page_ext->flags))
174 continue;
175
176 page_pinner = get_page_pinner(page_ext);
177 if (free) {
178 /* record page free call path */
179 __page_pinner_migration_failed(page);
180 atomic_set(&page_pinner->count, 0);
181 __clear_bit(PAGE_EXT_PINNER_MIGRATION_FAILED, &page_ext->flags);
182 } else {
183 check_longterm_pin(page_pinner, page);
184 }
185 clear_bit(PAGE_EXT_GET, &page_ext->flags);
186 page_ext = page_ext_next(page_ext);
187 }
188 page_ext_put(page_ext);
189 }
190
__set_page_pinner_handle(struct page * page,struct page_ext * page_ext,depot_stack_handle_t handle,unsigned int order)191 static inline void __set_page_pinner_handle(struct page *page,
192 struct page_ext *page_ext, depot_stack_handle_t handle,
193 unsigned int order)
194 {
195 struct page_pinner *page_pinner;
196 int i;
197 s64 usec = ktime_to_us(ktime_get_boottime());
198
199 for (i = 0; i < (1 << order); i++) {
200 page_pinner = get_page_pinner(page_ext);
201 page_pinner->handle = handle;
202 page_pinner->ts_usec = usec;
203 set_bit(PAGE_EXT_GET, &page_ext->flags);
204 atomic_inc(&page_pinner->count);
205 page_ext = page_ext_next(page_ext);
206 }
207 }
208
__set_page_pinner(struct page * page,unsigned int order)209 noinline void __set_page_pinner(struct page *page, unsigned int order)
210 {
211 struct page_ext *page_ext;
212 depot_stack_handle_t handle;
213
214 handle = save_stack(GFP_NOWAIT|__GFP_NOWARN);
215
216 page_ext = page_ext_get(page);
217 if (unlikely(!page_ext))
218 return;
219 __set_page_pinner_handle(page, page_ext, handle, order);
220 page_ext_put(page_ext);
221 }
222
223 static ssize_t
print_page_pinner(bool longterm,char __user * buf,size_t count,struct captured_pinner * record)224 print_page_pinner(bool longterm, char __user *buf, size_t count, struct captured_pinner *record)
225 {
226 int ret;
227 unsigned long *entries;
228 unsigned int nr_entries;
229 char *kbuf;
230
231 count = min_t(size_t, count, PAGE_SIZE);
232 kbuf = kmalloc(count, GFP_KERNEL);
233 if (!kbuf)
234 return -ENOMEM;
235
236 if (longterm) {
237 ret = snprintf(kbuf, count, "Page pinned for %lld us\n",
238 record->elapsed);
239 } else {
240 u64 ts_usec = record->ts_usec;
241 unsigned long rem_usec = do_div(ts_usec, 1000000);
242
243 ret = snprintf(kbuf, count,
244 "Page pinned ts [%5lu.%06lu]\n",
245 (unsigned long)ts_usec, rem_usec);
246 }
247
248 if (ret >= count)
249 goto err;
250
251 /* Print information relevant to grouping pages by mobility */
252 ret += snprintf(kbuf + ret, count - ret,
253 "PFN 0x%lx Block %lu count %d mapcount %d mapping %pS Flags %#lx(%pGp)\n",
254 record->pfn,
255 record->pfn >> pageblock_order,
256 record->count, record->mapcount,
257 record->mapping,
258 record->flags, &record->flags);
259
260 if (ret >= count)
261 goto err;
262
263 nr_entries = stack_depot_fetch(record->handle, &entries);
264 ret += stack_trace_snprint(kbuf + ret, count - ret, entries,
265 nr_entries, 0);
266 if (ret >= count)
267 goto err;
268
269 ret += snprintf(kbuf + ret, count - ret, "\n");
270 if (ret >= count)
271 goto err;
272
273 if (copy_to_user(buf, kbuf, ret))
274 ret = -EFAULT;
275
276 kfree(kbuf);
277 return ret;
278
279 err:
280 kfree(kbuf);
281 return -ENOMEM;
282 }
283
__dump_page_pinner(struct page * page)284 void __dump_page_pinner(struct page *page)
285 {
286 struct page_ext *page_ext = page_ext_get(page);
287 struct page_pinner *page_pinner;
288 depot_stack_handle_t handle;
289 unsigned long *entries;
290 unsigned int nr_entries;
291 int pageblock_mt;
292 unsigned long pfn;
293 int count;
294 unsigned long rem_usec;
295 u64 ts_usec;
296
297 if (unlikely(!page_ext)) {
298 pr_alert("There is not page extension available.\n");
299 return;
300 }
301
302 page_pinner = get_page_pinner(page_ext);
303
304 count = atomic_read(&page_pinner->count);
305 if (!count) {
306 pr_alert("page_pinner info is not present (never set?)\n");
307 page_ext_put(page_ext);
308 return;
309 }
310
311 pfn = page_to_pfn(page);
312 ts_usec = page_pinner->ts_usec;
313 rem_usec = do_div(ts_usec, 1000000);
314 pr_alert("page last pinned %5lu.%06lu] count %d\n",
315 (unsigned long)ts_usec, rem_usec, count);
316
317 pageblock_mt = get_pageblock_migratetype(page);
318 pr_alert("PFN %lu Block %lu type %s Flags %#lx(%pGp)\n",
319 pfn,
320 pfn >> pageblock_order,
321 migratetype_names[pageblock_mt],
322 page->flags, &page->flags);
323
324 handle = READ_ONCE(page_pinner->handle);
325 if (!handle) {
326 pr_alert("page_pinner allocation stack trace missing\n");
327 } else {
328 nr_entries = stack_depot_fetch(handle, &entries);
329 stack_trace_print(entries, nr_entries, 0);
330 }
331 page_ext_put(page_ext);
332 }
333
__page_pinner_migration_failed(struct page * page)334 void __page_pinner_migration_failed(struct page *page)
335 {
336 struct page_ext *page_ext = page_ext_get(page);
337 struct captured_pinner record;
338 unsigned long flags;
339 unsigned int idx;
340
341 if (unlikely(!page_ext))
342 return;
343
344 if (!test_bit(PAGE_EXT_PINNER_MIGRATION_FAILED, &page_ext->flags)) {
345 page_ext_put(page_ext);
346 return;
347 }
348
349 page_ext_put(page_ext);
350 record.handle = save_stack(GFP_NOWAIT|__GFP_NOWARN);
351 record.ts_usec = ktime_to_us(ktime_get_boottime());
352 capture_page_state(page, &record);
353
354 spin_lock_irqsave(&acf_pinner.lock, flags);
355 idx = acf_pinner.index++;
356 acf_pinner.index %= LONGTERM_PIN_BUCKETS;
357 acf_pinner.pinner[idx] = record;
358 spin_unlock_irqrestore(&acf_pinner.lock, flags);
359 }
360 EXPORT_SYMBOL(__page_pinner_migration_failed);
361
__page_pinner_mark_migration_failed_pages(struct list_head * page_list)362 void __page_pinner_mark_migration_failed_pages(struct list_head *page_list)
363 {
364 struct page *page;
365 struct page_ext *page_ext;
366
367 list_for_each_entry(page, page_list, lru) {
368 /* The page will be freed by putback_movable_pages soon */
369 if (page_count(page) == 1)
370 continue;
371 page_ext = page_ext_get(page);
372 if (unlikely(!page_ext))
373 continue;
374 __set_bit(PAGE_EXT_PINNER_MIGRATION_FAILED, &page_ext->flags);
375 page_ext_put(page_ext);
376 __page_pinner_migration_failed(page);
377 }
378 }
379
380 static ssize_t
read_longterm_page_pinner(struct file * file,char __user * buf,size_t count,loff_t * ppos)381 read_longterm_page_pinner(struct file *file, char __user *buf, size_t count,
382 loff_t *ppos)
383 {
384 loff_t i, idx;
385 struct captured_pinner record;
386 unsigned long flags;
387
388 if (!static_branch_unlikely(&page_pinner_inited))
389 return -EINVAL;
390
391 if (*ppos >= LONGTERM_PIN_BUCKETS)
392 return 0;
393
394 i = *ppos;
395 *ppos = i + 1;
396
397 /*
398 * reading the records in the reverse order with newest one
399 * being read first followed by older ones
400 */
401 idx = (lt_pinner.index - 1 - i + LONGTERM_PIN_BUCKETS) %
402 LONGTERM_PIN_BUCKETS;
403 spin_lock_irqsave(<_pinner.lock, flags);
404 record = lt_pinner.pinner[idx];
405 spin_unlock_irqrestore(<_pinner.lock, flags);
406 if (!record.handle)
407 return 0;
408
409 return print_page_pinner(true, buf, count, &record);
410 }
411
412 static const struct file_operations proc_longterm_pinner_operations = {
413 .read = read_longterm_page_pinner,
414 };
415
read_alloc_contig_failed(struct file * file,char __user * buf,size_t count,loff_t * ppos)416 static ssize_t read_alloc_contig_failed(struct file *file, char __user *buf,
417 size_t count, loff_t *ppos)
418 {
419 loff_t i, idx;
420 struct captured_pinner record;
421 unsigned long flags;
422
423 if (!static_branch_unlikely(&failure_tracking))
424 return -EINVAL;
425
426 if (*ppos >= LONGTERM_PIN_BUCKETS)
427 return 0;
428
429 i = *ppos;
430 *ppos = i + 1;
431
432 /*
433 * reading the records in the reverse order with newest one
434 * being read first followed by older ones
435 */
436 idx = (acf_pinner.index - 1 - i + LONGTERM_PIN_BUCKETS) %
437 LONGTERM_PIN_BUCKETS;
438
439 spin_lock_irqsave(&acf_pinner.lock, flags);
440 record = acf_pinner.pinner[idx];
441 spin_unlock_irqrestore(&acf_pinner.lock, flags);
442 if (!record.handle)
443 return 0;
444
445 return print_page_pinner(false, buf, count, &record);
446 }
447
448 static const struct file_operations proc_alloc_contig_failed_operations = {
449 .read = read_alloc_contig_failed,
450 };
451
pp_threshold_set(void * data,unsigned long long val)452 static int pp_threshold_set(void *data, unsigned long long val)
453 {
454 unsigned long flags;
455
456 threshold_usec = (s64)val;
457
458 spin_lock_irqsave(<_pinner.lock, flags);
459 memset(lt_pinner.pinner, 0,
460 sizeof(struct captured_pinner) * LONGTERM_PIN_BUCKETS);
461 lt_pinner.index = 0;
462 spin_unlock_irqrestore(<_pinner.lock, flags);
463 return 0;
464 }
465
pp_threshold_get(void * data,unsigned long long * val)466 static int pp_threshold_get(void *data, unsigned long long *val)
467 {
468 *val = (unsigned long long)threshold_usec;
469
470 return 0;
471 }
472 DEFINE_DEBUGFS_ATTRIBUTE(pp_threshold_fops, pp_threshold_get,
473 pp_threshold_set, "%lld\n");
474
failure_tracking_set(void * data,u64 val)475 static int failure_tracking_set(void *data, u64 val)
476 {
477 bool on;
478
479 on = (bool)val;
480 if (on)
481 static_branch_enable(&failure_tracking);
482 else
483 static_branch_disable(&failure_tracking);
484 return 0;
485 }
486
failure_tracking_get(void * data,u64 * val)487 static int failure_tracking_get(void *data, u64 *val)
488 {
489 *val = static_branch_unlikely(&failure_tracking);
490 return 0;
491 }
492 DEFINE_DEBUGFS_ATTRIBUTE(failure_tracking_fops,
493 failure_tracking_get,
494 failure_tracking_set, "%llu\n");
495
page_pinner_init(void)496 static int __init page_pinner_init(void)
497 {
498 struct dentry *pp_debugfs_root;
499
500 if (!static_branch_unlikely(&page_pinner_inited))
501 return 0;
502
503 pr_info("page_pinner enabled\n");
504
505 pp_debugfs_root = debugfs_create_dir("page_pinner", NULL);
506
507 debugfs_create_file("longterm_pinner", 0444, pp_debugfs_root, NULL,
508 &proc_longterm_pinner_operations);
509
510 debugfs_create_file("threshold", 0644, pp_debugfs_root, NULL,
511 &pp_threshold_fops);
512
513 debugfs_create_file("alloc_contig_failed", 0444,
514 pp_debugfs_root, NULL,
515 &proc_alloc_contig_failed_operations);
516
517 debugfs_create_file("failure_tracking", 0644,
518 pp_debugfs_root, NULL,
519 &failure_tracking_fops);
520 return 0;
521 }
522 late_initcall(page_pinner_init)
523