1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/debugfs.h>
3 #include <linux/mm.h>
4 #include <linux/slab.h>
5 #include <linux/uaccess.h>
6 #include <linux/memblock.h>
7 #include <linux/stacktrace.h>
8 #include <linux/page_pinner.h>
9 #include <linux/jump_label.h>
10 #include <linux/migrate.h>
11 #include <linux/stackdepot.h>
12 #include <linux/seq_file.h>
13 #include <linux/sched/clock.h>
14 #include <linux/page_ref.h>
15
16 #include "internal.h"
17
18 #define PAGE_PINNER_STACK_DEPTH 16
19 static unsigned long pp_buf_size = 4096;
20
21 struct page_pinner {
22 depot_stack_handle_t handle;
23 u64 ts_usec;
24 atomic_t count;
25 };
26
27 enum pp_state {
28 PP_PUT,
29 PP_FREE,
30 PP_FAIL_DETECTED,
31 };
32
33 struct captured_pinner {
34 depot_stack_handle_t handle;
35 union {
36 u64 ts_usec;
37 u64 elapsed;
38 };
39
40 /* struct page fields */
41 unsigned long pfn;
42 int count;
43 int mapcount;
44 struct address_space *mapping;
45 unsigned long flags;
46 enum pp_state state;
47 };
48
49 struct page_pinner_buffer {
50 spinlock_t lock;
51 unsigned long index;
52 struct captured_pinner *buffer;
53 };
54
55 /* alloc_contig failed pinner */
56 static struct page_pinner_buffer pp_buffer;
57
58 static bool page_pinner_enabled;
59 DEFINE_STATIC_KEY_FALSE(page_pinner_inited);
60 EXPORT_SYMBOL_GPL(page_pinner_inited);
61
62 DEFINE_STATIC_KEY_TRUE(failure_tracking);
63
64 static depot_stack_handle_t failure_handle;
65
early_page_pinner_param(char * buf)66 static int __init early_page_pinner_param(char *buf)
67 {
68 page_pinner_enabled = true;
69 return 0;
70 }
71 early_param("page_pinner", early_page_pinner_param);
72
need_page_pinner(void)73 static bool need_page_pinner(void)
74 {
75 return page_pinner_enabled;
76 }
77
register_failure_stack(void)78 static noinline void register_failure_stack(void)
79 {
80 unsigned long entries[4];
81 unsigned int nr_entries;
82
83 nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 0);
84 failure_handle = stack_depot_save(entries, nr_entries, GFP_KERNEL);
85 }
86
init_page_pinner(void)87 static void init_page_pinner(void)
88 {
89 if (!page_pinner_enabled)
90 return;
91
92 pp_buffer.buffer = kvmalloc_array(pp_buf_size, sizeof(*pp_buffer.buffer),
93 GFP_KERNEL);
94 if (!pp_buffer.buffer) {
95 pr_info("page_pinner disabled due to failure of buffer allocation\n");
96 return;
97 }
98
99 spin_lock_init(&pp_buffer.lock);
100 pp_buffer.index = 0;
101
102 register_failure_stack();
103 static_branch_enable(&page_pinner_inited);
104 }
105
106 struct page_ext_operations page_pinner_ops = {
107 .size = sizeof(struct page_pinner),
108 .need = need_page_pinner,
109 .init = init_page_pinner,
110 };
111
get_page_pinner(struct page_ext * page_ext)112 static inline struct page_pinner *get_page_pinner(struct page_ext *page_ext)
113 {
114 return (void *)page_ext + page_pinner_ops.offset;
115 }
116
save_stack(gfp_t flags)117 static noinline depot_stack_handle_t save_stack(gfp_t flags)
118 {
119 unsigned long entries[PAGE_PINNER_STACK_DEPTH];
120 depot_stack_handle_t handle;
121 unsigned int nr_entries;
122
123 nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 2);
124 handle = stack_depot_save(entries, nr_entries, flags);
125 if (!handle)
126 handle = failure_handle;
127
128 return handle;
129 }
130
capture_page_state(struct page * page,struct captured_pinner * record)131 static void capture_page_state(struct page *page,
132 struct captured_pinner *record)
133 {
134 struct folio *folio = page_folio(page);
135
136 record->flags = page->flags;
137 record->mapping = folio_mapping(folio);
138 record->pfn = page_to_pfn(page);
139 record->count = page_count(page);
140 record->mapcount = folio_mapcount(folio);
141 }
142
add_record(struct page_pinner_buffer * pp_buf,struct captured_pinner * record)143 static void add_record(struct page_pinner_buffer *pp_buf,
144 struct captured_pinner *record)
145 {
146 unsigned long flags;
147 unsigned int idx;
148
149 spin_lock_irqsave(&pp_buf->lock, flags);
150 idx = pp_buf->index++;
151 pp_buf->index %= pp_buf_size;
152 pp_buf->buffer[idx] = *record;
153 spin_unlock_irqrestore(&pp_buf->lock, flags);
154 }
155
__free_page_pinner(struct page * page,unsigned int order)156 void __free_page_pinner(struct page *page, unsigned int order)
157 {
158 struct page_pinner *page_pinner;
159 struct page_ext *page_ext;
160 int i;
161
162 /* free_page could be called before buffer is initialized */
163 if (!pp_buffer.buffer)
164 return;
165
166 page_ext = page_ext_get(page);
167 if (unlikely(!page_ext))
168 return;
169
170 for (i = 0; i < (1 << order); i++) {
171 struct captured_pinner record;
172
173 if (!test_bit(PAGE_EXT_PINNER_MIGRATION_FAILED, &page_ext->flags))
174 continue;
175
176 page_pinner = get_page_pinner(page_ext);
177
178 record.handle = save_stack(GFP_NOWAIT|__GFP_NOWARN);
179 record.ts_usec = (u64)ktime_to_us(ktime_get_boottime());
180 record.state = PP_FREE;
181 capture_page_state(page, &record);
182
183 add_record(&pp_buffer, &record);
184
185 atomic_set(&page_pinner->count, 0);
186 page_pinner->ts_usec = 0;
187 clear_bit(PAGE_EXT_PINNER_MIGRATION_FAILED, &page_ext->flags);
188 page_ext = page_ext_next(page_ext);
189 }
190 page_ext_put(page_ext);
191 }
192
193 static ssize_t
print_page_pinner(char __user * buf,size_t count,struct captured_pinner * record)194 print_page_pinner(char __user *buf, size_t count, struct captured_pinner *record)
195 {
196 int ret;
197 unsigned long *entries;
198 unsigned int nr_entries;
199 char *kbuf;
200
201 count = min_t(size_t, count, PAGE_SIZE);
202 kbuf = kmalloc(count, GFP_KERNEL);
203 if (!kbuf)
204 return -ENOMEM;
205
206 if (record->state == PP_PUT) {
207 ret = snprintf(kbuf, count, "At least, pinned for %llu us\n",
208 record->elapsed);
209 } else {
210 u64 ts_usec = record->ts_usec;
211 unsigned long rem_usec = do_div(ts_usec, 1000000);
212
213 ret = snprintf(kbuf, count,
214 "%s [%5lu.%06lu]\n",
215 record->state == PP_FREE ? "Freed at" :
216 "Failure detected at",
217 (unsigned long)ts_usec, rem_usec);
218 }
219
220 if (ret >= count)
221 goto err;
222
223 /* Print information relevant to grouping pages by mobility */
224 ret += snprintf(kbuf + ret, count - ret,
225 "PFN 0x%lx Block %lu count %d mapcount %d mapping %pS Flags %#lx(%pGp)\n",
226 record->pfn,
227 record->pfn >> pageblock_order,
228 record->count, record->mapcount,
229 record->mapping,
230 record->flags, &record->flags);
231
232 if (ret >= count)
233 goto err;
234
235 nr_entries = stack_depot_fetch(record->handle, &entries);
236 ret += stack_trace_snprint(kbuf + ret, count - ret, entries,
237 nr_entries, 0);
238 if (ret >= count)
239 goto err;
240
241 ret += snprintf(kbuf + ret, count - ret, "\n");
242 if (ret >= count)
243 goto err;
244
245 if (copy_to_user(buf, kbuf, ret))
246 ret = -EFAULT;
247
248 kfree(kbuf);
249 return ret;
250
251 err:
252 kfree(kbuf);
253 return -ENOMEM;
254 }
255
__page_pinner_failure_detect(struct page * page)256 void __page_pinner_failure_detect(struct page *page)
257 {
258 struct page_ext *page_ext;
259 struct page_pinner *page_pinner;
260 struct captured_pinner record;
261 u64 now;
262
263 if (!static_branch_unlikely(&failure_tracking))
264 return;
265
266 page_ext = page_ext_get(page);
267 if (unlikely(!page_ext))
268 return;
269
270 if (test_bit(PAGE_EXT_PINNER_MIGRATION_FAILED, &page_ext->flags)) {
271 page_ext_put(page_ext);
272 return;
273 }
274
275 now = (u64)ktime_to_us(ktime_get_boottime());
276 page_pinner = get_page_pinner(page_ext);
277 if (!page_pinner->ts_usec)
278 page_pinner->ts_usec = now;
279 set_bit(PAGE_EXT_PINNER_MIGRATION_FAILED, &page_ext->flags);
280 record.handle = save_stack(GFP_NOWAIT|__GFP_NOWARN);
281 record.ts_usec = now;
282 record.state = PP_FAIL_DETECTED;
283 capture_page_state(page, &record);
284
285 add_record(&pp_buffer, &record);
286 page_ext_put(page_ext);
287 }
288 EXPORT_SYMBOL_GPL(__page_pinner_failure_detect);
289
__page_pinner_put_page(struct page * page)290 void __page_pinner_put_page(struct page *page)
291 {
292 struct page_ext *page_ext;
293 struct page_pinner *page_pinner;
294 struct captured_pinner record;
295 u64 now, ts_usec;
296
297 if (!static_branch_unlikely(&failure_tracking))
298 return;
299
300 page_ext = page_ext_get(page);
301 if (unlikely(!page_ext))
302 return;
303
304 if (!test_bit(PAGE_EXT_PINNER_MIGRATION_FAILED, &page_ext->flags)) {
305 page_ext_put(page_ext);
306 return;
307 }
308
309 page_pinner = get_page_pinner(page_ext);
310 record.handle = save_stack(GFP_NOWAIT|__GFP_NOWARN);
311 now = (u64)ktime_to_us(ktime_get_boottime());
312 ts_usec = page_pinner->ts_usec;
313
314 if (now > ts_usec)
315 record.elapsed = now - ts_usec;
316 else
317 record.elapsed = 0;
318 record.state = PP_PUT;
319 capture_page_state(page, &record);
320
321 add_record(&pp_buffer, &record);
322 page_ext_put(page_ext);
323 }
324 EXPORT_SYMBOL_GPL(__page_pinner_put_page);
325
read_buffer(struct file * file,char __user * buf,size_t count,loff_t * ppos)326 static ssize_t read_buffer(struct file *file, char __user *buf,
327 size_t count, loff_t *ppos)
328 {
329 u64 tmp;
330 loff_t i, idx;
331 struct captured_pinner record;
332 unsigned long flags;
333
334 if (!static_branch_unlikely(&failure_tracking))
335 return -EINVAL;
336
337 if (*ppos >= pp_buf_size)
338 return 0;
339
340 i = *ppos;
341 *ppos = i + 1;
342
343 /*
344 * reading the records in the reverse order with newest one
345 * being read first followed by older ones
346 */
347 tmp = pp_buffer.index - 1 - i + pp_buf_size;
348 idx = do_div(tmp, pp_buf_size);
349
350 spin_lock_irqsave(&pp_buffer.lock, flags);
351 record = pp_buffer.buffer[idx];
352 spin_unlock_irqrestore(&pp_buffer.lock, flags);
353 if (!record.handle)
354 return 0;
355
356 return print_page_pinner(buf, count, &record);
357 }
358
359 static const struct file_operations proc_buffer_operations = {
360 .read = read_buffer,
361 };
362
failure_tracking_set(void * data,u64 val)363 static int failure_tracking_set(void *data, u64 val)
364 {
365 bool on;
366
367 on = (bool)val;
368 if (on)
369 static_branch_enable(&failure_tracking);
370 else
371 static_branch_disable(&failure_tracking);
372 return 0;
373 }
374
failure_tracking_get(void * data,u64 * val)375 static int failure_tracking_get(void *data, u64 *val)
376 {
377 *val = static_branch_unlikely(&failure_tracking);
378 return 0;
379 }
380 DEFINE_DEBUGFS_ATTRIBUTE(failure_tracking_fops,
381 failure_tracking_get,
382 failure_tracking_set, "%llu\n");
383
buffer_size_set(void * data,u64 val)384 static int buffer_size_set(void *data, u64 val)
385 {
386 unsigned long flags;
387 struct captured_pinner *new, *old;
388
389 new = kvmalloc_array(val, sizeof(*new), GFP_KERNEL);
390 if (!new)
391 return -ENOMEM;
392
393 spin_lock_irqsave(&pp_buffer.lock, flags);
394 old = pp_buffer.buffer;
395 pp_buffer.buffer = new;
396 pp_buffer.index = 0;
397 pp_buf_size = val;
398 spin_unlock_irqrestore(&pp_buffer.lock, flags);
399 kvfree(old);
400
401 return 0;
402 }
403
buffer_size_get(void * data,u64 * val)404 static int buffer_size_get(void *data, u64 *val)
405 {
406 *val = pp_buf_size;
407 return 0;
408 }
409 DEFINE_DEBUGFS_ATTRIBUTE(buffer_size_fops,
410 buffer_size_get,
411 buffer_size_set, "%llu\n");
412
page_pinner_init(void)413 static int __init page_pinner_init(void)
414 {
415 struct dentry *pp_debugfs_root;
416
417 if (!static_branch_unlikely(&page_pinner_inited))
418 return 0;
419
420 pr_info("page_pinner enabled\n");
421
422 pp_debugfs_root = debugfs_create_dir("page_pinner", NULL);
423
424 debugfs_create_file("buffer", 0444,
425 pp_debugfs_root, NULL,
426 &proc_buffer_operations);
427
428 debugfs_create_file("failure_tracking", 0644,
429 pp_debugfs_root, NULL,
430 &failure_tracking_fops);
431
432 debugfs_create_file("buffer_size", 0644,
433 pp_debugfs_root, NULL,
434 &buffer_size_fops);
435 return 0;
436 }
437 late_initcall(page_pinner_init)
438