1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/debugfs.h>
3 #include <linux/mm.h>
4 #include <linux/slab.h>
5 #include <linux/uaccess.h>
6 #include <linux/memblock.h>
7 #include <linux/stacktrace.h>
8 #include <linux/page_pinner.h>
9 #include <linux/jump_label.h>
10 #include <linux/migrate.h>
11 #include <linux/stackdepot.h>
12 #include <linux/seq_file.h>
13 #include <linux/sched/clock.h>
14
15 #include "internal.h"
16
17 #define PAGE_PINNER_STACK_DEPTH 16
18 static unsigned long pp_buf_size = 4096;
19
20 struct page_pinner {
21 depot_stack_handle_t handle;
22 u64 ts_usec;
23 atomic_t count;
24 };
25
26 enum pp_state {
27 PP_PUT,
28 PP_FREE,
29 PP_FAIL_DETECTED,
30 };
31
32 struct captured_pinner {
33 depot_stack_handle_t handle;
34 union {
35 u64 ts_usec;
36 u64 elapsed;
37 };
38
39 /* struct page fields */
40 unsigned long pfn;
41 int count;
42 int mapcount;
43 struct address_space *mapping;
44 unsigned long flags;
45 enum pp_state state;
46 };
47
48 struct page_pinner_buffer {
49 spinlock_t lock;
50 unsigned long index;
51 struct captured_pinner *buffer;
52 };
53
54 /* alloc_contig failed pinner */
55 static struct page_pinner_buffer pp_buffer;
56
57 static bool page_pinner_enabled;
58 DEFINE_STATIC_KEY_FALSE(page_pinner_inited);
59
60 DEFINE_STATIC_KEY_TRUE(failure_tracking);
61 EXPORT_SYMBOL_GPL(failure_tracking);
62
63 static depot_stack_handle_t failure_handle;
64
early_page_pinner_param(char * buf)65 static int __init early_page_pinner_param(char *buf)
66 {
67 page_pinner_enabled = true;
68 return 0;
69 }
70 early_param("page_pinner", early_page_pinner_param);
71
need_page_pinner(void)72 static bool need_page_pinner(void)
73 {
74 return page_pinner_enabled;
75 }
76
register_failure_stack(void)77 static noinline void register_failure_stack(void)
78 {
79 unsigned long entries[4];
80 unsigned int nr_entries;
81
82 nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 0);
83 failure_handle = stack_depot_save(entries, nr_entries, GFP_KERNEL);
84 }
85
init_page_pinner(void)86 static void init_page_pinner(void)
87 {
88 if (!page_pinner_enabled)
89 return;
90
91 pp_buffer.buffer = kvmalloc_array(pp_buf_size, sizeof(*pp_buffer.buffer),
92 GFP_KERNEL);
93 if (!pp_buffer.buffer) {
94 pr_info("page_pinner disabled due to failure of buffer allocation\n");
95 return;
96 }
97
98 spin_lock_init(&pp_buffer.lock);
99 pp_buffer.index = 0;
100
101 register_failure_stack();
102 static_branch_enable(&page_pinner_inited);
103 }
104
105 struct page_ext_operations page_pinner_ops = {
106 .size = sizeof(struct page_pinner),
107 .need = need_page_pinner,
108 .init = init_page_pinner,
109 };
110
get_page_pinner(struct page_ext * page_ext)111 static inline struct page_pinner *get_page_pinner(struct page_ext *page_ext)
112 {
113 return (void *)page_ext + page_pinner_ops.offset;
114 }
115
save_stack(gfp_t flags)116 static noinline depot_stack_handle_t save_stack(gfp_t flags)
117 {
118 unsigned long entries[PAGE_PINNER_STACK_DEPTH];
119 depot_stack_handle_t handle;
120 unsigned int nr_entries;
121
122 nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 2);
123 handle = stack_depot_save(entries, nr_entries, flags);
124 if (!handle)
125 handle = failure_handle;
126
127 return handle;
128 }
129
capture_page_state(struct page * page,struct captured_pinner * record)130 static void capture_page_state(struct page *page,
131 struct captured_pinner *record)
132 {
133 record->flags = page->flags;
134 record->mapping = page_mapping(page);
135 record->pfn = page_to_pfn(page);
136 record->count = page_count(page);
137 record->mapcount = page_mapcount(page);
138 }
139
add_record(struct page_pinner_buffer * pp_buf,struct captured_pinner * record)140 static void add_record(struct page_pinner_buffer *pp_buf,
141 struct captured_pinner *record)
142 {
143 unsigned long flags;
144 unsigned int idx;
145
146 spin_lock_irqsave(&pp_buf->lock, flags);
147 idx = pp_buf->index++;
148 pp_buf->index %= pp_buf_size;
149 pp_buf->buffer[idx] = *record;
150 spin_unlock_irqrestore(&pp_buf->lock, flags);
151 }
152
__free_page_pinner(struct page * page,unsigned int order)153 void __free_page_pinner(struct page *page, unsigned int order)
154 {
155 struct page_pinner *page_pinner;
156 struct page_ext *page_ext;
157 int i;
158
159 /* free_page could be called before buffer is initialized */
160 if (!pp_buffer.buffer)
161 return;
162
163 page_ext = page_ext_get(page);
164 if (unlikely(!page_ext))
165 return;
166
167 for (i = 0; i < (1 << order); i++) {
168 struct captured_pinner record;
169
170 if (!test_bit(PAGE_EXT_PINNER_MIGRATION_FAILED, &page_ext->flags))
171 continue;
172
173 page_pinner = get_page_pinner(page_ext);
174
175 record.handle = save_stack(GFP_NOWAIT|__GFP_NOWARN);
176 record.ts_usec = (u64)ktime_to_us(ktime_get_boottime());
177 record.state = PP_FREE;
178 capture_page_state(page, &record);
179
180 add_record(&pp_buffer, &record);
181
182 atomic_set(&page_pinner->count, 0);
183 page_pinner->ts_usec = 0;
184 clear_bit(PAGE_EXT_PINNER_MIGRATION_FAILED, &page_ext->flags);
185 page_ext = page_ext_next(page_ext);
186 }
187 page_ext_put(page_ext);
188 }
189
190 static ssize_t
print_page_pinner(char __user * buf,size_t count,struct captured_pinner * record)191 print_page_pinner(char __user *buf, size_t count, struct captured_pinner *record)
192 {
193 int ret;
194 unsigned long *entries;
195 unsigned int nr_entries;
196 char *kbuf;
197
198 count = min_t(size_t, count, PAGE_SIZE);
199 kbuf = kmalloc(count, GFP_KERNEL);
200 if (!kbuf)
201 return -ENOMEM;
202
203 if (record->state == PP_PUT) {
204 ret = snprintf(kbuf, count, "At least, pinned for %llu us\n",
205 record->elapsed);
206 } else {
207 u64 ts_usec = record->ts_usec;
208 unsigned long rem_usec = do_div(ts_usec, 1000000);
209
210 ret = snprintf(kbuf, count,
211 "%s [%5lu.%06lu]\n",
212 record->state == PP_FREE ? "Freed at" :
213 "Failure detected at",
214 (unsigned long)ts_usec, rem_usec);
215 }
216
217 if (ret >= count)
218 goto err;
219
220 /* Print information relevant to grouping pages by mobility */
221 ret += snprintf(kbuf + ret, count - ret,
222 "PFN 0x%lx Block %lu count %d mapcount %d mapping %pS Flags %#lx(%pGp)\n",
223 record->pfn,
224 record->pfn >> pageblock_order,
225 record->count, record->mapcount,
226 record->mapping,
227 record->flags, &record->flags);
228
229 if (ret >= count)
230 goto err;
231
232 nr_entries = stack_depot_fetch(record->handle, &entries);
233 ret += stack_trace_snprint(kbuf + ret, count - ret, entries,
234 nr_entries, 0);
235 if (ret >= count)
236 goto err;
237
238 ret += snprintf(kbuf + ret, count - ret, "\n");
239 if (ret >= count)
240 goto err;
241
242 if (copy_to_user(buf, kbuf, ret))
243 ret = -EFAULT;
244
245 kfree(kbuf);
246 return ret;
247
248 err:
249 kfree(kbuf);
250 return -ENOMEM;
251 }
252
__page_pinner_failure_detect(struct page * page)253 void __page_pinner_failure_detect(struct page *page)
254 {
255 struct page_ext *page_ext = page_ext_get(page);
256 struct page_pinner *page_pinner;
257 struct captured_pinner record;
258 u64 now;
259
260 if (unlikely(!page_ext))
261 return;
262
263 if (test_bit(PAGE_EXT_PINNER_MIGRATION_FAILED, &page_ext->flags)) {
264 page_ext_put(page_ext);
265 return;
266 }
267
268 now = (u64)ktime_to_us(ktime_get_boottime());
269 page_pinner = get_page_pinner(page_ext);
270 if (!page_pinner->ts_usec)
271 page_pinner->ts_usec = now;
272 set_bit(PAGE_EXT_PINNER_MIGRATION_FAILED, &page_ext->flags);
273 record.handle = save_stack(GFP_NOWAIT|__GFP_NOWARN);
274 record.ts_usec = now;
275 record.state = PP_FAIL_DETECTED;
276 capture_page_state(page, &record);
277
278 add_record(&pp_buffer, &record);
279 page_ext_put(page_ext);
280 }
281 EXPORT_SYMBOL_GPL(__page_pinner_failure_detect);
282
__page_pinner_put_page(struct page * page)283 void __page_pinner_put_page(struct page *page)
284 {
285 struct page_ext *page_ext = page_ext_get(page);
286 struct page_pinner *page_pinner;
287 struct captured_pinner record;
288 u64 now, ts_usec;
289
290 if (unlikely(!page_ext))
291 return;
292
293 if (!test_bit(PAGE_EXT_PINNER_MIGRATION_FAILED, &page_ext->flags)) {
294 page_ext_put(page_ext);
295 return;
296 }
297
298 page_pinner = get_page_pinner(page_ext);
299 record.handle = save_stack(GFP_NOWAIT|__GFP_NOWARN);
300 now = (u64)ktime_to_us(ktime_get_boottime());
301 ts_usec = page_pinner->ts_usec;
302
303 if (now > ts_usec)
304 record.elapsed = now - ts_usec;
305 else
306 record.elapsed = 0;
307 record.state = PP_PUT;
308 capture_page_state(page, &record);
309
310 add_record(&pp_buffer, &record);
311 page_ext_put(page_ext);
312 }
313 EXPORT_SYMBOL_GPL(__page_pinner_put_page);
314
read_buffer(struct file * file,char __user * buf,size_t count,loff_t * ppos)315 static ssize_t read_buffer(struct file *file, char __user *buf,
316 size_t count, loff_t *ppos)
317 {
318 u64 tmp;
319 loff_t i, idx;
320 struct captured_pinner record;
321 unsigned long flags;
322
323 if (!static_branch_unlikely(&failure_tracking))
324 return -EINVAL;
325
326 if (*ppos >= pp_buf_size)
327 return 0;
328
329 i = *ppos;
330 *ppos = i + 1;
331
332 /*
333 * reading the records in the reverse order with newest one
334 * being read first followed by older ones
335 */
336 tmp = pp_buffer.index - 1 - i + pp_buf_size;
337 idx = do_div(tmp, pp_buf_size);
338
339 spin_lock_irqsave(&pp_buffer.lock, flags);
340 record = pp_buffer.buffer[idx];
341 spin_unlock_irqrestore(&pp_buffer.lock, flags);
342 if (!record.handle)
343 return 0;
344
345 return print_page_pinner(buf, count, &record);
346 }
347
348 static const struct file_operations proc_buffer_operations = {
349 .read = read_buffer,
350 };
351
failure_tracking_set(void * data,u64 val)352 static int failure_tracking_set(void *data, u64 val)
353 {
354 bool on;
355
356 on = (bool)val;
357 if (on)
358 static_branch_enable(&failure_tracking);
359 else
360 static_branch_disable(&failure_tracking);
361 return 0;
362 }
363
failure_tracking_get(void * data,u64 * val)364 static int failure_tracking_get(void *data, u64 *val)
365 {
366 *val = static_branch_unlikely(&failure_tracking);
367 return 0;
368 }
369 DEFINE_DEBUGFS_ATTRIBUTE(failure_tracking_fops,
370 failure_tracking_get,
371 failure_tracking_set, "%llu\n");
372
buffer_size_set(void * data,u64 val)373 static int buffer_size_set(void *data, u64 val)
374 {
375 unsigned long flags;
376 struct captured_pinner *new, *old;
377
378 new = kvmalloc_array(val, sizeof(*new), GFP_KERNEL);
379 if (!new)
380 return -ENOMEM;
381
382 spin_lock_irqsave(&pp_buffer.lock, flags);
383 old = pp_buffer.buffer;
384 pp_buffer.buffer = new;
385 pp_buffer.index = 0;
386 pp_buf_size = val;
387 spin_unlock_irqrestore(&pp_buffer.lock, flags);
388 kvfree(old);
389
390 return 0;
391 }
392
buffer_size_get(void * data,u64 * val)393 static int buffer_size_get(void *data, u64 *val)
394 {
395 *val = pp_buf_size;
396 return 0;
397 }
398 DEFINE_DEBUGFS_ATTRIBUTE(buffer_size_fops,
399 buffer_size_get,
400 buffer_size_set, "%llu\n");
401
page_pinner_init(void)402 static int __init page_pinner_init(void)
403 {
404 struct dentry *pp_debugfs_root;
405
406 if (!static_branch_unlikely(&page_pinner_inited))
407 return 0;
408
409 pr_info("page_pinner enabled\n");
410
411 pp_debugfs_root = debugfs_create_dir("page_pinner", NULL);
412
413 debugfs_create_file("buffer", 0444,
414 pp_debugfs_root, NULL,
415 &proc_buffer_operations);
416
417 debugfs_create_file("failure_tracking", 0644,
418 pp_debugfs_root, NULL,
419 &failure_tracking_fops);
420
421 debugfs_create_file("buffer_size", 0644,
422 pp_debugfs_root, NULL,
423 &buffer_size_fops);
424 return 0;
425 }
426 late_initcall(page_pinner_init)
427