• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/debugfs.h>
3 #include <linux/mm.h>
4 #include <linux/slab.h>
5 #include <linux/uaccess.h>
6 #include <linux/memblock.h>
7 #include <linux/stacktrace.h>
8 #include <linux/page_pinner.h>
9 #include <linux/jump_label.h>
10 #include <linux/migrate.h>
11 #include <linux/stackdepot.h>
12 #include <linux/seq_file.h>
13 #include <linux/sched/clock.h>
14 #include <linux/page_ref.h>
15 
16 #include "internal.h"
17 
18 #define PAGE_PINNER_STACK_DEPTH 16
19 static unsigned long pp_buf_size = 4096;
20 
21 struct page_pinner {
22 	depot_stack_handle_t handle;
23 	u64 ts_usec;
24 	atomic_t count;
25 };
26 
27 enum pp_state {
28 	PP_PUT,
29 	PP_FREE,
30 	PP_FAIL_DETECTED,
31 };
32 
33 struct captured_pinner {
34 	depot_stack_handle_t handle;
35 	union {
36 		u64 ts_usec;
37 		u64 elapsed;
38 	};
39 
40 	/* struct page fields */
41 	unsigned long pfn;
42 	int count;
43 	int mapcount;
44 	struct address_space *mapping;
45 	unsigned long flags;
46 	enum pp_state state;
47 };
48 
49 struct page_pinner_buffer {
50 	spinlock_t lock;
51 	unsigned long index;
52 	struct captured_pinner *buffer;
53 };
54 
55 /* alloc_contig failed pinner */
56 static struct page_pinner_buffer pp_buffer;
57 
58 static bool page_pinner_enabled;
59 DEFINE_STATIC_KEY_FALSE(page_pinner_inited);
60 EXPORT_SYMBOL_GPL(page_pinner_inited);
61 
62 DEFINE_STATIC_KEY_TRUE(failure_tracking);
63 
64 static depot_stack_handle_t failure_handle;
65 
early_page_pinner_param(char * buf)66 static int __init early_page_pinner_param(char *buf)
67 {
68 	page_pinner_enabled = true;
69 	return 0;
70 }
71 early_param("page_pinner", early_page_pinner_param);
72 
need_page_pinner(void)73 static bool need_page_pinner(void)
74 {
75 	return page_pinner_enabled;
76 }
77 
register_failure_stack(void)78 static noinline void register_failure_stack(void)
79 {
80 	unsigned long entries[4];
81 	unsigned int nr_entries;
82 
83 	nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 0);
84 	failure_handle = stack_depot_save(entries, nr_entries, GFP_KERNEL);
85 }
86 
init_page_pinner(void)87 static void init_page_pinner(void)
88 {
89 	if (!page_pinner_enabled)
90 		return;
91 
92 	pp_buffer.buffer = kvmalloc_array(pp_buf_size, sizeof(*pp_buffer.buffer),
93 				GFP_KERNEL);
94 	if (!pp_buffer.buffer) {
95 		pr_info("page_pinner disabled due to failure of buffer allocation\n");
96 		return;
97 	}
98 
99 	spin_lock_init(&pp_buffer.lock);
100 	pp_buffer.index = 0;
101 
102 	register_failure_stack();
103 	static_branch_enable(&page_pinner_inited);
104 }
105 
106 struct page_ext_operations page_pinner_ops = {
107 	.size = sizeof(struct page_pinner),
108 	.need = need_page_pinner,
109 	.init = init_page_pinner,
110 };
111 
get_page_pinner(struct page_ext * page_ext)112 static inline struct page_pinner *get_page_pinner(struct page_ext *page_ext)
113 {
114 	return (void *)page_ext + page_pinner_ops.offset;
115 }
116 
save_stack(gfp_t flags)117 static noinline depot_stack_handle_t save_stack(gfp_t flags)
118 {
119 	unsigned long entries[PAGE_PINNER_STACK_DEPTH];
120 	depot_stack_handle_t handle;
121 	unsigned int nr_entries;
122 
123 	nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 2);
124 	handle = stack_depot_save(entries, nr_entries, flags);
125 	if (!handle)
126 		handle = failure_handle;
127 
128 	return handle;
129 }
130 
capture_page_state(struct page * page,struct captured_pinner * record)131 static void capture_page_state(struct page *page,
132 			       struct captured_pinner *record)
133 {
134 	struct folio *folio = page_folio(page);
135 
136 	record->flags = page->flags;
137 	record->mapping = folio_mapping(folio);
138 	record->pfn = page_to_pfn(page);
139 	record->count = page_count(page);
140 	record->mapcount = folio_mapcount(folio);
141 }
142 
add_record(struct page_pinner_buffer * pp_buf,struct captured_pinner * record)143 static void add_record(struct page_pinner_buffer *pp_buf,
144 		       struct captured_pinner *record)
145 {
146 	unsigned long flags;
147 	unsigned int idx;
148 
149 	spin_lock_irqsave(&pp_buf->lock, flags);
150 	idx = pp_buf->index++;
151 	pp_buf->index %= pp_buf_size;
152 	pp_buf->buffer[idx] = *record;
153 	spin_unlock_irqrestore(&pp_buf->lock, flags);
154 }
155 
__free_page_pinner(struct page * page,unsigned int order)156 void __free_page_pinner(struct page *page, unsigned int order)
157 {
158 	struct page_pinner *page_pinner;
159 	struct page_ext *page_ext;
160 	int i;
161 
162 	/* free_page could be called before buffer is initialized */
163 	if (!pp_buffer.buffer)
164 		return;
165 
166 	page_ext = page_ext_get(page);
167 	if (unlikely(!page_ext))
168 		return;
169 
170 	for (i = 0; i < (1 << order); i++) {
171 		struct captured_pinner record;
172 
173 		if (!test_bit(PAGE_EXT_PINNER_MIGRATION_FAILED, &page_ext->flags))
174 			continue;
175 
176 		page_pinner = get_page_pinner(page_ext);
177 
178 		record.handle = save_stack(GFP_NOWAIT|__GFP_NOWARN);
179 		record.ts_usec = (u64)ktime_to_us(ktime_get_boottime());
180 		record.state = PP_FREE;
181 		capture_page_state(page, &record);
182 
183 		add_record(&pp_buffer, &record);
184 
185 		atomic_set(&page_pinner->count, 0);
186 		page_pinner->ts_usec = 0;
187 		clear_bit(PAGE_EXT_PINNER_MIGRATION_FAILED, &page_ext->flags);
188 		page_ext = page_ext_next(page_ext);
189 	}
190 	page_ext_put(page_ext);
191 }
192 
193 static ssize_t
print_page_pinner(char __user * buf,size_t count,struct captured_pinner * record)194 print_page_pinner(char __user *buf, size_t count, struct captured_pinner *record)
195 {
196 	int ret;
197 	unsigned long *entries;
198 	unsigned int nr_entries;
199 	char *kbuf;
200 
201 	count = min_t(size_t, count, PAGE_SIZE);
202 	kbuf = kmalloc(count, GFP_KERNEL);
203 	if (!kbuf)
204 		return -ENOMEM;
205 
206 	if (record->state == PP_PUT) {
207 		ret = snprintf(kbuf, count, "At least, pinned for %llu us\n",
208 			       record->elapsed);
209 	} else {
210 		u64 ts_usec = record->ts_usec;
211 		unsigned long rem_usec = do_div(ts_usec, 1000000);
212 
213 		ret = snprintf(kbuf, count,
214 			       "%s [%5lu.%06lu]\n",
215 			       record->state == PP_FREE ? "Freed at" :
216 							  "Failure detected at",
217 			       (unsigned long)ts_usec, rem_usec);
218 	}
219 
220 	if (ret >= count)
221 		goto err;
222 
223 	/* Print information relevant to grouping pages by mobility */
224 	ret += snprintf(kbuf + ret, count - ret,
225 			"PFN 0x%lx Block %lu count %d mapcount %d mapping %pS Flags %#lx(%pGp)\n",
226 			record->pfn,
227 			record->pfn >> pageblock_order,
228 			record->count, record->mapcount,
229 			record->mapping,
230 			record->flags, &record->flags);
231 
232 	if (ret >= count)
233 		goto err;
234 
235 	nr_entries = stack_depot_fetch(record->handle, &entries);
236 	ret += stack_trace_snprint(kbuf + ret, count - ret, entries,
237 				   nr_entries, 0);
238 	if (ret >= count)
239 		goto err;
240 
241 	ret += snprintf(kbuf + ret, count - ret, "\n");
242 	if (ret >= count)
243 		goto err;
244 
245 	if (copy_to_user(buf, kbuf, ret))
246 		ret = -EFAULT;
247 
248 	kfree(kbuf);
249 	return ret;
250 
251 err:
252 	kfree(kbuf);
253 	return -ENOMEM;
254 }
255 
__page_pinner_failure_detect(struct page * page)256 void __page_pinner_failure_detect(struct page *page)
257 {
258 	struct page_ext *page_ext;
259 	struct page_pinner *page_pinner;
260 	struct captured_pinner record;
261 	u64 now;
262 
263 	if (!static_branch_unlikely(&failure_tracking))
264 		return;
265 
266 	page_ext = page_ext_get(page);
267 	if (unlikely(!page_ext))
268 		return;
269 
270 	if (test_bit(PAGE_EXT_PINNER_MIGRATION_FAILED, &page_ext->flags)) {
271 		page_ext_put(page_ext);
272 		return;
273 	}
274 
275 	now = (u64)ktime_to_us(ktime_get_boottime());
276 	page_pinner = get_page_pinner(page_ext);
277 	if (!page_pinner->ts_usec)
278 		page_pinner->ts_usec = now;
279 	set_bit(PAGE_EXT_PINNER_MIGRATION_FAILED, &page_ext->flags);
280 	record.handle = save_stack(GFP_NOWAIT|__GFP_NOWARN);
281 	record.ts_usec = now;
282 	record.state = PP_FAIL_DETECTED;
283 	capture_page_state(page, &record);
284 
285 	add_record(&pp_buffer, &record);
286 	page_ext_put(page_ext);
287 }
288 EXPORT_SYMBOL_GPL(__page_pinner_failure_detect);
289 
__page_pinner_put_page(struct page * page)290 void __page_pinner_put_page(struct page *page)
291 {
292 	struct page_ext *page_ext;
293 	struct page_pinner *page_pinner;
294 	struct captured_pinner record;
295 	u64 now, ts_usec;
296 
297 	if (!static_branch_unlikely(&failure_tracking))
298 		return;
299 
300 	page_ext = page_ext_get(page);
301 	if (unlikely(!page_ext))
302 		return;
303 
304 	if (!test_bit(PAGE_EXT_PINNER_MIGRATION_FAILED, &page_ext->flags)) {
305 		page_ext_put(page_ext);
306 		return;
307 	}
308 
309 	page_pinner = get_page_pinner(page_ext);
310 	record.handle = save_stack(GFP_NOWAIT|__GFP_NOWARN);
311 	now = (u64)ktime_to_us(ktime_get_boottime());
312 	ts_usec = page_pinner->ts_usec;
313 
314 	if (now > ts_usec)
315 		record.elapsed = now - ts_usec;
316 	else
317 		record.elapsed = 0;
318 	record.state = PP_PUT;
319 	capture_page_state(page, &record);
320 
321 	add_record(&pp_buffer, &record);
322 	page_ext_put(page_ext);
323 }
324 EXPORT_SYMBOL_GPL(__page_pinner_put_page);
325 
read_buffer(struct file * file,char __user * buf,size_t count,loff_t * ppos)326 static ssize_t read_buffer(struct file *file, char __user *buf,
327 					size_t count, loff_t *ppos)
328 {
329 	u64 tmp;
330 	loff_t i, idx;
331 	struct captured_pinner record;
332 	unsigned long flags;
333 
334 	if (!static_branch_unlikely(&failure_tracking))
335 		return -EINVAL;
336 
337 	if (*ppos >= pp_buf_size)
338 		return 0;
339 
340 	i = *ppos;
341 	*ppos = i + 1;
342 
343 	/*
344 	 * reading the records in the reverse order with newest one
345 	 * being read first followed by older ones
346 	 */
347 	tmp = pp_buffer.index - 1 - i + pp_buf_size;
348 	idx = do_div(tmp, pp_buf_size);
349 
350 	spin_lock_irqsave(&pp_buffer.lock, flags);
351 	record = pp_buffer.buffer[idx];
352 	spin_unlock_irqrestore(&pp_buffer.lock, flags);
353 	if (!record.handle)
354 		return 0;
355 
356 	return print_page_pinner(buf, count, &record);
357 }
358 
359 static const struct file_operations proc_buffer_operations = {
360 	.read		= read_buffer,
361 };
362 
failure_tracking_set(void * data,u64 val)363 static int failure_tracking_set(void *data, u64 val)
364 {
365 	bool on;
366 
367 	on = (bool)val;
368 	if (on)
369 		static_branch_enable(&failure_tracking);
370 	else
371 		static_branch_disable(&failure_tracking);
372 	return 0;
373 }
374 
failure_tracking_get(void * data,u64 * val)375 static int failure_tracking_get(void *data, u64 *val)
376 {
377 	*val = static_branch_unlikely(&failure_tracking);
378 	return 0;
379 }
380 DEFINE_DEBUGFS_ATTRIBUTE(failure_tracking_fops,
381 			 failure_tracking_get,
382 			 failure_tracking_set, "%llu\n");
383 
buffer_size_set(void * data,u64 val)384 static int buffer_size_set(void *data, u64 val)
385 {
386 	unsigned long flags;
387 	struct captured_pinner *new, *old;
388 
389 	new = kvmalloc_array(val, sizeof(*new), GFP_KERNEL);
390 	if (!new)
391 		return -ENOMEM;
392 
393 	spin_lock_irqsave(&pp_buffer.lock, flags);
394 	old = pp_buffer.buffer;
395 	pp_buffer.buffer = new;
396 	pp_buffer.index = 0;
397 	pp_buf_size = val;
398 	spin_unlock_irqrestore(&pp_buffer.lock, flags);
399 	kvfree(old);
400 
401 	return 0;
402 }
403 
buffer_size_get(void * data,u64 * val)404 static int buffer_size_get(void *data, u64 *val)
405 {
406 	*val = pp_buf_size;
407 	return 0;
408 }
409 DEFINE_DEBUGFS_ATTRIBUTE(buffer_size_fops,
410 			 buffer_size_get,
411 			 buffer_size_set, "%llu\n");
412 
page_pinner_init(void)413 static int __init page_pinner_init(void)
414 {
415 	struct dentry *pp_debugfs_root;
416 
417 	if (!static_branch_unlikely(&page_pinner_inited))
418 		return 0;
419 
420 	pr_info("page_pinner enabled\n");
421 
422 	pp_debugfs_root = debugfs_create_dir("page_pinner", NULL);
423 
424 	debugfs_create_file("buffer", 0444,
425 			    pp_debugfs_root, NULL,
426 			    &proc_buffer_operations);
427 
428 	debugfs_create_file("failure_tracking", 0644,
429 			    pp_debugfs_root, NULL,
430 			    &failure_tracking_fops);
431 
432 	debugfs_create_file("buffer_size", 0644,
433 			    pp_debugfs_root, NULL,
434 			    &buffer_size_fops);
435 	return 0;
436 }
437 late_initcall(page_pinner_init)
438