• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/debugfs.h>
3 #include <linux/mm.h>
4 #include <linux/slab.h>
5 #include <linux/uaccess.h>
6 #include <linux/memblock.h>
7 #include <linux/stacktrace.h>
8 #include <linux/page_pinner.h>
9 #include <linux/jump_label.h>
10 #include <linux/migrate.h>
11 #include <linux/stackdepot.h>
12 #include <linux/seq_file.h>
13 #include <linux/sched/clock.h>
14 
15 #include "internal.h"
16 
17 #define PAGE_PINNER_STACK_DEPTH 16
18 static unsigned long pp_buf_size = 4096;
19 
20 struct page_pinner {
21 	depot_stack_handle_t handle;
22 	u64 ts_usec;
23 	atomic_t count;
24 };
25 
26 enum pp_state {
27 	PP_PUT,
28 	PP_FREE,
29 	PP_FAIL_DETECTED,
30 };
31 
32 struct captured_pinner {
33 	depot_stack_handle_t handle;
34 	union {
35 		u64 ts_usec;
36 		u64 elapsed;
37 	};
38 
39 	/* struct page fields */
40 	unsigned long pfn;
41 	int count;
42 	int mapcount;
43 	struct address_space *mapping;
44 	unsigned long flags;
45 	enum pp_state state;
46 };
47 
48 struct page_pinner_buffer {
49 	spinlock_t lock;
50 	unsigned long index;
51 	struct captured_pinner *buffer;
52 };
53 
54 /* alloc_contig failed pinner */
55 static struct page_pinner_buffer pp_buffer;
56 
57 static bool page_pinner_enabled;
58 DEFINE_STATIC_KEY_FALSE(page_pinner_inited);
59 
60 DEFINE_STATIC_KEY_TRUE(failure_tracking);
61 EXPORT_SYMBOL_GPL(failure_tracking);
62 
63 static depot_stack_handle_t failure_handle;
64 
early_page_pinner_param(char * buf)65 static int __init early_page_pinner_param(char *buf)
66 {
67 	page_pinner_enabled = true;
68 	return 0;
69 }
70 early_param("page_pinner", early_page_pinner_param);
71 
need_page_pinner(void)72 static bool need_page_pinner(void)
73 {
74 	return page_pinner_enabled;
75 }
76 
register_failure_stack(void)77 static noinline void register_failure_stack(void)
78 {
79 	unsigned long entries[4];
80 	unsigned int nr_entries;
81 
82 	nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 0);
83 	failure_handle = stack_depot_save(entries, nr_entries, GFP_KERNEL);
84 }
85 
init_page_pinner(void)86 static void init_page_pinner(void)
87 {
88 	if (!page_pinner_enabled)
89 		return;
90 
91 	pp_buffer.buffer = kvmalloc_array(pp_buf_size, sizeof(*pp_buffer.buffer),
92 				GFP_KERNEL);
93 	if (!pp_buffer.buffer) {
94 		pr_info("page_pinner disabled due to failure of buffer allocation\n");
95 		return;
96 	}
97 
98 	spin_lock_init(&pp_buffer.lock);
99 	pp_buffer.index = 0;
100 
101 	register_failure_stack();
102 	static_branch_enable(&page_pinner_inited);
103 }
104 
105 struct page_ext_operations page_pinner_ops = {
106 	.size = sizeof(struct page_pinner),
107 	.need = need_page_pinner,
108 	.init = init_page_pinner,
109 };
110 
get_page_pinner(struct page_ext * page_ext)111 static inline struct page_pinner *get_page_pinner(struct page_ext *page_ext)
112 {
113 	return (void *)page_ext + page_pinner_ops.offset;
114 }
115 
save_stack(gfp_t flags)116 static noinline depot_stack_handle_t save_stack(gfp_t flags)
117 {
118 	unsigned long entries[PAGE_PINNER_STACK_DEPTH];
119 	depot_stack_handle_t handle;
120 	unsigned int nr_entries;
121 
122 	nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 2);
123 	handle = stack_depot_save(entries, nr_entries, flags);
124 	if (!handle)
125 		handle = failure_handle;
126 
127 	return handle;
128 }
129 
capture_page_state(struct page * page,struct captured_pinner * record)130 static void capture_page_state(struct page *page,
131 			       struct captured_pinner *record)
132 {
133 	record->flags = page->flags;
134 	record->mapping = page_mapping(page);
135 	record->pfn = page_to_pfn(page);
136 	record->count = page_count(page);
137 	record->mapcount = page_mapcount(page);
138 }
139 
add_record(struct page_pinner_buffer * pp_buf,struct captured_pinner * record)140 static void add_record(struct page_pinner_buffer *pp_buf,
141 		       struct captured_pinner *record)
142 {
143 	unsigned long flags;
144 	unsigned int idx;
145 
146 	spin_lock_irqsave(&pp_buf->lock, flags);
147 	idx = pp_buf->index++;
148 	pp_buf->index %= pp_buf_size;
149 	pp_buf->buffer[idx] = *record;
150 	spin_unlock_irqrestore(&pp_buf->lock, flags);
151 }
152 
__free_page_pinner(struct page * page,unsigned int order)153 void __free_page_pinner(struct page *page, unsigned int order)
154 {
155 	struct page_pinner *page_pinner;
156 	struct page_ext *page_ext;
157 	int i;
158 
159 	/* free_page could be called before buffer is initialized */
160 	if (!pp_buffer.buffer)
161 		return;
162 
163 	page_ext = page_ext_get(page);
164 	if (unlikely(!page_ext))
165 		return;
166 
167 	for (i = 0; i < (1 << order); i++) {
168 		struct captured_pinner record;
169 
170 		if (!test_bit(PAGE_EXT_PINNER_MIGRATION_FAILED, &page_ext->flags))
171 			continue;
172 
173 		page_pinner = get_page_pinner(page_ext);
174 
175 		record.handle = save_stack(GFP_NOWAIT|__GFP_NOWARN);
176 		record.ts_usec = (u64)ktime_to_us(ktime_get_boottime());
177 		record.state = PP_FREE;
178 		capture_page_state(page, &record);
179 
180 		add_record(&pp_buffer, &record);
181 
182 		atomic_set(&page_pinner->count, 0);
183 		page_pinner->ts_usec = 0;
184 		clear_bit(PAGE_EXT_PINNER_MIGRATION_FAILED, &page_ext->flags);
185 		page_ext = page_ext_next(page_ext);
186 	}
187 	page_ext_put(page_ext);
188 }
189 
190 static ssize_t
print_page_pinner(char __user * buf,size_t count,struct captured_pinner * record)191 print_page_pinner(char __user *buf, size_t count, struct captured_pinner *record)
192 {
193 	int ret;
194 	unsigned long *entries;
195 	unsigned int nr_entries;
196 	char *kbuf;
197 
198 	count = min_t(size_t, count, PAGE_SIZE);
199 	kbuf = kmalloc(count, GFP_KERNEL);
200 	if (!kbuf)
201 		return -ENOMEM;
202 
203 	if (record->state == PP_PUT) {
204 		ret = snprintf(kbuf, count, "At least, pinned for %llu us\n",
205 			       record->elapsed);
206 	} else {
207 		u64 ts_usec = record->ts_usec;
208 		unsigned long rem_usec = do_div(ts_usec, 1000000);
209 
210 		ret = snprintf(kbuf, count,
211 			       "%s [%5lu.%06lu]\n",
212 			       record->state == PP_FREE ? "Freed at" :
213 							  "Failure detected at",
214 			       (unsigned long)ts_usec, rem_usec);
215 	}
216 
217 	if (ret >= count)
218 		goto err;
219 
220 	/* Print information relevant to grouping pages by mobility */
221 	ret += snprintf(kbuf + ret, count - ret,
222 			"PFN 0x%lx Block %lu count %d mapcount %d mapping %pS Flags %#lx(%pGp)\n",
223 			record->pfn,
224 			record->pfn >> pageblock_order,
225 			record->count, record->mapcount,
226 			record->mapping,
227 			record->flags, &record->flags);
228 
229 	if (ret >= count)
230 		goto err;
231 
232 	nr_entries = stack_depot_fetch(record->handle, &entries);
233 	ret += stack_trace_snprint(kbuf + ret, count - ret, entries,
234 				   nr_entries, 0);
235 	if (ret >= count)
236 		goto err;
237 
238 	ret += snprintf(kbuf + ret, count - ret, "\n");
239 	if (ret >= count)
240 		goto err;
241 
242 	if (copy_to_user(buf, kbuf, ret))
243 		ret = -EFAULT;
244 
245 	kfree(kbuf);
246 	return ret;
247 
248 err:
249 	kfree(kbuf);
250 	return -ENOMEM;
251 }
252 
__page_pinner_failure_detect(struct page * page)253 void __page_pinner_failure_detect(struct page *page)
254 {
255 	struct page_ext *page_ext = page_ext_get(page);
256 	struct page_pinner *page_pinner;
257 	struct captured_pinner record;
258 	u64 now;
259 
260 	if (unlikely(!page_ext))
261 		return;
262 
263 	if (test_bit(PAGE_EXT_PINNER_MIGRATION_FAILED, &page_ext->flags)) {
264 		page_ext_put(page_ext);
265 		return;
266 	}
267 
268 	now = (u64)ktime_to_us(ktime_get_boottime());
269 	page_pinner = get_page_pinner(page_ext);
270 	if (!page_pinner->ts_usec)
271 		page_pinner->ts_usec = now;
272 	set_bit(PAGE_EXT_PINNER_MIGRATION_FAILED, &page_ext->flags);
273 	record.handle = save_stack(GFP_NOWAIT|__GFP_NOWARN);
274 	record.ts_usec = now;
275 	record.state = PP_FAIL_DETECTED;
276 	capture_page_state(page, &record);
277 
278 	add_record(&pp_buffer, &record);
279 	page_ext_put(page_ext);
280 }
281 EXPORT_SYMBOL_GPL(__page_pinner_failure_detect);
282 
__page_pinner_put_page(struct page * page)283 void __page_pinner_put_page(struct page *page)
284 {
285 	struct page_ext *page_ext = page_ext_get(page);
286 	struct page_pinner *page_pinner;
287 	struct captured_pinner record;
288 	u64 now, ts_usec;
289 
290 	if (unlikely(!page_ext))
291 		return;
292 
293 	if (!test_bit(PAGE_EXT_PINNER_MIGRATION_FAILED, &page_ext->flags)) {
294 		page_ext_put(page_ext);
295 		return;
296 	}
297 
298 	page_pinner = get_page_pinner(page_ext);
299 	record.handle = save_stack(GFP_NOWAIT|__GFP_NOWARN);
300 	now = (u64)ktime_to_us(ktime_get_boottime());
301 	ts_usec = page_pinner->ts_usec;
302 
303 	if (now > ts_usec)
304 		record.elapsed = now - ts_usec;
305 	else
306 		record.elapsed = 0;
307 	record.state = PP_PUT;
308 	capture_page_state(page, &record);
309 
310 	add_record(&pp_buffer, &record);
311 	page_ext_put(page_ext);
312 }
313 EXPORT_SYMBOL_GPL(__page_pinner_put_page);
314 
read_buffer(struct file * file,char __user * buf,size_t count,loff_t * ppos)315 static ssize_t read_buffer(struct file *file, char __user *buf,
316 					size_t count, loff_t *ppos)
317 {
318 	u64 tmp;
319 	loff_t i, idx;
320 	struct captured_pinner record;
321 	unsigned long flags;
322 
323 	if (!static_branch_unlikely(&failure_tracking))
324 		return -EINVAL;
325 
326 	if (*ppos >= pp_buf_size)
327 		return 0;
328 
329 	i = *ppos;
330 	*ppos = i + 1;
331 
332 	/*
333 	 * reading the records in the reverse order with newest one
334 	 * being read first followed by older ones
335 	 */
336 	tmp = pp_buffer.index - 1 - i + pp_buf_size;
337 	idx = do_div(tmp, pp_buf_size);
338 
339 	spin_lock_irqsave(&pp_buffer.lock, flags);
340 	record = pp_buffer.buffer[idx];
341 	spin_unlock_irqrestore(&pp_buffer.lock, flags);
342 	if (!record.handle)
343 		return 0;
344 
345 	return print_page_pinner(buf, count, &record);
346 }
347 
348 static const struct file_operations proc_buffer_operations = {
349 	.read		= read_buffer,
350 };
351 
failure_tracking_set(void * data,u64 val)352 static int failure_tracking_set(void *data, u64 val)
353 {
354 	bool on;
355 
356 	on = (bool)val;
357 	if (on)
358 		static_branch_enable(&failure_tracking);
359 	else
360 		static_branch_disable(&failure_tracking);
361 	return 0;
362 }
363 
failure_tracking_get(void * data,u64 * val)364 static int failure_tracking_get(void *data, u64 *val)
365 {
366 	*val = static_branch_unlikely(&failure_tracking);
367 	return 0;
368 }
369 DEFINE_DEBUGFS_ATTRIBUTE(failure_tracking_fops,
370 			 failure_tracking_get,
371 			 failure_tracking_set, "%llu\n");
372 
buffer_size_set(void * data,u64 val)373 static int buffer_size_set(void *data, u64 val)
374 {
375 	unsigned long flags;
376 	struct captured_pinner *new, *old;
377 
378 	new = kvmalloc_array(val, sizeof(*new), GFP_KERNEL);
379 	if (!new)
380 		return -ENOMEM;
381 
382 	spin_lock_irqsave(&pp_buffer.lock, flags);
383 	old = pp_buffer.buffer;
384 	pp_buffer.buffer = new;
385 	pp_buffer.index = 0;
386 	pp_buf_size = val;
387 	spin_unlock_irqrestore(&pp_buffer.lock, flags);
388 	kvfree(old);
389 
390 	return 0;
391 }
392 
buffer_size_get(void * data,u64 * val)393 static int buffer_size_get(void *data, u64 *val)
394 {
395 	*val = pp_buf_size;
396 	return 0;
397 }
398 DEFINE_DEBUGFS_ATTRIBUTE(buffer_size_fops,
399 			 buffer_size_get,
400 			 buffer_size_set, "%llu\n");
401 
page_pinner_init(void)402 static int __init page_pinner_init(void)
403 {
404 	struct dentry *pp_debugfs_root;
405 
406 	if (!static_branch_unlikely(&page_pinner_inited))
407 		return 0;
408 
409 	pr_info("page_pinner enabled\n");
410 
411 	pp_debugfs_root = debugfs_create_dir("page_pinner", NULL);
412 
413 	debugfs_create_file("buffer", 0444,
414 			    pp_debugfs_root, NULL,
415 			    &proc_buffer_operations);
416 
417 	debugfs_create_file("failure_tracking", 0644,
418 			    pp_debugfs_root, NULL,
419 			    &failure_tracking_fops);
420 
421 	debugfs_create_file("buffer_size", 0644,
422 			    pp_debugfs_root, NULL,
423 			    &buffer_size_fops);
424 	return 0;
425 }
426 late_initcall(page_pinner_init)
427