1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/pagewalk.h>
3 #include <linux/mm_inline.h>
4 #include <linux/hugetlb.h>
5 #include <linux/huge_mm.h>
6 #include <linux/mount.h>
7 #include <linux/ksm.h>
8 #include <linux/seq_file.h>
9 #include <linux/highmem.h>
10 #include <linux/ptrace.h>
11 #include <linux/slab.h>
12 #include <linux/pagemap.h>
13 #include <linux/pgsize_migration.h>
14 #include <linux/mempolicy.h>
15 #include <linux/rmap.h>
16 #include <linux/swap.h>
17 #include <linux/sched/mm.h>
18 #include <linux/swapops.h>
19 #include <linux/mmu_notifier.h>
20 #include <linux/page_idle.h>
21 #include <linux/page_size_compat.h>
22 #include <linux/shmem_fs.h>
23 #include <linux/uaccess.h>
24 #include <linux/pkeys.h>
25 #include <linux/minmax.h>
26 #include <linux/overflow.h>
27 #include <linux/buildid.h>
28 #include <trace/hooks/mm.h>
29
30 #include <asm/elf.h>
31 #include <asm/tlb.h>
32 #include <asm/tlbflush.h>
33 #include "internal.h"
34
35 #define SENTINEL_VMA_END -1
36 #define SENTINEL_VMA_GATE -2
37
38 #define SEQ_PUT_DEC(str, val) \
39 seq_put_decimal_ull_width(m, str, (val) << (PAGE_SHIFT-10), 8)
task_mem(struct seq_file * m,struct mm_struct * mm)40 void task_mem(struct seq_file *m, struct mm_struct *mm)
41 {
42 unsigned long text, lib, swap, anon, file, shmem;
43 unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
44
45 anon = get_mm_counter_sum(mm, MM_ANONPAGES);
46 file = get_mm_counter_sum(mm, MM_FILEPAGES);
47 shmem = get_mm_counter_sum(mm, MM_SHMEMPAGES);
48
49 /*
50 * Note: to minimize their overhead, mm maintains hiwater_vm and
51 * hiwater_rss only when about to *lower* total_vm or rss. Any
52 * collector of these hiwater stats must therefore get total_vm
53 * and rss too, which will usually be the higher. Barriers? not
54 * worth the effort, such snapshots can always be inconsistent.
55 */
56 hiwater_vm = total_vm = mm->total_vm;
57 if (hiwater_vm < mm->hiwater_vm)
58 hiwater_vm = mm->hiwater_vm;
59 hiwater_rss = total_rss = anon + file + shmem;
60 if (hiwater_rss < mm->hiwater_rss)
61 hiwater_rss = mm->hiwater_rss;
62
63 /* split executable areas between text and lib */
64 text = PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK);
65 text = min(text, mm->exec_vm << PAGE_SHIFT);
66 lib = (mm->exec_vm << PAGE_SHIFT) - text;
67
68 swap = get_mm_counter_sum(mm, MM_SWAPENTS);
69 SEQ_PUT_DEC("VmPeak:\t", hiwater_vm);
70 SEQ_PUT_DEC(" kB\nVmSize:\t", total_vm);
71 SEQ_PUT_DEC(" kB\nVmLck:\t", mm->locked_vm);
72 SEQ_PUT_DEC(" kB\nVmPin:\t", atomic64_read(&mm->pinned_vm));
73 SEQ_PUT_DEC(" kB\nVmHWM:\t", hiwater_rss);
74 SEQ_PUT_DEC(" kB\nVmRSS:\t", total_rss);
75 SEQ_PUT_DEC(" kB\nRssAnon:\t", anon);
76 SEQ_PUT_DEC(" kB\nRssFile:\t", file);
77 SEQ_PUT_DEC(" kB\nRssShmem:\t", shmem);
78 SEQ_PUT_DEC(" kB\nVmData:\t", mm->data_vm);
79 SEQ_PUT_DEC(" kB\nVmStk:\t", mm->stack_vm);
80 seq_put_decimal_ull_width(m,
81 " kB\nVmExe:\t", text >> 10, 8);
82 seq_put_decimal_ull_width(m,
83 " kB\nVmLib:\t", lib >> 10, 8);
84 seq_put_decimal_ull_width(m,
85 " kB\nVmPTE:\t", mm_pgtables_bytes(mm) >> 10, 8);
86 SEQ_PUT_DEC(" kB\nVmSwap:\t", swap);
87 seq_puts(m, " kB\n");
88 hugetlb_report_usage(m, mm);
89 }
90 #undef SEQ_PUT_DEC
91
task_vsize(struct mm_struct * mm)92 unsigned long task_vsize(struct mm_struct *mm)
93 {
94 return PAGE_SIZE * mm->total_vm;
95 }
96
task_statm(struct mm_struct * mm,unsigned long * shared,unsigned long * text,unsigned long * data,unsigned long * resident)97 unsigned long task_statm(struct mm_struct *mm,
98 unsigned long *shared, unsigned long *text,
99 unsigned long *data, unsigned long *resident)
100 {
101 *shared = get_mm_counter_sum(mm, MM_FILEPAGES) +
102 get_mm_counter_sum(mm, MM_SHMEMPAGES);
103 *text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK))
104 >> PAGE_SHIFT;
105 *data = mm->data_vm + mm->stack_vm;
106 *resident = *shared + get_mm_counter_sum(mm, MM_ANONPAGES);
107 return mm->total_vm;
108 }
109
110 #ifdef CONFIG_NUMA
111 /*
112 * Save get_task_policy() for show_numa_map().
113 */
hold_task_mempolicy(struct proc_maps_private * priv)114 static void hold_task_mempolicy(struct proc_maps_private *priv)
115 {
116 struct task_struct *task = priv->task;
117
118 task_lock(task);
119 priv->task_mempolicy = get_task_policy(task);
120 mpol_get(priv->task_mempolicy);
121 task_unlock(task);
122 }
release_task_mempolicy(struct proc_maps_private * priv)123 static void release_task_mempolicy(struct proc_maps_private *priv)
124 {
125 mpol_put(priv->task_mempolicy);
126 }
127 #else
hold_task_mempolicy(struct proc_maps_private * priv)128 static void hold_task_mempolicy(struct proc_maps_private *priv)
129 {
130 }
release_task_mempolicy(struct proc_maps_private * priv)131 static void release_task_mempolicy(struct proc_maps_private *priv)
132 {
133 }
134 #endif
135
136 #ifdef CONFIG_PER_VMA_LOCK
137
unlock_vma(struct proc_maps_private * priv)138 static void unlock_vma(struct proc_maps_private *priv)
139 {
140 if (priv->locked_vma) {
141 vma_end_read(priv->locked_vma);
142 priv->locked_vma = NULL;
143 }
144 }
145
146 static const struct seq_operations proc_pid_maps_op;
147
lock_vma_range(struct seq_file * m,struct proc_maps_private * priv)148 static inline bool lock_vma_range(struct seq_file *m,
149 struct proc_maps_private *priv)
150 {
151 /*
152 * smaps and numa_maps perform page table walk, therefore require
153 * mmap_lock but maps can be read with locking just the vma and
154 * walking the vma tree under rcu read protection.
155 */
156 if (m->op != &proc_pid_maps_op) {
157 if (mmap_read_lock_killable(priv->mm))
158 return false;
159
160 priv->mmap_locked = true;
161 } else {
162 rcu_read_lock();
163 priv->locked_vma = NULL;
164 priv->mmap_locked = false;
165 }
166
167 return true;
168 }
169
unlock_vma_range(struct proc_maps_private * priv)170 static inline void unlock_vma_range(struct proc_maps_private *priv)
171 {
172 if (priv->mmap_locked) {
173 mmap_read_unlock(priv->mm);
174 } else {
175 unlock_vma(priv);
176 rcu_read_unlock();
177 }
178 }
179
get_next_vma(struct proc_maps_private * priv,loff_t last_pos)180 static struct vm_area_struct *get_next_vma(struct proc_maps_private *priv,
181 loff_t last_pos)
182 {
183 struct vm_area_struct *vma;
184
185 if (priv->mmap_locked)
186 return vma_next(&priv->iter);
187
188 unlock_vma(priv);
189 vma = lock_next_vma(priv->mm, &priv->iter, last_pos);
190 if (!IS_ERR_OR_NULL(vma))
191 priv->locked_vma = vma;
192
193 return vma;
194 }
195
fallback_to_mmap_lock(struct proc_maps_private * priv,loff_t pos)196 static inline bool fallback_to_mmap_lock(struct proc_maps_private *priv,
197 loff_t pos)
198 {
199 if (priv->mmap_locked)
200 return false;
201
202 rcu_read_unlock();
203 mmap_read_lock(priv->mm);
204 /* Reinitialize the iterator after taking mmap_lock */
205 vma_iter_set(&priv->iter, pos);
206 priv->mmap_locked = true;
207
208 return true;
209 }
210
211 #else /* CONFIG_PER_VMA_LOCK */
212
lock_vma_range(struct seq_file * m,struct proc_maps_private * priv)213 static inline bool lock_vma_range(struct seq_file *m,
214 struct proc_maps_private *priv)
215 {
216 return mmap_read_lock_killable(priv->mm) == 0;
217 }
218
unlock_vma_range(struct proc_maps_private * priv)219 static inline void unlock_vma_range(struct proc_maps_private *priv)
220 {
221 mmap_read_unlock(priv->mm);
222 }
223
get_next_vma(struct proc_maps_private * priv,loff_t last_pos)224 static struct vm_area_struct *get_next_vma(struct proc_maps_private *priv,
225 loff_t last_pos)
226 {
227 return vma_next(&priv->iter);
228 }
229
fallback_to_mmap_lock(struct proc_maps_private * priv,loff_t pos)230 static inline bool fallback_to_mmap_lock(struct proc_maps_private *priv,
231 loff_t pos)
232 {
233 return false;
234 }
235
236 #endif /* CONFIG_PER_VMA_LOCK */
237
proc_get_vma(struct seq_file * m,loff_t * ppos)238 static struct vm_area_struct *proc_get_vma(struct seq_file *m, loff_t *ppos)
239 {
240 struct proc_maps_private *priv = m->private;
241 struct vm_area_struct *vma;
242
243 retry:
244 vma = get_next_vma(priv, *ppos);
245 /* EINTR of EAGAIN is possible */
246 if (IS_ERR(vma)) {
247 if (PTR_ERR(vma) == -EAGAIN && fallback_to_mmap_lock(priv, *ppos))
248 goto retry;
249
250 return vma;
251 }
252
253 /* Store previous position to be able to restart if needed */
254 priv->last_pos = *ppos;
255 if (vma) {
256 /*
257 * Track the end of the reported vma to ensure position changes
258 * even if previous vma was merged with the next vma and we
259 * found the extended vma with the same vm_start.
260 */
261 *ppos = vma->vm_end;
262 } else {
263 *ppos = SENTINEL_VMA_GATE;
264 vma = get_gate_vma(priv->mm);
265 }
266
267 return vma;
268 }
269
m_start(struct seq_file * m,loff_t * ppos)270 static void *m_start(struct seq_file *m, loff_t *ppos)
271 {
272 struct proc_maps_private *priv = m->private;
273 loff_t last_addr = *ppos;
274 struct mm_struct *mm;
275
276 /* See m_next(). Zero at the start or after lseek. */
277 if (last_addr == SENTINEL_VMA_END)
278 return NULL;
279
280 priv->task = get_proc_task(priv->inode);
281 if (!priv->task)
282 return ERR_PTR(-ESRCH);
283
284 mm = priv->mm;
285 if (!mm || !mmget_not_zero(mm)) {
286 put_task_struct(priv->task);
287 priv->task = NULL;
288 return NULL;
289 }
290
291 if (!lock_vma_range(m, priv)) {
292 mmput(mm);
293 put_task_struct(priv->task);
294 priv->task = NULL;
295 return ERR_PTR(-EINTR);
296 }
297
298 /*
299 * Reset current position if last_addr was set before
300 * and it's not a sentinel.
301 */
302 if (last_addr > 0)
303 *ppos = last_addr = priv->last_pos;
304 vma_iter_init(&priv->iter, mm, (unsigned long)last_addr);
305 hold_task_mempolicy(priv);
306 if (last_addr == SENTINEL_VMA_GATE)
307 return get_gate_vma(mm);
308
309 return proc_get_vma(m, ppos);
310 }
311
m_next(struct seq_file * m,void * v,loff_t * ppos)312 static void *m_next(struct seq_file *m, void *v, loff_t *ppos)
313 {
314 if (*ppos == SENTINEL_VMA_GATE) {
315 *ppos = SENTINEL_VMA_END;
316 return NULL;
317 }
318 return proc_get_vma(m, ppos);
319 }
320
m_stop(struct seq_file * m,void * v)321 static void m_stop(struct seq_file *m, void *v)
322 {
323 struct proc_maps_private *priv = m->private;
324 struct mm_struct *mm = priv->mm;
325
326 if (!priv->task)
327 return;
328
329 release_task_mempolicy(priv);
330 unlock_vma_range(priv);
331 mmput(mm);
332 put_task_struct(priv->task);
333 priv->task = NULL;
334 }
335
proc_maps_open(struct inode * inode,struct file * file,const struct seq_operations * ops,int psize)336 static int proc_maps_open(struct inode *inode, struct file *file,
337 const struct seq_operations *ops, int psize)
338 {
339 struct proc_maps_private *priv = __seq_open_private(file, ops, psize);
340
341 if (!priv)
342 return -ENOMEM;
343
344 priv->inode = inode;
345 priv->mm = proc_mem_open(inode, PTRACE_MODE_READ);
346 if (IS_ERR(priv->mm)) {
347 int err = PTR_ERR(priv->mm);
348
349 seq_release_private(inode, file);
350 return err;
351 }
352
353 return 0;
354 }
355
proc_map_release(struct inode * inode,struct file * file)356 static int proc_map_release(struct inode *inode, struct file *file)
357 {
358 struct seq_file *seq = file->private_data;
359 struct proc_maps_private *priv = seq->private;
360
361 if (priv->mm)
362 mmdrop(priv->mm);
363
364 return seq_release_private(inode, file);
365 }
366
do_maps_open(struct inode * inode,struct file * file,const struct seq_operations * ops)367 static int do_maps_open(struct inode *inode, struct file *file,
368 const struct seq_operations *ops)
369 {
370 return proc_maps_open(inode, file, ops,
371 sizeof(struct proc_maps_private));
372 }
373
get_vma_name(struct vm_area_struct * vma,const struct path ** path,const char ** name,const char ** name_fmt)374 static void get_vma_name(struct vm_area_struct *vma,
375 const struct path **path,
376 const char **name,
377 const char **name_fmt)
378 {
379 struct anon_vma_name *anon_name = vma->vm_mm ? anon_vma_name(vma) : NULL;
380
381 *name = NULL;
382 *path = NULL;
383 *name_fmt = NULL;
384
385 /*
386 * Print the dentry name for named mappings, and a
387 * special [heap] marker for the heap:
388 */
389 if (vma->vm_file) {
390 /*
391 * If user named this anon shared memory via
392 * prctl(PR_SET_VMA ..., use the provided name.
393 */
394 if (anon_name) {
395 *name_fmt = "[anon_shmem:%s]";
396 *name = anon_name->name;
397 } else {
398 *path = file_user_path(vma->vm_file);
399 }
400 return;
401 }
402
403 if (vma->vm_ops && vma->vm_ops->name) {
404 *name = vma->vm_ops->name(vma);
405 if (*name)
406 return;
407 }
408
409 *name = arch_vma_name(vma);
410 if (*name)
411 return;
412
413 if (!vma->vm_mm) {
414 *name = "[vdso]";
415 return;
416 }
417
418 if (vma_is_initial_heap(vma)) {
419 *name = "[heap]";
420 return;
421 }
422
423 if (vma_is_initial_stack(vma)) {
424 *name = "[stack]";
425 return;
426 }
427
428 if (anon_name) {
429 *name_fmt = "[anon:%s]";
430 *name = anon_name->name;
431 return;
432 }
433 }
434
show_vma_header_prefix(struct seq_file * m,unsigned long start,unsigned long end,vm_flags_t flags,unsigned long long pgoff,dev_t dev,unsigned long ino)435 static void show_vma_header_prefix(struct seq_file *m,
436 unsigned long start, unsigned long end,
437 vm_flags_t flags, unsigned long long pgoff,
438 dev_t dev, unsigned long ino)
439 {
440 seq_setwidth(m, 25 + sizeof(void *) * 6 - 1);
441 seq_put_hex_ll(m, NULL, start, 8);
442 seq_put_hex_ll(m, "-", end, 8);
443 seq_putc(m, ' ');
444 seq_putc(m, flags & VM_READ ? 'r' : '-');
445 seq_putc(m, flags & VM_WRITE ? 'w' : '-');
446 seq_putc(m, flags & VM_EXEC ? 'x' : '-');
447 seq_putc(m, flags & VM_MAYSHARE ? 's' : 'p');
448 seq_put_hex_ll(m, " ", pgoff, 8);
449 seq_put_hex_ll(m, " ", MAJOR(dev), 2);
450 seq_put_hex_ll(m, ":", MINOR(dev), 2);
451 seq_put_decimal_ull(m, " ", ino);
452 seq_putc(m, ' ');
453 }
454
455 static void
show_map_vma(struct seq_file * m,struct vm_area_struct * vma)456 show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
457 {
458 const struct path *path;
459 const char *name_fmt, *name;
460 vm_flags_t flags = vma->vm_flags;
461 unsigned long ino = 0;
462 unsigned long long pgoff = 0;
463 unsigned long start, end;
464 dev_t dev = 0;
465
466 if (vma->vm_file) {
467 const struct inode *inode = file_user_inode(vma->vm_file);
468
469 dev = inode->i_sb->s_dev;
470 ino = inode->i_ino;
471 pgoff = ((loff_t)vma->vm_pgoff) << PAGE_SHIFT;
472 }
473
474 start = vma->vm_start;
475 end = VMA_PAD_START(vma);
476
477 __fold_filemap_fixup_entry(&((struct proc_maps_private *)m->private)->iter, &end);
478
479 show_vma_header_prefix(m, start, end, flags, pgoff, dev, ino);
480
481 get_vma_name(vma, &path, &name, &name_fmt);
482 if (path) {
483 seq_pad(m, ' ');
484 seq_path(m, path, "\n");
485 } else if (name_fmt) {
486 seq_pad(m, ' ');
487 seq_printf(m, name_fmt, name);
488 } else if (name) {
489 seq_pad(m, ' ');
490 seq_puts(m, name);
491 }
492 seq_putc(m, '\n');
493 }
494
show_map(struct seq_file * m,void * v)495 static int show_map(struct seq_file *m, void *v)
496 {
497 struct vm_area_struct *vma = v;
498
499 if (vma_pages(vma))
500 show_map_vma(m, vma);
501
502 show_map_pad_vma(vma, m, show_map_vma, false);
503
504 return 0;
505 }
506
507 static const struct seq_operations proc_pid_maps_op = {
508 .start = m_start,
509 .next = m_next,
510 .stop = m_stop,
511 .show = show_map
512 };
513
pid_maps_open(struct inode * inode,struct file * file)514 static int pid_maps_open(struct inode *inode, struct file *file)
515 {
516 return do_maps_open(inode, file, &proc_pid_maps_op);
517 }
518
519 #define PROCMAP_QUERY_VMA_FLAGS ( \
520 PROCMAP_QUERY_VMA_READABLE | \
521 PROCMAP_QUERY_VMA_WRITABLE | \
522 PROCMAP_QUERY_VMA_EXECUTABLE | \
523 PROCMAP_QUERY_VMA_SHARED \
524 )
525
526 #define PROCMAP_QUERY_VALID_FLAGS_MASK ( \
527 PROCMAP_QUERY_COVERING_OR_NEXT_VMA | \
528 PROCMAP_QUERY_FILE_BACKED_VMA | \
529 PROCMAP_QUERY_VMA_FLAGS \
530 )
531
query_vma_setup(struct mm_struct * mm)532 static int query_vma_setup(struct mm_struct *mm)
533 {
534 return mmap_read_lock_killable(mm);
535 }
536
query_vma_teardown(struct mm_struct * mm,struct vm_area_struct * vma)537 static void query_vma_teardown(struct mm_struct *mm, struct vm_area_struct *vma)
538 {
539 mmap_read_unlock(mm);
540 }
541
query_vma_find_by_addr(struct mm_struct * mm,unsigned long addr)542 static struct vm_area_struct *query_vma_find_by_addr(struct mm_struct *mm, unsigned long addr)
543 {
544 return find_vma(mm, addr);
545 }
546
query_matching_vma(struct mm_struct * mm,unsigned long addr,u32 flags)547 static struct vm_area_struct *query_matching_vma(struct mm_struct *mm,
548 unsigned long addr, u32 flags)
549 {
550 struct vm_area_struct *vma;
551
552 next_vma:
553 vma = query_vma_find_by_addr(mm, addr);
554 if (!vma)
555 goto no_vma;
556
557 /* user requested only file-backed VMA, keep iterating */
558 if ((flags & PROCMAP_QUERY_FILE_BACKED_VMA) && !vma->vm_file)
559 goto skip_vma;
560
561 /* VMA permissions should satisfy query flags */
562 if (flags & PROCMAP_QUERY_VMA_FLAGS) {
563 u32 perm = 0;
564
565 if (flags & PROCMAP_QUERY_VMA_READABLE)
566 perm |= VM_READ;
567 if (flags & PROCMAP_QUERY_VMA_WRITABLE)
568 perm |= VM_WRITE;
569 if (flags & PROCMAP_QUERY_VMA_EXECUTABLE)
570 perm |= VM_EXEC;
571 if (flags & PROCMAP_QUERY_VMA_SHARED)
572 perm |= VM_MAYSHARE;
573
574 if ((vma->vm_flags & perm) != perm)
575 goto skip_vma;
576 }
577
578 /* found covering VMA or user is OK with the matching next VMA */
579 if ((flags & PROCMAP_QUERY_COVERING_OR_NEXT_VMA) || vma->vm_start <= addr)
580 return vma;
581
582 skip_vma:
583 /*
584 * If the user needs closest matching VMA, keep iterating.
585 */
586 addr = vma->vm_end;
587 if (flags & PROCMAP_QUERY_COVERING_OR_NEXT_VMA)
588 goto next_vma;
589
590 no_vma:
591 return ERR_PTR(-ENOENT);
592 }
593
do_procmap_query(struct proc_maps_private * priv,void __user * uarg)594 static int do_procmap_query(struct proc_maps_private *priv, void __user *uarg)
595 {
596 struct procmap_query karg;
597 struct vm_area_struct *vma;
598 struct mm_struct *mm;
599 const char *name = NULL;
600 char build_id_buf[BUILD_ID_SIZE_MAX], *name_buf = NULL;
601 __u64 usize;
602 int err;
603
604 if (copy_from_user(&usize, (void __user *)uarg, sizeof(usize)))
605 return -EFAULT;
606 /* argument struct can never be that large, reject abuse */
607 if (usize > PAGE_SIZE)
608 return -E2BIG;
609 /* argument struct should have at least query_flags and query_addr fields */
610 if (usize < offsetofend(struct procmap_query, query_addr))
611 return -EINVAL;
612 err = copy_struct_from_user(&karg, sizeof(karg), uarg, usize);
613 if (err)
614 return err;
615
616 /* reject unknown flags */
617 if (karg.query_flags & ~PROCMAP_QUERY_VALID_FLAGS_MASK)
618 return -EINVAL;
619 /* either both buffer address and size are set, or both should be zero */
620 if (!!karg.vma_name_size != !!karg.vma_name_addr)
621 return -EINVAL;
622 if (!!karg.build_id_size != !!karg.build_id_addr)
623 return -EINVAL;
624
625 mm = priv->mm;
626 if (!mm || !mmget_not_zero(mm))
627 return -ESRCH;
628
629 err = query_vma_setup(mm);
630 if (err) {
631 mmput(mm);
632 return err;
633 }
634
635 vma = query_matching_vma(mm, karg.query_addr, karg.query_flags);
636 if (IS_ERR(vma)) {
637 err = PTR_ERR(vma);
638 vma = NULL;
639 goto out;
640 }
641
642 karg.vma_start = vma->vm_start;
643 karg.vma_end = vma->vm_end;
644
645 karg.vma_flags = 0;
646 if (vma->vm_flags & VM_READ)
647 karg.vma_flags |= PROCMAP_QUERY_VMA_READABLE;
648 if (vma->vm_flags & VM_WRITE)
649 karg.vma_flags |= PROCMAP_QUERY_VMA_WRITABLE;
650 if (vma->vm_flags & VM_EXEC)
651 karg.vma_flags |= PROCMAP_QUERY_VMA_EXECUTABLE;
652 if (vma->vm_flags & VM_MAYSHARE)
653 karg.vma_flags |= PROCMAP_QUERY_VMA_SHARED;
654
655 karg.vma_page_size = vma_kernel_pagesize(vma);
656
657 if (vma->vm_file) {
658 const struct inode *inode = file_user_inode(vma->vm_file);
659
660 karg.vma_offset = ((__u64)vma->vm_pgoff) << PAGE_SHIFT;
661 karg.dev_major = MAJOR(inode->i_sb->s_dev);
662 karg.dev_minor = MINOR(inode->i_sb->s_dev);
663 karg.inode = inode->i_ino;
664 } else {
665 karg.vma_offset = 0;
666 karg.dev_major = 0;
667 karg.dev_minor = 0;
668 karg.inode = 0;
669 }
670
671 if (karg.build_id_size) {
672 __u32 build_id_sz;
673
674 err = build_id_parse(vma, build_id_buf, &build_id_sz);
675 if (err) {
676 karg.build_id_size = 0;
677 } else {
678 if (karg.build_id_size < build_id_sz) {
679 err = -ENAMETOOLONG;
680 goto out;
681 }
682 karg.build_id_size = build_id_sz;
683 }
684 }
685
686 if (karg.vma_name_size) {
687 size_t name_buf_sz = min_t(size_t, PATH_MAX, karg.vma_name_size);
688 const struct path *path;
689 const char *name_fmt;
690 size_t name_sz = 0;
691
692 get_vma_name(vma, &path, &name, &name_fmt);
693
694 if (path || name_fmt || name) {
695 name_buf = kmalloc(name_buf_sz, GFP_KERNEL);
696 if (!name_buf) {
697 err = -ENOMEM;
698 goto out;
699 }
700 }
701 if (path) {
702 name = d_path(path, name_buf, name_buf_sz);
703 if (IS_ERR(name)) {
704 err = PTR_ERR(name);
705 goto out;
706 }
707 name_sz = name_buf + name_buf_sz - name;
708 } else if (name || name_fmt) {
709 name_sz = 1 + snprintf(name_buf, name_buf_sz, name_fmt ?: "%s", name);
710 name = name_buf;
711 }
712 if (name_sz > name_buf_sz) {
713 err = -ENAMETOOLONG;
714 goto out;
715 }
716 karg.vma_name_size = name_sz;
717 }
718
719 /* unlock vma or mmap_lock, and put mm_struct before copying data to user */
720 query_vma_teardown(mm, vma);
721 mmput(mm);
722
723 if (karg.vma_name_size && copy_to_user(u64_to_user_ptr(karg.vma_name_addr),
724 name, karg.vma_name_size)) {
725 kfree(name_buf);
726 return -EFAULT;
727 }
728 kfree(name_buf);
729
730 if (karg.build_id_size && copy_to_user(u64_to_user_ptr(karg.build_id_addr),
731 build_id_buf, karg.build_id_size))
732 return -EFAULT;
733
734 if (copy_to_user(uarg, &karg, min_t(size_t, sizeof(karg), usize)))
735 return -EFAULT;
736
737 return 0;
738
739 out:
740 query_vma_teardown(mm, vma);
741 mmput(mm);
742 kfree(name_buf);
743 return err;
744 }
745
procfs_procmap_ioctl(struct file * file,unsigned int cmd,unsigned long arg)746 static long procfs_procmap_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
747 {
748 struct seq_file *seq = file->private_data;
749 struct proc_maps_private *priv = seq->private;
750
751 switch (cmd) {
752 case PROCMAP_QUERY:
753 return do_procmap_query(priv, (void __user *)arg);
754 default:
755 return -ENOIOCTLCMD;
756 }
757 }
758
759 const struct file_operations proc_pid_maps_operations = {
760 .open = pid_maps_open,
761 .read = seq_read,
762 .llseek = seq_lseek,
763 .release = proc_map_release,
764 .unlocked_ioctl = procfs_procmap_ioctl,
765 .compat_ioctl = compat_ptr_ioctl,
766 };
767
768 /*
769 * Proportional Set Size(PSS): my share of RSS.
770 *
771 * PSS of a process is the count of pages it has in memory, where each
772 * page is divided by the number of processes sharing it. So if a
773 * process has 1000 pages all to itself, and 1000 shared with one other
774 * process, its PSS will be 1500.
775 *
776 * To keep (accumulated) division errors low, we adopt a 64bit
777 * fixed-point pss counter to minimize division errors. So (pss >>
778 * PSS_SHIFT) would be the real byte count.
779 *
780 * A shift of 12 before division means (assuming 4K page size):
781 * - 1M 3-user-pages add up to 8KB errors;
782 * - supports mapcount up to 2^24, or 16M;
783 * - supports PSS up to 2^52 bytes, or 4PB.
784 */
785 #define PSS_SHIFT 12
786
787 #ifdef CONFIG_PROC_PAGE_MONITOR
788 struct mem_size_stats {
789 unsigned long resident;
790 unsigned long shared_clean;
791 unsigned long shared_dirty;
792 unsigned long private_clean;
793 unsigned long private_dirty;
794 unsigned long referenced;
795 unsigned long anonymous;
796 unsigned long lazyfree;
797 unsigned long anonymous_thp;
798 unsigned long shmem_thp;
799 unsigned long file_thp;
800 unsigned long swap;
801 unsigned long swap_shared;
802 unsigned long writeback;
803 unsigned long same;
804 unsigned long huge;
805 unsigned long shared_hugetlb;
806 unsigned long private_hugetlb;
807 unsigned long ksm;
808 u64 pss;
809 u64 pss_anon;
810 u64 pss_file;
811 u64 pss_shmem;
812 u64 pss_dirty;
813 u64 pss_locked;
814 u64 swap_pss;
815 };
816
smaps_page_accumulate(struct mem_size_stats * mss,struct folio * folio,unsigned long size,unsigned long pss,bool dirty,bool locked,bool private)817 static void smaps_page_accumulate(struct mem_size_stats *mss,
818 struct folio *folio, unsigned long size, unsigned long pss,
819 bool dirty, bool locked, bool private)
820 {
821 mss->pss += pss;
822
823 if (folio_test_anon(folio))
824 mss->pss_anon += pss;
825 else if (folio_test_swapbacked(folio))
826 mss->pss_shmem += pss;
827 else
828 mss->pss_file += pss;
829
830 if (locked)
831 mss->pss_locked += pss;
832
833 if (dirty || folio_test_dirty(folio)) {
834 mss->pss_dirty += pss;
835 if (private)
836 mss->private_dirty += size;
837 else
838 mss->shared_dirty += size;
839 } else {
840 if (private)
841 mss->private_clean += size;
842 else
843 mss->shared_clean += size;
844 }
845 }
846
smaps_account(struct mem_size_stats * mss,struct page * page,bool compound,bool young,bool dirty,bool locked,bool present)847 static void smaps_account(struct mem_size_stats *mss, struct page *page,
848 bool compound, bool young, bool dirty, bool locked,
849 bool present)
850 {
851 struct folio *folio = page_folio(page);
852 int i, nr = compound ? compound_nr(page) : 1;
853 unsigned long size = nr * PAGE_SIZE;
854
855 /*
856 * First accumulate quantities that depend only on |size| and the type
857 * of the compound page.
858 */
859 if (folio_test_anon(folio)) {
860 mss->anonymous += size;
861 if (!folio_test_swapbacked(folio) && !dirty &&
862 !folio_test_dirty(folio))
863 mss->lazyfree += size;
864 }
865
866 if (folio_test_ksm(folio))
867 mss->ksm += size;
868
869 mss->resident += size;
870 /* Accumulate the size in pages that have been accessed. */
871 if (young || folio_test_young(folio) || folio_test_referenced(folio))
872 mss->referenced += size;
873
874 /*
875 * Then accumulate quantities that may depend on sharing, or that may
876 * differ page-by-page.
877 *
878 * refcount == 1 for present entries guarantees that the folio is mapped
879 * exactly once. For large folios this implies that exactly one
880 * PTE/PMD/... maps (a part of) this folio.
881 *
882 * Treat all non-present entries (where relying on the mapcount and
883 * refcount doesn't make sense) as "maybe shared, but not sure how
884 * often". We treat device private entries as being fake-present.
885 *
886 * Note that it would not be safe to read the mapcount especially for
887 * pages referenced by migration entries, even with the PTL held.
888 */
889 if (folio_ref_count(folio) == 1 || !present) {
890 smaps_page_accumulate(mss, folio, size, size << PSS_SHIFT,
891 dirty, locked, present);
892 return;
893 }
894 /*
895 * We obtain a snapshot of the mapcount. Without holding the folio lock
896 * this snapshot can be slightly wrong as we cannot always read the
897 * mapcount atomically.
898 */
899 for (i = 0; i < nr; i++, page++) {
900 int mapcount = folio_precise_page_mapcount(folio, page);
901 unsigned long pss = PAGE_SIZE << PSS_SHIFT;
902 if (mapcount >= 2)
903 pss /= mapcount;
904 smaps_page_accumulate(mss, folio, PAGE_SIZE, pss,
905 dirty, locked, mapcount < 2);
906 }
907 }
908
909 #ifdef CONFIG_SHMEM
smaps_pte_hole(unsigned long addr,unsigned long end,__always_unused int depth,struct mm_walk * walk)910 static int smaps_pte_hole(unsigned long addr, unsigned long end,
911 __always_unused int depth, struct mm_walk *walk)
912 {
913 struct mem_size_stats *mss = walk->private;
914 struct vm_area_struct *vma = walk->vma;
915
916 mss->swap += shmem_partial_swap_usage(walk->vma->vm_file->f_mapping,
917 linear_page_index(vma, addr),
918 linear_page_index(vma, end));
919
920 return 0;
921 }
922 #else
923 #define smaps_pte_hole NULL
924 #endif /* CONFIG_SHMEM */
925
smaps_pte_hole_lookup(unsigned long addr,struct mm_walk * walk)926 static void smaps_pte_hole_lookup(unsigned long addr, struct mm_walk *walk)
927 {
928 #ifdef CONFIG_SHMEM
929 if (walk->ops->pte_hole) {
930 /* depth is not used */
931 smaps_pte_hole(addr, addr + PAGE_SIZE, 0, walk);
932 }
933 #endif
934 }
935
smaps_pte_entry(pte_t * pte,unsigned long addr,struct mm_walk * walk)936 static void smaps_pte_entry(pte_t *pte, unsigned long addr,
937 struct mm_walk *walk)
938 {
939 struct mem_size_stats *mss = walk->private;
940 struct vm_area_struct *vma = walk->vma;
941 bool locked = !!(vma->vm_flags & VM_LOCKED);
942 struct page *page = NULL;
943 bool present = false, young = false, dirty = false;
944 pte_t ptent = ptep_get(pte);
945
946 if (pte_present(ptent)) {
947 page = vm_normal_page(vma, addr, ptent);
948 young = pte_young(ptent);
949 dirty = pte_dirty(ptent);
950 present = true;
951 } else if (is_swap_pte(ptent)) {
952 swp_entry_t swpent = pte_to_swp_entry(ptent);
953
954 if (!non_swap_entry(swpent)) {
955 int mapcount;
956
957 mss->swap += PAGE_SIZE;
958 mapcount = swp_swapcount(swpent);
959 if (mapcount >= 2) {
960 u64 pss_delta = (u64)PAGE_SIZE << PSS_SHIFT;
961
962 do_div(pss_delta, mapcount);
963 mss->swap_pss += pss_delta;
964 } else {
965 mss->swap_pss += (u64)PAGE_SIZE << PSS_SHIFT;
966 }
967 trace_android_vh_smaps_pte_entry(swpent, mapcount,
968 &mss->swap_shared, &mss->writeback,
969 &mss->same, &mss->huge);
970 } else if (is_pfn_swap_entry(swpent)) {
971 if (is_device_private_entry(swpent))
972 present = true;
973 page = pfn_swap_entry_to_page(swpent);
974 }
975 } else {
976 smaps_pte_hole_lookup(addr, walk);
977 return;
978 }
979
980 if (!page)
981 return;
982
983 smaps_account(mss, page, false, young, dirty, locked, present);
984 }
985
986 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
smaps_pmd_entry(pmd_t * pmd,unsigned long addr,struct mm_walk * walk)987 static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
988 struct mm_walk *walk)
989 {
990 struct mem_size_stats *mss = walk->private;
991 struct vm_area_struct *vma = walk->vma;
992 bool locked = !!(vma->vm_flags & VM_LOCKED);
993 struct page *page = NULL;
994 bool present = false;
995 struct folio *folio;
996
997 if (pmd_present(*pmd)) {
998 page = vm_normal_page_pmd(vma, addr, *pmd);
999 present = true;
1000 } else if (unlikely(thp_migration_supported() && is_swap_pmd(*pmd))) {
1001 swp_entry_t entry = pmd_to_swp_entry(*pmd);
1002
1003 if (is_pfn_swap_entry(entry))
1004 page = pfn_swap_entry_to_page(entry);
1005 }
1006 if (IS_ERR_OR_NULL(page))
1007 return;
1008 folio = page_folio(page);
1009 if (folio_test_anon(folio))
1010 mss->anonymous_thp += HPAGE_PMD_SIZE;
1011 else if (folio_test_swapbacked(folio))
1012 mss->shmem_thp += HPAGE_PMD_SIZE;
1013 else if (folio_is_zone_device(folio))
1014 /* pass */;
1015 else
1016 mss->file_thp += HPAGE_PMD_SIZE;
1017
1018 smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd),
1019 locked, present);
1020 }
1021 #else
smaps_pmd_entry(pmd_t * pmd,unsigned long addr,struct mm_walk * walk)1022 static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
1023 struct mm_walk *walk)
1024 {
1025 }
1026 #endif
1027
smaps_pte_range(pmd_t * pmd,unsigned long addr,unsigned long end,struct mm_walk * walk)1028 static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
1029 struct mm_walk *walk)
1030 {
1031 struct vm_area_struct *vma = walk->vma;
1032 pte_t *pte;
1033 spinlock_t *ptl;
1034
1035 ptl = pmd_trans_huge_lock(pmd, vma);
1036 if (ptl) {
1037 smaps_pmd_entry(pmd, addr, walk);
1038 spin_unlock(ptl);
1039 goto out;
1040 }
1041
1042 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
1043 if (!pte) {
1044 walk->action = ACTION_AGAIN;
1045 return 0;
1046 }
1047 for (; addr != end; pte++, addr += PAGE_SIZE)
1048 smaps_pte_entry(pte, addr, walk);
1049 pte_unmap_unlock(pte - 1, ptl);
1050 out:
1051 cond_resched();
1052 return 0;
1053 }
1054
show_smap_vma_flags(struct seq_file * m,struct vm_area_struct * vma)1055 static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
1056 {
1057 /*
1058 * Don't forget to update Documentation/ on changes.
1059 *
1060 * The length of the second argument of mnemonics[]
1061 * needs to be 3 instead of previously set 2
1062 * (i.e. from [BITS_PER_LONG][2] to [BITS_PER_LONG][3])
1063 * to avoid spurious
1064 * -Werror=unterminated-string-initialization warning
1065 * with GCC 15
1066 */
1067 static const char mnemonics[BITS_PER_LONG][3] = {
1068 /*
1069 * In case if we meet a flag we don't know about.
1070 */
1071 [0 ... (BITS_PER_LONG-1)] = "??",
1072
1073 [ilog2(VM_READ)] = "rd",
1074 [ilog2(VM_WRITE)] = "wr",
1075 [ilog2(VM_EXEC)] = "ex",
1076 [ilog2(VM_SHARED)] = "sh",
1077 [ilog2(VM_MAYREAD)] = "mr",
1078 [ilog2(VM_MAYWRITE)] = "mw",
1079 [ilog2(VM_MAYEXEC)] = "me",
1080 [ilog2(VM_MAYSHARE)] = "ms",
1081 [ilog2(VM_GROWSDOWN)] = "gd",
1082 [ilog2(VM_PFNMAP)] = "pf",
1083 [ilog2(VM_LOCKED)] = "lo",
1084 [ilog2(VM_IO)] = "io",
1085 [ilog2(VM_SEQ_READ)] = "sr",
1086 [ilog2(VM_RAND_READ)] = "rr",
1087 [ilog2(VM_DONTCOPY)] = "dc",
1088 [ilog2(VM_DONTEXPAND)] = "de",
1089 [ilog2(VM_LOCKONFAULT)] = "lf",
1090 [ilog2(VM_ACCOUNT)] = "ac",
1091 [ilog2(VM_NORESERVE)] = "nr",
1092 [ilog2(VM_HUGETLB)] = "ht",
1093 [ilog2(VM_SYNC)] = "sf",
1094 [ilog2(VM_ARCH_1)] = "ar",
1095 [ilog2(VM_WIPEONFORK)] = "wf",
1096 [ilog2(VM_DONTDUMP)] = "dd",
1097 #ifdef CONFIG_ARM64_BTI
1098 [ilog2(VM_ARM64_BTI)] = "bt",
1099 #endif
1100 #ifdef CONFIG_MEM_SOFT_DIRTY
1101 [ilog2(VM_SOFTDIRTY)] = "sd",
1102 #endif
1103 [ilog2(VM_MIXEDMAP)] = "mm",
1104 [ilog2(VM_HUGEPAGE)] = "hg",
1105 [ilog2(VM_NOHUGEPAGE)] = "nh",
1106 [ilog2(VM_MERGEABLE)] = "mg",
1107 [ilog2(VM_UFFD_MISSING)]= "um",
1108 [ilog2(VM_UFFD_WP)] = "uw",
1109 #ifdef CONFIG_ARM64_MTE
1110 [ilog2(VM_MTE)] = "mt",
1111 [ilog2(VM_MTE_ALLOWED)] = "",
1112 #endif
1113 #ifdef CONFIG_ARCH_HAS_PKEYS
1114 /* These come out via ProtectionKey: */
1115 [ilog2(VM_PKEY_BIT0)] = "",
1116 [ilog2(VM_PKEY_BIT1)] = "",
1117 [ilog2(VM_PKEY_BIT2)] = "",
1118 #if VM_PKEY_BIT3
1119 [ilog2(VM_PKEY_BIT3)] = "",
1120 #endif
1121 #if VM_PKEY_BIT4
1122 [ilog2(VM_PKEY_BIT4)] = "",
1123 #endif
1124 #endif /* CONFIG_ARCH_HAS_PKEYS */
1125 #ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
1126 [ilog2(VM_UFFD_MINOR)] = "ui",
1127 #endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */
1128 #ifdef CONFIG_X86_USER_SHADOW_STACK
1129 [ilog2(VM_SHADOW_STACK)] = "ss",
1130 #endif
1131 #if defined(CONFIG_64BIT) || defined(CONFIG_PPC32)
1132 [ilog2(VM_DROPPABLE)] = "dp",
1133 #endif
1134 #ifdef CONFIG_64BIT
1135 [ilog2(VM_SEALED)] = "sl",
1136 #endif
1137 };
1138 unsigned long pad_pages = vma_pad_pages(vma);
1139 size_t i;
1140
1141 seq_puts(m, "VmFlags: ");
1142 for (i = 0; i < BITS_PER_LONG; i++) {
1143 if (!mnemonics[i][0])
1144 continue;
1145 if ((1UL << i) & VM_PAD_MASK)
1146 continue;
1147 if (vma->vm_flags & (1UL << i))
1148 seq_printf(m, "%s ", mnemonics[i]);
1149 }
1150 if (pad_pages)
1151 seq_printf(m, "pad=%lukB", pad_pages << (PAGE_SHIFT - 10));
1152
1153 seq_putc(m, '\n');
1154 }
1155
1156 #ifdef CONFIG_HUGETLB_PAGE
smaps_hugetlb_range(pte_t * pte,unsigned long hmask,unsigned long addr,unsigned long end,struct mm_walk * walk)1157 static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask,
1158 unsigned long addr, unsigned long end,
1159 struct mm_walk *walk)
1160 {
1161 struct mem_size_stats *mss = walk->private;
1162 struct vm_area_struct *vma = walk->vma;
1163 struct folio *folio = NULL;
1164 bool present = false;
1165 spinlock_t *ptl;
1166 pte_t ptent;
1167
1168 ptl = huge_pte_lock(hstate_vma(vma), walk->mm, pte);
1169 ptent = huge_ptep_get(walk->mm, addr, pte);
1170 if (pte_present(ptent)) {
1171 folio = page_folio(pte_page(ptent));
1172 present = true;
1173 } else if (is_swap_pte(ptent)) {
1174 swp_entry_t swpent = pte_to_swp_entry(ptent);
1175
1176 if (is_pfn_swap_entry(swpent))
1177 folio = pfn_swap_entry_folio(swpent);
1178 }
1179
1180 if (folio) {
1181 /* We treat non-present entries as "maybe shared". */
1182 if (!present || folio_likely_mapped_shared(folio) ||
1183 hugetlb_pmd_shared(pte))
1184 mss->shared_hugetlb += huge_page_size(hstate_vma(vma));
1185 else
1186 mss->private_hugetlb += huge_page_size(hstate_vma(vma));
1187 }
1188 spin_unlock(ptl);
1189 return 0;
1190 }
1191 #else
1192 #define smaps_hugetlb_range NULL
1193 #endif /* HUGETLB_PAGE */
1194
1195 static const struct mm_walk_ops smaps_walk_ops = {
1196 .pmd_entry = smaps_pte_range,
1197 .hugetlb_entry = smaps_hugetlb_range,
1198 .walk_lock = PGWALK_RDLOCK,
1199 };
1200
1201 static const struct mm_walk_ops smaps_shmem_walk_ops = {
1202 .pmd_entry = smaps_pte_range,
1203 .hugetlb_entry = smaps_hugetlb_range,
1204 .pte_hole = smaps_pte_hole,
1205 .walk_lock = PGWALK_RDLOCK,
1206 };
1207
1208 /*
1209 * Gather mem stats from @vma with the indicated beginning
1210 * address @start, and keep them in @mss.
1211 *
1212 * Use vm_start of @vma as the beginning address if @start is 0.
1213 */
smap_gather_stats(struct vm_area_struct * vma,struct mem_size_stats * mss,unsigned long start)1214 static void smap_gather_stats(struct vm_area_struct *vma,
1215 struct mem_size_stats *mss, unsigned long start)
1216 {
1217 const struct mm_walk_ops *ops = &smaps_walk_ops;
1218 unsigned long end = VMA_PAD_START(vma);
1219
1220 /* Invalid start */
1221 if (start >= end)
1222 return;
1223
1224 if (vma->vm_file && shmem_mapping(vma->vm_file->f_mapping)) {
1225 /*
1226 * For shared or readonly shmem mappings we know that all
1227 * swapped out pages belong to the shmem object, and we can
1228 * obtain the swap value much more efficiently. For private
1229 * writable mappings, we might have COW pages that are
1230 * not affected by the parent swapped out pages of the shmem
1231 * object, so we have to distinguish them during the page walk.
1232 * Unless we know that the shmem object (or the part mapped by
1233 * our VMA) has no swapped out pages at all.
1234 */
1235 unsigned long shmem_swapped = shmem_swap_usage(vma);
1236
1237 if (!start && (!shmem_swapped || (vma->vm_flags & VM_SHARED) ||
1238 !(vma->vm_flags & VM_WRITE)) &&
1239 /*
1240 * Only if we don't have padding can we use the fast path
1241 * shmem_inode_info->swapped for shmem_swapped.
1242 *
1243 * Else we'll walk the page table to calculate
1244 * shmem_swapped, (excluding the padding region).
1245 */
1246 end == vma->vm_end) {
1247 mss->swap += shmem_swapped;
1248 } else {
1249 ops = &smaps_shmem_walk_ops;
1250 }
1251 }
1252
1253 /* mmap_lock is held in m_start */
1254 if (!start)
1255 walk_page_range(vma->vm_mm, vma->vm_start, end, ops, mss);
1256 else
1257 walk_page_range(vma->vm_mm, start, end, ops, mss);
1258 }
1259
1260 #define SEQ_PUT_DEC(str, val) \
1261 seq_put_decimal_ull_width(m, str, (val) >> 10, 8)
1262
1263 /* Show the contents common for smaps and smaps_rollup */
__show_smap(struct seq_file * m,const struct mem_size_stats * mss,bool rollup_mode)1264 static void __show_smap(struct seq_file *m, const struct mem_size_stats *mss,
1265 bool rollup_mode)
1266 {
1267 SEQ_PUT_DEC("Rss: ", mss->resident);
1268 SEQ_PUT_DEC(" kB\nPss: ", mss->pss >> PSS_SHIFT);
1269 SEQ_PUT_DEC(" kB\nPss_Dirty: ", mss->pss_dirty >> PSS_SHIFT);
1270 if (rollup_mode) {
1271 /*
1272 * These are meaningful only for smaps_rollup, otherwise two of
1273 * them are zero, and the other one is the same as Pss.
1274 */
1275 SEQ_PUT_DEC(" kB\nPss_Anon: ",
1276 mss->pss_anon >> PSS_SHIFT);
1277 SEQ_PUT_DEC(" kB\nPss_File: ",
1278 mss->pss_file >> PSS_SHIFT);
1279 SEQ_PUT_DEC(" kB\nPss_Shmem: ",
1280 mss->pss_shmem >> PSS_SHIFT);
1281 }
1282 SEQ_PUT_DEC(" kB\nShared_Clean: ", mss->shared_clean);
1283 SEQ_PUT_DEC(" kB\nShared_Dirty: ", mss->shared_dirty);
1284 SEQ_PUT_DEC(" kB\nPrivate_Clean: ", mss->private_clean);
1285 SEQ_PUT_DEC(" kB\nPrivate_Dirty: ", mss->private_dirty);
1286 SEQ_PUT_DEC(" kB\nReferenced: ", mss->referenced);
1287 SEQ_PUT_DEC(" kB\nAnonymous: ", mss->anonymous);
1288 SEQ_PUT_DEC(" kB\nKSM: ", mss->ksm);
1289 SEQ_PUT_DEC(" kB\nLazyFree: ", mss->lazyfree);
1290 SEQ_PUT_DEC(" kB\nAnonHugePages: ", mss->anonymous_thp);
1291 SEQ_PUT_DEC(" kB\nShmemPmdMapped: ", mss->shmem_thp);
1292 SEQ_PUT_DEC(" kB\nFilePmdMapped: ", mss->file_thp);
1293 SEQ_PUT_DEC(" kB\nShared_Hugetlb: ", mss->shared_hugetlb);
1294 seq_put_decimal_ull_width(m, " kB\nPrivate_Hugetlb: ",
1295 mss->private_hugetlb >> 10, 7);
1296 SEQ_PUT_DEC(" kB\nSwap: ", mss->swap);
1297 SEQ_PUT_DEC(" kB\nSwapPss: ",
1298 mss->swap_pss >> PSS_SHIFT);
1299 SEQ_PUT_DEC(" kB\nLocked: ",
1300 mss->pss_locked >> PSS_SHIFT);
1301 seq_puts(m, " kB\n");
1302 trace_android_vh_show_smap(m, mss->swap_shared, mss->writeback,
1303 mss->same, mss->huge);
1304 }
1305
show_smap(struct seq_file * m,void * v)1306 static int show_smap(struct seq_file *m, void *v)
1307 {
1308 struct vm_area_struct *vma = v;
1309 struct mem_size_stats mss = {};
1310
1311 if (!vma_pages(vma))
1312 goto show_pad;
1313
1314 smap_gather_stats(vma, &mss, 0);
1315
1316 show_map_vma(m, vma);
1317
1318 SEQ_PUT_DEC("Size: ", VMA_PAD_START(vma) - vma->vm_start);
1319 SEQ_PUT_DEC(" kB\nKernelPageSize: ", vma_kernel_pagesize(vma));
1320 SEQ_PUT_DEC(" kB\nMMUPageSize: ", vma_mmu_pagesize(vma));
1321 seq_puts(m, " kB\n");
1322
1323 __show_smap(m, &mss, false);
1324
1325 seq_printf(m, "THPeligible: %8u\n",
1326 !!thp_vma_allowable_orders(vma, vma->vm_flags,
1327 TVA_SMAPS | TVA_ENFORCE_SYSFS, THP_ORDERS_ALL));
1328
1329 if (arch_pkeys_enabled())
1330 seq_printf(m, "ProtectionKey: %8u\n", vma_pkey(vma));
1331 show_smap_vma_flags(m, vma);
1332
1333 show_pad:
1334 show_map_pad_vma(vma, m, show_smap, true);
1335
1336 return 0;
1337 }
1338
show_smaps_rollup(struct seq_file * m,void * v)1339 static int show_smaps_rollup(struct seq_file *m, void *v)
1340 {
1341 struct proc_maps_private *priv = m->private;
1342 struct mem_size_stats mss = {};
1343 struct mm_struct *mm = priv->mm;
1344 struct vm_area_struct *vma;
1345 unsigned long vma_start = 0, last_vma_end = 0;
1346 int ret = 0;
1347 VMA_ITERATOR(vmi, mm, 0);
1348
1349 priv->task = get_proc_task(priv->inode);
1350 if (!priv->task)
1351 return -ESRCH;
1352
1353 if (!mm || !mmget_not_zero(mm)) {
1354 ret = -ESRCH;
1355 goto out_put_task;
1356 }
1357
1358 ret = mmap_read_lock_killable(mm);
1359 if (ret)
1360 goto out_put_mm;
1361
1362 hold_task_mempolicy(priv);
1363 vma = vma_next(&vmi);
1364
1365 if (unlikely(!vma))
1366 goto empty_set;
1367
1368 vma_start = vma->vm_start;
1369 do {
1370 smap_gather_stats(vma, &mss, 0);
1371 last_vma_end = vma->vm_end;
1372
1373 /*
1374 * Release mmap_lock temporarily if someone wants to
1375 * access it for write request.
1376 */
1377 if (mmap_lock_is_contended(mm)) {
1378 vma_iter_invalidate(&vmi);
1379 mmap_read_unlock(mm);
1380 ret = mmap_read_lock_killable(mm);
1381 if (ret) {
1382 release_task_mempolicy(priv);
1383 goto out_put_mm;
1384 }
1385
1386 /*
1387 * After dropping the lock, there are four cases to
1388 * consider. See the following example for explanation.
1389 *
1390 * +------+------+-----------+
1391 * | VMA1 | VMA2 | VMA3 |
1392 * +------+------+-----------+
1393 * | | | |
1394 * 4k 8k 16k 400k
1395 *
1396 * Suppose we drop the lock after reading VMA2 due to
1397 * contention, then we get:
1398 *
1399 * last_vma_end = 16k
1400 *
1401 * 1) VMA2 is freed, but VMA3 exists:
1402 *
1403 * vma_next(vmi) will return VMA3.
1404 * In this case, just continue from VMA3.
1405 *
1406 * 2) VMA2 still exists:
1407 *
1408 * vma_next(vmi) will return VMA3.
1409 * In this case, just continue from VMA3.
1410 *
1411 * 3) No more VMAs can be found:
1412 *
1413 * vma_next(vmi) will return NULL.
1414 * No more things to do, just break.
1415 *
1416 * 4) (last_vma_end - 1) is the middle of a vma (VMA'):
1417 *
1418 * vma_next(vmi) will return VMA' whose range
1419 * contains last_vma_end.
1420 * Iterate VMA' from last_vma_end.
1421 */
1422 vma = vma_next(&vmi);
1423 /* Case 3 above */
1424 if (!vma)
1425 break;
1426
1427 /* Case 1 and 2 above */
1428 if (vma->vm_start >= last_vma_end) {
1429 smap_gather_stats(vma, &mss, 0);
1430 last_vma_end = vma->vm_end;
1431 continue;
1432 }
1433
1434 /* Case 4 above */
1435 if (vma->vm_end > last_vma_end) {
1436 smap_gather_stats(vma, &mss, last_vma_end);
1437 last_vma_end = vma->vm_end;
1438 }
1439 }
1440 } for_each_vma(vmi, vma);
1441
1442 empty_set:
1443 show_vma_header_prefix(m, vma_start, last_vma_end, 0, 0, 0, 0);
1444 seq_pad(m, ' ');
1445 seq_puts(m, "[rollup]\n");
1446
1447 __show_smap(m, &mss, true);
1448
1449 release_task_mempolicy(priv);
1450 mmap_read_unlock(mm);
1451
1452 out_put_mm:
1453 mmput(mm);
1454 out_put_task:
1455 put_task_struct(priv->task);
1456 priv->task = NULL;
1457
1458 return ret;
1459 }
1460 #undef SEQ_PUT_DEC
1461
1462 static const struct seq_operations proc_pid_smaps_op = {
1463 .start = m_start,
1464 .next = m_next,
1465 .stop = m_stop,
1466 .show = show_smap
1467 };
1468
pid_smaps_open(struct inode * inode,struct file * file)1469 static int pid_smaps_open(struct inode *inode, struct file *file)
1470 {
1471 return do_maps_open(inode, file, &proc_pid_smaps_op);
1472 }
1473
smaps_rollup_open(struct inode * inode,struct file * file)1474 static int smaps_rollup_open(struct inode *inode, struct file *file)
1475 {
1476 int ret;
1477 struct proc_maps_private *priv;
1478
1479 priv = kzalloc(sizeof(*priv), GFP_KERNEL_ACCOUNT);
1480 if (!priv)
1481 return -ENOMEM;
1482
1483 ret = single_open(file, show_smaps_rollup, priv);
1484 if (ret)
1485 goto out_free;
1486
1487 priv->inode = inode;
1488 priv->mm = proc_mem_open(inode, PTRACE_MODE_READ);
1489 if (IS_ERR(priv->mm)) {
1490 ret = PTR_ERR(priv->mm);
1491
1492 single_release(inode, file);
1493 goto out_free;
1494 }
1495
1496 return 0;
1497
1498 out_free:
1499 kfree(priv);
1500 return ret;
1501 }
1502
smaps_rollup_release(struct inode * inode,struct file * file)1503 static int smaps_rollup_release(struct inode *inode, struct file *file)
1504 {
1505 struct seq_file *seq = file->private_data;
1506 struct proc_maps_private *priv = seq->private;
1507
1508 if (priv->mm)
1509 mmdrop(priv->mm);
1510
1511 kfree(priv);
1512 return single_release(inode, file);
1513 }
1514
1515 const struct file_operations proc_pid_smaps_operations = {
1516 .open = pid_smaps_open,
1517 .read = seq_read,
1518 .llseek = seq_lseek,
1519 .release = proc_map_release,
1520 };
1521
1522 const struct file_operations proc_pid_smaps_rollup_operations = {
1523 .open = smaps_rollup_open,
1524 .read = seq_read,
1525 .llseek = seq_lseek,
1526 .release = smaps_rollup_release,
1527 };
1528
1529 enum clear_refs_types {
1530 CLEAR_REFS_ALL = 1,
1531 CLEAR_REFS_ANON,
1532 CLEAR_REFS_MAPPED,
1533 CLEAR_REFS_SOFT_DIRTY,
1534 CLEAR_REFS_MM_HIWATER_RSS,
1535 CLEAR_REFS_LAST,
1536 };
1537
1538 struct clear_refs_private {
1539 enum clear_refs_types type;
1540 };
1541
1542 #ifdef CONFIG_MEM_SOFT_DIRTY
1543
pte_is_pinned(struct vm_area_struct * vma,unsigned long addr,pte_t pte)1544 static inline bool pte_is_pinned(struct vm_area_struct *vma, unsigned long addr, pte_t pte)
1545 {
1546 struct folio *folio;
1547
1548 if (!pte_write(pte))
1549 return false;
1550 if (!is_cow_mapping(vma->vm_flags))
1551 return false;
1552 if (likely(!test_bit(MMF_HAS_PINNED, &vma->vm_mm->flags)))
1553 return false;
1554 folio = vm_normal_folio(vma, addr, pte);
1555 if (!folio)
1556 return false;
1557 return folio_maybe_dma_pinned(folio);
1558 }
1559
clear_soft_dirty(struct vm_area_struct * vma,unsigned long addr,pte_t * pte)1560 static inline void clear_soft_dirty(struct vm_area_struct *vma,
1561 unsigned long addr, pte_t *pte)
1562 {
1563 /*
1564 * The soft-dirty tracker uses #PF-s to catch writes
1565 * to pages, so write-protect the pte as well. See the
1566 * Documentation/admin-guide/mm/soft-dirty.rst for full description
1567 * of how soft-dirty works.
1568 */
1569 pte_t ptent = ptep_get(pte);
1570
1571 if (pte_present(ptent)) {
1572 pte_t old_pte;
1573
1574 if (pte_is_pinned(vma, addr, ptent))
1575 return;
1576 old_pte = ptep_modify_prot_start(vma, addr, pte);
1577 ptent = pte_wrprotect(old_pte);
1578 ptent = pte_clear_soft_dirty(ptent);
1579 ptep_modify_prot_commit(vma, addr, pte, old_pte, ptent);
1580 } else if (is_swap_pte(ptent)) {
1581 ptent = pte_swp_clear_soft_dirty(ptent);
1582 set_pte_at(vma->vm_mm, addr, pte, ptent);
1583 }
1584 }
1585 #else
clear_soft_dirty(struct vm_area_struct * vma,unsigned long addr,pte_t * pte)1586 static inline void clear_soft_dirty(struct vm_area_struct *vma,
1587 unsigned long addr, pte_t *pte)
1588 {
1589 }
1590 #endif
1591
1592 #if defined(CONFIG_MEM_SOFT_DIRTY) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
clear_soft_dirty_pmd(struct vm_area_struct * vma,unsigned long addr,pmd_t * pmdp)1593 static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
1594 unsigned long addr, pmd_t *pmdp)
1595 {
1596 pmd_t old, pmd = *pmdp;
1597
1598 if (pmd_present(pmd)) {
1599 /* See comment in change_huge_pmd() */
1600 old = pmdp_invalidate(vma, addr, pmdp);
1601 if (pmd_dirty(old))
1602 pmd = pmd_mkdirty(pmd);
1603 if (pmd_young(old))
1604 pmd = pmd_mkyoung(pmd);
1605
1606 pmd = pmd_wrprotect(pmd);
1607 pmd = pmd_clear_soft_dirty(pmd);
1608
1609 set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
1610 } else if (is_migration_entry(pmd_to_swp_entry(pmd))) {
1611 pmd = pmd_swp_clear_soft_dirty(pmd);
1612 set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
1613 }
1614 }
1615 #else
clear_soft_dirty_pmd(struct vm_area_struct * vma,unsigned long addr,pmd_t * pmdp)1616 static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
1617 unsigned long addr, pmd_t *pmdp)
1618 {
1619 }
1620 #endif
1621
clear_refs_pte_range(pmd_t * pmd,unsigned long addr,unsigned long end,struct mm_walk * walk)1622 static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
1623 unsigned long end, struct mm_walk *walk)
1624 {
1625 struct clear_refs_private *cp = walk->private;
1626 struct vm_area_struct *vma = walk->vma;
1627 pte_t *pte, ptent;
1628 spinlock_t *ptl;
1629 struct folio *folio;
1630
1631 ptl = pmd_trans_huge_lock(pmd, vma);
1632 if (ptl) {
1633 if (cp->type == CLEAR_REFS_SOFT_DIRTY) {
1634 clear_soft_dirty_pmd(vma, addr, pmd);
1635 goto out;
1636 }
1637
1638 if (!pmd_present(*pmd))
1639 goto out;
1640
1641 folio = pmd_folio(*pmd);
1642
1643 /* Clear accessed and referenced bits. */
1644 pmdp_test_and_clear_young(vma, addr, pmd);
1645 folio_test_clear_young(folio);
1646 folio_clear_referenced(folio);
1647 out:
1648 spin_unlock(ptl);
1649 return 0;
1650 }
1651
1652 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
1653 if (!pte) {
1654 walk->action = ACTION_AGAIN;
1655 return 0;
1656 }
1657 for (; addr != end; pte++, addr += PAGE_SIZE) {
1658 ptent = ptep_get(pte);
1659
1660 if (cp->type == CLEAR_REFS_SOFT_DIRTY) {
1661 clear_soft_dirty(vma, addr, pte);
1662 continue;
1663 }
1664
1665 if (!pte_present(ptent))
1666 continue;
1667
1668 folio = vm_normal_folio(vma, addr, ptent);
1669 if (!folio)
1670 continue;
1671
1672 /* Clear accessed and referenced bits. */
1673 ptep_test_and_clear_young(vma, addr, pte);
1674 folio_test_clear_young(folio);
1675 folio_clear_referenced(folio);
1676 }
1677 pte_unmap_unlock(pte - 1, ptl);
1678 cond_resched();
1679 return 0;
1680 }
1681
clear_refs_test_walk(unsigned long start,unsigned long end,struct mm_walk * walk)1682 static int clear_refs_test_walk(unsigned long start, unsigned long end,
1683 struct mm_walk *walk)
1684 {
1685 struct clear_refs_private *cp = walk->private;
1686 struct vm_area_struct *vma = walk->vma;
1687
1688 if (vma->vm_flags & VM_PFNMAP)
1689 return 1;
1690
1691 /*
1692 * Writing 1 to /proc/pid/clear_refs affects all pages.
1693 * Writing 2 to /proc/pid/clear_refs only affects anonymous pages.
1694 * Writing 3 to /proc/pid/clear_refs only affects file mapped pages.
1695 * Writing 4 to /proc/pid/clear_refs affects all pages.
1696 */
1697 if (cp->type == CLEAR_REFS_ANON && vma->vm_file)
1698 return 1;
1699 if (cp->type == CLEAR_REFS_MAPPED && !vma->vm_file)
1700 return 1;
1701 return 0;
1702 }
1703
1704 static const struct mm_walk_ops clear_refs_walk_ops = {
1705 .pmd_entry = clear_refs_pte_range,
1706 .test_walk = clear_refs_test_walk,
1707 .walk_lock = PGWALK_WRLOCK,
1708 };
1709
clear_refs_write(struct file * file,const char __user * buf,size_t count,loff_t * ppos)1710 static ssize_t clear_refs_write(struct file *file, const char __user *buf,
1711 size_t count, loff_t *ppos)
1712 {
1713 struct task_struct *task;
1714 char buffer[PROC_NUMBUF] = {};
1715 struct mm_struct *mm;
1716 struct vm_area_struct *vma;
1717 enum clear_refs_types type;
1718 int itype;
1719 int rv;
1720
1721 if (count > sizeof(buffer) - 1)
1722 count = sizeof(buffer) - 1;
1723 if (copy_from_user(buffer, buf, count))
1724 return -EFAULT;
1725 rv = kstrtoint(strstrip(buffer), 10, &itype);
1726 if (rv < 0)
1727 return rv;
1728 type = (enum clear_refs_types)itype;
1729 if (type < CLEAR_REFS_ALL || type >= CLEAR_REFS_LAST)
1730 return -EINVAL;
1731
1732 task = get_proc_task(file_inode(file));
1733 if (!task)
1734 return -ESRCH;
1735 mm = get_task_mm(task);
1736 if (mm) {
1737 VMA_ITERATOR(vmi, mm, 0);
1738 struct mmu_notifier_range range;
1739 struct clear_refs_private cp = {
1740 .type = type,
1741 };
1742
1743 if (mmap_write_lock_killable(mm)) {
1744 count = -EINTR;
1745 goto out_mm;
1746 }
1747 if (type == CLEAR_REFS_MM_HIWATER_RSS) {
1748 /*
1749 * Writing 5 to /proc/pid/clear_refs resets the peak
1750 * resident set size to this mm's current rss value.
1751 */
1752 reset_mm_hiwater_rss(mm);
1753 goto out_unlock;
1754 }
1755
1756 if (type == CLEAR_REFS_SOFT_DIRTY) {
1757 for_each_vma(vmi, vma) {
1758 if (!(vma->vm_flags & VM_SOFTDIRTY))
1759 continue;
1760 vm_flags_clear(vma, VM_SOFTDIRTY);
1761 vma_set_page_prot(vma);
1762 }
1763
1764 inc_tlb_flush_pending(mm);
1765 mmu_notifier_range_init(&range, MMU_NOTIFY_SOFT_DIRTY,
1766 0, mm, 0, -1UL);
1767 mmu_notifier_invalidate_range_start(&range);
1768 }
1769 walk_page_range(mm, 0, -1, &clear_refs_walk_ops, &cp);
1770 if (type == CLEAR_REFS_SOFT_DIRTY) {
1771 mmu_notifier_invalidate_range_end(&range);
1772 flush_tlb_mm(mm);
1773 dec_tlb_flush_pending(mm);
1774 }
1775 out_unlock:
1776 mmap_write_unlock(mm);
1777 out_mm:
1778 mmput(mm);
1779 }
1780 put_task_struct(task);
1781
1782 return count;
1783 }
1784
1785 const struct file_operations proc_clear_refs_operations = {
1786 .write = clear_refs_write,
1787 .llseek = noop_llseek,
1788 };
1789
1790 typedef struct {
1791 u64 pme;
1792 } pagemap_entry_t;
1793
1794 struct pagemapread {
1795 int pos, len; /* units: PM_ENTRY_BYTES, not bytes */
1796 pagemap_entry_t *buffer;
1797 bool show_pfn;
1798 };
1799
1800 #define PAGEMAP_WALK_SIZE (PMD_SIZE)
1801 #define PAGEMAP_WALK_MASK (PMD_MASK)
1802
1803 #define PM_ENTRY_BYTES sizeof(pagemap_entry_t)
1804 #define PM_PFRAME_BITS 55
1805 #define PM_PFRAME_MASK GENMASK_ULL(PM_PFRAME_BITS - 1, 0)
1806 #define PM_SOFT_DIRTY BIT_ULL(55)
1807 #define PM_MMAP_EXCLUSIVE BIT_ULL(56)
1808 #define PM_UFFD_WP BIT_ULL(57)
1809 #define PM_GUARD_REGION BIT_ULL(58)
1810 #define PM_FILE BIT_ULL(61)
1811 #define PM_SWAP BIT_ULL(62)
1812 #define PM_PRESENT BIT_ULL(63)
1813
1814 #define PM_END_OF_BUFFER 1
1815
make_pme(u64 frame,u64 flags)1816 static inline pagemap_entry_t make_pme(u64 frame, u64 flags)
1817 {
1818 return (pagemap_entry_t) { .pme = (frame & PM_PFRAME_MASK) | flags };
1819 }
1820
add_to_pagemap(pagemap_entry_t * pme,struct pagemapread * pm)1821 static int add_to_pagemap(pagemap_entry_t *pme, struct pagemapread *pm)
1822 {
1823 pm->buffer[pm->pos++] = *pme;
1824 if (pm->pos >= pm->len)
1825 return PM_END_OF_BUFFER;
1826 return 0;
1827 }
1828
pagemap_pte_hole(unsigned long start,unsigned long end,__always_unused int depth,struct mm_walk * walk)1829 static int pagemap_pte_hole(unsigned long start, unsigned long end,
1830 __always_unused int depth, struct mm_walk *walk)
1831 {
1832 struct pagemapread *pm = walk->private;
1833 unsigned long addr = start;
1834 int err = 0;
1835
1836 while (addr < end) {
1837 struct vm_area_struct *vma = find_vma(walk->mm, addr);
1838 pagemap_entry_t pme = make_pme(0, 0);
1839 /* End of address space hole, which we mark as non-present. */
1840 unsigned long hole_end;
1841
1842 if (vma)
1843 hole_end = min(end, vma->vm_start);
1844 else
1845 hole_end = end;
1846
1847 for (; addr < hole_end; addr += PAGE_SIZE) {
1848 err = add_to_pagemap(&pme, pm);
1849 if (err)
1850 goto out;
1851 }
1852
1853 if (!vma)
1854 break;
1855
1856 /* Addresses in the VMA. */
1857 if (vma->vm_flags & VM_SOFTDIRTY)
1858 pme = make_pme(0, PM_SOFT_DIRTY);
1859 for (; addr < min(end, vma->vm_end); addr += PAGE_SIZE) {
1860 err = add_to_pagemap(&pme, pm);
1861 if (err)
1862 goto out;
1863 }
1864 }
1865 out:
1866 return err;
1867 }
1868
pte_to_pagemap_entry(struct pagemapread * pm,struct vm_area_struct * vma,unsigned long addr,pte_t pte)1869 static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm,
1870 struct vm_area_struct *vma, unsigned long addr, pte_t pte)
1871 {
1872 u64 frame = 0, flags = 0;
1873 struct page *page = NULL;
1874 struct folio *folio;
1875
1876 if (pte_present(pte)) {
1877 if (pm->show_pfn)
1878 frame = pte_pfn(pte);
1879 flags |= PM_PRESENT;
1880 page = vm_normal_page(vma, addr, pte);
1881 if (pte_soft_dirty(pte))
1882 flags |= PM_SOFT_DIRTY;
1883 if (pte_uffd_wp(pte))
1884 flags |= PM_UFFD_WP;
1885 } else if (is_swap_pte(pte)) {
1886 swp_entry_t entry;
1887 if (pte_swp_soft_dirty(pte))
1888 flags |= PM_SOFT_DIRTY;
1889 if (pte_swp_uffd_wp(pte))
1890 flags |= PM_UFFD_WP;
1891 entry = pte_to_swp_entry(pte);
1892 if (pm->show_pfn) {
1893 pgoff_t offset;
1894 /*
1895 * For PFN swap offsets, keeping the offset field
1896 * to be PFN only to be compatible with old smaps.
1897 */
1898 if (is_pfn_swap_entry(entry))
1899 offset = swp_offset_pfn(entry);
1900 else
1901 offset = swp_offset(entry);
1902 frame = swp_type(entry) |
1903 (offset << MAX_SWAPFILES_SHIFT);
1904 }
1905 flags |= PM_SWAP;
1906 if (is_pfn_swap_entry(entry))
1907 page = pfn_swap_entry_to_page(entry);
1908 if (pte_marker_entry_uffd_wp(entry))
1909 flags |= PM_UFFD_WP;
1910 if (is_guard_swp_entry(entry))
1911 flags |= PM_GUARD_REGION;
1912 }
1913
1914 if (page) {
1915 folio = page_folio(page);
1916 if (!folio_test_anon(folio))
1917 flags |= PM_FILE;
1918 if ((flags & PM_PRESENT) &&
1919 folio_precise_page_mapcount(folio, page) == 1)
1920 flags |= PM_MMAP_EXCLUSIVE;
1921 }
1922 if (vma->vm_flags & VM_SOFTDIRTY)
1923 flags |= PM_SOFT_DIRTY;
1924
1925 return make_pme(frame, flags);
1926 }
1927
pagemap_pmd_range(pmd_t * pmdp,unsigned long addr,unsigned long end,struct mm_walk * walk)1928 static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
1929 struct mm_walk *walk)
1930 {
1931 struct vm_area_struct *vma = walk->vma;
1932 struct pagemapread *pm = walk->private;
1933 spinlock_t *ptl;
1934 pte_t *pte, *orig_pte;
1935 int err = 0;
1936 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
1937
1938 ptl = pmd_trans_huge_lock(pmdp, vma);
1939 if (ptl) {
1940 unsigned int idx = (addr & ~PMD_MASK) >> PAGE_SHIFT;
1941 u64 flags = 0, frame = 0;
1942 pmd_t pmd = *pmdp;
1943 struct page *page = NULL;
1944 struct folio *folio = NULL;
1945
1946 if (vma->vm_flags & VM_SOFTDIRTY)
1947 flags |= PM_SOFT_DIRTY;
1948
1949 if (pmd_present(pmd)) {
1950 page = pmd_page(pmd);
1951
1952 flags |= PM_PRESENT;
1953 if (pmd_soft_dirty(pmd))
1954 flags |= PM_SOFT_DIRTY;
1955 if (pmd_uffd_wp(pmd))
1956 flags |= PM_UFFD_WP;
1957 if (pm->show_pfn)
1958 frame = pmd_pfn(pmd) + idx;
1959 }
1960 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
1961 else if (is_swap_pmd(pmd)) {
1962 swp_entry_t entry = pmd_to_swp_entry(pmd);
1963 unsigned long offset;
1964
1965 if (pm->show_pfn) {
1966 if (is_pfn_swap_entry(entry))
1967 offset = swp_offset_pfn(entry) + idx;
1968 else
1969 offset = swp_offset(entry) + idx;
1970 frame = swp_type(entry) |
1971 (offset << MAX_SWAPFILES_SHIFT);
1972 }
1973 flags |= PM_SWAP;
1974 if (pmd_swp_soft_dirty(pmd))
1975 flags |= PM_SOFT_DIRTY;
1976 if (pmd_swp_uffd_wp(pmd))
1977 flags |= PM_UFFD_WP;
1978 VM_BUG_ON(!is_pmd_migration_entry(pmd));
1979 page = pfn_swap_entry_to_page(entry);
1980 }
1981 #endif
1982
1983 if (page) {
1984 folio = page_folio(page);
1985 if (!folio_test_anon(folio))
1986 flags |= PM_FILE;
1987 }
1988
1989 for (; addr != end; addr += PAGE_SIZE, idx++) {
1990 u64 cur_flags = flags;
1991 pagemap_entry_t pme;
1992
1993 if (folio && (flags & PM_PRESENT) &&
1994 folio_precise_page_mapcount(folio, page + idx) == 1)
1995 cur_flags |= PM_MMAP_EXCLUSIVE;
1996
1997 pme = make_pme(frame, cur_flags);
1998 err = add_to_pagemap(&pme, pm);
1999 if (err)
2000 break;
2001 if (pm->show_pfn) {
2002 if (flags & PM_PRESENT)
2003 frame++;
2004 else if (flags & PM_SWAP)
2005 frame += (1 << MAX_SWAPFILES_SHIFT);
2006 }
2007 }
2008 spin_unlock(ptl);
2009 return err;
2010 }
2011 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
2012
2013 /*
2014 * We can assume that @vma always points to a valid one and @end never
2015 * goes beyond vma->vm_end.
2016 */
2017 orig_pte = pte = pte_offset_map_lock(walk->mm, pmdp, addr, &ptl);
2018 if (!pte) {
2019 walk->action = ACTION_AGAIN;
2020 return err;
2021 }
2022 for (; addr < end; pte++, addr += PAGE_SIZE) {
2023 pagemap_entry_t pme;
2024
2025 pme = pte_to_pagemap_entry(pm, vma, addr, ptep_get(pte));
2026 err = add_to_pagemap(&pme, pm);
2027 if (err)
2028 break;
2029 }
2030 pte_unmap_unlock(orig_pte, ptl);
2031
2032 cond_resched();
2033
2034 return err;
2035 }
2036
2037 #ifdef CONFIG_HUGETLB_PAGE
2038 /* This function walks within one hugetlb entry in the single call */
pagemap_hugetlb_range(pte_t * ptep,unsigned long hmask,unsigned long addr,unsigned long end,struct mm_walk * walk)2039 static int pagemap_hugetlb_range(pte_t *ptep, unsigned long hmask,
2040 unsigned long addr, unsigned long end,
2041 struct mm_walk *walk)
2042 {
2043 struct pagemapread *pm = walk->private;
2044 struct vm_area_struct *vma = walk->vma;
2045 u64 flags = 0, frame = 0;
2046 int err = 0;
2047 pte_t pte;
2048
2049 if (vma->vm_flags & VM_SOFTDIRTY)
2050 flags |= PM_SOFT_DIRTY;
2051
2052 pte = huge_ptep_get(walk->mm, addr, ptep);
2053 if (pte_present(pte)) {
2054 struct folio *folio = page_folio(pte_page(pte));
2055
2056 if (!folio_test_anon(folio))
2057 flags |= PM_FILE;
2058
2059 if (!folio_likely_mapped_shared(folio) &&
2060 !hugetlb_pmd_shared(ptep))
2061 flags |= PM_MMAP_EXCLUSIVE;
2062
2063 if (huge_pte_uffd_wp(pte))
2064 flags |= PM_UFFD_WP;
2065
2066 flags |= PM_PRESENT;
2067 if (pm->show_pfn)
2068 frame = pte_pfn(pte) +
2069 ((addr & ~hmask) >> PAGE_SHIFT);
2070 } else if (pte_swp_uffd_wp_any(pte)) {
2071 flags |= PM_UFFD_WP;
2072 }
2073
2074 for (; addr != end; addr += PAGE_SIZE) {
2075 pagemap_entry_t pme = make_pme(frame, flags);
2076
2077 err = add_to_pagemap(&pme, pm);
2078 if (err)
2079 return err;
2080 if (pm->show_pfn && (flags & PM_PRESENT))
2081 frame++;
2082 }
2083
2084 cond_resched();
2085
2086 return err;
2087 }
2088 #else
2089 #define pagemap_hugetlb_range NULL
2090 #endif /* HUGETLB_PAGE */
2091
2092 static const struct mm_walk_ops pagemap_ops = {
2093 .pmd_entry = pagemap_pmd_range,
2094 .pte_hole = pagemap_pte_hole,
2095 .hugetlb_entry = pagemap_hugetlb_range,
2096 .walk_lock = PGWALK_RDLOCK,
2097 };
2098
__collapse_pagemap_result(pagemap_entry_t * src_vec,pagemap_entry_t * res_vec,unsigned int entries,unsigned int nr_subpages)2099 static inline void __collapse_pagemap_result(pagemap_entry_t *src_vec,
2100 pagemap_entry_t *res_vec,
2101 unsigned int entries,
2102 unsigned int nr_subpages)
2103 {
2104 unsigned int i;
2105
2106 if (nr_subpages == 1)
2107 return;
2108
2109 for (i = 0; i < entries; i++) {
2110 /*
2111 * Zero the PFN since there is no guarantee that the PFNs are contiguous.
2112 * Zero the flags - applicable flags are derived from the sub-entries,
2113 * inapplicable flags are kept zeroed.
2114 */
2115 if (i % nr_subpages == 0)
2116 res_vec[i / nr_subpages] = make_pme(0, 0);
2117
2118 res_vec[i / nr_subpages].pme
2119 |= src_vec[i].pme & (PM_SOFT_DIRTY|PM_MMAP_EXCLUSIVE|PM_FILE|PM_PRESENT);
2120 }
2121 }
2122
2123 /*
2124 * /proc/pid/pagemap - an array mapping virtual pages to pfns
2125 *
2126 * For each page in the address space, this file contains one 64-bit entry
2127 * consisting of the following:
2128 *
2129 * Bits 0-54 page frame number (PFN) if present
2130 * Bits 0-4 swap type if swapped
2131 * Bits 5-54 swap offset if swapped
2132 * Bit 55 pte is soft-dirty (see Documentation/admin-guide/mm/soft-dirty.rst)
2133 * Bit 56 page exclusively mapped
2134 * Bit 57 pte is uffd-wp write-protected
2135 * Bit 58 pte is a guard region
2136 * Bits 59-60 zero
2137 * Bit 61 page is file-page or shared-anon
2138 * Bit 62 page swapped
2139 * Bit 63 page present
2140 *
2141 * If the page is not present but in swap, then the PFN contains an
2142 * encoding of the swap file number and the page's offset into the
2143 * swap. Unmapped pages return a null PFN. This allows determining
2144 * precisely which pages are mapped (or in swap) and comparing mapped
2145 * pages between processes.
2146 *
2147 * Efficient users of this interface will use /proc/pid/maps to
2148 * determine which areas of memory are actually mapped and llseek to
2149 * skip over unmapped regions.
2150 */
pagemap_read(struct file * file,char __user * buf,size_t count,loff_t * ppos)2151 static ssize_t pagemap_read(struct file *file, char __user *buf,
2152 size_t count, loff_t *ppos)
2153 {
2154 struct mm_struct *mm = file->private_data;
2155 struct pagemapread pm;
2156 unsigned long src;
2157 unsigned long svpfn;
2158 unsigned long start_vaddr;
2159 unsigned long end_vaddr;
2160 int ret = 0, copied = 0;
2161 unsigned int nr_subpages = __PAGE_SIZE / PAGE_SIZE;
2162 pagemap_entry_t *res = NULL;
2163
2164 if (!mm || !mmget_not_zero(mm))
2165 goto out;
2166
2167 ret = -EINVAL;
2168 /* file position must be aligned */
2169 if ((*ppos % PM_ENTRY_BYTES) || (count % PM_ENTRY_BYTES))
2170 goto out_mm;
2171
2172 ret = 0;
2173 if (!count)
2174 goto out_mm;
2175
2176 /* do not disclose physical addresses: attack vector */
2177 pm.show_pfn = file_ns_capable(file, &init_user_ns, CAP_SYS_ADMIN);
2178
2179 pm.len = (PAGEMAP_WALK_SIZE >> PAGE_SHIFT);
2180 pm.buffer = kmalloc_array(pm.len, PM_ENTRY_BYTES, GFP_KERNEL);
2181 ret = -ENOMEM;
2182 if (!pm.buffer)
2183 goto out_mm;
2184
2185 if (unlikely(nr_subpages > 1)) {
2186 /*
2187 * Userspace thinks the pages are large than the actually are, adjust the count to
2188 * compensate.
2189 */
2190 count *= nr_subpages;
2191
2192 res = kcalloc(pm.len / nr_subpages, PM_ENTRY_BYTES, GFP_KERNEL);
2193 if (!res) {
2194 ret = -ENOMEM;
2195 goto out_free;
2196 }
2197 } else
2198 res = pm.buffer;
2199
2200 src = *ppos;
2201 svpfn = src / PM_ENTRY_BYTES;
2202 end_vaddr = mm->task_size;
2203
2204 /* watch out for wraparound */
2205 start_vaddr = end_vaddr;
2206 if (svpfn <= (ULONG_MAX >> PAGE_SHIFT)) {
2207 unsigned long end;
2208
2209 ret = mmap_read_lock_killable(mm);
2210 if (ret)
2211 goto out_free;
2212 start_vaddr = untagged_addr_remote(mm, svpfn << PAGE_SHIFT);
2213 mmap_read_unlock(mm);
2214
2215 end = start_vaddr + ((count / PM_ENTRY_BYTES) << PAGE_SHIFT);
2216 if (end >= start_vaddr && end < mm->task_size)
2217 end_vaddr = end;
2218 }
2219
2220 /* Ensure the address is inside the task */
2221 if (start_vaddr > mm->task_size)
2222 start_vaddr = end_vaddr;
2223
2224 ret = 0;
2225 while (count && (start_vaddr < end_vaddr)) {
2226 int len;
2227 unsigned long end;
2228
2229 pm.pos = 0;
2230 end = (start_vaddr + PAGEMAP_WALK_SIZE) & PAGEMAP_WALK_MASK;
2231 /* overflow ? */
2232 if (end < start_vaddr || end > end_vaddr)
2233 end = end_vaddr;
2234 ret = mmap_read_lock_killable(mm);
2235 if (ret)
2236 goto out_free;
2237 ret = walk_page_range(mm, start_vaddr, end, &pagemap_ops, &pm);
2238 mmap_read_unlock(mm);
2239 start_vaddr = end;
2240
2241 len = min(count, PM_ENTRY_BYTES * pm.pos);
2242
2243 __collapse_pagemap_result(pm.buffer, res, len / PM_ENTRY_BYTES, nr_subpages);
2244
2245 if (copy_to_user(buf, res, len / nr_subpages)) {
2246 ret = -EFAULT;
2247 goto out_free;
2248 }
2249
2250 /*
2251 * If emulating the page size, clear the old results, to avoid
2252 * corrupting the next __collapse_pagemap_result()
2253 */
2254 if (unlikely(nr_subpages > 1))
2255 memset(res, 0, len / nr_subpages);
2256
2257 copied += len;
2258 buf += len / nr_subpages;
2259 count -= len;
2260 }
2261 *ppos += copied;
2262 if (!ret || ret == PM_END_OF_BUFFER)
2263 ret = copied / nr_subpages;
2264
2265 out_free:
2266 /* Avoid double free, as res = pm.buffer if nr_subpages == 1 */
2267 if (unlikely(nr_subpages > 1))
2268 kfree(res);
2269 kfree(pm.buffer);
2270 out_mm:
2271 mmput(mm);
2272 out:
2273 return ret;
2274 }
2275
pagemap_open(struct inode * inode,struct file * file)2276 static int pagemap_open(struct inode *inode, struct file *file)
2277 {
2278 struct mm_struct *mm;
2279
2280 mm = proc_mem_open(inode, PTRACE_MODE_READ);
2281 if (IS_ERR(mm))
2282 return PTR_ERR(mm);
2283 file->private_data = mm;
2284 return 0;
2285 }
2286
pagemap_release(struct inode * inode,struct file * file)2287 static int pagemap_release(struct inode *inode, struct file *file)
2288 {
2289 struct mm_struct *mm = file->private_data;
2290
2291 if (mm)
2292 mmdrop(mm);
2293 return 0;
2294 }
2295
2296 #define PM_SCAN_CATEGORIES (PAGE_IS_WPALLOWED | PAGE_IS_WRITTEN | \
2297 PAGE_IS_FILE | PAGE_IS_PRESENT | \
2298 PAGE_IS_SWAPPED | PAGE_IS_PFNZERO | \
2299 PAGE_IS_HUGE | PAGE_IS_SOFT_DIRTY)
2300 #define PM_SCAN_FLAGS (PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC)
2301
2302 struct pagemap_scan_private {
2303 struct pm_scan_arg arg;
2304 unsigned long masks_of_interest, cur_vma_category;
2305 struct page_region *vec_buf;
2306 unsigned long vec_buf_len, vec_buf_index, found_pages;
2307 struct page_region __user *vec_out;
2308 };
2309
pagemap_page_category(struct pagemap_scan_private * p,struct vm_area_struct * vma,unsigned long addr,pte_t pte)2310 static unsigned long pagemap_page_category(struct pagemap_scan_private *p,
2311 struct vm_area_struct *vma,
2312 unsigned long addr, pte_t pte)
2313 {
2314 unsigned long categories = 0;
2315
2316 if (pte_present(pte)) {
2317 struct page *page;
2318
2319 categories |= PAGE_IS_PRESENT;
2320 if (!pte_uffd_wp(pte))
2321 categories |= PAGE_IS_WRITTEN;
2322
2323 if (p->masks_of_interest & PAGE_IS_FILE) {
2324 page = vm_normal_page(vma, addr, pte);
2325 if (page && !PageAnon(page))
2326 categories |= PAGE_IS_FILE;
2327 }
2328
2329 if (is_zero_pfn(pte_pfn(pte)))
2330 categories |= PAGE_IS_PFNZERO;
2331 if (pte_soft_dirty(pte))
2332 categories |= PAGE_IS_SOFT_DIRTY;
2333 } else if (is_swap_pte(pte)) {
2334 swp_entry_t swp;
2335
2336 categories |= PAGE_IS_SWAPPED;
2337 if (!pte_swp_uffd_wp_any(pte))
2338 categories |= PAGE_IS_WRITTEN;
2339
2340 if (p->masks_of_interest & PAGE_IS_FILE) {
2341 swp = pte_to_swp_entry(pte);
2342 if (is_pfn_swap_entry(swp) &&
2343 !folio_test_anon(pfn_swap_entry_folio(swp)))
2344 categories |= PAGE_IS_FILE;
2345 }
2346 if (pte_swp_soft_dirty(pte))
2347 categories |= PAGE_IS_SOFT_DIRTY;
2348 }
2349
2350 return categories;
2351 }
2352
make_uffd_wp_pte(struct vm_area_struct * vma,unsigned long addr,pte_t * pte,pte_t ptent)2353 static void make_uffd_wp_pte(struct vm_area_struct *vma,
2354 unsigned long addr, pte_t *pte, pte_t ptent)
2355 {
2356 if (pte_present(ptent)) {
2357 pte_t old_pte;
2358
2359 old_pte = ptep_modify_prot_start(vma, addr, pte);
2360 ptent = pte_mkuffd_wp(old_pte);
2361 ptep_modify_prot_commit(vma, addr, pte, old_pte, ptent);
2362 } else if (is_swap_pte(ptent)) {
2363 ptent = pte_swp_mkuffd_wp(ptent);
2364 set_pte_at(vma->vm_mm, addr, pte, ptent);
2365 } else {
2366 set_pte_at(vma->vm_mm, addr, pte,
2367 make_pte_marker(PTE_MARKER_UFFD_WP));
2368 }
2369 }
2370
2371 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
pagemap_thp_category(struct pagemap_scan_private * p,struct vm_area_struct * vma,unsigned long addr,pmd_t pmd)2372 static unsigned long pagemap_thp_category(struct pagemap_scan_private *p,
2373 struct vm_area_struct *vma,
2374 unsigned long addr, pmd_t pmd)
2375 {
2376 unsigned long categories = PAGE_IS_HUGE;
2377
2378 if (pmd_present(pmd)) {
2379 struct page *page;
2380
2381 categories |= PAGE_IS_PRESENT;
2382 if (!pmd_uffd_wp(pmd))
2383 categories |= PAGE_IS_WRITTEN;
2384
2385 if (p->masks_of_interest & PAGE_IS_FILE) {
2386 page = vm_normal_page_pmd(vma, addr, pmd);
2387 if (page && !PageAnon(page))
2388 categories |= PAGE_IS_FILE;
2389 }
2390
2391 if (is_huge_zero_pmd(pmd))
2392 categories |= PAGE_IS_PFNZERO;
2393 if (pmd_soft_dirty(pmd))
2394 categories |= PAGE_IS_SOFT_DIRTY;
2395 } else if (is_swap_pmd(pmd)) {
2396 swp_entry_t swp;
2397
2398 categories |= PAGE_IS_SWAPPED;
2399 if (!pmd_swp_uffd_wp(pmd))
2400 categories |= PAGE_IS_WRITTEN;
2401 if (pmd_swp_soft_dirty(pmd))
2402 categories |= PAGE_IS_SOFT_DIRTY;
2403
2404 if (p->masks_of_interest & PAGE_IS_FILE) {
2405 swp = pmd_to_swp_entry(pmd);
2406 if (is_pfn_swap_entry(swp) &&
2407 !folio_test_anon(pfn_swap_entry_folio(swp)))
2408 categories |= PAGE_IS_FILE;
2409 }
2410 }
2411
2412 return categories;
2413 }
2414
make_uffd_wp_pmd(struct vm_area_struct * vma,unsigned long addr,pmd_t * pmdp)2415 static void make_uffd_wp_pmd(struct vm_area_struct *vma,
2416 unsigned long addr, pmd_t *pmdp)
2417 {
2418 pmd_t old, pmd = *pmdp;
2419
2420 if (pmd_present(pmd)) {
2421 old = pmdp_invalidate_ad(vma, addr, pmdp);
2422 pmd = pmd_mkuffd_wp(old);
2423 set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
2424 } else if (is_migration_entry(pmd_to_swp_entry(pmd))) {
2425 pmd = pmd_swp_mkuffd_wp(pmd);
2426 set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
2427 }
2428 }
2429 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
2430
2431 #ifdef CONFIG_HUGETLB_PAGE
pagemap_hugetlb_category(pte_t pte)2432 static unsigned long pagemap_hugetlb_category(pte_t pte)
2433 {
2434 unsigned long categories = PAGE_IS_HUGE;
2435
2436 /*
2437 * According to pagemap_hugetlb_range(), file-backed HugeTLB
2438 * page cannot be swapped. So PAGE_IS_FILE is not checked for
2439 * swapped pages.
2440 */
2441 if (pte_present(pte)) {
2442 categories |= PAGE_IS_PRESENT;
2443 if (!huge_pte_uffd_wp(pte))
2444 categories |= PAGE_IS_WRITTEN;
2445 if (!PageAnon(pte_page(pte)))
2446 categories |= PAGE_IS_FILE;
2447 if (is_zero_pfn(pte_pfn(pte)))
2448 categories |= PAGE_IS_PFNZERO;
2449 if (pte_soft_dirty(pte))
2450 categories |= PAGE_IS_SOFT_DIRTY;
2451 } else if (is_swap_pte(pte)) {
2452 categories |= PAGE_IS_SWAPPED;
2453 if (!pte_swp_uffd_wp_any(pte))
2454 categories |= PAGE_IS_WRITTEN;
2455 if (pte_swp_soft_dirty(pte))
2456 categories |= PAGE_IS_SOFT_DIRTY;
2457 }
2458
2459 return categories;
2460 }
2461
make_uffd_wp_huge_pte(struct vm_area_struct * vma,unsigned long addr,pte_t * ptep,pte_t ptent)2462 static void make_uffd_wp_huge_pte(struct vm_area_struct *vma,
2463 unsigned long addr, pte_t *ptep,
2464 pte_t ptent)
2465 {
2466 unsigned long psize;
2467
2468 if (is_hugetlb_entry_hwpoisoned(ptent) || is_pte_marker(ptent))
2469 return;
2470
2471 psize = huge_page_size(hstate_vma(vma));
2472
2473 if (is_hugetlb_entry_migration(ptent))
2474 set_huge_pte_at(vma->vm_mm, addr, ptep,
2475 pte_swp_mkuffd_wp(ptent), psize);
2476 else if (!huge_pte_none(ptent))
2477 huge_ptep_modify_prot_commit(vma, addr, ptep, ptent,
2478 huge_pte_mkuffd_wp(ptent));
2479 else
2480 set_huge_pte_at(vma->vm_mm, addr, ptep,
2481 make_pte_marker(PTE_MARKER_UFFD_WP), psize);
2482 }
2483 #endif /* CONFIG_HUGETLB_PAGE */
2484
2485 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLB_PAGE)
pagemap_scan_backout_range(struct pagemap_scan_private * p,unsigned long addr,unsigned long end)2486 static void pagemap_scan_backout_range(struct pagemap_scan_private *p,
2487 unsigned long addr, unsigned long end)
2488 {
2489 struct page_region *cur_buf = &p->vec_buf[p->vec_buf_index];
2490
2491 if (!p->vec_buf)
2492 return;
2493
2494 if (cur_buf->start != addr)
2495 cur_buf->end = addr;
2496 else
2497 cur_buf->start = cur_buf->end = 0;
2498
2499 p->found_pages -= (end - addr) / PAGE_SIZE;
2500 }
2501 #endif
2502
pagemap_scan_is_interesting_page(unsigned long categories,const struct pagemap_scan_private * p)2503 static bool pagemap_scan_is_interesting_page(unsigned long categories,
2504 const struct pagemap_scan_private *p)
2505 {
2506 categories ^= p->arg.category_inverted;
2507 if ((categories & p->arg.category_mask) != p->arg.category_mask)
2508 return false;
2509 if (p->arg.category_anyof_mask && !(categories & p->arg.category_anyof_mask))
2510 return false;
2511
2512 return true;
2513 }
2514
pagemap_scan_is_interesting_vma(unsigned long categories,const struct pagemap_scan_private * p)2515 static bool pagemap_scan_is_interesting_vma(unsigned long categories,
2516 const struct pagemap_scan_private *p)
2517 {
2518 unsigned long required = p->arg.category_mask & PAGE_IS_WPALLOWED;
2519
2520 categories ^= p->arg.category_inverted;
2521 if ((categories & required) != required)
2522 return false;
2523
2524 return true;
2525 }
2526
pagemap_scan_test_walk(unsigned long start,unsigned long end,struct mm_walk * walk)2527 static int pagemap_scan_test_walk(unsigned long start, unsigned long end,
2528 struct mm_walk *walk)
2529 {
2530 struct pagemap_scan_private *p = walk->private;
2531 struct vm_area_struct *vma = walk->vma;
2532 unsigned long vma_category = 0;
2533 bool wp_allowed = userfaultfd_wp_async(vma) &&
2534 userfaultfd_wp_use_markers(vma);
2535
2536 if (!wp_allowed) {
2537 /* User requested explicit failure over wp-async capability */
2538 if (p->arg.flags & PM_SCAN_CHECK_WPASYNC)
2539 return -EPERM;
2540 /*
2541 * User requires wr-protect, and allows silently skipping
2542 * unsupported vmas.
2543 */
2544 if (p->arg.flags & PM_SCAN_WP_MATCHING)
2545 return 1;
2546 /*
2547 * Then the request doesn't involve wr-protects at all,
2548 * fall through to the rest checks, and allow vma walk.
2549 */
2550 }
2551
2552 if (vma->vm_flags & VM_PFNMAP)
2553 return 1;
2554
2555 if (wp_allowed)
2556 vma_category |= PAGE_IS_WPALLOWED;
2557
2558 if (vma->vm_flags & VM_SOFTDIRTY)
2559 vma_category |= PAGE_IS_SOFT_DIRTY;
2560
2561 if (!pagemap_scan_is_interesting_vma(vma_category, p))
2562 return 1;
2563
2564 p->cur_vma_category = vma_category;
2565
2566 return 0;
2567 }
2568
pagemap_scan_push_range(unsigned long categories,struct pagemap_scan_private * p,unsigned long addr,unsigned long end)2569 static bool pagemap_scan_push_range(unsigned long categories,
2570 struct pagemap_scan_private *p,
2571 unsigned long addr, unsigned long end)
2572 {
2573 struct page_region *cur_buf = &p->vec_buf[p->vec_buf_index];
2574
2575 /*
2576 * When there is no output buffer provided at all, the sentinel values
2577 * won't match here. There is no other way for `cur_buf->end` to be
2578 * non-zero other than it being non-empty.
2579 */
2580 if (addr == cur_buf->end && categories == cur_buf->categories) {
2581 cur_buf->end = end;
2582 return true;
2583 }
2584
2585 if (cur_buf->end) {
2586 if (p->vec_buf_index >= p->vec_buf_len - 1)
2587 return false;
2588
2589 cur_buf = &p->vec_buf[++p->vec_buf_index];
2590 }
2591
2592 cur_buf->start = addr;
2593 cur_buf->end = end;
2594 cur_buf->categories = categories;
2595
2596 return true;
2597 }
2598
pagemap_scan_output(unsigned long categories,struct pagemap_scan_private * p,unsigned long addr,unsigned long * end)2599 static int pagemap_scan_output(unsigned long categories,
2600 struct pagemap_scan_private *p,
2601 unsigned long addr, unsigned long *end)
2602 {
2603 unsigned long n_pages, total_pages;
2604 int ret = 0;
2605
2606 if (!p->vec_buf)
2607 return 0;
2608
2609 categories &= p->arg.return_mask;
2610
2611 n_pages = (*end - addr) / PAGE_SIZE;
2612 if (check_add_overflow(p->found_pages, n_pages, &total_pages) ||
2613 total_pages > p->arg.max_pages) {
2614 size_t n_too_much = total_pages - p->arg.max_pages;
2615 *end -= n_too_much * PAGE_SIZE;
2616 n_pages -= n_too_much;
2617 ret = -ENOSPC;
2618 }
2619
2620 if (!pagemap_scan_push_range(categories, p, addr, *end)) {
2621 *end = addr;
2622 n_pages = 0;
2623 ret = -ENOSPC;
2624 }
2625
2626 p->found_pages += n_pages;
2627 if (ret)
2628 p->arg.walk_end = *end;
2629
2630 return ret;
2631 }
2632
pagemap_scan_thp_entry(pmd_t * pmd,unsigned long start,unsigned long end,struct mm_walk * walk)2633 static int pagemap_scan_thp_entry(pmd_t *pmd, unsigned long start,
2634 unsigned long end, struct mm_walk *walk)
2635 {
2636 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
2637 struct pagemap_scan_private *p = walk->private;
2638 struct vm_area_struct *vma = walk->vma;
2639 unsigned long categories;
2640 spinlock_t *ptl;
2641 int ret = 0;
2642
2643 ptl = pmd_trans_huge_lock(pmd, vma);
2644 if (!ptl)
2645 return -ENOENT;
2646
2647 categories = p->cur_vma_category |
2648 pagemap_thp_category(p, vma, start, *pmd);
2649
2650 if (!pagemap_scan_is_interesting_page(categories, p))
2651 goto out_unlock;
2652
2653 ret = pagemap_scan_output(categories, p, start, &end);
2654 if (start == end)
2655 goto out_unlock;
2656
2657 if (~p->arg.flags & PM_SCAN_WP_MATCHING)
2658 goto out_unlock;
2659 if (~categories & PAGE_IS_WRITTEN)
2660 goto out_unlock;
2661
2662 /*
2663 * Break huge page into small pages if the WP operation
2664 * needs to be performed on a portion of the huge page.
2665 */
2666 if (end != start + HPAGE_SIZE) {
2667 spin_unlock(ptl);
2668 split_huge_pmd(vma, pmd, start);
2669 pagemap_scan_backout_range(p, start, end);
2670 /* Report as if there was no THP */
2671 return -ENOENT;
2672 }
2673
2674 make_uffd_wp_pmd(vma, start, pmd);
2675 flush_tlb_range(vma, start, end);
2676 out_unlock:
2677 spin_unlock(ptl);
2678 return ret;
2679 #else /* !CONFIG_TRANSPARENT_HUGEPAGE */
2680 return -ENOENT;
2681 #endif
2682 }
2683
pagemap_scan_pmd_entry(pmd_t * pmd,unsigned long start,unsigned long end,struct mm_walk * walk)2684 static int pagemap_scan_pmd_entry(pmd_t *pmd, unsigned long start,
2685 unsigned long end, struct mm_walk *walk)
2686 {
2687 struct pagemap_scan_private *p = walk->private;
2688 struct vm_area_struct *vma = walk->vma;
2689 unsigned long addr, flush_end = 0;
2690 pte_t *pte, *start_pte;
2691 spinlock_t *ptl;
2692 int ret;
2693
2694 arch_enter_lazy_mmu_mode();
2695
2696 ret = pagemap_scan_thp_entry(pmd, start, end, walk);
2697 if (ret != -ENOENT) {
2698 arch_leave_lazy_mmu_mode();
2699 return ret;
2700 }
2701
2702 ret = 0;
2703 start_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl);
2704 if (!pte) {
2705 arch_leave_lazy_mmu_mode();
2706 walk->action = ACTION_AGAIN;
2707 return 0;
2708 }
2709
2710 if ((p->arg.flags & PM_SCAN_WP_MATCHING) && !p->vec_out) {
2711 /* Fast path for performing exclusive WP */
2712 for (addr = start; addr != end; pte++, addr += PAGE_SIZE) {
2713 pte_t ptent = ptep_get(pte);
2714
2715 if ((pte_present(ptent) && pte_uffd_wp(ptent)) ||
2716 pte_swp_uffd_wp_any(ptent))
2717 continue;
2718 make_uffd_wp_pte(vma, addr, pte, ptent);
2719 if (!flush_end)
2720 start = addr;
2721 flush_end = addr + PAGE_SIZE;
2722 }
2723 goto flush_and_return;
2724 }
2725
2726 if (!p->arg.category_anyof_mask && !p->arg.category_inverted &&
2727 p->arg.category_mask == PAGE_IS_WRITTEN &&
2728 p->arg.return_mask == PAGE_IS_WRITTEN) {
2729 for (addr = start; addr < end; pte++, addr += PAGE_SIZE) {
2730 unsigned long next = addr + PAGE_SIZE;
2731 pte_t ptent = ptep_get(pte);
2732
2733 if ((pte_present(ptent) && pte_uffd_wp(ptent)) ||
2734 pte_swp_uffd_wp_any(ptent))
2735 continue;
2736 ret = pagemap_scan_output(p->cur_vma_category | PAGE_IS_WRITTEN,
2737 p, addr, &next);
2738 if (next == addr)
2739 break;
2740 if (~p->arg.flags & PM_SCAN_WP_MATCHING)
2741 continue;
2742 make_uffd_wp_pte(vma, addr, pte, ptent);
2743 if (!flush_end)
2744 start = addr;
2745 flush_end = next;
2746 }
2747 goto flush_and_return;
2748 }
2749
2750 for (addr = start; addr != end; pte++, addr += PAGE_SIZE) {
2751 pte_t ptent = ptep_get(pte);
2752 unsigned long categories = p->cur_vma_category |
2753 pagemap_page_category(p, vma, addr, ptent);
2754 unsigned long next = addr + PAGE_SIZE;
2755
2756 if (!pagemap_scan_is_interesting_page(categories, p))
2757 continue;
2758
2759 ret = pagemap_scan_output(categories, p, addr, &next);
2760 if (next == addr)
2761 break;
2762
2763 if (~p->arg.flags & PM_SCAN_WP_MATCHING)
2764 continue;
2765 if (~categories & PAGE_IS_WRITTEN)
2766 continue;
2767
2768 make_uffd_wp_pte(vma, addr, pte, ptent);
2769 if (!flush_end)
2770 start = addr;
2771 flush_end = next;
2772 }
2773
2774 flush_and_return:
2775 if (flush_end)
2776 flush_tlb_range(vma, start, addr);
2777
2778 pte_unmap_unlock(start_pte, ptl);
2779 arch_leave_lazy_mmu_mode();
2780
2781 cond_resched();
2782 return ret;
2783 }
2784
2785 #ifdef CONFIG_HUGETLB_PAGE
pagemap_scan_hugetlb_entry(pte_t * ptep,unsigned long hmask,unsigned long start,unsigned long end,struct mm_walk * walk)2786 static int pagemap_scan_hugetlb_entry(pte_t *ptep, unsigned long hmask,
2787 unsigned long start, unsigned long end,
2788 struct mm_walk *walk)
2789 {
2790 struct pagemap_scan_private *p = walk->private;
2791 struct vm_area_struct *vma = walk->vma;
2792 unsigned long categories;
2793 spinlock_t *ptl;
2794 int ret = 0;
2795 pte_t pte;
2796
2797 if (~p->arg.flags & PM_SCAN_WP_MATCHING) {
2798 /* Go the short route when not write-protecting pages. */
2799
2800 pte = huge_ptep_get(walk->mm, start, ptep);
2801 categories = p->cur_vma_category | pagemap_hugetlb_category(pte);
2802
2803 if (!pagemap_scan_is_interesting_page(categories, p))
2804 return 0;
2805
2806 return pagemap_scan_output(categories, p, start, &end);
2807 }
2808
2809 i_mmap_lock_write(vma->vm_file->f_mapping);
2810 ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, ptep);
2811
2812 pte = huge_ptep_get(walk->mm, start, ptep);
2813 categories = p->cur_vma_category | pagemap_hugetlb_category(pte);
2814
2815 if (!pagemap_scan_is_interesting_page(categories, p))
2816 goto out_unlock;
2817
2818 ret = pagemap_scan_output(categories, p, start, &end);
2819 if (start == end)
2820 goto out_unlock;
2821
2822 if (~categories & PAGE_IS_WRITTEN)
2823 goto out_unlock;
2824
2825 if (end != start + HPAGE_SIZE) {
2826 /* Partial HugeTLB page WP isn't possible. */
2827 pagemap_scan_backout_range(p, start, end);
2828 p->arg.walk_end = start;
2829 ret = 0;
2830 goto out_unlock;
2831 }
2832
2833 make_uffd_wp_huge_pte(vma, start, ptep, pte);
2834 flush_hugetlb_tlb_range(vma, start, end);
2835
2836 out_unlock:
2837 spin_unlock(ptl);
2838 i_mmap_unlock_write(vma->vm_file->f_mapping);
2839
2840 return ret;
2841 }
2842 #else
2843 #define pagemap_scan_hugetlb_entry NULL
2844 #endif
2845
pagemap_scan_pte_hole(unsigned long addr,unsigned long end,int depth,struct mm_walk * walk)2846 static int pagemap_scan_pte_hole(unsigned long addr, unsigned long end,
2847 int depth, struct mm_walk *walk)
2848 {
2849 struct pagemap_scan_private *p = walk->private;
2850 struct vm_area_struct *vma = walk->vma;
2851 int ret, err;
2852
2853 if (!vma || !pagemap_scan_is_interesting_page(p->cur_vma_category, p))
2854 return 0;
2855
2856 ret = pagemap_scan_output(p->cur_vma_category, p, addr, &end);
2857 if (addr == end)
2858 return ret;
2859
2860 if (~p->arg.flags & PM_SCAN_WP_MATCHING)
2861 return ret;
2862
2863 err = uffd_wp_range(vma, addr, end - addr, true);
2864 if (err < 0)
2865 ret = err;
2866
2867 return ret;
2868 }
2869
2870 static const struct mm_walk_ops pagemap_scan_ops = {
2871 .test_walk = pagemap_scan_test_walk,
2872 .pmd_entry = pagemap_scan_pmd_entry,
2873 .pte_hole = pagemap_scan_pte_hole,
2874 .hugetlb_entry = pagemap_scan_hugetlb_entry,
2875 };
2876
pagemap_scan_get_args(struct pm_scan_arg * arg,unsigned long uarg)2877 static int pagemap_scan_get_args(struct pm_scan_arg *arg,
2878 unsigned long uarg)
2879 {
2880 if (copy_from_user(arg, (void __user *)uarg, sizeof(*arg)))
2881 return -EFAULT;
2882
2883 if (arg->size != sizeof(struct pm_scan_arg))
2884 return -EINVAL;
2885
2886 /* Validate requested features */
2887 if (arg->flags & ~PM_SCAN_FLAGS)
2888 return -EINVAL;
2889 if ((arg->category_inverted | arg->category_mask |
2890 arg->category_anyof_mask | arg->return_mask) & ~PM_SCAN_CATEGORIES)
2891 return -EINVAL;
2892
2893 arg->start = untagged_addr((unsigned long)arg->start);
2894 arg->end = untagged_addr((unsigned long)arg->end);
2895 arg->vec = untagged_addr((unsigned long)arg->vec);
2896
2897 /* Validate memory pointers */
2898 if (!IS_ALIGNED(arg->start, PAGE_SIZE))
2899 return -EINVAL;
2900 if (!access_ok((void __user *)(long)arg->start, arg->end - arg->start))
2901 return -EFAULT;
2902 if (!arg->vec && arg->vec_len)
2903 return -EINVAL;
2904 if (UINT_MAX == SIZE_MAX && arg->vec_len > SIZE_MAX)
2905 return -EINVAL;
2906 if (arg->vec && !access_ok((void __user *)(long)arg->vec,
2907 size_mul(arg->vec_len, sizeof(struct page_region))))
2908 return -EFAULT;
2909
2910 /* Fixup default values */
2911 arg->end = ALIGN(arg->end, PAGE_SIZE);
2912 arg->walk_end = 0;
2913 if (!arg->max_pages)
2914 arg->max_pages = ULONG_MAX;
2915
2916 return 0;
2917 }
2918
pagemap_scan_writeback_args(struct pm_scan_arg * arg,unsigned long uargl)2919 static int pagemap_scan_writeback_args(struct pm_scan_arg *arg,
2920 unsigned long uargl)
2921 {
2922 struct pm_scan_arg __user *uarg = (void __user *)uargl;
2923
2924 if (copy_to_user(&uarg->walk_end, &arg->walk_end, sizeof(arg->walk_end)))
2925 return -EFAULT;
2926
2927 return 0;
2928 }
2929
pagemap_scan_init_bounce_buffer(struct pagemap_scan_private * p)2930 static int pagemap_scan_init_bounce_buffer(struct pagemap_scan_private *p)
2931 {
2932 if (!p->arg.vec_len)
2933 return 0;
2934
2935 p->vec_buf_len = min_t(size_t, PAGEMAP_WALK_SIZE >> PAGE_SHIFT,
2936 p->arg.vec_len);
2937 p->vec_buf = kmalloc_array(p->vec_buf_len, sizeof(*p->vec_buf),
2938 GFP_KERNEL);
2939 if (!p->vec_buf)
2940 return -ENOMEM;
2941
2942 p->vec_buf->start = p->vec_buf->end = 0;
2943 p->vec_out = (struct page_region __user *)(long)p->arg.vec;
2944
2945 return 0;
2946 }
2947
pagemap_scan_flush_buffer(struct pagemap_scan_private * p)2948 static long pagemap_scan_flush_buffer(struct pagemap_scan_private *p)
2949 {
2950 const struct page_region *buf = p->vec_buf;
2951 long n = p->vec_buf_index;
2952
2953 if (!p->vec_buf)
2954 return 0;
2955
2956 if (buf[n].end != buf[n].start)
2957 n++;
2958
2959 if (!n)
2960 return 0;
2961
2962 if (copy_to_user(p->vec_out, buf, n * sizeof(*buf)))
2963 return -EFAULT;
2964
2965 p->arg.vec_len -= n;
2966 p->vec_out += n;
2967
2968 p->vec_buf_index = 0;
2969 p->vec_buf_len = min_t(size_t, p->vec_buf_len, p->arg.vec_len);
2970 p->vec_buf->start = p->vec_buf->end = 0;
2971
2972 return n;
2973 }
2974
do_pagemap_scan(struct mm_struct * mm,unsigned long uarg)2975 static long do_pagemap_scan(struct mm_struct *mm, unsigned long uarg)
2976 {
2977 struct pagemap_scan_private p = {0};
2978 unsigned long walk_start;
2979 size_t n_ranges_out = 0;
2980 int ret;
2981
2982 ret = pagemap_scan_get_args(&p.arg, uarg);
2983 if (ret)
2984 return ret;
2985
2986 p.masks_of_interest = p.arg.category_mask | p.arg.category_anyof_mask |
2987 p.arg.return_mask;
2988 ret = pagemap_scan_init_bounce_buffer(&p);
2989 if (ret)
2990 return ret;
2991
2992 for (walk_start = p.arg.start; walk_start < p.arg.end;
2993 walk_start = p.arg.walk_end) {
2994 struct mmu_notifier_range range;
2995 long n_out;
2996
2997 if (fatal_signal_pending(current)) {
2998 ret = -EINTR;
2999 break;
3000 }
3001
3002 ret = mmap_read_lock_killable(mm);
3003 if (ret)
3004 break;
3005
3006 /* Protection change for the range is going to happen. */
3007 if (p.arg.flags & PM_SCAN_WP_MATCHING) {
3008 mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA, 0,
3009 mm, walk_start, p.arg.end);
3010 mmu_notifier_invalidate_range_start(&range);
3011 }
3012
3013 ret = walk_page_range(mm, walk_start, p.arg.end,
3014 &pagemap_scan_ops, &p);
3015
3016 if (p.arg.flags & PM_SCAN_WP_MATCHING)
3017 mmu_notifier_invalidate_range_end(&range);
3018
3019 mmap_read_unlock(mm);
3020
3021 n_out = pagemap_scan_flush_buffer(&p);
3022 if (n_out < 0)
3023 ret = n_out;
3024 else
3025 n_ranges_out += n_out;
3026
3027 if (ret != -ENOSPC)
3028 break;
3029
3030 if (p.arg.vec_len == 0 || p.found_pages == p.arg.max_pages)
3031 break;
3032 }
3033
3034 /* ENOSPC signifies early stop (buffer full) from the walk. */
3035 if (!ret || ret == -ENOSPC)
3036 ret = n_ranges_out;
3037
3038 /* The walk_end isn't set when ret is zero */
3039 if (!p.arg.walk_end)
3040 p.arg.walk_end = p.arg.end;
3041 if (pagemap_scan_writeback_args(&p.arg, uarg))
3042 ret = -EFAULT;
3043
3044 kfree(p.vec_buf);
3045 return ret;
3046 }
3047
do_pagemap_cmd(struct file * file,unsigned int cmd,unsigned long arg)3048 static long do_pagemap_cmd(struct file *file, unsigned int cmd,
3049 unsigned long arg)
3050 {
3051 struct mm_struct *mm = file->private_data;
3052
3053 switch (cmd) {
3054 case PAGEMAP_SCAN:
3055 return do_pagemap_scan(mm, arg);
3056
3057 default:
3058 return -EINVAL;
3059 }
3060 }
3061
__pagemap_lseek(struct file * file,loff_t offset,int orig)3062 loff_t __pagemap_lseek(struct file *file, loff_t offset, int orig)
3063 {
3064 unsigned long nr_subpages = __PAGE_SIZE / PAGE_SIZE;
3065 loff_t ret;
3066
3067 /*
3068 * Userspace thinks the pages are larger than they actually are, so adjust the
3069 * offset to compensate.
3070 */
3071 offset *= nr_subpages;
3072
3073 ret = mem_lseek(file, offset, orig); /* borrow this */
3074 if (ret < 0)
3075 return offset;
3076
3077 /* Re-adjust the offset to reflect the larger userspace page size. */
3078 return ret / nr_subpages;
3079 }
3080
3081 const struct file_operations proc_pagemap_operations = {
3082 .llseek = __pagemap_lseek,
3083 .read = pagemap_read,
3084 .open = pagemap_open,
3085 .release = pagemap_release,
3086 .unlocked_ioctl = do_pagemap_cmd,
3087 .compat_ioctl = do_pagemap_cmd,
3088 };
3089
__is_emulated_pagemap_file(struct file * file)3090 bool __is_emulated_pagemap_file(struct file *file)
3091 {
3092 return __PAGE_SIZE != PAGE_SIZE && file->f_op == &proc_pagemap_operations;
3093 }
3094 #endif /* CONFIG_PROC_PAGE_MONITOR */
3095
3096 #ifdef CONFIG_NUMA
3097
3098 struct numa_maps {
3099 unsigned long pages;
3100 unsigned long anon;
3101 unsigned long active;
3102 unsigned long writeback;
3103 unsigned long mapcount_max;
3104 unsigned long dirty;
3105 unsigned long swapcache;
3106 unsigned long node[MAX_NUMNODES];
3107 };
3108
3109 struct numa_maps_private {
3110 struct proc_maps_private proc_maps;
3111 struct numa_maps md;
3112 };
3113
gather_stats(struct page * page,struct numa_maps * md,int pte_dirty,unsigned long nr_pages)3114 static void gather_stats(struct page *page, struct numa_maps *md, int pte_dirty,
3115 unsigned long nr_pages)
3116 {
3117 struct folio *folio = page_folio(page);
3118 int count = folio_precise_page_mapcount(folio, page);
3119
3120 md->pages += nr_pages;
3121 if (pte_dirty || folio_test_dirty(folio))
3122 md->dirty += nr_pages;
3123
3124 if (folio_test_swapcache(folio))
3125 md->swapcache += nr_pages;
3126
3127 if (folio_test_active(folio) || folio_test_unevictable(folio))
3128 md->active += nr_pages;
3129
3130 if (folio_test_writeback(folio))
3131 md->writeback += nr_pages;
3132
3133 if (folio_test_anon(folio))
3134 md->anon += nr_pages;
3135
3136 if (count > md->mapcount_max)
3137 md->mapcount_max = count;
3138
3139 md->node[folio_nid(folio)] += nr_pages;
3140 }
3141
can_gather_numa_stats(pte_t pte,struct vm_area_struct * vma,unsigned long addr)3142 static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma,
3143 unsigned long addr)
3144 {
3145 struct page *page;
3146 int nid;
3147
3148 if (!pte_present(pte))
3149 return NULL;
3150
3151 page = vm_normal_page(vma, addr, pte);
3152 if (!page || is_zone_device_page(page))
3153 return NULL;
3154
3155 if (PageReserved(page))
3156 return NULL;
3157
3158 nid = page_to_nid(page);
3159 if (!node_isset(nid, node_states[N_MEMORY]))
3160 return NULL;
3161
3162 return page;
3163 }
3164
3165 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
can_gather_numa_stats_pmd(pmd_t pmd,struct vm_area_struct * vma,unsigned long addr)3166 static struct page *can_gather_numa_stats_pmd(pmd_t pmd,
3167 struct vm_area_struct *vma,
3168 unsigned long addr)
3169 {
3170 struct page *page;
3171 int nid;
3172
3173 if (!pmd_present(pmd))
3174 return NULL;
3175
3176 page = vm_normal_page_pmd(vma, addr, pmd);
3177 if (!page)
3178 return NULL;
3179
3180 if (PageReserved(page))
3181 return NULL;
3182
3183 nid = page_to_nid(page);
3184 if (!node_isset(nid, node_states[N_MEMORY]))
3185 return NULL;
3186
3187 return page;
3188 }
3189 #endif
3190
gather_pte_stats(pmd_t * pmd,unsigned long addr,unsigned long end,struct mm_walk * walk)3191 static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
3192 unsigned long end, struct mm_walk *walk)
3193 {
3194 struct numa_maps *md = walk->private;
3195 struct vm_area_struct *vma = walk->vma;
3196 spinlock_t *ptl;
3197 pte_t *orig_pte;
3198 pte_t *pte;
3199
3200 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
3201 ptl = pmd_trans_huge_lock(pmd, vma);
3202 if (ptl) {
3203 struct page *page;
3204
3205 page = can_gather_numa_stats_pmd(*pmd, vma, addr);
3206 if (page)
3207 gather_stats(page, md, pmd_dirty(*pmd),
3208 HPAGE_PMD_SIZE/PAGE_SIZE);
3209 spin_unlock(ptl);
3210 return 0;
3211 }
3212 #endif
3213 orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
3214 if (!pte) {
3215 walk->action = ACTION_AGAIN;
3216 return 0;
3217 }
3218 do {
3219 pte_t ptent = ptep_get(pte);
3220 struct page *page = can_gather_numa_stats(ptent, vma, addr);
3221 if (!page)
3222 continue;
3223 gather_stats(page, md, pte_dirty(ptent), 1);
3224
3225 } while (pte++, addr += PAGE_SIZE, addr != end);
3226 pte_unmap_unlock(orig_pte, ptl);
3227 cond_resched();
3228 return 0;
3229 }
3230 #ifdef CONFIG_HUGETLB_PAGE
gather_hugetlb_stats(pte_t * pte,unsigned long hmask,unsigned long addr,unsigned long end,struct mm_walk * walk)3231 static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask,
3232 unsigned long addr, unsigned long end, struct mm_walk *walk)
3233 {
3234 pte_t huge_pte = huge_ptep_get(walk->mm, addr, pte);
3235 struct numa_maps *md;
3236 struct page *page;
3237
3238 if (!pte_present(huge_pte))
3239 return 0;
3240
3241 page = pte_page(huge_pte);
3242
3243 md = walk->private;
3244 gather_stats(page, md, pte_dirty(huge_pte), 1);
3245 return 0;
3246 }
3247
3248 #else
gather_hugetlb_stats(pte_t * pte,unsigned long hmask,unsigned long addr,unsigned long end,struct mm_walk * walk)3249 static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask,
3250 unsigned long addr, unsigned long end, struct mm_walk *walk)
3251 {
3252 return 0;
3253 }
3254 #endif
3255
3256 static const struct mm_walk_ops show_numa_ops = {
3257 .hugetlb_entry = gather_hugetlb_stats,
3258 .pmd_entry = gather_pte_stats,
3259 .walk_lock = PGWALK_RDLOCK,
3260 };
3261
3262 /*
3263 * Display pages allocated per node and memory policy via /proc.
3264 */
show_numa_map(struct seq_file * m,void * v)3265 static int show_numa_map(struct seq_file *m, void *v)
3266 {
3267 struct numa_maps_private *numa_priv = m->private;
3268 struct proc_maps_private *proc_priv = &numa_priv->proc_maps;
3269 struct vm_area_struct *vma = v;
3270 struct numa_maps *md = &numa_priv->md;
3271 struct file *file = vma->vm_file;
3272 struct mm_struct *mm = vma->vm_mm;
3273 char buffer[64];
3274 struct mempolicy *pol;
3275 pgoff_t ilx;
3276 int nid;
3277
3278 if (!mm)
3279 return 0;
3280
3281 /* Ensure we start with an empty set of numa_maps statistics. */
3282 memset(md, 0, sizeof(*md));
3283
3284 pol = __get_vma_policy(vma, vma->vm_start, &ilx);
3285 if (pol) {
3286 mpol_to_str(buffer, sizeof(buffer), pol);
3287 mpol_cond_put(pol);
3288 } else {
3289 mpol_to_str(buffer, sizeof(buffer), proc_priv->task_mempolicy);
3290 }
3291
3292 seq_printf(m, "%08lx %s", vma->vm_start, buffer);
3293
3294 if (file) {
3295 seq_puts(m, " file=");
3296 seq_path(m, file_user_path(file), "\n\t= ");
3297 } else if (vma_is_initial_heap(vma)) {
3298 seq_puts(m, " heap");
3299 } else if (vma_is_initial_stack(vma)) {
3300 seq_puts(m, " stack");
3301 }
3302
3303 if (is_vm_hugetlb_page(vma))
3304 seq_puts(m, " huge");
3305
3306 /* mmap_lock is held by m_start */
3307 walk_page_vma(vma, &show_numa_ops, md);
3308
3309 if (!md->pages)
3310 goto out;
3311
3312 if (md->anon)
3313 seq_printf(m, " anon=%lu", md->anon);
3314
3315 if (md->dirty)
3316 seq_printf(m, " dirty=%lu", md->dirty);
3317
3318 if (md->pages != md->anon && md->pages != md->dirty)
3319 seq_printf(m, " mapped=%lu", md->pages);
3320
3321 if (md->mapcount_max > 1)
3322 seq_printf(m, " mapmax=%lu", md->mapcount_max);
3323
3324 if (md->swapcache)
3325 seq_printf(m, " swapcache=%lu", md->swapcache);
3326
3327 if (md->active < md->pages && !is_vm_hugetlb_page(vma))
3328 seq_printf(m, " active=%lu", md->active);
3329
3330 if (md->writeback)
3331 seq_printf(m, " writeback=%lu", md->writeback);
3332
3333 for_each_node_state(nid, N_MEMORY)
3334 if (md->node[nid])
3335 seq_printf(m, " N%d=%lu", nid, md->node[nid]);
3336
3337 seq_printf(m, " kernelpagesize_kB=%lu", vma_kernel_pagesize(vma) >> 10);
3338 out:
3339 seq_putc(m, '\n');
3340 return 0;
3341 }
3342
3343 static const struct seq_operations proc_pid_numa_maps_op = {
3344 .start = m_start,
3345 .next = m_next,
3346 .stop = m_stop,
3347 .show = show_numa_map,
3348 };
3349
pid_numa_maps_open(struct inode * inode,struct file * file)3350 static int pid_numa_maps_open(struct inode *inode, struct file *file)
3351 {
3352 return proc_maps_open(inode, file, &proc_pid_numa_maps_op,
3353 sizeof(struct numa_maps_private));
3354 }
3355
3356 const struct file_operations proc_pid_numa_maps_operations = {
3357 .open = pid_numa_maps_open,
3358 .read = seq_read,
3359 .llseek = seq_lseek,
3360 .release = proc_map_release,
3361 };
3362
3363 #endif /* CONFIG_NUMA */
3364