1 // SPDX-License-Identifier: GPL-2.0-only
2 #include <linux/mm.h>
3 #include <linux/slab.h>
4 #include <linux/string.h>
5 #include <linux/compiler.h>
6 #include <linux/export.h>
7 #include <linux/err.h>
8 #include <linux/sched.h>
9 #include <linux/sched/mm.h>
10 #include <linux/sched/signal.h>
11 #include <linux/sched/task_stack.h>
12 #include <linux/security.h>
13 #include <linux/swap.h>
14 #include <linux/swapops.h>
15 #include <linux/mman.h>
16 #include <linux/hugetlb.h>
17 #include <linux/vmalloc.h>
18 #include <linux/userfaultfd_k.h>
19 #include <linux/elf.h>
20 #include <linux/elf-randomize.h>
21 #include <linux/personality.h>
22 #include <linux/random.h>
23 #include <linux/processor.h>
24 #include <linux/sizes.h>
25 #include <linux/compat.h>
26 #include <linux/page_size_compat.h>
27
28 #include <linux/uaccess.h>
29 #include <trace/hooks/mm.h>
30 #include "internal.h"
31 #include "swap.h"
32
33 #ifndef __GENKSYMS__
34 #include <trace/hooks/syscall_check.h>
35 #include <trace/hooks/mm.h>
36 #endif
37
38 /**
39 * kfree_const - conditionally free memory
40 * @x: pointer to the memory
41 *
42 * Function calls kfree only if @x is not in .rodata section.
43 */
kfree_const(const void * x)44 void kfree_const(const void *x)
45 {
46 if (!is_kernel_rodata((unsigned long)x))
47 kfree(x);
48 }
49 EXPORT_SYMBOL(kfree_const);
50
51 /**
52 * kstrdup - allocate space for and copy an existing string
53 * @s: the string to duplicate
54 * @gfp: the GFP mask used in the kmalloc() call when allocating memory
55 *
56 * Return: newly allocated copy of @s or %NULL in case of error
57 */
58 noinline
kstrdup(const char * s,gfp_t gfp)59 char *kstrdup(const char *s, gfp_t gfp)
60 {
61 size_t len;
62 char *buf;
63
64 if (!s)
65 return NULL;
66
67 len = strlen(s) + 1;
68 buf = kmalloc_track_caller(len, gfp);
69 if (buf)
70 memcpy(buf, s, len);
71 return buf;
72 }
73 EXPORT_SYMBOL(kstrdup);
74
75 /**
76 * kstrdup_const - conditionally duplicate an existing const string
77 * @s: the string to duplicate
78 * @gfp: the GFP mask used in the kmalloc() call when allocating memory
79 *
80 * Note: Strings allocated by kstrdup_const should be freed by kfree_const and
81 * must not be passed to krealloc().
82 *
83 * Return: source string if it is in .rodata section otherwise
84 * fallback to kstrdup.
85 */
kstrdup_const(const char * s,gfp_t gfp)86 const char *kstrdup_const(const char *s, gfp_t gfp)
87 {
88 if (is_kernel_rodata((unsigned long)s))
89 return s;
90
91 return kstrdup(s, gfp);
92 }
93 EXPORT_SYMBOL(kstrdup_const);
94
95 /**
96 * kstrndup - allocate space for and copy an existing string
97 * @s: the string to duplicate
98 * @max: read at most @max chars from @s
99 * @gfp: the GFP mask used in the kmalloc() call when allocating memory
100 *
101 * Note: Use kmemdup_nul() instead if the size is known exactly.
102 *
103 * Return: newly allocated copy of @s or %NULL in case of error
104 */
kstrndup(const char * s,size_t max,gfp_t gfp)105 char *kstrndup(const char *s, size_t max, gfp_t gfp)
106 {
107 size_t len;
108 char *buf;
109
110 if (!s)
111 return NULL;
112
113 len = strnlen(s, max);
114 buf = kmalloc_track_caller(len+1, gfp);
115 if (buf) {
116 memcpy(buf, s, len);
117 buf[len] = '\0';
118 }
119 return buf;
120 }
121 EXPORT_SYMBOL(kstrndup);
122
123 /**
124 * kmemdup - duplicate region of memory
125 *
126 * @src: memory region to duplicate
127 * @len: memory region length
128 * @gfp: GFP mask to use
129 *
130 * Return: newly allocated copy of @src or %NULL in case of error,
131 * result is physically contiguous. Use kfree() to free.
132 */
kmemdup(const void * src,size_t len,gfp_t gfp)133 void *kmemdup(const void *src, size_t len, gfp_t gfp)
134 {
135 void *p;
136
137 p = kmalloc_track_caller(len, gfp);
138 if (p)
139 memcpy(p, src, len);
140 return p;
141 }
142 EXPORT_SYMBOL(kmemdup);
143
144 /**
145 * kvmemdup - duplicate region of memory
146 *
147 * @src: memory region to duplicate
148 * @len: memory region length
149 * @gfp: GFP mask to use
150 *
151 * Return: newly allocated copy of @src or %NULL in case of error,
152 * result may be not physically contiguous. Use kvfree() to free.
153 */
kvmemdup(const void * src,size_t len,gfp_t gfp)154 void *kvmemdup(const void *src, size_t len, gfp_t gfp)
155 {
156 void *p;
157
158 p = kvmalloc(len, gfp);
159 if (p)
160 memcpy(p, src, len);
161 return p;
162 }
163 EXPORT_SYMBOL(kvmemdup);
164
165 /**
166 * kmemdup_nul - Create a NUL-terminated string from unterminated data
167 * @s: The data to stringify
168 * @len: The size of the data
169 * @gfp: the GFP mask used in the kmalloc() call when allocating memory
170 *
171 * Return: newly allocated copy of @s with NUL-termination or %NULL in
172 * case of error
173 */
kmemdup_nul(const char * s,size_t len,gfp_t gfp)174 char *kmemdup_nul(const char *s, size_t len, gfp_t gfp)
175 {
176 char *buf;
177
178 if (!s)
179 return NULL;
180
181 buf = kmalloc_track_caller(len + 1, gfp);
182 if (buf) {
183 memcpy(buf, s, len);
184 buf[len] = '\0';
185 }
186 return buf;
187 }
188 EXPORT_SYMBOL(kmemdup_nul);
189
190 /**
191 * memdup_user - duplicate memory region from user space
192 *
193 * @src: source address in user space
194 * @len: number of bytes to copy
195 *
196 * Return: an ERR_PTR() on failure. Result is physically
197 * contiguous, to be freed by kfree().
198 */
memdup_user(const void __user * src,size_t len)199 void *memdup_user(const void __user *src, size_t len)
200 {
201 void *p;
202
203 p = kmalloc_track_caller(len, GFP_USER | __GFP_NOWARN);
204 if (!p)
205 return ERR_PTR(-ENOMEM);
206
207 if (copy_from_user(p, src, len)) {
208 kfree(p);
209 return ERR_PTR(-EFAULT);
210 }
211
212 return p;
213 }
214 EXPORT_SYMBOL(memdup_user);
215
216 /**
217 * vmemdup_user - duplicate memory region from user space
218 *
219 * @src: source address in user space
220 * @len: number of bytes to copy
221 *
222 * Return: an ERR_PTR() on failure. Result may be not
223 * physically contiguous. Use kvfree() to free.
224 */
vmemdup_user(const void __user * src,size_t len)225 void *vmemdup_user(const void __user *src, size_t len)
226 {
227 void *p;
228
229 p = kvmalloc(len, GFP_USER);
230 if (!p)
231 return ERR_PTR(-ENOMEM);
232
233 if (copy_from_user(p, src, len)) {
234 kvfree(p);
235 return ERR_PTR(-EFAULT);
236 }
237
238 return p;
239 }
240 EXPORT_SYMBOL(vmemdup_user);
241
242 /**
243 * strndup_user - duplicate an existing string from user space
244 * @s: The string to duplicate
245 * @n: Maximum number of bytes to copy, including the trailing NUL.
246 *
247 * Return: newly allocated copy of @s or an ERR_PTR() in case of error
248 */
strndup_user(const char __user * s,long n)249 char *strndup_user(const char __user *s, long n)
250 {
251 char *p;
252 long length;
253
254 length = strnlen_user(s, n);
255
256 if (!length)
257 return ERR_PTR(-EFAULT);
258
259 if (length > n)
260 return ERR_PTR(-EINVAL);
261
262 p = memdup_user(s, length);
263
264 if (IS_ERR(p))
265 return p;
266
267 p[length - 1] = '\0';
268
269 return p;
270 }
271 EXPORT_SYMBOL(strndup_user);
272
273 /**
274 * memdup_user_nul - duplicate memory region from user space and NUL-terminate
275 *
276 * @src: source address in user space
277 * @len: number of bytes to copy
278 *
279 * Return: an ERR_PTR() on failure.
280 */
memdup_user_nul(const void __user * src,size_t len)281 void *memdup_user_nul(const void __user *src, size_t len)
282 {
283 char *p;
284
285 /*
286 * Always use GFP_KERNEL, since copy_from_user() can sleep and
287 * cause pagefault, which makes it pointless to use GFP_NOFS
288 * or GFP_ATOMIC.
289 */
290 p = kmalloc_track_caller(len + 1, GFP_KERNEL);
291 if (!p)
292 return ERR_PTR(-ENOMEM);
293
294 if (copy_from_user(p, src, len)) {
295 kfree(p);
296 return ERR_PTR(-EFAULT);
297 }
298 p[len] = '\0';
299
300 return p;
301 }
302 EXPORT_SYMBOL(memdup_user_nul);
303
304 /* Check if the vma is being used as a stack by this task */
vma_is_stack_for_current(struct vm_area_struct * vma)305 int vma_is_stack_for_current(struct vm_area_struct *vma)
306 {
307 struct task_struct * __maybe_unused t = current;
308
309 return (vma->vm_start <= KSTK_ESP(t) && vma->vm_end >= KSTK_ESP(t));
310 }
311
312 /*
313 * Change backing file, only valid to use during initial VMA setup.
314 */
vma_set_file(struct vm_area_struct * vma,struct file * file)315 void vma_set_file(struct vm_area_struct *vma, struct file *file)
316 {
317 /* Changing an anonymous vma with this is illegal */
318 get_file(file);
319 swap(vma->vm_file, file);
320 fput(file);
321 }
322 EXPORT_SYMBOL(vma_set_file);
323
324 #ifndef STACK_RND_MASK
325 #define STACK_RND_MASK (0x7ff >> (PAGE_SHIFT - 12)) /* 8MB of VA */
326 #endif
327
randomize_stack_top(unsigned long stack_top)328 unsigned long randomize_stack_top(unsigned long stack_top)
329 {
330 unsigned long random_variable = 0;
331
332 if (current->flags & PF_RANDOMIZE) {
333 random_variable = get_random_long();
334 random_variable &= STACK_RND_MASK;
335 random_variable <<= __PAGE_SHIFT;
336 }
337 #ifdef CONFIG_STACK_GROWSUP
338 return __PAGE_ALIGN(stack_top) + random_variable;
339 #else
340 return __PAGE_ALIGN(stack_top) - random_variable;
341 #endif
342 }
343
344 /**
345 * randomize_page - Generate a random, page aligned address
346 * @start: The smallest acceptable address the caller will take.
347 * @range: The size of the area, starting at @start, within which the
348 * random address must fall.
349 *
350 * If @start + @range would overflow, @range is capped.
351 *
352 * NOTE: Historical use of randomize_range, which this replaces, presumed that
353 * @start was already page aligned. We now align it regardless.
354 *
355 * Return: A page aligned address within [start, start + range). On error,
356 * @start is returned.
357 */
randomize_page(unsigned long start,unsigned long range)358 unsigned long randomize_page(unsigned long start, unsigned long range)
359 {
360 if (__offset_in_page(start)) {
361 range -= __PAGE_ALIGN(start) - start;
362 start = __PAGE_ALIGN(start);
363 }
364
365 if (start > ULONG_MAX - range)
366 range = ULONG_MAX - start;
367
368 range >>= __PAGE_SHIFT;
369
370 if (range == 0)
371 return start;
372
373 return start + (get_random_long() % range << __PAGE_SHIFT);
374 }
375
376 #ifdef CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
arch_randomize_brk(struct mm_struct * mm)377 unsigned long __weak arch_randomize_brk(struct mm_struct *mm)
378 {
379 /* Is the current task 32bit ? */
380 if (!IS_ENABLED(CONFIG_64BIT) || is_compat_task())
381 return randomize_page(mm->brk, SZ_32M);
382
383 return randomize_page(mm->brk, SZ_1G);
384 }
385
arch_mmap_rnd(void)386 unsigned long arch_mmap_rnd(void)
387 {
388 unsigned long rnd;
389
390 #ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS
391 if (is_compat_task())
392 rnd = get_random_long() & ((1UL << mmap_rnd_compat_bits) - 1);
393 else
394 #endif /* CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS */
395 rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1);
396
397 return rnd << PAGE_SHIFT;
398 }
399
mmap_is_legacy(struct rlimit * rlim_stack)400 static int mmap_is_legacy(struct rlimit *rlim_stack)
401 {
402 if (current->personality & ADDR_COMPAT_LAYOUT)
403 return 1;
404
405 /* On parisc the stack always grows up - so a unlimited stack should
406 * not be an indicator to use the legacy memory layout. */
407 if (rlim_stack->rlim_cur == RLIM_INFINITY &&
408 !IS_ENABLED(CONFIG_STACK_GROWSUP))
409 return 1;
410
411 return sysctl_legacy_va_layout;
412 }
413
414 /*
415 * Leave enough space between the mmap area and the stack to honour ulimit in
416 * the face of randomisation.
417 */
418 #define MIN_GAP (SZ_128M)
419 #define MAX_GAP (STACK_TOP / 6 * 5)
420
mmap_base(unsigned long rnd,struct rlimit * rlim_stack)421 static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack)
422 {
423 #ifdef CONFIG_STACK_GROWSUP
424 /*
425 * For an upwards growing stack the calculation is much simpler.
426 * Memory for the maximum stack size is reserved at the top of the
427 * task. mmap_base starts directly below the stack and grows
428 * downwards.
429 */
430 return PAGE_ALIGN_DOWN(mmap_upper_limit(rlim_stack) - rnd);
431 #else
432 unsigned long gap = rlim_stack->rlim_cur;
433 unsigned long pad = stack_guard_gap;
434
435 /* Account for stack randomization if necessary */
436 if (current->flags & PF_RANDOMIZE)
437 pad += (STACK_RND_MASK << PAGE_SHIFT);
438
439 /* Values close to RLIM_INFINITY can overflow. */
440 if (gap + pad > gap)
441 gap += pad;
442
443 if (gap < MIN_GAP)
444 gap = MIN_GAP;
445 else if (gap > MAX_GAP)
446 gap = MAX_GAP;
447
448 return PAGE_ALIGN(STACK_TOP - gap - rnd);
449 #endif
450 }
451
arch_pick_mmap_layout(struct mm_struct * mm,struct rlimit * rlim_stack)452 void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
453 {
454 unsigned long random_factor = 0UL;
455
456 if (current->flags & PF_RANDOMIZE)
457 random_factor = arch_mmap_rnd();
458
459 if (mmap_is_legacy(rlim_stack)) {
460 mm->mmap_base = TASK_UNMAPPED_BASE + random_factor;
461 mm->get_unmapped_area = arch_get_unmapped_area;
462 } else {
463 mm->mmap_base = mmap_base(random_factor, rlim_stack);
464 mm->get_unmapped_area = arch_get_unmapped_area_topdown;
465 }
466 }
467 #elif defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT)
arch_pick_mmap_layout(struct mm_struct * mm,struct rlimit * rlim_stack)468 void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
469 {
470 mm->mmap_base = TASK_UNMAPPED_BASE;
471 mm->get_unmapped_area = arch_get_unmapped_area;
472 }
473 #endif
474
475 /**
476 * __account_locked_vm - account locked pages to an mm's locked_vm
477 * @mm: mm to account against
478 * @pages: number of pages to account
479 * @inc: %true if @pages should be considered positive, %false if not
480 * @task: task used to check RLIMIT_MEMLOCK
481 * @bypass_rlim: %true if checking RLIMIT_MEMLOCK should be skipped
482 *
483 * Assumes @task and @mm are valid (i.e. at least one reference on each), and
484 * that mmap_lock is held as writer.
485 *
486 * Return:
487 * * 0 on success
488 * * -ENOMEM if RLIMIT_MEMLOCK would be exceeded.
489 */
__account_locked_vm(struct mm_struct * mm,unsigned long pages,bool inc,struct task_struct * task,bool bypass_rlim)490 int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc,
491 struct task_struct *task, bool bypass_rlim)
492 {
493 unsigned long locked_vm, limit;
494 int ret = 0;
495
496 mmap_assert_write_locked(mm);
497
498 locked_vm = mm->locked_vm;
499 if (inc) {
500 if (!bypass_rlim) {
501 limit = task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT;
502 if (locked_vm + pages > limit)
503 ret = -ENOMEM;
504 }
505 if (!ret)
506 mm->locked_vm = locked_vm + pages;
507 } else {
508 WARN_ON_ONCE(pages > locked_vm);
509 mm->locked_vm = locked_vm - pages;
510 }
511
512 pr_debug("%s: [%d] caller %ps %c%lu %lu/%lu%s\n", __func__, task->pid,
513 (void *)_RET_IP_, (inc) ? '+' : '-', pages << PAGE_SHIFT,
514 locked_vm << PAGE_SHIFT, task_rlimit(task, RLIMIT_MEMLOCK),
515 ret ? " - exceeded" : "");
516
517 return ret;
518 }
519 EXPORT_SYMBOL_GPL(__account_locked_vm);
520
521 /**
522 * account_locked_vm - account locked pages to an mm's locked_vm
523 * @mm: mm to account against, may be NULL
524 * @pages: number of pages to account
525 * @inc: %true if @pages should be considered positive, %false if not
526 *
527 * Assumes a non-NULL @mm is valid (i.e. at least one reference on it).
528 *
529 * Return:
530 * * 0 on success, or if mm is NULL
531 * * -ENOMEM if RLIMIT_MEMLOCK would be exceeded.
532 */
account_locked_vm(struct mm_struct * mm,unsigned long pages,bool inc)533 int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc)
534 {
535 int ret;
536
537 if (pages == 0 || !mm)
538 return 0;
539
540 mmap_write_lock(mm);
541 ret = __account_locked_vm(mm, pages, inc, current,
542 capable(CAP_IPC_LOCK));
543 mmap_write_unlock(mm);
544
545 return ret;
546 }
547 EXPORT_SYMBOL_GPL(account_locked_vm);
548
vm_mmap_pgoff(struct file * file,unsigned long addr,unsigned long len,unsigned long prot,unsigned long flag,unsigned long pgoff)549 unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
550 unsigned long len, unsigned long prot,
551 unsigned long flag, unsigned long pgoff)
552 {
553 unsigned long ret;
554 struct mm_struct *mm = current->mm;
555 unsigned long populate;
556 LIST_HEAD(uf);
557
558 ret = security_mmap_file(file, prot, flag);
559 if (!ret) {
560 if (mmap_write_lock_killable(mm))
561 return -EINTR;
562 ret = do_mmap(file, addr, len, prot, flag, 0, pgoff, &populate,
563 &uf);
564 mmap_write_unlock(mm);
565 userfaultfd_unmap_complete(mm, &uf);
566 if (populate)
567 mm_populate(ret, populate);
568 }
569 trace_android_vh_check_mmap_file(file, prot, flag, ret);
570 return ret;
571 }
572
vm_mmap(struct file * file,unsigned long addr,unsigned long len,unsigned long prot,unsigned long flag,unsigned long offset)573 unsigned long vm_mmap(struct file *file, unsigned long addr,
574 unsigned long len, unsigned long prot,
575 unsigned long flag, unsigned long offset)
576 {
577 if (unlikely(offset + PAGE_ALIGN(len) < offset))
578 return -EINVAL;
579 if (unlikely(offset_in_page(offset)))
580 return -EINVAL;
581
582 return vm_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT);
583 }
584 EXPORT_SYMBOL(vm_mmap);
585
586 /**
587 * kvmalloc_node - attempt to allocate physically contiguous memory, but upon
588 * failure, fall back to non-contiguous (vmalloc) allocation.
589 * @size: size of the request.
590 * @flags: gfp mask for the allocation - must be compatible (superset) with GFP_KERNEL.
591 * @node: numa node to allocate from
592 *
593 * Uses kmalloc to get the memory but if the allocation fails then falls back
594 * to the vmalloc allocator. Use kvfree for freeing the memory.
595 *
596 * GFP_NOWAIT and GFP_ATOMIC are not supported, neither is the __GFP_NORETRY modifier.
597 * __GFP_RETRY_MAYFAIL is supported, and it should be used only if kmalloc is
598 * preferable to the vmalloc fallback, due to visible performance drawbacks.
599 *
600 * Return: pointer to the allocated memory of %NULL in case of failure
601 */
kvmalloc_node(size_t size,gfp_t flags,int node)602 void *kvmalloc_node(size_t size, gfp_t flags, int node)
603 {
604 gfp_t kmalloc_flags = flags;
605 void *ret;
606 bool use_vmalloc = false;
607
608 trace_android_vh_kvmalloc_node_use_vmalloc(size, &kmalloc_flags, &use_vmalloc);
609 if (use_vmalloc)
610 goto use_vmalloc_node;
611 /*
612 * We want to attempt a large physically contiguous block first because
613 * it is less likely to fragment multiple larger blocks and therefore
614 * contribute to a long term fragmentation less than vmalloc fallback.
615 * However make sure that larger requests are not too disruptive - no
616 * OOM killer and no allocation failure warnings as we have a fallback.
617 */
618 if (size > PAGE_SIZE) {
619 kmalloc_flags |= __GFP_NOWARN;
620
621 if (!(kmalloc_flags & __GFP_RETRY_MAYFAIL))
622 kmalloc_flags |= __GFP_NORETRY;
623
624 /* nofail semantic is implemented by the vmalloc fallback */
625 kmalloc_flags &= ~__GFP_NOFAIL;
626 }
627 trace_android_vh_adjust_kvmalloc_flags(get_order(size), &kmalloc_flags);
628
629 ret = kmalloc_node(size, kmalloc_flags, node);
630
631 /*
632 * It doesn't really make sense to fallback to vmalloc for sub page
633 * requests
634 */
635 if (ret || size <= PAGE_SIZE)
636 return ret;
637
638 /* non-sleeping allocations are not supported by vmalloc */
639 if (!gfpflags_allow_blocking(flags))
640 return NULL;
641
642 /* Don't even allow crazy sizes */
643 if (unlikely(size > INT_MAX)) {
644 WARN_ON_ONCE(!(flags & __GFP_NOWARN));
645 return NULL;
646 }
647
648 /*
649 * kvmalloc() can always use VM_ALLOW_HUGE_VMAP,
650 * since the callers already cannot assume anything
651 * about the resulting pointer, and cannot play
652 * protection games.
653 */
654 use_vmalloc_node:
655 return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
656 flags, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP,
657 node, __builtin_return_address(0));
658 }
659 EXPORT_SYMBOL(kvmalloc_node);
660
661 /**
662 * kvfree() - Free memory.
663 * @addr: Pointer to allocated memory.
664 *
665 * kvfree frees memory allocated by any of vmalloc(), kmalloc() or kvmalloc().
666 * It is slightly more efficient to use kfree() or vfree() if you are certain
667 * that you know which one to use.
668 *
669 * Context: Either preemptible task context or not-NMI interrupt.
670 */
kvfree(const void * addr)671 void kvfree(const void *addr)
672 {
673 if (is_vmalloc_addr(addr))
674 vfree(addr);
675 else
676 kfree(addr);
677 }
678 EXPORT_SYMBOL(kvfree);
679
680 /**
681 * kvfree_sensitive - Free a data object containing sensitive information.
682 * @addr: address of the data object to be freed.
683 * @len: length of the data object.
684 *
685 * Use the special memzero_explicit() function to clear the content of a
686 * kvmalloc'ed object containing sensitive data to make sure that the
687 * compiler won't optimize out the data clearing.
688 */
kvfree_sensitive(const void * addr,size_t len)689 void kvfree_sensitive(const void *addr, size_t len)
690 {
691 if (likely(!ZERO_OR_NULL_PTR(addr))) {
692 memzero_explicit((void *)addr, len);
693 kvfree(addr);
694 }
695 }
696 EXPORT_SYMBOL(kvfree_sensitive);
697
kvrealloc(const void * p,size_t oldsize,size_t newsize,gfp_t flags)698 void *kvrealloc(const void *p, size_t oldsize, size_t newsize, gfp_t flags)
699 {
700 void *newp;
701
702 if (oldsize >= newsize)
703 return (void *)p;
704 newp = kvmalloc(newsize, flags);
705 if (!newp)
706 return NULL;
707 memcpy(newp, p, oldsize);
708 kvfree(p);
709 return newp;
710 }
711 EXPORT_SYMBOL(kvrealloc);
712
713 /**
714 * __vmalloc_array - allocate memory for a virtually contiguous array.
715 * @n: number of elements.
716 * @size: element size.
717 * @flags: the type of memory to allocate (see kmalloc).
718 */
__vmalloc_array(size_t n,size_t size,gfp_t flags)719 void *__vmalloc_array(size_t n, size_t size, gfp_t flags)
720 {
721 size_t bytes;
722
723 if (unlikely(check_mul_overflow(n, size, &bytes)))
724 return NULL;
725 return __vmalloc(bytes, flags);
726 }
727 EXPORT_SYMBOL(__vmalloc_array);
728
729 /**
730 * vmalloc_array - allocate memory for a virtually contiguous array.
731 * @n: number of elements.
732 * @size: element size.
733 */
vmalloc_array(size_t n,size_t size)734 void *vmalloc_array(size_t n, size_t size)
735 {
736 return __vmalloc_array(n, size, GFP_KERNEL);
737 }
738 EXPORT_SYMBOL(vmalloc_array);
739
740 /**
741 * __vcalloc - allocate and zero memory for a virtually contiguous array.
742 * @n: number of elements.
743 * @size: element size.
744 * @flags: the type of memory to allocate (see kmalloc).
745 */
__vcalloc(size_t n,size_t size,gfp_t flags)746 void *__vcalloc(size_t n, size_t size, gfp_t flags)
747 {
748 return __vmalloc_array(n, size, flags | __GFP_ZERO);
749 }
750 EXPORT_SYMBOL(__vcalloc);
751
752 /**
753 * vcalloc - allocate and zero memory for a virtually contiguous array.
754 * @n: number of elements.
755 * @size: element size.
756 */
vcalloc(size_t n,size_t size)757 void *vcalloc(size_t n, size_t size)
758 {
759 return __vmalloc_array(n, size, GFP_KERNEL | __GFP_ZERO);
760 }
761 EXPORT_SYMBOL(vcalloc);
762
folio_anon_vma(struct folio * folio)763 struct anon_vma *folio_anon_vma(struct folio *folio)
764 {
765 unsigned long mapping = (unsigned long)folio->mapping;
766
767 if ((mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
768 return NULL;
769 return (void *)(mapping - PAGE_MAPPING_ANON);
770 }
771
772 /**
773 * folio_mapping - Find the mapping where this folio is stored.
774 * @folio: The folio.
775 *
776 * For folios which are in the page cache, return the mapping that this
777 * page belongs to. Folios in the swap cache return the swap mapping
778 * this page is stored in (which is different from the mapping for the
779 * swap file or swap device where the data is stored).
780 *
781 * You can call this for folios which aren't in the swap cache or page
782 * cache and it will return NULL.
783 */
folio_mapping(struct folio * folio)784 struct address_space *folio_mapping(struct folio *folio)
785 {
786 struct address_space *mapping;
787
788 /* This happens if someone calls flush_dcache_page on slab page */
789 if (unlikely(folio_test_slab(folio)))
790 return NULL;
791
792 if (unlikely(folio_test_swapcache(folio)))
793 return swap_address_space(folio->swap);
794
795 mapping = folio->mapping;
796 if ((unsigned long)mapping & PAGE_MAPPING_FLAGS)
797 return NULL;
798
799 return mapping;
800 }
801 EXPORT_SYMBOL(folio_mapping);
802
803 /**
804 * folio_copy - Copy the contents of one folio to another.
805 * @dst: Folio to copy to.
806 * @src: Folio to copy from.
807 *
808 * The bytes in the folio represented by @src are copied to @dst.
809 * Assumes the caller has validated that @dst is at least as large as @src.
810 * Can be called in atomic context for order-0 folios, but if the folio is
811 * larger, it may sleep.
812 */
folio_copy(struct folio * dst,struct folio * src)813 void folio_copy(struct folio *dst, struct folio *src)
814 {
815 long i = 0;
816 long nr = folio_nr_pages(src);
817
818 for (;;) {
819 copy_highpage(folio_page(dst, i), folio_page(src, i));
820 if (++i == nr)
821 break;
822 cond_resched();
823 }
824 }
825
826 int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS;
827 int sysctl_overcommit_ratio __read_mostly = 50;
828 unsigned long sysctl_overcommit_kbytes __read_mostly;
829 int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
830 unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
831 unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */
832
overcommit_ratio_handler(struct ctl_table * table,int write,void * buffer,size_t * lenp,loff_t * ppos)833 int overcommit_ratio_handler(struct ctl_table *table, int write, void *buffer,
834 size_t *lenp, loff_t *ppos)
835 {
836 int ret;
837
838 ret = proc_dointvec(table, write, buffer, lenp, ppos);
839 if (ret == 0 && write)
840 sysctl_overcommit_kbytes = 0;
841 return ret;
842 }
843
sync_overcommit_as(struct work_struct * dummy)844 static void sync_overcommit_as(struct work_struct *dummy)
845 {
846 percpu_counter_sync(&vm_committed_as);
847 }
848
overcommit_policy_handler(struct ctl_table * table,int write,void * buffer,size_t * lenp,loff_t * ppos)849 int overcommit_policy_handler(struct ctl_table *table, int write, void *buffer,
850 size_t *lenp, loff_t *ppos)
851 {
852 struct ctl_table t;
853 int new_policy = -1;
854 int ret;
855
856 /*
857 * The deviation of sync_overcommit_as could be big with loose policy
858 * like OVERCOMMIT_ALWAYS/OVERCOMMIT_GUESS. When changing policy to
859 * strict OVERCOMMIT_NEVER, we need to reduce the deviation to comply
860 * with the strict "NEVER", and to avoid possible race condition (even
861 * though user usually won't too frequently do the switching to policy
862 * OVERCOMMIT_NEVER), the switch is done in the following order:
863 * 1. changing the batch
864 * 2. sync percpu count on each CPU
865 * 3. switch the policy
866 */
867 if (write) {
868 t = *table;
869 t.data = &new_policy;
870 ret = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
871 if (ret || new_policy == -1)
872 return ret;
873
874 mm_compute_batch(new_policy);
875 if (new_policy == OVERCOMMIT_NEVER)
876 schedule_on_each_cpu(sync_overcommit_as);
877 sysctl_overcommit_memory = new_policy;
878 } else {
879 ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
880 }
881
882 return ret;
883 }
884
overcommit_kbytes_handler(struct ctl_table * table,int write,void * buffer,size_t * lenp,loff_t * ppos)885 int overcommit_kbytes_handler(struct ctl_table *table, int write, void *buffer,
886 size_t *lenp, loff_t *ppos)
887 {
888 int ret;
889
890 ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
891 if (ret == 0 && write)
892 sysctl_overcommit_ratio = 0;
893 return ret;
894 }
895
896 /*
897 * Committed memory limit enforced when OVERCOMMIT_NEVER policy is used
898 */
vm_commit_limit(void)899 unsigned long vm_commit_limit(void)
900 {
901 unsigned long allowed;
902
903 if (sysctl_overcommit_kbytes)
904 allowed = sysctl_overcommit_kbytes >> (PAGE_SHIFT - 10);
905 else
906 allowed = ((totalram_pages() - hugetlb_total_pages())
907 * sysctl_overcommit_ratio / 100);
908 allowed += total_swap_pages;
909
910 return allowed;
911 }
912
913 /*
914 * Make sure vm_committed_as in one cacheline and not cacheline shared with
915 * other variables. It can be updated by several CPUs frequently.
916 */
917 struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp;
918
919 /*
920 * The global memory commitment made in the system can be a metric
921 * that can be used to drive ballooning decisions when Linux is hosted
922 * as a guest. On Hyper-V, the host implements a policy engine for dynamically
923 * balancing memory across competing virtual machines that are hosted.
924 * Several metrics drive this policy engine including the guest reported
925 * memory commitment.
926 *
927 * The time cost of this is very low for small platforms, and for big
928 * platform like a 2S/36C/72T Skylake server, in worst case where
929 * vm_committed_as's spinlock is under severe contention, the time cost
930 * could be about 30~40 microseconds.
931 */
vm_memory_committed(void)932 unsigned long vm_memory_committed(void)
933 {
934 return percpu_counter_sum_positive(&vm_committed_as);
935 }
936 EXPORT_SYMBOL_GPL(vm_memory_committed);
937
938 /*
939 * Check that a process has enough memory to allocate a new virtual
940 * mapping. 0 means there is enough memory for the allocation to
941 * succeed and -ENOMEM implies there is not.
942 *
943 * We currently support three overcommit policies, which are set via the
944 * vm.overcommit_memory sysctl. See Documentation/mm/overcommit-accounting.rst
945 *
946 * Strict overcommit modes added 2002 Feb 26 by Alan Cox.
947 * Additional code 2002 Jul 20 by Robert Love.
948 *
949 * cap_sys_admin is 1 if the process has admin privileges, 0 otherwise.
950 *
951 * Note this is a helper function intended to be used by LSMs which
952 * wish to use this logic.
953 */
__vm_enough_memory(struct mm_struct * mm,long pages,int cap_sys_admin)954 int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
955 {
956 long allowed;
957
958 vm_acct_memory(pages);
959
960 /*
961 * Sometimes we want to use more memory than we have
962 */
963 if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS)
964 return 0;
965
966 if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
967 if (pages > totalram_pages() + total_swap_pages)
968 goto error;
969 return 0;
970 }
971
972 allowed = vm_commit_limit();
973 /*
974 * Reserve some for root
975 */
976 if (!cap_sys_admin)
977 allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
978
979 /*
980 * Don't let a single process grow so big a user can't recover
981 */
982 if (mm) {
983 long reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10);
984
985 allowed -= min_t(long, mm->total_vm / 32, reserve);
986 }
987
988 if (percpu_counter_read_positive(&vm_committed_as) < allowed)
989 return 0;
990 error:
991 pr_warn_ratelimited("%s: pid: %d, comm: %s, not enough memory for the allocation\n",
992 __func__, current->pid, current->comm);
993 vm_unacct_memory(pages);
994
995 return -ENOMEM;
996 }
997
998 /**
999 * get_cmdline() - copy the cmdline value to a buffer.
1000 * @task: the task whose cmdline value to copy.
1001 * @buffer: the buffer to copy to.
1002 * @buflen: the length of the buffer. Larger cmdline values are truncated
1003 * to this length.
1004 *
1005 * Return: the size of the cmdline field copied. Note that the copy does
1006 * not guarantee an ending NULL byte.
1007 */
get_cmdline(struct task_struct * task,char * buffer,int buflen)1008 int get_cmdline(struct task_struct *task, char *buffer, int buflen)
1009 {
1010 int res = 0;
1011 unsigned int len;
1012 struct mm_struct *mm = get_task_mm(task);
1013 unsigned long arg_start, arg_end, env_start, env_end;
1014 if (!mm)
1015 goto out;
1016 if (!mm->arg_end)
1017 goto out_mm; /* Shh! No looking before we're done */
1018
1019 spin_lock(&mm->arg_lock);
1020 arg_start = mm->arg_start;
1021 arg_end = mm->arg_end;
1022 env_start = mm->env_start;
1023 env_end = mm->env_end;
1024 spin_unlock(&mm->arg_lock);
1025
1026 len = arg_end - arg_start;
1027
1028 if (len > buflen)
1029 len = buflen;
1030
1031 res = access_process_vm(task, arg_start, buffer, len, FOLL_FORCE);
1032
1033 /*
1034 * If the nul at the end of args has been overwritten, then
1035 * assume application is using setproctitle(3).
1036 */
1037 if (res > 0 && buffer[res-1] != '\0' && len < buflen) {
1038 len = strnlen(buffer, res);
1039 if (len < res) {
1040 res = len;
1041 } else {
1042 len = env_end - env_start;
1043 if (len > buflen - res)
1044 len = buflen - res;
1045 res += access_process_vm(task, env_start,
1046 buffer+res, len,
1047 FOLL_FORCE);
1048 res = strnlen(buffer, res);
1049 }
1050 }
1051 out_mm:
1052 mmput(mm);
1053 out:
1054 return res;
1055 }
1056
memcmp_pages(struct page * page1,struct page * page2)1057 int __weak memcmp_pages(struct page *page1, struct page *page2)
1058 {
1059 char *addr1, *addr2;
1060 int ret;
1061
1062 addr1 = kmap_atomic(page1);
1063 addr2 = kmap_atomic(page2);
1064 ret = memcmp(addr1, addr2, PAGE_SIZE);
1065 kunmap_atomic(addr2);
1066 kunmap_atomic(addr1);
1067 return ret;
1068 }
1069
1070 #ifdef CONFIG_PRINTK
1071 /**
1072 * mem_dump_obj - Print available provenance information
1073 * @object: object for which to find provenance information.
1074 *
1075 * This function uses pr_cont(), so that the caller is expected to have
1076 * printed out whatever preamble is appropriate. The provenance information
1077 * depends on the type of object and on how much debugging is enabled.
1078 * For example, for a slab-cache object, the slab name is printed, and,
1079 * if available, the return address and stack trace from the allocation
1080 * and last free path of that object.
1081 */
mem_dump_obj(void * object)1082 void mem_dump_obj(void *object)
1083 {
1084 const char *type;
1085
1086 if (kmem_dump_obj(object))
1087 return;
1088
1089 if (vmalloc_dump_obj(object))
1090 return;
1091
1092 if (is_vmalloc_addr(object))
1093 type = "vmalloc memory";
1094 else if (virt_addr_valid(object))
1095 type = "non-slab/vmalloc memory";
1096 else if (object == NULL)
1097 type = "NULL pointer";
1098 else if (object == ZERO_SIZE_PTR)
1099 type = "zero-size pointer";
1100 else
1101 type = "non-paged memory";
1102
1103 pr_cont(" %s\n", type);
1104 }
1105 EXPORT_SYMBOL_GPL(mem_dump_obj);
1106 #endif
1107
1108 /*
1109 * A driver might set a page logically offline -- PageOffline() -- and
1110 * turn the page inaccessible in the hypervisor; after that, access to page
1111 * content can be fatal.
1112 *
1113 * Some special PFN walkers -- i.e., /proc/kcore -- read content of random
1114 * pages after checking PageOffline(); however, these PFN walkers can race
1115 * with drivers that set PageOffline().
1116 *
1117 * page_offline_freeze()/page_offline_thaw() allows for a subsystem to
1118 * synchronize with such drivers, achieving that a page cannot be set
1119 * PageOffline() while frozen.
1120 *
1121 * page_offline_begin()/page_offline_end() is used by drivers that care about
1122 * such races when setting a page PageOffline().
1123 */
1124 static DECLARE_RWSEM(page_offline_rwsem);
1125
page_offline_freeze(void)1126 void page_offline_freeze(void)
1127 {
1128 down_read(&page_offline_rwsem);
1129 }
1130
page_offline_thaw(void)1131 void page_offline_thaw(void)
1132 {
1133 up_read(&page_offline_rwsem);
1134 }
1135
page_offline_begin(void)1136 void page_offline_begin(void)
1137 {
1138 down_write(&page_offline_rwsem);
1139 }
1140 EXPORT_SYMBOL(page_offline_begin);
1141
page_offline_end(void)1142 void page_offline_end(void)
1143 {
1144 up_write(&page_offline_rwsem);
1145 }
1146 EXPORT_SYMBOL(page_offline_end);
1147
1148 #ifndef flush_dcache_folio
flush_dcache_folio(struct folio * folio)1149 void flush_dcache_folio(struct folio *folio)
1150 {
1151 long i, nr = folio_nr_pages(folio);
1152
1153 for (i = 0; i < nr; i++)
1154 flush_dcache_page(folio_page(folio, i));
1155 }
1156 EXPORT_SYMBOL(flush_dcache_folio);
1157 #endif
1158