• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: GPL-2.0-only
2 #include <linux/mm.h>
3 #include <linux/slab.h>
4 #include <linux/string.h>
5 #include <linux/compiler.h>
6 #include <linux/export.h>
7 #include <linux/err.h>
8 #include <linux/sched.h>
9 #include <linux/sched/mm.h>
10 #include <linux/sched/signal.h>
11 #include <linux/sched/task_stack.h>
12 #include <linux/security.h>
13 #include <linux/swap.h>
14 #include <linux/swapops.h>
15 #include <linux/mman.h>
16 #include <linux/hugetlb.h>
17 #include <linux/vmalloc.h>
18 #include <linux/userfaultfd_k.h>
19 #include <linux/elf.h>
20 #include <linux/elf-randomize.h>
21 #include <linux/personality.h>
22 #include <linux/random.h>
23 #include <linux/processor.h>
24 #include <linux/sizes.h>
25 #include <linux/compat.h>
26 #include <linux/page_size_compat.h>
27 
28 #include <linux/uaccess.h>
29 #include <trace/hooks/mm.h>
30 #include "internal.h"
31 #include "swap.h"
32 
33 #ifndef __GENKSYMS__
34 #include <trace/hooks/syscall_check.h>
35 #include <trace/hooks/mm.h>
36 #endif
37 
38 /**
39  * kfree_const - conditionally free memory
40  * @x: pointer to the memory
41  *
42  * Function calls kfree only if @x is not in .rodata section.
43  */
kfree_const(const void * x)44 void kfree_const(const void *x)
45 {
46 	if (!is_kernel_rodata((unsigned long)x))
47 		kfree(x);
48 }
49 EXPORT_SYMBOL(kfree_const);
50 
51 /**
52  * kstrdup - allocate space for and copy an existing string
53  * @s: the string to duplicate
54  * @gfp: the GFP mask used in the kmalloc() call when allocating memory
55  *
56  * Return: newly allocated copy of @s or %NULL in case of error
57  */
58 noinline
kstrdup(const char * s,gfp_t gfp)59 char *kstrdup(const char *s, gfp_t gfp)
60 {
61 	size_t len;
62 	char *buf;
63 
64 	if (!s)
65 		return NULL;
66 
67 	len = strlen(s) + 1;
68 	buf = kmalloc_track_caller(len, gfp);
69 	if (buf)
70 		memcpy(buf, s, len);
71 	return buf;
72 }
73 EXPORT_SYMBOL(kstrdup);
74 
75 /**
76  * kstrdup_const - conditionally duplicate an existing const string
77  * @s: the string to duplicate
78  * @gfp: the GFP mask used in the kmalloc() call when allocating memory
79  *
80  * Note: Strings allocated by kstrdup_const should be freed by kfree_const and
81  * must not be passed to krealloc().
82  *
83  * Return: source string if it is in .rodata section otherwise
84  * fallback to kstrdup.
85  */
kstrdup_const(const char * s,gfp_t gfp)86 const char *kstrdup_const(const char *s, gfp_t gfp)
87 {
88 	if (is_kernel_rodata((unsigned long)s))
89 		return s;
90 
91 	return kstrdup(s, gfp);
92 }
93 EXPORT_SYMBOL(kstrdup_const);
94 
95 /**
96  * kstrndup - allocate space for and copy an existing string
97  * @s: the string to duplicate
98  * @max: read at most @max chars from @s
99  * @gfp: the GFP mask used in the kmalloc() call when allocating memory
100  *
101  * Note: Use kmemdup_nul() instead if the size is known exactly.
102  *
103  * Return: newly allocated copy of @s or %NULL in case of error
104  */
kstrndup(const char * s,size_t max,gfp_t gfp)105 char *kstrndup(const char *s, size_t max, gfp_t gfp)
106 {
107 	size_t len;
108 	char *buf;
109 
110 	if (!s)
111 		return NULL;
112 
113 	len = strnlen(s, max);
114 	buf = kmalloc_track_caller(len+1, gfp);
115 	if (buf) {
116 		memcpy(buf, s, len);
117 		buf[len] = '\0';
118 	}
119 	return buf;
120 }
121 EXPORT_SYMBOL(kstrndup);
122 
123 /**
124  * kmemdup - duplicate region of memory
125  *
126  * @src: memory region to duplicate
127  * @len: memory region length
128  * @gfp: GFP mask to use
129  *
130  * Return: newly allocated copy of @src or %NULL in case of error,
131  * result is physically contiguous. Use kfree() to free.
132  */
kmemdup(const void * src,size_t len,gfp_t gfp)133 void *kmemdup(const void *src, size_t len, gfp_t gfp)
134 {
135 	void *p;
136 
137 	p = kmalloc_track_caller(len, gfp);
138 	if (p)
139 		memcpy(p, src, len);
140 	return p;
141 }
142 EXPORT_SYMBOL(kmemdup);
143 
144 /**
145  * kvmemdup - duplicate region of memory
146  *
147  * @src: memory region to duplicate
148  * @len: memory region length
149  * @gfp: GFP mask to use
150  *
151  * Return: newly allocated copy of @src or %NULL in case of error,
152  * result may be not physically contiguous. Use kvfree() to free.
153  */
kvmemdup(const void * src,size_t len,gfp_t gfp)154 void *kvmemdup(const void *src, size_t len, gfp_t gfp)
155 {
156 	void *p;
157 
158 	p = kvmalloc(len, gfp);
159 	if (p)
160 		memcpy(p, src, len);
161 	return p;
162 }
163 EXPORT_SYMBOL(kvmemdup);
164 
165 /**
166  * kmemdup_nul - Create a NUL-terminated string from unterminated data
167  * @s: The data to stringify
168  * @len: The size of the data
169  * @gfp: the GFP mask used in the kmalloc() call when allocating memory
170  *
171  * Return: newly allocated copy of @s with NUL-termination or %NULL in
172  * case of error
173  */
kmemdup_nul(const char * s,size_t len,gfp_t gfp)174 char *kmemdup_nul(const char *s, size_t len, gfp_t gfp)
175 {
176 	char *buf;
177 
178 	if (!s)
179 		return NULL;
180 
181 	buf = kmalloc_track_caller(len + 1, gfp);
182 	if (buf) {
183 		memcpy(buf, s, len);
184 		buf[len] = '\0';
185 	}
186 	return buf;
187 }
188 EXPORT_SYMBOL(kmemdup_nul);
189 
190 /**
191  * memdup_user - duplicate memory region from user space
192  *
193  * @src: source address in user space
194  * @len: number of bytes to copy
195  *
196  * Return: an ERR_PTR() on failure.  Result is physically
197  * contiguous, to be freed by kfree().
198  */
memdup_user(const void __user * src,size_t len)199 void *memdup_user(const void __user *src, size_t len)
200 {
201 	void *p;
202 
203 	p = kmalloc_track_caller(len, GFP_USER | __GFP_NOWARN);
204 	if (!p)
205 		return ERR_PTR(-ENOMEM);
206 
207 	if (copy_from_user(p, src, len)) {
208 		kfree(p);
209 		return ERR_PTR(-EFAULT);
210 	}
211 
212 	return p;
213 }
214 EXPORT_SYMBOL(memdup_user);
215 
216 /**
217  * vmemdup_user - duplicate memory region from user space
218  *
219  * @src: source address in user space
220  * @len: number of bytes to copy
221  *
222  * Return: an ERR_PTR() on failure.  Result may be not
223  * physically contiguous.  Use kvfree() to free.
224  */
vmemdup_user(const void __user * src,size_t len)225 void *vmemdup_user(const void __user *src, size_t len)
226 {
227 	void *p;
228 
229 	p = kvmalloc(len, GFP_USER);
230 	if (!p)
231 		return ERR_PTR(-ENOMEM);
232 
233 	if (copy_from_user(p, src, len)) {
234 		kvfree(p);
235 		return ERR_PTR(-EFAULT);
236 	}
237 
238 	return p;
239 }
240 EXPORT_SYMBOL(vmemdup_user);
241 
242 /**
243  * strndup_user - duplicate an existing string from user space
244  * @s: The string to duplicate
245  * @n: Maximum number of bytes to copy, including the trailing NUL.
246  *
247  * Return: newly allocated copy of @s or an ERR_PTR() in case of error
248  */
strndup_user(const char __user * s,long n)249 char *strndup_user(const char __user *s, long n)
250 {
251 	char *p;
252 	long length;
253 
254 	length = strnlen_user(s, n);
255 
256 	if (!length)
257 		return ERR_PTR(-EFAULT);
258 
259 	if (length > n)
260 		return ERR_PTR(-EINVAL);
261 
262 	p = memdup_user(s, length);
263 
264 	if (IS_ERR(p))
265 		return p;
266 
267 	p[length - 1] = '\0';
268 
269 	return p;
270 }
271 EXPORT_SYMBOL(strndup_user);
272 
273 /**
274  * memdup_user_nul - duplicate memory region from user space and NUL-terminate
275  *
276  * @src: source address in user space
277  * @len: number of bytes to copy
278  *
279  * Return: an ERR_PTR() on failure.
280  */
memdup_user_nul(const void __user * src,size_t len)281 void *memdup_user_nul(const void __user *src, size_t len)
282 {
283 	char *p;
284 
285 	/*
286 	 * Always use GFP_KERNEL, since copy_from_user() can sleep and
287 	 * cause pagefault, which makes it pointless to use GFP_NOFS
288 	 * or GFP_ATOMIC.
289 	 */
290 	p = kmalloc_track_caller(len + 1, GFP_KERNEL);
291 	if (!p)
292 		return ERR_PTR(-ENOMEM);
293 
294 	if (copy_from_user(p, src, len)) {
295 		kfree(p);
296 		return ERR_PTR(-EFAULT);
297 	}
298 	p[len] = '\0';
299 
300 	return p;
301 }
302 EXPORT_SYMBOL(memdup_user_nul);
303 
304 /* Check if the vma is being used as a stack by this task */
vma_is_stack_for_current(struct vm_area_struct * vma)305 int vma_is_stack_for_current(struct vm_area_struct *vma)
306 {
307 	struct task_struct * __maybe_unused t = current;
308 
309 	return (vma->vm_start <= KSTK_ESP(t) && vma->vm_end >= KSTK_ESP(t));
310 }
311 
312 /*
313  * Change backing file, only valid to use during initial VMA setup.
314  */
vma_set_file(struct vm_area_struct * vma,struct file * file)315 void vma_set_file(struct vm_area_struct *vma, struct file *file)
316 {
317 	/* Changing an anonymous vma with this is illegal */
318 	get_file(file);
319 	swap(vma->vm_file, file);
320 	fput(file);
321 }
322 EXPORT_SYMBOL(vma_set_file);
323 
324 #ifndef STACK_RND_MASK
325 #define STACK_RND_MASK (0x7ff >> (PAGE_SHIFT - 12))     /* 8MB of VA */
326 #endif
327 
randomize_stack_top(unsigned long stack_top)328 unsigned long randomize_stack_top(unsigned long stack_top)
329 {
330 	unsigned long random_variable = 0;
331 
332 	if (current->flags & PF_RANDOMIZE) {
333 		random_variable = get_random_long();
334 		random_variable &= STACK_RND_MASK;
335 		random_variable <<= __PAGE_SHIFT;
336 	}
337 #ifdef CONFIG_STACK_GROWSUP
338 	return __PAGE_ALIGN(stack_top) + random_variable;
339 #else
340 	return __PAGE_ALIGN(stack_top) - random_variable;
341 #endif
342 }
343 
344 /**
345  * randomize_page - Generate a random, page aligned address
346  * @start:	The smallest acceptable address the caller will take.
347  * @range:	The size of the area, starting at @start, within which the
348  *		random address must fall.
349  *
350  * If @start + @range would overflow, @range is capped.
351  *
352  * NOTE: Historical use of randomize_range, which this replaces, presumed that
353  * @start was already page aligned.  We now align it regardless.
354  *
355  * Return: A page aligned address within [start, start + range).  On error,
356  * @start is returned.
357  */
randomize_page(unsigned long start,unsigned long range)358 unsigned long randomize_page(unsigned long start, unsigned long range)
359 {
360 	if (__offset_in_page(start)) {
361 		range -= __PAGE_ALIGN(start) - start;
362 		start = __PAGE_ALIGN(start);
363 	}
364 
365 	if (start > ULONG_MAX - range)
366 		range = ULONG_MAX - start;
367 
368 	range >>= __PAGE_SHIFT;
369 
370 	if (range == 0)
371 		return start;
372 
373 	return start + (get_random_long() % range << __PAGE_SHIFT);
374 }
375 
376 #ifdef CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
arch_randomize_brk(struct mm_struct * mm)377 unsigned long __weak arch_randomize_brk(struct mm_struct *mm)
378 {
379 	/* Is the current task 32bit ? */
380 	if (!IS_ENABLED(CONFIG_64BIT) || is_compat_task())
381 		return randomize_page(mm->brk, SZ_32M);
382 
383 	return randomize_page(mm->brk, SZ_1G);
384 }
385 
arch_mmap_rnd(void)386 unsigned long arch_mmap_rnd(void)
387 {
388 	unsigned long rnd;
389 
390 #ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS
391 	if (is_compat_task())
392 		rnd = get_random_long() & ((1UL << mmap_rnd_compat_bits) - 1);
393 	else
394 #endif /* CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS */
395 		rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1);
396 
397 	return rnd << PAGE_SHIFT;
398 }
399 
mmap_is_legacy(struct rlimit * rlim_stack)400 static int mmap_is_legacy(struct rlimit *rlim_stack)
401 {
402 	if (current->personality & ADDR_COMPAT_LAYOUT)
403 		return 1;
404 
405 	/* On parisc the stack always grows up - so a unlimited stack should
406 	 * not be an indicator to use the legacy memory layout. */
407 	if (rlim_stack->rlim_cur == RLIM_INFINITY &&
408 		!IS_ENABLED(CONFIG_STACK_GROWSUP))
409 		return 1;
410 
411 	return sysctl_legacy_va_layout;
412 }
413 
414 /*
415  * Leave enough space between the mmap area and the stack to honour ulimit in
416  * the face of randomisation.
417  */
418 #define MIN_GAP		(SZ_128M)
419 #define MAX_GAP		(STACK_TOP / 6 * 5)
420 
mmap_base(unsigned long rnd,struct rlimit * rlim_stack)421 static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack)
422 {
423 #ifdef CONFIG_STACK_GROWSUP
424 	/*
425 	 * For an upwards growing stack the calculation is much simpler.
426 	 * Memory for the maximum stack size is reserved at the top of the
427 	 * task. mmap_base starts directly below the stack and grows
428 	 * downwards.
429 	 */
430 	return PAGE_ALIGN_DOWN(mmap_upper_limit(rlim_stack) - rnd);
431 #else
432 	unsigned long gap = rlim_stack->rlim_cur;
433 	unsigned long pad = stack_guard_gap;
434 
435 	/* Account for stack randomization if necessary */
436 	if (current->flags & PF_RANDOMIZE)
437 		pad += (STACK_RND_MASK << PAGE_SHIFT);
438 
439 	/* Values close to RLIM_INFINITY can overflow. */
440 	if (gap + pad > gap)
441 		gap += pad;
442 
443 	if (gap < MIN_GAP)
444 		gap = MIN_GAP;
445 	else if (gap > MAX_GAP)
446 		gap = MAX_GAP;
447 
448 	return PAGE_ALIGN(STACK_TOP - gap - rnd);
449 #endif
450 }
451 
arch_pick_mmap_layout(struct mm_struct * mm,struct rlimit * rlim_stack)452 void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
453 {
454 	unsigned long random_factor = 0UL;
455 
456 	if (current->flags & PF_RANDOMIZE)
457 		random_factor = arch_mmap_rnd();
458 
459 	if (mmap_is_legacy(rlim_stack)) {
460 		mm->mmap_base = TASK_UNMAPPED_BASE + random_factor;
461 		mm->get_unmapped_area = arch_get_unmapped_area;
462 	} else {
463 		mm->mmap_base = mmap_base(random_factor, rlim_stack);
464 		mm->get_unmapped_area = arch_get_unmapped_area_topdown;
465 	}
466 }
467 #elif defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT)
arch_pick_mmap_layout(struct mm_struct * mm,struct rlimit * rlim_stack)468 void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
469 {
470 	mm->mmap_base = TASK_UNMAPPED_BASE;
471 	mm->get_unmapped_area = arch_get_unmapped_area;
472 }
473 #endif
474 
475 /**
476  * __account_locked_vm - account locked pages to an mm's locked_vm
477  * @mm:          mm to account against
478  * @pages:       number of pages to account
479  * @inc:         %true if @pages should be considered positive, %false if not
480  * @task:        task used to check RLIMIT_MEMLOCK
481  * @bypass_rlim: %true if checking RLIMIT_MEMLOCK should be skipped
482  *
483  * Assumes @task and @mm are valid (i.e. at least one reference on each), and
484  * that mmap_lock is held as writer.
485  *
486  * Return:
487  * * 0       on success
488  * * -ENOMEM if RLIMIT_MEMLOCK would be exceeded.
489  */
__account_locked_vm(struct mm_struct * mm,unsigned long pages,bool inc,struct task_struct * task,bool bypass_rlim)490 int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc,
491 			struct task_struct *task, bool bypass_rlim)
492 {
493 	unsigned long locked_vm, limit;
494 	int ret = 0;
495 
496 	mmap_assert_write_locked(mm);
497 
498 	locked_vm = mm->locked_vm;
499 	if (inc) {
500 		if (!bypass_rlim) {
501 			limit = task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT;
502 			if (locked_vm + pages > limit)
503 				ret = -ENOMEM;
504 		}
505 		if (!ret)
506 			mm->locked_vm = locked_vm + pages;
507 	} else {
508 		WARN_ON_ONCE(pages > locked_vm);
509 		mm->locked_vm = locked_vm - pages;
510 	}
511 
512 	pr_debug("%s: [%d] caller %ps %c%lu %lu/%lu%s\n", __func__, task->pid,
513 		 (void *)_RET_IP_, (inc) ? '+' : '-', pages << PAGE_SHIFT,
514 		 locked_vm << PAGE_SHIFT, task_rlimit(task, RLIMIT_MEMLOCK),
515 		 ret ? " - exceeded" : "");
516 
517 	return ret;
518 }
519 EXPORT_SYMBOL_GPL(__account_locked_vm);
520 
521 /**
522  * account_locked_vm - account locked pages to an mm's locked_vm
523  * @mm:          mm to account against, may be NULL
524  * @pages:       number of pages to account
525  * @inc:         %true if @pages should be considered positive, %false if not
526  *
527  * Assumes a non-NULL @mm is valid (i.e. at least one reference on it).
528  *
529  * Return:
530  * * 0       on success, or if mm is NULL
531  * * -ENOMEM if RLIMIT_MEMLOCK would be exceeded.
532  */
account_locked_vm(struct mm_struct * mm,unsigned long pages,bool inc)533 int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc)
534 {
535 	int ret;
536 
537 	if (pages == 0 || !mm)
538 		return 0;
539 
540 	mmap_write_lock(mm);
541 	ret = __account_locked_vm(mm, pages, inc, current,
542 				  capable(CAP_IPC_LOCK));
543 	mmap_write_unlock(mm);
544 
545 	return ret;
546 }
547 EXPORT_SYMBOL_GPL(account_locked_vm);
548 
vm_mmap_pgoff(struct file * file,unsigned long addr,unsigned long len,unsigned long prot,unsigned long flag,unsigned long pgoff)549 unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
550 	unsigned long len, unsigned long prot,
551 	unsigned long flag, unsigned long pgoff)
552 {
553 	unsigned long ret;
554 	struct mm_struct *mm = current->mm;
555 	unsigned long populate;
556 	LIST_HEAD(uf);
557 
558 	ret = security_mmap_file(file, prot, flag);
559 	if (!ret) {
560 		if (mmap_write_lock_killable(mm))
561 			return -EINTR;
562 		ret = do_mmap(file, addr, len, prot, flag, 0, pgoff, &populate,
563 			      &uf);
564 		mmap_write_unlock(mm);
565 		userfaultfd_unmap_complete(mm, &uf);
566 		if (populate)
567 			mm_populate(ret, populate);
568 	}
569 	trace_android_vh_check_mmap_file(file, prot, flag, ret);
570 	return ret;
571 }
572 
vm_mmap(struct file * file,unsigned long addr,unsigned long len,unsigned long prot,unsigned long flag,unsigned long offset)573 unsigned long vm_mmap(struct file *file, unsigned long addr,
574 	unsigned long len, unsigned long prot,
575 	unsigned long flag, unsigned long offset)
576 {
577 	if (unlikely(offset + PAGE_ALIGN(len) < offset))
578 		return -EINVAL;
579 	if (unlikely(offset_in_page(offset)))
580 		return -EINVAL;
581 
582 	return vm_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT);
583 }
584 EXPORT_SYMBOL(vm_mmap);
585 
586 /**
587  * kvmalloc_node - attempt to allocate physically contiguous memory, but upon
588  * failure, fall back to non-contiguous (vmalloc) allocation.
589  * @size: size of the request.
590  * @flags: gfp mask for the allocation - must be compatible (superset) with GFP_KERNEL.
591  * @node: numa node to allocate from
592  *
593  * Uses kmalloc to get the memory but if the allocation fails then falls back
594  * to the vmalloc allocator. Use kvfree for freeing the memory.
595  *
596  * GFP_NOWAIT and GFP_ATOMIC are not supported, neither is the __GFP_NORETRY modifier.
597  * __GFP_RETRY_MAYFAIL is supported, and it should be used only if kmalloc is
598  * preferable to the vmalloc fallback, due to visible performance drawbacks.
599  *
600  * Return: pointer to the allocated memory of %NULL in case of failure
601  */
kvmalloc_node(size_t size,gfp_t flags,int node)602 void *kvmalloc_node(size_t size, gfp_t flags, int node)
603 {
604 	gfp_t kmalloc_flags = flags;
605 	void *ret;
606 	bool use_vmalloc = false;
607 
608 	trace_android_vh_kvmalloc_node_use_vmalloc(size, &kmalloc_flags, &use_vmalloc);
609 	if (use_vmalloc)
610 		goto use_vmalloc_node;
611 	/*
612 	 * We want to attempt a large physically contiguous block first because
613 	 * it is less likely to fragment multiple larger blocks and therefore
614 	 * contribute to a long term fragmentation less than vmalloc fallback.
615 	 * However make sure that larger requests are not too disruptive - no
616 	 * OOM killer and no allocation failure warnings as we have a fallback.
617 	 */
618 	if (size > PAGE_SIZE) {
619 		kmalloc_flags |= __GFP_NOWARN;
620 
621 		if (!(kmalloc_flags & __GFP_RETRY_MAYFAIL))
622 			kmalloc_flags |= __GFP_NORETRY;
623 
624 		/* nofail semantic is implemented by the vmalloc fallback */
625 		kmalloc_flags &= ~__GFP_NOFAIL;
626 	}
627 	trace_android_vh_adjust_kvmalloc_flags(get_order(size), &kmalloc_flags);
628 
629 	ret = kmalloc_node(size, kmalloc_flags, node);
630 
631 	/*
632 	 * It doesn't really make sense to fallback to vmalloc for sub page
633 	 * requests
634 	 */
635 	if (ret || size <= PAGE_SIZE)
636 		return ret;
637 
638 	/* non-sleeping allocations are not supported by vmalloc */
639 	if (!gfpflags_allow_blocking(flags))
640 		return NULL;
641 
642 	/* Don't even allow crazy sizes */
643 	if (unlikely(size > INT_MAX)) {
644 		WARN_ON_ONCE(!(flags & __GFP_NOWARN));
645 		return NULL;
646 	}
647 
648 	/*
649 	 * kvmalloc() can always use VM_ALLOW_HUGE_VMAP,
650 	 * since the callers already cannot assume anything
651 	 * about the resulting pointer, and cannot play
652 	 * protection games.
653 	 */
654 use_vmalloc_node:
655 	return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
656 			flags, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP,
657 			node, __builtin_return_address(0));
658 }
659 EXPORT_SYMBOL(kvmalloc_node);
660 
661 /**
662  * kvfree() - Free memory.
663  * @addr: Pointer to allocated memory.
664  *
665  * kvfree frees memory allocated by any of vmalloc(), kmalloc() or kvmalloc().
666  * It is slightly more efficient to use kfree() or vfree() if you are certain
667  * that you know which one to use.
668  *
669  * Context: Either preemptible task context or not-NMI interrupt.
670  */
kvfree(const void * addr)671 void kvfree(const void *addr)
672 {
673 	if (is_vmalloc_addr(addr))
674 		vfree(addr);
675 	else
676 		kfree(addr);
677 }
678 EXPORT_SYMBOL(kvfree);
679 
680 /**
681  * kvfree_sensitive - Free a data object containing sensitive information.
682  * @addr: address of the data object to be freed.
683  * @len: length of the data object.
684  *
685  * Use the special memzero_explicit() function to clear the content of a
686  * kvmalloc'ed object containing sensitive data to make sure that the
687  * compiler won't optimize out the data clearing.
688  */
kvfree_sensitive(const void * addr,size_t len)689 void kvfree_sensitive(const void *addr, size_t len)
690 {
691 	if (likely(!ZERO_OR_NULL_PTR(addr))) {
692 		memzero_explicit((void *)addr, len);
693 		kvfree(addr);
694 	}
695 }
696 EXPORT_SYMBOL(kvfree_sensitive);
697 
kvrealloc(const void * p,size_t oldsize,size_t newsize,gfp_t flags)698 void *kvrealloc(const void *p, size_t oldsize, size_t newsize, gfp_t flags)
699 {
700 	void *newp;
701 
702 	if (oldsize >= newsize)
703 		return (void *)p;
704 	newp = kvmalloc(newsize, flags);
705 	if (!newp)
706 		return NULL;
707 	memcpy(newp, p, oldsize);
708 	kvfree(p);
709 	return newp;
710 }
711 EXPORT_SYMBOL(kvrealloc);
712 
713 /**
714  * __vmalloc_array - allocate memory for a virtually contiguous array.
715  * @n: number of elements.
716  * @size: element size.
717  * @flags: the type of memory to allocate (see kmalloc).
718  */
__vmalloc_array(size_t n,size_t size,gfp_t flags)719 void *__vmalloc_array(size_t n, size_t size, gfp_t flags)
720 {
721 	size_t bytes;
722 
723 	if (unlikely(check_mul_overflow(n, size, &bytes)))
724 		return NULL;
725 	return __vmalloc(bytes, flags);
726 }
727 EXPORT_SYMBOL(__vmalloc_array);
728 
729 /**
730  * vmalloc_array - allocate memory for a virtually contiguous array.
731  * @n: number of elements.
732  * @size: element size.
733  */
vmalloc_array(size_t n,size_t size)734 void *vmalloc_array(size_t n, size_t size)
735 {
736 	return __vmalloc_array(n, size, GFP_KERNEL);
737 }
738 EXPORT_SYMBOL(vmalloc_array);
739 
740 /**
741  * __vcalloc - allocate and zero memory for a virtually contiguous array.
742  * @n: number of elements.
743  * @size: element size.
744  * @flags: the type of memory to allocate (see kmalloc).
745  */
__vcalloc(size_t n,size_t size,gfp_t flags)746 void *__vcalloc(size_t n, size_t size, gfp_t flags)
747 {
748 	return __vmalloc_array(n, size, flags | __GFP_ZERO);
749 }
750 EXPORT_SYMBOL(__vcalloc);
751 
752 /**
753  * vcalloc - allocate and zero memory for a virtually contiguous array.
754  * @n: number of elements.
755  * @size: element size.
756  */
vcalloc(size_t n,size_t size)757 void *vcalloc(size_t n, size_t size)
758 {
759 	return __vmalloc_array(n, size, GFP_KERNEL | __GFP_ZERO);
760 }
761 EXPORT_SYMBOL(vcalloc);
762 
folio_anon_vma(struct folio * folio)763 struct anon_vma *folio_anon_vma(struct folio *folio)
764 {
765 	unsigned long mapping = (unsigned long)folio->mapping;
766 
767 	if ((mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
768 		return NULL;
769 	return (void *)(mapping - PAGE_MAPPING_ANON);
770 }
771 
772 /**
773  * folio_mapping - Find the mapping where this folio is stored.
774  * @folio: The folio.
775  *
776  * For folios which are in the page cache, return the mapping that this
777  * page belongs to.  Folios in the swap cache return the swap mapping
778  * this page is stored in (which is different from the mapping for the
779  * swap file or swap device where the data is stored).
780  *
781  * You can call this for folios which aren't in the swap cache or page
782  * cache and it will return NULL.
783  */
folio_mapping(struct folio * folio)784 struct address_space *folio_mapping(struct folio *folio)
785 {
786 	struct address_space *mapping;
787 
788 	/* This happens if someone calls flush_dcache_page on slab page */
789 	if (unlikely(folio_test_slab(folio)))
790 		return NULL;
791 
792 	if (unlikely(folio_test_swapcache(folio)))
793 		return swap_address_space(folio->swap);
794 
795 	mapping = folio->mapping;
796 	if ((unsigned long)mapping & PAGE_MAPPING_FLAGS)
797 		return NULL;
798 
799 	return mapping;
800 }
801 EXPORT_SYMBOL(folio_mapping);
802 
803 /**
804  * folio_copy - Copy the contents of one folio to another.
805  * @dst: Folio to copy to.
806  * @src: Folio to copy from.
807  *
808  * The bytes in the folio represented by @src are copied to @dst.
809  * Assumes the caller has validated that @dst is at least as large as @src.
810  * Can be called in atomic context for order-0 folios, but if the folio is
811  * larger, it may sleep.
812  */
folio_copy(struct folio * dst,struct folio * src)813 void folio_copy(struct folio *dst, struct folio *src)
814 {
815 	long i = 0;
816 	long nr = folio_nr_pages(src);
817 
818 	for (;;) {
819 		copy_highpage(folio_page(dst, i), folio_page(src, i));
820 		if (++i == nr)
821 			break;
822 		cond_resched();
823 	}
824 }
825 
826 int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS;
827 int sysctl_overcommit_ratio __read_mostly = 50;
828 unsigned long sysctl_overcommit_kbytes __read_mostly;
829 int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
830 unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
831 unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */
832 
overcommit_ratio_handler(struct ctl_table * table,int write,void * buffer,size_t * lenp,loff_t * ppos)833 int overcommit_ratio_handler(struct ctl_table *table, int write, void *buffer,
834 		size_t *lenp, loff_t *ppos)
835 {
836 	int ret;
837 
838 	ret = proc_dointvec(table, write, buffer, lenp, ppos);
839 	if (ret == 0 && write)
840 		sysctl_overcommit_kbytes = 0;
841 	return ret;
842 }
843 
sync_overcommit_as(struct work_struct * dummy)844 static void sync_overcommit_as(struct work_struct *dummy)
845 {
846 	percpu_counter_sync(&vm_committed_as);
847 }
848 
overcommit_policy_handler(struct ctl_table * table,int write,void * buffer,size_t * lenp,loff_t * ppos)849 int overcommit_policy_handler(struct ctl_table *table, int write, void *buffer,
850 		size_t *lenp, loff_t *ppos)
851 {
852 	struct ctl_table t;
853 	int new_policy = -1;
854 	int ret;
855 
856 	/*
857 	 * The deviation of sync_overcommit_as could be big with loose policy
858 	 * like OVERCOMMIT_ALWAYS/OVERCOMMIT_GUESS. When changing policy to
859 	 * strict OVERCOMMIT_NEVER, we need to reduce the deviation to comply
860 	 * with the strict "NEVER", and to avoid possible race condition (even
861 	 * though user usually won't too frequently do the switching to policy
862 	 * OVERCOMMIT_NEVER), the switch is done in the following order:
863 	 *	1. changing the batch
864 	 *	2. sync percpu count on each CPU
865 	 *	3. switch the policy
866 	 */
867 	if (write) {
868 		t = *table;
869 		t.data = &new_policy;
870 		ret = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
871 		if (ret || new_policy == -1)
872 			return ret;
873 
874 		mm_compute_batch(new_policy);
875 		if (new_policy == OVERCOMMIT_NEVER)
876 			schedule_on_each_cpu(sync_overcommit_as);
877 		sysctl_overcommit_memory = new_policy;
878 	} else {
879 		ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
880 	}
881 
882 	return ret;
883 }
884 
overcommit_kbytes_handler(struct ctl_table * table,int write,void * buffer,size_t * lenp,loff_t * ppos)885 int overcommit_kbytes_handler(struct ctl_table *table, int write, void *buffer,
886 		size_t *lenp, loff_t *ppos)
887 {
888 	int ret;
889 
890 	ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
891 	if (ret == 0 && write)
892 		sysctl_overcommit_ratio = 0;
893 	return ret;
894 }
895 
896 /*
897  * Committed memory limit enforced when OVERCOMMIT_NEVER policy is used
898  */
vm_commit_limit(void)899 unsigned long vm_commit_limit(void)
900 {
901 	unsigned long allowed;
902 
903 	if (sysctl_overcommit_kbytes)
904 		allowed = sysctl_overcommit_kbytes >> (PAGE_SHIFT - 10);
905 	else
906 		allowed = ((totalram_pages() - hugetlb_total_pages())
907 			   * sysctl_overcommit_ratio / 100);
908 	allowed += total_swap_pages;
909 
910 	return allowed;
911 }
912 
913 /*
914  * Make sure vm_committed_as in one cacheline and not cacheline shared with
915  * other variables. It can be updated by several CPUs frequently.
916  */
917 struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp;
918 
919 /*
920  * The global memory commitment made in the system can be a metric
921  * that can be used to drive ballooning decisions when Linux is hosted
922  * as a guest. On Hyper-V, the host implements a policy engine for dynamically
923  * balancing memory across competing virtual machines that are hosted.
924  * Several metrics drive this policy engine including the guest reported
925  * memory commitment.
926  *
927  * The time cost of this is very low for small platforms, and for big
928  * platform like a 2S/36C/72T Skylake server, in worst case where
929  * vm_committed_as's spinlock is under severe contention, the time cost
930  * could be about 30~40 microseconds.
931  */
vm_memory_committed(void)932 unsigned long vm_memory_committed(void)
933 {
934 	return percpu_counter_sum_positive(&vm_committed_as);
935 }
936 EXPORT_SYMBOL_GPL(vm_memory_committed);
937 
938 /*
939  * Check that a process has enough memory to allocate a new virtual
940  * mapping. 0 means there is enough memory for the allocation to
941  * succeed and -ENOMEM implies there is not.
942  *
943  * We currently support three overcommit policies, which are set via the
944  * vm.overcommit_memory sysctl.  See Documentation/mm/overcommit-accounting.rst
945  *
946  * Strict overcommit modes added 2002 Feb 26 by Alan Cox.
947  * Additional code 2002 Jul 20 by Robert Love.
948  *
949  * cap_sys_admin is 1 if the process has admin privileges, 0 otherwise.
950  *
951  * Note this is a helper function intended to be used by LSMs which
952  * wish to use this logic.
953  */
__vm_enough_memory(struct mm_struct * mm,long pages,int cap_sys_admin)954 int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
955 {
956 	long allowed;
957 
958 	vm_acct_memory(pages);
959 
960 	/*
961 	 * Sometimes we want to use more memory than we have
962 	 */
963 	if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS)
964 		return 0;
965 
966 	if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
967 		if (pages > totalram_pages() + total_swap_pages)
968 			goto error;
969 		return 0;
970 	}
971 
972 	allowed = vm_commit_limit();
973 	/*
974 	 * Reserve some for root
975 	 */
976 	if (!cap_sys_admin)
977 		allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
978 
979 	/*
980 	 * Don't let a single process grow so big a user can't recover
981 	 */
982 	if (mm) {
983 		long reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10);
984 
985 		allowed -= min_t(long, mm->total_vm / 32, reserve);
986 	}
987 
988 	if (percpu_counter_read_positive(&vm_committed_as) < allowed)
989 		return 0;
990 error:
991 	pr_warn_ratelimited("%s: pid: %d, comm: %s, not enough memory for the allocation\n",
992 			    __func__, current->pid, current->comm);
993 	vm_unacct_memory(pages);
994 
995 	return -ENOMEM;
996 }
997 
998 /**
999  * get_cmdline() - copy the cmdline value to a buffer.
1000  * @task:     the task whose cmdline value to copy.
1001  * @buffer:   the buffer to copy to.
1002  * @buflen:   the length of the buffer. Larger cmdline values are truncated
1003  *            to this length.
1004  *
1005  * Return: the size of the cmdline field copied. Note that the copy does
1006  * not guarantee an ending NULL byte.
1007  */
get_cmdline(struct task_struct * task,char * buffer,int buflen)1008 int get_cmdline(struct task_struct *task, char *buffer, int buflen)
1009 {
1010 	int res = 0;
1011 	unsigned int len;
1012 	struct mm_struct *mm = get_task_mm(task);
1013 	unsigned long arg_start, arg_end, env_start, env_end;
1014 	if (!mm)
1015 		goto out;
1016 	if (!mm->arg_end)
1017 		goto out_mm;	/* Shh! No looking before we're done */
1018 
1019 	spin_lock(&mm->arg_lock);
1020 	arg_start = mm->arg_start;
1021 	arg_end = mm->arg_end;
1022 	env_start = mm->env_start;
1023 	env_end = mm->env_end;
1024 	spin_unlock(&mm->arg_lock);
1025 
1026 	len = arg_end - arg_start;
1027 
1028 	if (len > buflen)
1029 		len = buflen;
1030 
1031 	res = access_process_vm(task, arg_start, buffer, len, FOLL_FORCE);
1032 
1033 	/*
1034 	 * If the nul at the end of args has been overwritten, then
1035 	 * assume application is using setproctitle(3).
1036 	 */
1037 	if (res > 0 && buffer[res-1] != '\0' && len < buflen) {
1038 		len = strnlen(buffer, res);
1039 		if (len < res) {
1040 			res = len;
1041 		} else {
1042 			len = env_end - env_start;
1043 			if (len > buflen - res)
1044 				len = buflen - res;
1045 			res += access_process_vm(task, env_start,
1046 						 buffer+res, len,
1047 						 FOLL_FORCE);
1048 			res = strnlen(buffer, res);
1049 		}
1050 	}
1051 out_mm:
1052 	mmput(mm);
1053 out:
1054 	return res;
1055 }
1056 
memcmp_pages(struct page * page1,struct page * page2)1057 int __weak memcmp_pages(struct page *page1, struct page *page2)
1058 {
1059 	char *addr1, *addr2;
1060 	int ret;
1061 
1062 	addr1 = kmap_atomic(page1);
1063 	addr2 = kmap_atomic(page2);
1064 	ret = memcmp(addr1, addr2, PAGE_SIZE);
1065 	kunmap_atomic(addr2);
1066 	kunmap_atomic(addr1);
1067 	return ret;
1068 }
1069 
1070 #ifdef CONFIG_PRINTK
1071 /**
1072  * mem_dump_obj - Print available provenance information
1073  * @object: object for which to find provenance information.
1074  *
1075  * This function uses pr_cont(), so that the caller is expected to have
1076  * printed out whatever preamble is appropriate.  The provenance information
1077  * depends on the type of object and on how much debugging is enabled.
1078  * For example, for a slab-cache object, the slab name is printed, and,
1079  * if available, the return address and stack trace from the allocation
1080  * and last free path of that object.
1081  */
mem_dump_obj(void * object)1082 void mem_dump_obj(void *object)
1083 {
1084 	const char *type;
1085 
1086 	if (kmem_dump_obj(object))
1087 		return;
1088 
1089 	if (vmalloc_dump_obj(object))
1090 		return;
1091 
1092 	if (is_vmalloc_addr(object))
1093 		type = "vmalloc memory";
1094 	else if (virt_addr_valid(object))
1095 		type = "non-slab/vmalloc memory";
1096 	else if (object == NULL)
1097 		type = "NULL pointer";
1098 	else if (object == ZERO_SIZE_PTR)
1099 		type = "zero-size pointer";
1100 	else
1101 		type = "non-paged memory";
1102 
1103 	pr_cont(" %s\n", type);
1104 }
1105 EXPORT_SYMBOL_GPL(mem_dump_obj);
1106 #endif
1107 
1108 /*
1109  * A driver might set a page logically offline -- PageOffline() -- and
1110  * turn the page inaccessible in the hypervisor; after that, access to page
1111  * content can be fatal.
1112  *
1113  * Some special PFN walkers -- i.e., /proc/kcore -- read content of random
1114  * pages after checking PageOffline(); however, these PFN walkers can race
1115  * with drivers that set PageOffline().
1116  *
1117  * page_offline_freeze()/page_offline_thaw() allows for a subsystem to
1118  * synchronize with such drivers, achieving that a page cannot be set
1119  * PageOffline() while frozen.
1120  *
1121  * page_offline_begin()/page_offline_end() is used by drivers that care about
1122  * such races when setting a page PageOffline().
1123  */
1124 static DECLARE_RWSEM(page_offline_rwsem);
1125 
page_offline_freeze(void)1126 void page_offline_freeze(void)
1127 {
1128 	down_read(&page_offline_rwsem);
1129 }
1130 
page_offline_thaw(void)1131 void page_offline_thaw(void)
1132 {
1133 	up_read(&page_offline_rwsem);
1134 }
1135 
page_offline_begin(void)1136 void page_offline_begin(void)
1137 {
1138 	down_write(&page_offline_rwsem);
1139 }
1140 EXPORT_SYMBOL(page_offline_begin);
1141 
page_offline_end(void)1142 void page_offline_end(void)
1143 {
1144 	up_write(&page_offline_rwsem);
1145 }
1146 EXPORT_SYMBOL(page_offline_end);
1147 
1148 #ifndef flush_dcache_folio
flush_dcache_folio(struct folio * folio)1149 void flush_dcache_folio(struct folio *folio)
1150 {
1151 	long i, nr = folio_nr_pages(folio);
1152 
1153 	for (i = 0; i < nr; i++)
1154 		flush_dcache_page(folio_page(folio, i));
1155 }
1156 EXPORT_SYMBOL(flush_dcache_folio);
1157 #endif
1158