• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  *  Implement mseal() syscall.
4  *
5  *  Copyright (c) 2023,2024 Google, Inc.
6  *
7  *  Author: Jeff Xu <jeffxu@chromium.org>
8  */
9 
10 #include <linux/mempolicy.h>
11 #include <linux/mman.h>
12 #include <linux/mm.h>
13 #include <linux/mm_inline.h>
14 #include <linux/mmu_context.h>
15 #include <linux/page_size_compat.h>
16 #include <linux/syscalls.h>
17 #include <linux/sched.h>
18 #include "internal.h"
19 
set_vma_sealed(struct vm_area_struct * vma)20 static inline void set_vma_sealed(struct vm_area_struct *vma)
21 {
22 	vm_flags_set(vma, VM_SEALED);
23 }
24 
is_madv_discard(int behavior)25 static bool is_madv_discard(int behavior)
26 {
27 	switch (behavior) {
28 	case MADV_FREE:
29 	case MADV_DONTNEED:
30 	case MADV_DONTNEED_LOCKED:
31 	case MADV_REMOVE:
32 	case MADV_DONTFORK:
33 	case MADV_WIPEONFORK:
34 	case MADV_GUARD_INSTALL:
35 		return true;
36 	}
37 
38 	return false;
39 }
40 
is_ro_anon(struct vm_area_struct * vma)41 static bool is_ro_anon(struct vm_area_struct *vma)
42 {
43 	/* check anonymous mapping. */
44 	if (vma->vm_file || vma->vm_flags & VM_SHARED)
45 		return false;
46 
47 	/*
48 	 * check for non-writable:
49 	 * PROT=RO or PKRU is not writeable.
50 	 */
51 	if (!(vma->vm_flags & VM_WRITE) ||
52 		!arch_vma_access_permitted(vma, true, false, false))
53 		return true;
54 
55 	return false;
56 }
57 
58 /*
59  * Check if a vma is allowed to be modified by madvise.
60  */
can_modify_vma_madv(struct vm_area_struct * vma,int behavior)61 bool can_modify_vma_madv(struct vm_area_struct *vma, int behavior)
62 {
63 	if (!is_madv_discard(behavior))
64 		return true;
65 
66 	if (unlikely(!can_modify_vma(vma) && is_ro_anon(vma)))
67 		return false;
68 
69 	/* Allow by default. */
70 	return true;
71 }
72 
mseal_fixup(struct vma_iterator * vmi,struct vm_area_struct * vma,struct vm_area_struct ** prev,unsigned long start,unsigned long end,vm_flags_t newflags)73 static int mseal_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma,
74 		struct vm_area_struct **prev, unsigned long start,
75 		unsigned long end, vm_flags_t newflags)
76 {
77 	int ret = 0;
78 	vm_flags_t oldflags = vma->vm_flags;
79 
80 	if (newflags == oldflags)
81 		goto out;
82 
83 	vma = vma_modify_flags(vmi, *prev, vma, start, end, newflags);
84 	if (IS_ERR(vma)) {
85 		ret = PTR_ERR(vma);
86 		goto out;
87 	}
88 
89 	set_vma_sealed(vma);
90 out:
91 	*prev = vma;
92 	return ret;
93 }
94 
95 /*
96  * Check for do_mseal:
97  * 1> start is part of a valid vma.
98  * 2> end is part of a valid vma.
99  * 3> No gap (unallocated address) between start and end.
100  * 4> map is sealable.
101  */
check_mm_seal(unsigned long start,unsigned long end)102 static int check_mm_seal(unsigned long start, unsigned long end)
103 {
104 	struct vm_area_struct *vma;
105 	unsigned long nstart = start;
106 
107 	VMA_ITERATOR(vmi, current->mm, start);
108 
109 	/* going through each vma to check. */
110 	for_each_vma_range(vmi, vma, end) {
111 		if (vma->vm_start > nstart)
112 			/* unallocated memory found. */
113 			return -ENOMEM;
114 
115 		if (vma->vm_end >= end)
116 			return 0;
117 
118 		nstart = vma->vm_end;
119 	}
120 
121 	return -ENOMEM;
122 }
123 
124 /*
125  * Apply sealing.
126  */
apply_mm_seal(unsigned long start,unsigned long end)127 static int apply_mm_seal(unsigned long start, unsigned long end)
128 {
129 	unsigned long nstart;
130 	struct vm_area_struct *vma, *prev;
131 
132 	VMA_ITERATOR(vmi, current->mm, start);
133 
134 	vma = vma_iter_load(&vmi);
135 	/*
136 	 * Note: check_mm_seal should already checked ENOMEM case.
137 	 * so vma should not be null, same for the other ENOMEM cases.
138 	 */
139 	prev = vma_prev(&vmi);
140 	if (start > vma->vm_start)
141 		prev = vma;
142 
143 	nstart = start;
144 	for_each_vma_range(vmi, vma, end) {
145 		int error;
146 		unsigned long tmp;
147 		vm_flags_t newflags;
148 
149 		newflags = vma->vm_flags | VM_SEALED;
150 		tmp = vma->vm_end;
151 		if (tmp > end)
152 			tmp = end;
153 		error = mseal_fixup(&vmi, vma, &prev, nstart, tmp, newflags);
154 		if (error)
155 			return error;
156 		nstart = vma_iter_end(&vmi);
157 	}
158 
159 	return 0;
160 }
161 
162 /*
163  * mseal(2) seals the VM's meta data from
164  * selected syscalls.
165  *
166  * addr/len: VM address range.
167  *
168  *  The address range by addr/len must meet:
169  *   start (addr) must be in a valid VMA.
170  *   end (addr + len) must be in a valid VMA.
171  *   no gap (unallocated memory) between start and end.
172  *   start (addr) must be page aligned.
173  *
174  *  len: len will be page aligned implicitly.
175  *
176  *   Below VMA operations are blocked after sealing.
177  *   1> Unmapping, moving to another location, and shrinking
178  *	the size, via munmap() and mremap(), can leave an empty
179  *	space, therefore can be replaced with a VMA with a new
180  *	set of attributes.
181  *   2> Moving or expanding a different vma into the current location,
182  *	via mremap().
183  *   3> Modifying a VMA via mmap(MAP_FIXED).
184  *   4> Size expansion, via mremap(), does not appear to pose any
185  *	specific risks to sealed VMAs. It is included anyway because
186  *	the use case is unclear. In any case, users can rely on
187  *	merging to expand a sealed VMA.
188  *   5> mprotect and pkey_mprotect.
189  *   6> Some destructive madvice() behavior (e.g. MADV_DONTNEED)
190  *      for anonymous memory, when users don't have write permission to the
191  *	memory. Those behaviors can alter region contents by discarding pages,
192  *	effectively a memset(0) for anonymous memory.
193  *
194  *  flags: reserved.
195  *
196  * return values:
197  *  zero: success.
198  *  -EINVAL:
199  *   invalid input flags.
200  *   start address is not page aligned.
201  *   Address arange (start + len) overflow.
202  *  -ENOMEM:
203  *   addr is not a valid address (not allocated).
204  *   end (start + len) is not a valid address.
205  *   a gap (unallocated memory) between start and end.
206  *  -EPERM:
207  *  - In 32 bit architecture, sealing is not supported.
208  * Note:
209  *  user can call mseal(2) multiple times, adding a seal on an
210  *  already sealed memory is a no-action (no error).
211  *
212  *  unseal() is not supported.
213  */
do_mseal(unsigned long start,size_t len_in,unsigned long flags)214 int do_mseal(unsigned long start, size_t len_in, unsigned long flags)
215 {
216 	size_t len;
217 	int ret = 0;
218 	unsigned long end;
219 	struct mm_struct *mm = current->mm;
220 
221 	ret = can_do_mseal(flags);
222 	if (ret)
223 		return ret;
224 
225 	start = untagged_addr(start);
226 	if (!__PAGE_ALIGNED(start))
227 		return -EINVAL;
228 
229 	len = __PAGE_ALIGN(len_in);
230 	/* Check to see whether len was rounded up from small -ve to zero. */
231 	if (len_in && !len)
232 		return -EINVAL;
233 
234 	end = start + len;
235 	if (end < start)
236 		return -EINVAL;
237 
238 	if (end == start)
239 		return 0;
240 
241 	if (mmap_write_lock_killable(mm))
242 		return -EINTR;
243 
244 	/*
245 	 * First pass, this helps to avoid
246 	 * partial sealing in case of error in input address range,
247 	 * e.g. ENOMEM error.
248 	 */
249 	ret = check_mm_seal(start, end);
250 	if (ret)
251 		goto out;
252 
253 	/*
254 	 * Second pass, this should success, unless there are errors
255 	 * from vma_modify_flags, e.g. merge/split error, or process
256 	 * reaching the max supported VMAs, however, those cases shall
257 	 * be rare.
258 	 */
259 	ret = apply_mm_seal(start, end);
260 
261 out:
262 	mmap_write_unlock(current->mm);
263 	return ret;
264 }
265 
SYSCALL_DEFINE3(mseal,unsigned long,start,size_t,len,unsigned long,flags)266 SYSCALL_DEFINE3(mseal, unsigned long, start, size_t, len, unsigned long,
267 		flags)
268 {
269 	return do_mseal(start, len, flags);
270 }
271