1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Implement mseal() syscall.
4 *
5 * Copyright (c) 2023,2024 Google, Inc.
6 *
7 * Author: Jeff Xu <jeffxu@chromium.org>
8 */
9
10 #include <linux/mempolicy.h>
11 #include <linux/mman.h>
12 #include <linux/mm.h>
13 #include <linux/mm_inline.h>
14 #include <linux/mmu_context.h>
15 #include <linux/page_size_compat.h>
16 #include <linux/syscalls.h>
17 #include <linux/sched.h>
18 #include "internal.h"
19
vma_is_sealed(struct vm_area_struct * vma)20 static inline bool vma_is_sealed(struct vm_area_struct *vma)
21 {
22 return (vma->vm_flags & VM_SEALED);
23 }
24
set_vma_sealed(struct vm_area_struct * vma)25 static inline void set_vma_sealed(struct vm_area_struct *vma)
26 {
27 vm_flags_set(vma, VM_SEALED);
28 }
29
30 /*
31 * check if a vma is sealed for modification.
32 * return true, if modification is allowed.
33 */
can_modify_vma(struct vm_area_struct * vma)34 static bool can_modify_vma(struct vm_area_struct *vma)
35 {
36 if (unlikely(vma_is_sealed(vma)))
37 return false;
38
39 return true;
40 }
41
is_madv_discard(int behavior)42 static bool is_madv_discard(int behavior)
43 {
44 return behavior &
45 (MADV_FREE | MADV_DONTNEED | MADV_DONTNEED_LOCKED |
46 MADV_REMOVE | MADV_DONTFORK | MADV_WIPEONFORK);
47 }
48
is_ro_anon(struct vm_area_struct * vma)49 static bool is_ro_anon(struct vm_area_struct *vma)
50 {
51 /* check anonymous mapping. */
52 if (vma->vm_file || vma->vm_flags & VM_SHARED)
53 return false;
54
55 /*
56 * check for non-writable:
57 * PROT=RO or PKRU is not writeable.
58 */
59 if (!(vma->vm_flags & VM_WRITE) ||
60 !arch_vma_access_permitted(vma, true, false, false))
61 return true;
62
63 return false;
64 }
65
66 /*
67 * Check if the vmas of a memory range are allowed to be modified.
68 * the memory ranger can have a gap (unallocated memory).
69 * return true, if it is allowed.
70 */
can_modify_mm(struct mm_struct * mm,unsigned long start,unsigned long end)71 bool can_modify_mm(struct mm_struct *mm, unsigned long start, unsigned long end)
72 {
73 struct vm_area_struct *vma;
74
75 VMA_ITERATOR(vmi, mm, start);
76
77 /* going through each vma to check. */
78 for_each_vma_range(vmi, vma, end) {
79 if (unlikely(!can_modify_vma(vma)))
80 return false;
81 }
82
83 /* Allow by default. */
84 return true;
85 }
86
87 /*
88 * Check if the vmas of a memory range are allowed to be modified by madvise.
89 * the memory ranger can have a gap (unallocated memory).
90 * return true, if it is allowed.
91 */
can_modify_mm_madv(struct mm_struct * mm,unsigned long start,unsigned long end,int behavior)92 bool can_modify_mm_madv(struct mm_struct *mm, unsigned long start, unsigned long end,
93 int behavior)
94 {
95 struct vm_area_struct *vma;
96
97 VMA_ITERATOR(vmi, mm, start);
98
99 if (!is_madv_discard(behavior))
100 return true;
101
102 /* going through each vma to check. */
103 for_each_vma_range(vmi, vma, end)
104 if (unlikely(is_ro_anon(vma) && !can_modify_vma(vma)))
105 return false;
106
107 /* Allow by default. */
108 return true;
109 }
110
111 /*
112 * mseal_fixup is almost same as mlock_fixup
113 */
mseal_fixup(struct vma_iterator * vmi,struct vm_area_struct * vma,struct vm_area_struct ** prev,unsigned long start,unsigned long end,vm_flags_t newflags)114 static int mseal_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma,
115 struct vm_area_struct **prev, unsigned long start,
116 unsigned long end, vm_flags_t newflags)
117 {
118 struct mm_struct *mm = vma->vm_mm;
119 pgoff_t pgoff;
120 int ret = 0;
121 vm_flags_t oldflags = vma->vm_flags;
122
123 if (newflags == oldflags)
124 goto out;
125
126 pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
127 *prev = vma_merge(vmi, mm, *prev, start, end, newflags,
128 vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
129 vma->vm_userfaultfd_ctx, anon_vma_name(vma));
130 if (*prev) {
131 vma = *prev;
132 goto success;
133 }
134
135 if (start != vma->vm_start) {
136 ret = split_vma(vmi, vma, start, 1);
137 if (ret)
138 goto out;
139 }
140
141 if (end != vma->vm_end) {
142 ret = split_vma(vmi, vma, end, 0);
143 if (ret)
144 goto out;
145 }
146
147 success:
148 set_vma_sealed(vma);
149
150 out:
151 *prev = vma;
152 return ret;
153 }
154
155
156 /*
157 * Check for do_mseal:
158 * 1> start is part of a valid vma.
159 * 2> end is part of a valid vma.
160 * 3> No gap (unallocated address) between start and end.
161 * 4> map is sealable.
162 */
check_mm_seal(unsigned long start,unsigned long end)163 static int check_mm_seal(unsigned long start, unsigned long end)
164 {
165 struct vm_area_struct *vma;
166 unsigned long nstart = start;
167
168 VMA_ITERATOR(vmi, current->mm, start);
169
170 /* going through each vma to check. */
171 for_each_vma_range(vmi, vma, end) {
172 if (vma->vm_start > nstart)
173 /* unallocated memory found. */
174 return -ENOMEM;
175
176 if (vma->vm_end >= end)
177 return 0;
178
179 nstart = vma->vm_end;
180 }
181
182 return -ENOMEM;
183 }
184
185 /*
186 * Apply sealing.
187 */
apply_mm_seal(unsigned long start,unsigned long end)188 static int apply_mm_seal(unsigned long start, unsigned long end)
189 {
190 unsigned long nstart;
191 struct vm_area_struct *vma, *prev;
192
193 VMA_ITERATOR(vmi, current->mm, start);
194
195 vma = vma_iter_load(&vmi);
196 /*
197 * Note: check_mm_seal should already checked ENOMEM case.
198 * so vma should not be null, same for the other ENOMEM cases.
199 */
200 prev = vma_prev(&vmi);
201 if (start > vma->vm_start)
202 prev = vma;
203
204 nstart = start;
205 for_each_vma_range(vmi, vma, end) {
206 int error;
207 unsigned long tmp;
208 vm_flags_t newflags;
209
210 newflags = vma->vm_flags | VM_SEALED;
211 tmp = vma->vm_end;
212 if (tmp > end)
213 tmp = end;
214 error = mseal_fixup(&vmi, vma, &prev, nstart, tmp, newflags);
215 if (error)
216 return error;
217 nstart = vma_iter_end(&vmi);
218 }
219
220 return 0;
221 }
222
223 /*
224 * mseal(2) seals the VM's meta data from
225 * selected syscalls.
226 *
227 * addr/len: VM address range.
228 *
229 * The address range by addr/len must meet:
230 * start (addr) must be in a valid VMA.
231 * end (addr + len) must be in a valid VMA.
232 * no gap (unallocated memory) between start and end.
233 * start (addr) must be page aligned.
234 *
235 * len: len will be page aligned implicitly.
236 *
237 * Below VMA operations are blocked after sealing.
238 * 1> Unmapping, moving to another location, and shrinking
239 * the size, via munmap() and mremap(), can leave an empty
240 * space, therefore can be replaced with a VMA with a new
241 * set of attributes.
242 * 2> Moving or expanding a different vma into the current location,
243 * via mremap().
244 * 3> Modifying a VMA via mmap(MAP_FIXED).
245 * 4> Size expansion, via mremap(), does not appear to pose any
246 * specific risks to sealed VMAs. It is included anyway because
247 * the use case is unclear. In any case, users can rely on
248 * merging to expand a sealed VMA.
249 * 5> mprotect and pkey_mprotect.
250 * 6> Some destructive madvice() behavior (e.g. MADV_DONTNEED)
251 * for anonymous memory, when users don't have write permission to the
252 * memory. Those behaviors can alter region contents by discarding pages,
253 * effectively a memset(0) for anonymous memory.
254 *
255 * flags: reserved.
256 *
257 * return values:
258 * zero: success.
259 * -EINVAL:
260 * invalid input flags.
261 * start address is not page aligned.
262 * Address arange (start + len) overflow.
263 * -ENOMEM:
264 * addr is not a valid address (not allocated).
265 * end (start + len) is not a valid address.
266 * a gap (unallocated memory) between start and end.
267 * -EPERM:
268 * - In 32 bit architecture, sealing is not supported.
269 * Note:
270 * user can call mseal(2) multiple times, adding a seal on an
271 * already sealed memory is a no-action (no error).
272 *
273 * unseal() is not supported.
274 */
do_mseal(unsigned long start,size_t len_in,unsigned long flags)275 static int do_mseal(unsigned long start, size_t len_in, unsigned long flags)
276 {
277 size_t len;
278 int ret = 0;
279 unsigned long end;
280 struct mm_struct *mm = current->mm;
281
282 ret = can_do_mseal(flags);
283 if (ret)
284 return ret;
285
286 start = untagged_addr(start);
287 if (!__PAGE_ALIGNED(start))
288 return -EINVAL;
289
290 len = __PAGE_ALIGN(len_in);
291 /* Check to see whether len was rounded up from small -ve to zero. */
292 if (len_in && !len)
293 return -EINVAL;
294
295 end = start + len;
296 if (end < start)
297 return -EINVAL;
298
299 if (end == start)
300 return 0;
301
302 if (mmap_write_lock_killable(mm))
303 return -EINTR;
304
305 /*
306 * First pass, this helps to avoid
307 * partial sealing in case of error in input address range,
308 * e.g. ENOMEM error.
309 */
310 ret = check_mm_seal(start, end);
311 if (ret)
312 goto out;
313
314 /*
315 * Second pass, this should success, unless there are errors
316 * from vma_modify_flags, e.g. merge/split error, or process
317 * reaching the max supported VMAs, however, those cases shall
318 * be rare.
319 */
320 ret = apply_mm_seal(start, end);
321
322 out:
323 mmap_write_unlock(current->mm);
324 return ret;
325 }
326
SYSCALL_DEFINE3(mseal,unsigned long,start,size_t,len,unsigned long,flags)327 SYSCALL_DEFINE3(mseal, unsigned long, start, size_t, len, unsigned long,
328 flags)
329 {
330 return do_mseal(start, len, flags);
331 }
332