• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  *  Implement mseal() syscall.
4  *
5  *  Copyright (c) 2023,2024 Google, Inc.
6  *
7  *  Author: Jeff Xu <jeffxu@chromium.org>
8  */
9 
10 #include <linux/mempolicy.h>
11 #include <linux/mman.h>
12 #include <linux/mm.h>
13 #include <linux/mm_inline.h>
14 #include <linux/mmu_context.h>
15 #include <linux/page_size_compat.h>
16 #include <linux/syscalls.h>
17 #include <linux/sched.h>
18 #include "internal.h"
19 
vma_is_sealed(struct vm_area_struct * vma)20 static inline bool vma_is_sealed(struct vm_area_struct *vma)
21 {
22 	return (vma->vm_flags & VM_SEALED);
23 }
24 
set_vma_sealed(struct vm_area_struct * vma)25 static inline void set_vma_sealed(struct vm_area_struct *vma)
26 {
27 	vm_flags_set(vma, VM_SEALED);
28 }
29 
30 /*
31  * check if a vma is sealed for modification.
32  * return true, if modification is allowed.
33  */
can_modify_vma(struct vm_area_struct * vma)34 static bool can_modify_vma(struct vm_area_struct *vma)
35 {
36 	if (unlikely(vma_is_sealed(vma)))
37 		return false;
38 
39 	return true;
40 }
41 
is_madv_discard(int behavior)42 static bool is_madv_discard(int behavior)
43 {
44 	return	behavior &
45 		(MADV_FREE | MADV_DONTNEED | MADV_DONTNEED_LOCKED |
46 		 MADV_REMOVE | MADV_DONTFORK | MADV_WIPEONFORK);
47 }
48 
is_ro_anon(struct vm_area_struct * vma)49 static bool is_ro_anon(struct vm_area_struct *vma)
50 {
51 	/* check anonymous mapping. */
52 	if (vma->vm_file || vma->vm_flags & VM_SHARED)
53 		return false;
54 
55 	/*
56 	 * check for non-writable:
57 	 * PROT=RO or PKRU is not writeable.
58 	 */
59 	if (!(vma->vm_flags & VM_WRITE) ||
60 		!arch_vma_access_permitted(vma, true, false, false))
61 		return true;
62 
63 	return false;
64 }
65 
66 /*
67  * Check if the vmas of a memory range are allowed to be modified.
68  * the memory ranger can have a gap (unallocated memory).
69  * return true, if it is allowed.
70  */
can_modify_mm(struct mm_struct * mm,unsigned long start,unsigned long end)71 bool can_modify_mm(struct mm_struct *mm, unsigned long start, unsigned long end)
72 {
73 	struct vm_area_struct *vma;
74 
75 	VMA_ITERATOR(vmi, mm, start);
76 
77 	/* going through each vma to check. */
78 	for_each_vma_range(vmi, vma, end) {
79 		if (unlikely(!can_modify_vma(vma)))
80 			return false;
81 	}
82 
83 	/* Allow by default. */
84 	return true;
85 }
86 
87 /*
88  * Check if the vmas of a memory range are allowed to be modified by madvise.
89  * the memory ranger can have a gap (unallocated memory).
90  * return true, if it is allowed.
91  */
can_modify_mm_madv(struct mm_struct * mm,unsigned long start,unsigned long end,int behavior)92 bool can_modify_mm_madv(struct mm_struct *mm, unsigned long start, unsigned long end,
93 		int behavior)
94 {
95 	struct vm_area_struct *vma;
96 
97 	VMA_ITERATOR(vmi, mm, start);
98 
99 	if (!is_madv_discard(behavior))
100 		return true;
101 
102 	/* going through each vma to check. */
103 	for_each_vma_range(vmi, vma, end)
104 		if (unlikely(is_ro_anon(vma) && !can_modify_vma(vma)))
105 			return false;
106 
107 	/* Allow by default. */
108 	return true;
109 }
110 
111 /*
112  * mseal_fixup is almost same as mlock_fixup
113  */
mseal_fixup(struct vma_iterator * vmi,struct vm_area_struct * vma,struct vm_area_struct ** prev,unsigned long start,unsigned long end,vm_flags_t newflags)114 static int mseal_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma,
115 		struct vm_area_struct **prev, unsigned long start,
116 		unsigned long end, vm_flags_t newflags)
117 {
118 	struct mm_struct *mm = vma->vm_mm;
119 	pgoff_t pgoff;
120 	int ret = 0;
121 	vm_flags_t oldflags = vma->vm_flags;
122 
123 	if (newflags == oldflags)
124 		goto out;
125 
126 	pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
127 	*prev = vma_merge(vmi, mm, *prev, start, end, newflags,
128 			vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
129 			vma->vm_userfaultfd_ctx, anon_vma_name(vma));
130 	if (*prev) {
131 		vma = *prev;
132 		goto success;
133 	}
134 
135 	if (start != vma->vm_start) {
136 		ret = split_vma(vmi, vma, start, 1);
137 		if (ret)
138 			goto out;
139 	}
140 
141 	if (end != vma->vm_end) {
142 		ret = split_vma(vmi, vma, end, 0);
143 		if (ret)
144 			goto out;
145 	}
146 
147 success:
148 	set_vma_sealed(vma);
149 
150 out:
151 	*prev = vma;
152 	return ret;
153 }
154 
155 
156 /*
157  * Check for do_mseal:
158  * 1> start is part of a valid vma.
159  * 2> end is part of a valid vma.
160  * 3> No gap (unallocated address) between start and end.
161  * 4> map is sealable.
162  */
check_mm_seal(unsigned long start,unsigned long end)163 static int check_mm_seal(unsigned long start, unsigned long end)
164 {
165 	struct vm_area_struct *vma;
166 	unsigned long nstart = start;
167 
168 	VMA_ITERATOR(vmi, current->mm, start);
169 
170 	/* going through each vma to check. */
171 	for_each_vma_range(vmi, vma, end) {
172 		if (vma->vm_start > nstart)
173 			/* unallocated memory found. */
174 			return -ENOMEM;
175 
176 		if (vma->vm_end >= end)
177 			return 0;
178 
179 		nstart = vma->vm_end;
180 	}
181 
182 	return -ENOMEM;
183 }
184 
185 /*
186  * Apply sealing.
187  */
apply_mm_seal(unsigned long start,unsigned long end)188 static int apply_mm_seal(unsigned long start, unsigned long end)
189 {
190 	unsigned long nstart;
191 	struct vm_area_struct *vma, *prev;
192 
193 	VMA_ITERATOR(vmi, current->mm, start);
194 
195 	vma = vma_iter_load(&vmi);
196 	/*
197 	 * Note: check_mm_seal should already checked ENOMEM case.
198 	 * so vma should not be null, same for the other ENOMEM cases.
199 	 */
200 	prev = vma_prev(&vmi);
201 	if (start > vma->vm_start)
202 		prev = vma;
203 
204 	nstart = start;
205 	for_each_vma_range(vmi, vma, end) {
206 		int error;
207 		unsigned long tmp;
208 		vm_flags_t newflags;
209 
210 		newflags = vma->vm_flags | VM_SEALED;
211 		tmp = vma->vm_end;
212 		if (tmp > end)
213 			tmp = end;
214 		error = mseal_fixup(&vmi, vma, &prev, nstart, tmp, newflags);
215 		if (error)
216 			return error;
217 		nstart = vma_iter_end(&vmi);
218 	}
219 
220 	return 0;
221 }
222 
223 /*
224  * mseal(2) seals the VM's meta data from
225  * selected syscalls.
226  *
227  * addr/len: VM address range.
228  *
229  *  The address range by addr/len must meet:
230  *   start (addr) must be in a valid VMA.
231  *   end (addr + len) must be in a valid VMA.
232  *   no gap (unallocated memory) between start and end.
233  *   start (addr) must be page aligned.
234  *
235  *  len: len will be page aligned implicitly.
236  *
237  *   Below VMA operations are blocked after sealing.
238  *   1> Unmapping, moving to another location, and shrinking
239  *	the size, via munmap() and mremap(), can leave an empty
240  *	space, therefore can be replaced with a VMA with a new
241  *	set of attributes.
242  *   2> Moving or expanding a different vma into the current location,
243  *	via mremap().
244  *   3> Modifying a VMA via mmap(MAP_FIXED).
245  *   4> Size expansion, via mremap(), does not appear to pose any
246  *	specific risks to sealed VMAs. It is included anyway because
247  *	the use case is unclear. In any case, users can rely on
248  *	merging to expand a sealed VMA.
249  *   5> mprotect and pkey_mprotect.
250  *   6> Some destructive madvice() behavior (e.g. MADV_DONTNEED)
251  *      for anonymous memory, when users don't have write permission to the
252  *	memory. Those behaviors can alter region contents by discarding pages,
253  *	effectively a memset(0) for anonymous memory.
254  *
255  *  flags: reserved.
256  *
257  * return values:
258  *  zero: success.
259  *  -EINVAL:
260  *   invalid input flags.
261  *   start address is not page aligned.
262  *   Address arange (start + len) overflow.
263  *  -ENOMEM:
264  *   addr is not a valid address (not allocated).
265  *   end (start + len) is not a valid address.
266  *   a gap (unallocated memory) between start and end.
267  *  -EPERM:
268  *  - In 32 bit architecture, sealing is not supported.
269  * Note:
270  *  user can call mseal(2) multiple times, adding a seal on an
271  *  already sealed memory is a no-action (no error).
272  *
273  *  unseal() is not supported.
274  */
do_mseal(unsigned long start,size_t len_in,unsigned long flags)275 static int do_mseal(unsigned long start, size_t len_in, unsigned long flags)
276 {
277 	size_t len;
278 	int ret = 0;
279 	unsigned long end;
280 	struct mm_struct *mm = current->mm;
281 
282 	ret = can_do_mseal(flags);
283 	if (ret)
284 		return ret;
285 
286 	start = untagged_addr(start);
287 	if (!__PAGE_ALIGNED(start))
288 		return -EINVAL;
289 
290 	len = __PAGE_ALIGN(len_in);
291 	/* Check to see whether len was rounded up from small -ve to zero. */
292 	if (len_in && !len)
293 		return -EINVAL;
294 
295 	end = start + len;
296 	if (end < start)
297 		return -EINVAL;
298 
299 	if (end == start)
300 		return 0;
301 
302 	if (mmap_write_lock_killable(mm))
303 		return -EINTR;
304 
305 	/*
306 	 * First pass, this helps to avoid
307 	 * partial sealing in case of error in input address range,
308 	 * e.g. ENOMEM error.
309 	 */
310 	ret = check_mm_seal(start, end);
311 	if (ret)
312 		goto out;
313 
314 	/*
315 	 * Second pass, this should success, unless there are errors
316 	 * from vma_modify_flags, e.g. merge/split error, or process
317 	 * reaching the max supported VMAs, however, those cases shall
318 	 * be rare.
319 	 */
320 	ret = apply_mm_seal(start, end);
321 
322 out:
323 	mmap_write_unlock(current->mm);
324 	return ret;
325 }
326 
SYSCALL_DEFINE3(mseal,unsigned long,start,size_t,len,unsigned long,flags)327 SYSCALL_DEFINE3(mseal, unsigned long, start, size_t, len, unsigned long,
328 		flags)
329 {
330 	return do_mseal(start, len, flags);
331 }
332