• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: GPL-2.0
2 /*  Copyright(c) 2016-20 Intel Corporation. */
3 
4 #include <linux/lockdep.h>
5 #include <linux/mm.h>
6 #include <linux/mman.h>
7 #include <linux/shmem_fs.h>
8 #include <linux/suspend.h>
9 #include <linux/sched/mm.h>
10 #include <asm/sgx.h>
11 #include "encl.h"
12 #include "encls.h"
13 #include "sgx.h"
14 
15 #define PCMDS_PER_PAGE (PAGE_SIZE / sizeof(struct sgx_pcmd))
16 /*
17  * 32 PCMD entries share a PCMD page. PCMD_FIRST_MASK is used to
18  * determine the page index associated with the first PCMD entry
19  * within a PCMD page.
20  */
21 #define PCMD_FIRST_MASK GENMASK(4, 0)
22 
23 /**
24  * reclaimer_writing_to_pcmd() - Query if any enclave page associated with
25  *                               a PCMD page is in process of being reclaimed.
26  * @encl:        Enclave to which PCMD page belongs
27  * @start_addr:  Address of enclave page using first entry within the PCMD page
28  *
29  * When an enclave page is reclaimed some Paging Crypto MetaData (PCMD) is
30  * stored. The PCMD data of a reclaimed enclave page contains enough
31  * information for the processor to verify the page at the time
32  * it is loaded back into the Enclave Page Cache (EPC).
33  *
34  * The backing storage to which enclave pages are reclaimed is laid out as
35  * follows:
36  * Encrypted enclave pages:SECS page:PCMD pages
37  *
38  * Each PCMD page contains the PCMD metadata of
39  * PAGE_SIZE/sizeof(struct sgx_pcmd) enclave pages.
40  *
41  * A PCMD page can only be truncated if it is (a) empty, and (b) not in the
42  * process of getting data (and thus soon being non-empty). (b) is tested with
43  * a check if an enclave page sharing the PCMD page is in the process of being
44  * reclaimed.
45  *
46  * The reclaimer sets the SGX_ENCL_PAGE_BEING_RECLAIMED flag when it
47  * intends to reclaim that enclave page - it means that the PCMD page
48  * associated with that enclave page is about to get some data and thus
49  * even if the PCMD page is empty, it should not be truncated.
50  *
51  * Context: Enclave mutex (&sgx_encl->lock) must be held.
52  * Return: 1 if the reclaimer is about to write to the PCMD page
53  *         0 if the reclaimer has no intention to write to the PCMD page
54  */
reclaimer_writing_to_pcmd(struct sgx_encl * encl,unsigned long start_addr)55 static int reclaimer_writing_to_pcmd(struct sgx_encl *encl,
56 				     unsigned long start_addr)
57 {
58 	int reclaimed = 0;
59 	int i;
60 
61 	/*
62 	 * PCMD_FIRST_MASK is based on number of PCMD entries within
63 	 * PCMD page being 32.
64 	 */
65 	BUILD_BUG_ON(PCMDS_PER_PAGE != 32);
66 
67 	for (i = 0; i < PCMDS_PER_PAGE; i++) {
68 		struct sgx_encl_page *entry;
69 		unsigned long addr;
70 
71 		addr = start_addr + i * PAGE_SIZE;
72 
73 		/*
74 		 * Stop when reaching the SECS page - it does not
75 		 * have a page_array entry and its reclaim is
76 		 * started and completed with enclave mutex held so
77 		 * it does not use the SGX_ENCL_PAGE_BEING_RECLAIMED
78 		 * flag.
79 		 */
80 		if (addr == encl->base + encl->size)
81 			break;
82 
83 		entry = xa_load(&encl->page_array, PFN_DOWN(addr));
84 		if (!entry)
85 			continue;
86 
87 		/*
88 		 * VA page slot ID uses same bit as the flag so it is important
89 		 * to ensure that the page is not already in backing store.
90 		 */
91 		if (entry->epc_page &&
92 		    (entry->desc & SGX_ENCL_PAGE_BEING_RECLAIMED)) {
93 			reclaimed = 1;
94 			break;
95 		}
96 	}
97 
98 	return reclaimed;
99 }
100 
101 /*
102  * Calculate byte offset of a PCMD struct associated with an enclave page. PCMD's
103  * follow right after the EPC data in the backing storage. In addition to the
104  * visible enclave pages, there's one extra page slot for SECS, before PCMD
105  * structs.
106  */
sgx_encl_get_backing_page_pcmd_offset(struct sgx_encl * encl,unsigned long page_index)107 static inline pgoff_t sgx_encl_get_backing_page_pcmd_offset(struct sgx_encl *encl,
108 							    unsigned long page_index)
109 {
110 	pgoff_t epc_end_off = encl->size + sizeof(struct sgx_secs);
111 
112 	return epc_end_off + page_index * sizeof(struct sgx_pcmd);
113 }
114 
115 /*
116  * Free a page from the backing storage in the given page index.
117  */
sgx_encl_truncate_backing_page(struct sgx_encl * encl,unsigned long page_index)118 static inline void sgx_encl_truncate_backing_page(struct sgx_encl *encl, unsigned long page_index)
119 {
120 	struct inode *inode = file_inode(encl->backing);
121 
122 	shmem_truncate_range(inode, PFN_PHYS(page_index), PFN_PHYS(page_index) + PAGE_SIZE - 1);
123 }
124 
125 /*
126  * ELDU: Load an EPC page as unblocked. For more info, see "OS Management of EPC
127  * Pages" in the SDM.
128  */
__sgx_encl_eldu(struct sgx_encl_page * encl_page,struct sgx_epc_page * epc_page,struct sgx_epc_page * secs_page)129 static int __sgx_encl_eldu(struct sgx_encl_page *encl_page,
130 			   struct sgx_epc_page *epc_page,
131 			   struct sgx_epc_page *secs_page)
132 {
133 	unsigned long va_offset = encl_page->desc & SGX_ENCL_PAGE_VA_OFFSET_MASK;
134 	struct sgx_encl *encl = encl_page->encl;
135 	pgoff_t page_index, page_pcmd_off;
136 	unsigned long pcmd_first_page;
137 	struct sgx_pageinfo pginfo;
138 	struct sgx_backing b;
139 	bool pcmd_page_empty;
140 	u8 *pcmd_page;
141 	int ret;
142 
143 	if (secs_page)
144 		page_index = PFN_DOWN(encl_page->desc - encl_page->encl->base);
145 	else
146 		page_index = PFN_DOWN(encl->size);
147 
148 	/*
149 	 * Address of enclave page using the first entry within the PCMD page.
150 	 */
151 	pcmd_first_page = PFN_PHYS(page_index & ~PCMD_FIRST_MASK) + encl->base;
152 
153 	page_pcmd_off = sgx_encl_get_backing_page_pcmd_offset(encl, page_index);
154 
155 	ret = sgx_encl_lookup_backing(encl, page_index, &b);
156 	if (ret)
157 		return ret;
158 
159 	pginfo.addr = encl_page->desc & PAGE_MASK;
160 	pginfo.contents = (unsigned long)kmap_atomic(b.contents);
161 	pcmd_page = kmap_atomic(b.pcmd);
162 	pginfo.metadata = (unsigned long)pcmd_page + b.pcmd_offset;
163 
164 	if (secs_page)
165 		pginfo.secs = (u64)sgx_get_epc_virt_addr(secs_page);
166 	else
167 		pginfo.secs = 0;
168 
169 	ret = __eldu(&pginfo, sgx_get_epc_virt_addr(epc_page),
170 		     sgx_get_epc_virt_addr(encl_page->va_page->epc_page) + va_offset);
171 	if (ret) {
172 		if (encls_failed(ret))
173 			ENCLS_WARN(ret, "ELDU");
174 
175 		ret = -EFAULT;
176 	}
177 
178 	memset(pcmd_page + b.pcmd_offset, 0, sizeof(struct sgx_pcmd));
179 	set_page_dirty(b.pcmd);
180 
181 	/*
182 	 * The area for the PCMD in the page was zeroed above.  Check if the
183 	 * whole page is now empty meaning that all PCMD's have been zeroed:
184 	 */
185 	pcmd_page_empty = !memchr_inv(pcmd_page, 0, PAGE_SIZE);
186 
187 	kunmap_atomic(pcmd_page);
188 	kunmap_atomic((void *)(unsigned long)pginfo.contents);
189 
190 	get_page(b.pcmd);
191 	sgx_encl_put_backing(&b);
192 
193 	sgx_encl_truncate_backing_page(encl, page_index);
194 
195 	if (pcmd_page_empty && !reclaimer_writing_to_pcmd(encl, pcmd_first_page)) {
196 		sgx_encl_truncate_backing_page(encl, PFN_DOWN(page_pcmd_off));
197 		pcmd_page = kmap_atomic(b.pcmd);
198 		if (memchr_inv(pcmd_page, 0, PAGE_SIZE))
199 			pr_warn("PCMD page not empty after truncate.\n");
200 		kunmap_atomic(pcmd_page);
201 	}
202 
203 	put_page(b.pcmd);
204 
205 	return ret;
206 }
207 
sgx_encl_eldu(struct sgx_encl_page * encl_page,struct sgx_epc_page * secs_page)208 static struct sgx_epc_page *sgx_encl_eldu(struct sgx_encl_page *encl_page,
209 					  struct sgx_epc_page *secs_page)
210 {
211 
212 	unsigned long va_offset = encl_page->desc & SGX_ENCL_PAGE_VA_OFFSET_MASK;
213 	struct sgx_encl *encl = encl_page->encl;
214 	struct sgx_epc_page *epc_page;
215 	int ret;
216 
217 	epc_page = sgx_alloc_epc_page(encl_page, false);
218 	if (IS_ERR(epc_page))
219 		return epc_page;
220 
221 	ret = __sgx_encl_eldu(encl_page, epc_page, secs_page);
222 	if (ret) {
223 		sgx_encl_free_epc_page(epc_page);
224 		return ERR_PTR(ret);
225 	}
226 
227 	sgx_free_va_slot(encl_page->va_page, va_offset);
228 	list_move(&encl_page->va_page->list, &encl->va_pages);
229 	encl_page->desc &= ~SGX_ENCL_PAGE_VA_OFFSET_MASK;
230 	encl_page->epc_page = epc_page;
231 
232 	return epc_page;
233 }
234 
sgx_encl_load_page(struct sgx_encl * encl,unsigned long addr,unsigned long vm_flags)235 static struct sgx_encl_page *sgx_encl_load_page(struct sgx_encl *encl,
236 						unsigned long addr,
237 						unsigned long vm_flags)
238 {
239 	unsigned long vm_prot_bits = vm_flags & (VM_READ | VM_WRITE | VM_EXEC);
240 	struct sgx_epc_page *epc_page;
241 	struct sgx_encl_page *entry;
242 
243 	entry = xa_load(&encl->page_array, PFN_DOWN(addr));
244 	if (!entry)
245 		return ERR_PTR(-EFAULT);
246 
247 	/*
248 	 * Verify that the faulted page has equal or higher build time
249 	 * permissions than the VMA permissions (i.e. the subset of {VM_READ,
250 	 * VM_WRITE, VM_EXECUTE} in vma->vm_flags).
251 	 */
252 	if ((entry->vm_max_prot_bits & vm_prot_bits) != vm_prot_bits)
253 		return ERR_PTR(-EFAULT);
254 
255 	/* Entry successfully located. */
256 	if (entry->epc_page) {
257 		if (entry->desc & SGX_ENCL_PAGE_BEING_RECLAIMED)
258 			return ERR_PTR(-EBUSY);
259 
260 		return entry;
261 	}
262 
263 	if (!(encl->secs.epc_page)) {
264 		epc_page = sgx_encl_eldu(&encl->secs, NULL);
265 		if (IS_ERR(epc_page))
266 			return ERR_CAST(epc_page);
267 	}
268 
269 	epc_page = sgx_encl_eldu(entry, encl->secs.epc_page);
270 	if (IS_ERR(epc_page))
271 		return ERR_CAST(epc_page);
272 
273 	encl->secs_child_cnt++;
274 	sgx_mark_page_reclaimable(entry->epc_page);
275 
276 	return entry;
277 }
278 
sgx_vma_fault(struct vm_fault * vmf)279 static vm_fault_t sgx_vma_fault(struct vm_fault *vmf)
280 {
281 	unsigned long addr = (unsigned long)vmf->address;
282 	struct vm_area_struct *vma = vmf->vma;
283 	struct sgx_encl_page *entry;
284 	unsigned long phys_addr;
285 	struct sgx_encl *encl;
286 	vm_fault_t ret;
287 
288 	encl = vma->vm_private_data;
289 
290 	/*
291 	 * It's very unlikely but possible that allocating memory for the
292 	 * mm_list entry of a forked process failed in sgx_vma_open(). When
293 	 * this happens, vm_private_data is set to NULL.
294 	 */
295 	if (unlikely(!encl))
296 		return VM_FAULT_SIGBUS;
297 
298 	mutex_lock(&encl->lock);
299 
300 	entry = sgx_encl_load_page(encl, addr, vma->vm_flags);
301 	if (IS_ERR(entry)) {
302 		mutex_unlock(&encl->lock);
303 
304 		if (PTR_ERR(entry) == -EBUSY)
305 			return VM_FAULT_NOPAGE;
306 
307 		return VM_FAULT_SIGBUS;
308 	}
309 
310 	phys_addr = sgx_get_epc_phys_addr(entry->epc_page);
311 
312 	ret = vmf_insert_pfn(vma, addr, PFN_DOWN(phys_addr));
313 	if (ret != VM_FAULT_NOPAGE) {
314 		mutex_unlock(&encl->lock);
315 
316 		return VM_FAULT_SIGBUS;
317 	}
318 
319 	sgx_encl_test_and_clear_young(vma->vm_mm, entry);
320 	mutex_unlock(&encl->lock);
321 
322 	return VM_FAULT_NOPAGE;
323 }
324 
sgx_vma_open(struct vm_area_struct * vma)325 static void sgx_vma_open(struct vm_area_struct *vma)
326 {
327 	struct sgx_encl *encl = vma->vm_private_data;
328 
329 	/*
330 	 * It's possible but unlikely that vm_private_data is NULL. This can
331 	 * happen in a grandchild of a process, when sgx_encl_mm_add() had
332 	 * failed to allocate memory in this callback.
333 	 */
334 	if (unlikely(!encl))
335 		return;
336 
337 	if (sgx_encl_mm_add(encl, vma->vm_mm))
338 		vma->vm_private_data = NULL;
339 }
340 
341 
342 /**
343  * sgx_encl_may_map() - Check if a requested VMA mapping is allowed
344  * @encl:		an enclave pointer
345  * @start:		lower bound of the address range, inclusive
346  * @end:		upper bound of the address range, exclusive
347  * @vm_flags:		VMA flags
348  *
349  * Iterate through the enclave pages contained within [@start, @end) to verify
350  * that the permissions requested by a subset of {VM_READ, VM_WRITE, VM_EXEC}
351  * do not contain any permissions that are not contained in the build time
352  * permissions of any of the enclave pages within the given address range.
353  *
354  * An enclave creator must declare the strongest permissions that will be
355  * needed for each enclave page. This ensures that mappings have the identical
356  * or weaker permissions than the earlier declared permissions.
357  *
358  * Return: 0 on success, -EACCES otherwise
359  */
sgx_encl_may_map(struct sgx_encl * encl,unsigned long start,unsigned long end,unsigned long vm_flags)360 int sgx_encl_may_map(struct sgx_encl *encl, unsigned long start,
361 		     unsigned long end, unsigned long vm_flags)
362 {
363 	unsigned long vm_prot_bits = vm_flags & (VM_READ | VM_WRITE | VM_EXEC);
364 	struct sgx_encl_page *page;
365 	unsigned long count = 0;
366 	int ret = 0;
367 
368 	XA_STATE(xas, &encl->page_array, PFN_DOWN(start));
369 
370 	/*
371 	 * Disallow READ_IMPLIES_EXEC tasks as their VMA permissions might
372 	 * conflict with the enclave page permissions.
373 	 */
374 	if (current->personality & READ_IMPLIES_EXEC)
375 		return -EACCES;
376 
377 	mutex_lock(&encl->lock);
378 	xas_lock(&xas);
379 	xas_for_each(&xas, page, PFN_DOWN(end - 1)) {
380 		if (~page->vm_max_prot_bits & vm_prot_bits) {
381 			ret = -EACCES;
382 			break;
383 		}
384 
385 		/* Reschedule on every XA_CHECK_SCHED iteration. */
386 		if (!(++count % XA_CHECK_SCHED)) {
387 			xas_pause(&xas);
388 			xas_unlock(&xas);
389 			mutex_unlock(&encl->lock);
390 
391 			cond_resched();
392 
393 			mutex_lock(&encl->lock);
394 			xas_lock(&xas);
395 		}
396 	}
397 	xas_unlock(&xas);
398 	mutex_unlock(&encl->lock);
399 
400 	return ret;
401 }
402 
sgx_vma_mprotect(struct vm_area_struct * vma,unsigned long start,unsigned long end,unsigned long newflags)403 static int sgx_vma_mprotect(struct vm_area_struct *vma, unsigned long start,
404 			    unsigned long end, unsigned long newflags)
405 {
406 	return sgx_encl_may_map(vma->vm_private_data, start, end, newflags);
407 }
408 
sgx_encl_debug_read(struct sgx_encl * encl,struct sgx_encl_page * page,unsigned long addr,void * data)409 static int sgx_encl_debug_read(struct sgx_encl *encl, struct sgx_encl_page *page,
410 			       unsigned long addr, void *data)
411 {
412 	unsigned long offset = addr & ~PAGE_MASK;
413 	int ret;
414 
415 
416 	ret = __edbgrd(sgx_get_epc_virt_addr(page->epc_page) + offset, data);
417 	if (ret)
418 		return -EIO;
419 
420 	return 0;
421 }
422 
sgx_encl_debug_write(struct sgx_encl * encl,struct sgx_encl_page * page,unsigned long addr,void * data)423 static int sgx_encl_debug_write(struct sgx_encl *encl, struct sgx_encl_page *page,
424 				unsigned long addr, void *data)
425 {
426 	unsigned long offset = addr & ~PAGE_MASK;
427 	int ret;
428 
429 	ret = __edbgwr(sgx_get_epc_virt_addr(page->epc_page) + offset, data);
430 	if (ret)
431 		return -EIO;
432 
433 	return 0;
434 }
435 
436 /*
437  * Load an enclave page to EPC if required, and take encl->lock.
438  */
sgx_encl_reserve_page(struct sgx_encl * encl,unsigned long addr,unsigned long vm_flags)439 static struct sgx_encl_page *sgx_encl_reserve_page(struct sgx_encl *encl,
440 						   unsigned long addr,
441 						   unsigned long vm_flags)
442 {
443 	struct sgx_encl_page *entry;
444 
445 	for ( ; ; ) {
446 		mutex_lock(&encl->lock);
447 
448 		entry = sgx_encl_load_page(encl, addr, vm_flags);
449 		if (PTR_ERR(entry) != -EBUSY)
450 			break;
451 
452 		mutex_unlock(&encl->lock);
453 	}
454 
455 	if (IS_ERR(entry))
456 		mutex_unlock(&encl->lock);
457 
458 	return entry;
459 }
460 
sgx_vma_access(struct vm_area_struct * vma,unsigned long addr,void * buf,int len,int write)461 static int sgx_vma_access(struct vm_area_struct *vma, unsigned long addr,
462 			  void *buf, int len, int write)
463 {
464 	struct sgx_encl *encl = vma->vm_private_data;
465 	struct sgx_encl_page *entry = NULL;
466 	char data[sizeof(unsigned long)];
467 	unsigned long align;
468 	int offset;
469 	int cnt;
470 	int ret = 0;
471 	int i;
472 
473 	/*
474 	 * If process was forked, VMA is still there but vm_private_data is set
475 	 * to NULL.
476 	 */
477 	if (!encl)
478 		return -EFAULT;
479 
480 	if (!test_bit(SGX_ENCL_DEBUG, &encl->flags))
481 		return -EFAULT;
482 
483 	for (i = 0; i < len; i += cnt) {
484 		entry = sgx_encl_reserve_page(encl, (addr + i) & PAGE_MASK,
485 					      vma->vm_flags);
486 		if (IS_ERR(entry)) {
487 			ret = PTR_ERR(entry);
488 			break;
489 		}
490 
491 		align = ALIGN_DOWN(addr + i, sizeof(unsigned long));
492 		offset = (addr + i) & (sizeof(unsigned long) - 1);
493 		cnt = sizeof(unsigned long) - offset;
494 		cnt = min(cnt, len - i);
495 
496 		ret = sgx_encl_debug_read(encl, entry, align, data);
497 		if (ret)
498 			goto out;
499 
500 		if (write) {
501 			memcpy(data + offset, buf + i, cnt);
502 			ret = sgx_encl_debug_write(encl, entry, align, data);
503 			if (ret)
504 				goto out;
505 		} else {
506 			memcpy(buf + i, data + offset, cnt);
507 		}
508 
509 out:
510 		mutex_unlock(&encl->lock);
511 
512 		if (ret)
513 			break;
514 	}
515 
516 	return ret < 0 ? ret : i;
517 }
518 
519 const struct vm_operations_struct sgx_vm_ops = {
520 	.fault = sgx_vma_fault,
521 	.mprotect = sgx_vma_mprotect,
522 	.open = sgx_vma_open,
523 	.access = sgx_vma_access,
524 };
525 
526 /**
527  * sgx_encl_release - Destroy an enclave instance
528  * @ref:	address of a kref inside &sgx_encl
529  *
530  * Used together with kref_put(). Frees all the resources associated with the
531  * enclave and the instance itself.
532  */
sgx_encl_release(struct kref * ref)533 void sgx_encl_release(struct kref *ref)
534 {
535 	struct sgx_encl *encl = container_of(ref, struct sgx_encl, refcount);
536 	unsigned long max_page_index = PFN_DOWN(encl->base + encl->size - 1);
537 	struct sgx_va_page *va_page;
538 	struct sgx_encl_page *entry;
539 	unsigned long count = 0;
540 
541 	XA_STATE(xas, &encl->page_array, PFN_DOWN(encl->base));
542 
543 	xas_lock(&xas);
544 	xas_for_each(&xas, entry, max_page_index) {
545 		if (entry->epc_page) {
546 			/*
547 			 * The page and its radix tree entry cannot be freed
548 			 * if the page is being held by the reclaimer.
549 			 */
550 			if (sgx_unmark_page_reclaimable(entry->epc_page))
551 				continue;
552 
553 			sgx_encl_free_epc_page(entry->epc_page);
554 			encl->secs_child_cnt--;
555 			entry->epc_page = NULL;
556 		}
557 
558 		kfree(entry);
559 		/*
560 		 * Invoke scheduler on every XA_CHECK_SCHED iteration
561 		 * to prevent soft lockups.
562 		 */
563 		if (!(++count % XA_CHECK_SCHED)) {
564 			xas_pause(&xas);
565 			xas_unlock(&xas);
566 
567 			cond_resched();
568 
569 			xas_lock(&xas);
570 		}
571 	}
572 	xas_unlock(&xas);
573 
574 	xa_destroy(&encl->page_array);
575 
576 	if (!encl->secs_child_cnt && encl->secs.epc_page) {
577 		sgx_encl_free_epc_page(encl->secs.epc_page);
578 		encl->secs.epc_page = NULL;
579 	}
580 
581 	while (!list_empty(&encl->va_pages)) {
582 		va_page = list_first_entry(&encl->va_pages, struct sgx_va_page,
583 					   list);
584 		list_del(&va_page->list);
585 		sgx_encl_free_epc_page(va_page->epc_page);
586 		kfree(va_page);
587 	}
588 
589 	if (encl->backing)
590 		fput(encl->backing);
591 
592 	cleanup_srcu_struct(&encl->srcu);
593 
594 	WARN_ON_ONCE(!list_empty(&encl->mm_list));
595 
596 	/* Detect EPC page leak's. */
597 	WARN_ON_ONCE(encl->secs_child_cnt);
598 	WARN_ON_ONCE(encl->secs.epc_page);
599 
600 	kfree(encl);
601 }
602 
603 /*
604  * 'mm' is exiting and no longer needs mmu notifications.
605  */
sgx_mmu_notifier_release(struct mmu_notifier * mn,struct mm_struct * mm)606 static void sgx_mmu_notifier_release(struct mmu_notifier *mn,
607 				     struct mm_struct *mm)
608 {
609 	struct sgx_encl_mm *encl_mm = container_of(mn, struct sgx_encl_mm, mmu_notifier);
610 	struct sgx_encl_mm *tmp = NULL;
611 
612 	/*
613 	 * The enclave itself can remove encl_mm.  Note, objects can't be moved
614 	 * off an RCU protected list, but deletion is ok.
615 	 */
616 	spin_lock(&encl_mm->encl->mm_lock);
617 	list_for_each_entry(tmp, &encl_mm->encl->mm_list, list) {
618 		if (tmp == encl_mm) {
619 			list_del_rcu(&encl_mm->list);
620 			break;
621 		}
622 	}
623 	spin_unlock(&encl_mm->encl->mm_lock);
624 
625 	if (tmp == encl_mm) {
626 		synchronize_srcu(&encl_mm->encl->srcu);
627 		mmu_notifier_put(mn);
628 	}
629 }
630 
sgx_mmu_notifier_free(struct mmu_notifier * mn)631 static void sgx_mmu_notifier_free(struct mmu_notifier *mn)
632 {
633 	struct sgx_encl_mm *encl_mm = container_of(mn, struct sgx_encl_mm, mmu_notifier);
634 
635 	/* 'encl_mm' is going away, put encl_mm->encl reference: */
636 	kref_put(&encl_mm->encl->refcount, sgx_encl_release);
637 
638 	kfree(encl_mm);
639 }
640 
641 static const struct mmu_notifier_ops sgx_mmu_notifier_ops = {
642 	.release		= sgx_mmu_notifier_release,
643 	.free_notifier		= sgx_mmu_notifier_free,
644 };
645 
sgx_encl_find_mm(struct sgx_encl * encl,struct mm_struct * mm)646 static struct sgx_encl_mm *sgx_encl_find_mm(struct sgx_encl *encl,
647 					    struct mm_struct *mm)
648 {
649 	struct sgx_encl_mm *encl_mm = NULL;
650 	struct sgx_encl_mm *tmp;
651 	int idx;
652 
653 	idx = srcu_read_lock(&encl->srcu);
654 
655 	list_for_each_entry_rcu(tmp, &encl->mm_list, list) {
656 		if (tmp->mm == mm) {
657 			encl_mm = tmp;
658 			break;
659 		}
660 	}
661 
662 	srcu_read_unlock(&encl->srcu, idx);
663 
664 	return encl_mm;
665 }
666 
sgx_encl_mm_add(struct sgx_encl * encl,struct mm_struct * mm)667 int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm)
668 {
669 	struct sgx_encl_mm *encl_mm;
670 	int ret;
671 
672 	/*
673 	 * Even though a single enclave may be mapped into an mm more than once,
674 	 * each 'mm' only appears once on encl->mm_list. This is guaranteed by
675 	 * holding the mm's mmap lock for write before an mm can be added or
676 	 * remove to an encl->mm_list.
677 	 */
678 	mmap_assert_write_locked(mm);
679 
680 	/*
681 	 * It's possible that an entry already exists in the mm_list, because it
682 	 * is removed only on VFS release or process exit.
683 	 */
684 	if (sgx_encl_find_mm(encl, mm))
685 		return 0;
686 
687 	encl_mm = kzalloc(sizeof(*encl_mm), GFP_KERNEL);
688 	if (!encl_mm)
689 		return -ENOMEM;
690 
691 	/* Grab a refcount for the encl_mm->encl reference: */
692 	kref_get(&encl->refcount);
693 	encl_mm->encl = encl;
694 	encl_mm->mm = mm;
695 	encl_mm->mmu_notifier.ops = &sgx_mmu_notifier_ops;
696 
697 	ret = __mmu_notifier_register(&encl_mm->mmu_notifier, mm);
698 	if (ret) {
699 		kfree(encl_mm);
700 		return ret;
701 	}
702 
703 	spin_lock(&encl->mm_lock);
704 	list_add_rcu(&encl_mm->list, &encl->mm_list);
705 	/* Pairs with smp_rmb() in sgx_reclaimer_block(). */
706 	smp_wmb();
707 	encl->mm_list_version++;
708 	spin_unlock(&encl->mm_lock);
709 
710 	return 0;
711 }
712 
sgx_encl_get_backing_page(struct sgx_encl * encl,pgoff_t index)713 static struct page *sgx_encl_get_backing_page(struct sgx_encl *encl,
714 					      pgoff_t index)
715 {
716 	struct inode *inode = encl->backing->f_path.dentry->d_inode;
717 	struct address_space *mapping = inode->i_mapping;
718 	gfp_t gfpmask = mapping_gfp_mask(mapping);
719 
720 	return shmem_read_mapping_page_gfp(mapping, index, gfpmask);
721 }
722 
723 /**
724  * sgx_encl_get_backing() - Pin the backing storage
725  * @encl:	an enclave pointer
726  * @page_index:	enclave page index
727  * @backing:	data for accessing backing storage for the page
728  *
729  * Pin the backing storage pages for storing the encrypted contents and Paging
730  * Crypto MetaData (PCMD) of an enclave page.
731  *
732  * Return:
733  *   0 on success,
734  *   -errno otherwise.
735  */
sgx_encl_get_backing(struct sgx_encl * encl,unsigned long page_index,struct sgx_backing * backing)736 static int sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index,
737 			 struct sgx_backing *backing)
738 {
739 	pgoff_t page_pcmd_off = sgx_encl_get_backing_page_pcmd_offset(encl, page_index);
740 	struct page *contents;
741 	struct page *pcmd;
742 
743 	contents = sgx_encl_get_backing_page(encl, page_index);
744 	if (IS_ERR(contents))
745 		return PTR_ERR(contents);
746 
747 	pcmd = sgx_encl_get_backing_page(encl, PFN_DOWN(page_pcmd_off));
748 	if (IS_ERR(pcmd)) {
749 		put_page(contents);
750 		return PTR_ERR(pcmd);
751 	}
752 
753 	backing->page_index = page_index;
754 	backing->contents = contents;
755 	backing->pcmd = pcmd;
756 	backing->pcmd_offset = page_pcmd_off & (PAGE_SIZE - 1);
757 
758 	return 0;
759 }
760 
761 /*
762  * When called from ksgxd, returns the mem_cgroup of a struct mm stored
763  * in the enclave's mm_list. When not called from ksgxd, just returns
764  * the mem_cgroup of the current task.
765  */
sgx_encl_get_mem_cgroup(struct sgx_encl * encl)766 static struct mem_cgroup *sgx_encl_get_mem_cgroup(struct sgx_encl *encl)
767 {
768 	struct mem_cgroup *memcg = NULL;
769 	struct sgx_encl_mm *encl_mm;
770 	int idx;
771 
772 	/*
773 	 * If called from normal task context, return the mem_cgroup
774 	 * of the current task's mm. The remainder of the handling is for
775 	 * ksgxd.
776 	 */
777 	if (!current_is_ksgxd())
778 		return get_mem_cgroup_from_mm(current->mm);
779 
780 	/*
781 	 * Search the enclave's mm_list to find an mm associated with
782 	 * this enclave to charge the allocation to.
783 	 */
784 	idx = srcu_read_lock(&encl->srcu);
785 
786 	list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) {
787 		if (!mmget_not_zero(encl_mm->mm))
788 			continue;
789 
790 		memcg = get_mem_cgroup_from_mm(encl_mm->mm);
791 
792 		mmput_async(encl_mm->mm);
793 
794 		break;
795 	}
796 
797 	srcu_read_unlock(&encl->srcu, idx);
798 
799 	/*
800 	 * In the rare case that there isn't an mm associated with
801 	 * the enclave, set memcg to the current active mem_cgroup.
802 	 * This will be the root mem_cgroup if there is no active
803 	 * mem_cgroup.
804 	 */
805 	if (!memcg)
806 		return get_mem_cgroup_from_mm(NULL);
807 
808 	return memcg;
809 }
810 
811 /**
812  * sgx_encl_alloc_backing() - allocate a new backing storage page
813  * @encl:	an enclave pointer
814  * @page_index:	enclave page index
815  * @backing:	data for accessing backing storage for the page
816  *
817  * When called from ksgxd, sets the active memcg from one of the
818  * mms in the enclave's mm_list prior to any backing page allocation,
819  * in order to ensure that shmem page allocations are charged to the
820  * enclave.
821  *
822  * Return:
823  *   0 on success,
824  *   -errno otherwise.
825  */
sgx_encl_alloc_backing(struct sgx_encl * encl,unsigned long page_index,struct sgx_backing * backing)826 int sgx_encl_alloc_backing(struct sgx_encl *encl, unsigned long page_index,
827 			   struct sgx_backing *backing)
828 {
829 	struct mem_cgroup *encl_memcg = sgx_encl_get_mem_cgroup(encl);
830 	struct mem_cgroup *memcg = set_active_memcg(encl_memcg);
831 	int ret;
832 
833 	ret = sgx_encl_get_backing(encl, page_index, backing);
834 
835 	set_active_memcg(memcg);
836 	mem_cgroup_put(encl_memcg);
837 
838 	return ret;
839 }
840 
841 /**
842  * sgx_encl_lookup_backing() - retrieve an existing backing storage page
843  * @encl:	an enclave pointer
844  * @page_index:	enclave page index
845  * @backing:	data for accessing backing storage for the page
846  *
847  * Retrieve a backing page for loading data back into an EPC page with ELDU.
848  * It is the caller's responsibility to ensure that it is appropriate to use
849  * sgx_encl_lookup_backing() rather than sgx_encl_alloc_backing(). If lookup is
850  * not used correctly, this will cause an allocation which is not accounted for.
851  *
852  * Return:
853  *   0 on success,
854  *   -errno otherwise.
855  */
sgx_encl_lookup_backing(struct sgx_encl * encl,unsigned long page_index,struct sgx_backing * backing)856 int sgx_encl_lookup_backing(struct sgx_encl *encl, unsigned long page_index,
857 			   struct sgx_backing *backing)
858 {
859 	return sgx_encl_get_backing(encl, page_index, backing);
860 }
861 
862 /**
863  * sgx_encl_put_backing() - Unpin the backing storage
864  * @backing:	data for accessing backing storage for the page
865  */
sgx_encl_put_backing(struct sgx_backing * backing)866 void sgx_encl_put_backing(struct sgx_backing *backing)
867 {
868 	put_page(backing->pcmd);
869 	put_page(backing->contents);
870 }
871 
sgx_encl_test_and_clear_young_cb(pte_t * ptep,unsigned long addr,void * data)872 static int sgx_encl_test_and_clear_young_cb(pte_t *ptep, unsigned long addr,
873 					    void *data)
874 {
875 	pte_t pte;
876 	int ret;
877 
878 	ret = pte_young(*ptep);
879 	if (ret) {
880 		pte = pte_mkold(*ptep);
881 		set_pte_at((struct mm_struct *)data, addr, ptep, pte);
882 	}
883 
884 	return ret;
885 }
886 
887 /**
888  * sgx_encl_test_and_clear_young() - Test and reset the accessed bit
889  * @mm:		mm_struct that is checked
890  * @page:	enclave page to be tested for recent access
891  *
892  * Checks the Access (A) bit from the PTE corresponding to the enclave page and
893  * clears it.
894  *
895  * Return: 1 if the page has been recently accessed and 0 if not.
896  */
sgx_encl_test_and_clear_young(struct mm_struct * mm,struct sgx_encl_page * page)897 int sgx_encl_test_and_clear_young(struct mm_struct *mm,
898 				  struct sgx_encl_page *page)
899 {
900 	unsigned long addr = page->desc & PAGE_MASK;
901 	struct sgx_encl *encl = page->encl;
902 	struct vm_area_struct *vma;
903 	int ret;
904 
905 	ret = sgx_encl_find(mm, addr, &vma);
906 	if (ret)
907 		return 0;
908 
909 	if (encl != vma->vm_private_data)
910 		return 0;
911 
912 	ret = apply_to_page_range(vma->vm_mm, addr, PAGE_SIZE,
913 				  sgx_encl_test_and_clear_young_cb, vma->vm_mm);
914 	if (ret < 0)
915 		return 0;
916 
917 	return ret;
918 }
919 
920 /**
921  * sgx_alloc_va_page() - Allocate a Version Array (VA) page
922  *
923  * Allocate a free EPC page and convert it to a Version Array (VA) page.
924  *
925  * Return:
926  *   a VA page,
927  *   -errno otherwise
928  */
sgx_alloc_va_page(void)929 struct sgx_epc_page *sgx_alloc_va_page(void)
930 {
931 	struct sgx_epc_page *epc_page;
932 	int ret;
933 
934 	epc_page = sgx_alloc_epc_page(NULL, true);
935 	if (IS_ERR(epc_page))
936 		return ERR_CAST(epc_page);
937 
938 	ret = __epa(sgx_get_epc_virt_addr(epc_page));
939 	if (ret) {
940 		WARN_ONCE(1, "EPA returned %d (0x%x)", ret, ret);
941 		sgx_encl_free_epc_page(epc_page);
942 		return ERR_PTR(-EFAULT);
943 	}
944 
945 	return epc_page;
946 }
947 
948 /**
949  * sgx_alloc_va_slot - allocate a VA slot
950  * @va_page:	a &struct sgx_va_page instance
951  *
952  * Allocates a slot from a &struct sgx_va_page instance.
953  *
954  * Return: offset of the slot inside the VA page
955  */
sgx_alloc_va_slot(struct sgx_va_page * va_page)956 unsigned int sgx_alloc_va_slot(struct sgx_va_page *va_page)
957 {
958 	int slot = find_first_zero_bit(va_page->slots, SGX_VA_SLOT_COUNT);
959 
960 	if (slot < SGX_VA_SLOT_COUNT)
961 		set_bit(slot, va_page->slots);
962 
963 	return slot << 3;
964 }
965 
966 /**
967  * sgx_free_va_slot - free a VA slot
968  * @va_page:	a &struct sgx_va_page instance
969  * @offset:	offset of the slot inside the VA page
970  *
971  * Frees a slot from a &struct sgx_va_page instance.
972  */
sgx_free_va_slot(struct sgx_va_page * va_page,unsigned int offset)973 void sgx_free_va_slot(struct sgx_va_page *va_page, unsigned int offset)
974 {
975 	clear_bit(offset >> 3, va_page->slots);
976 }
977 
978 /**
979  * sgx_va_page_full - is the VA page full?
980  * @va_page:	a &struct sgx_va_page instance
981  *
982  * Return: true if all slots have been taken
983  */
sgx_va_page_full(struct sgx_va_page * va_page)984 bool sgx_va_page_full(struct sgx_va_page *va_page)
985 {
986 	int slot = find_first_zero_bit(va_page->slots, SGX_VA_SLOT_COUNT);
987 
988 	return slot == SGX_VA_SLOT_COUNT;
989 }
990 
991 /**
992  * sgx_encl_free_epc_page - free an EPC page assigned to an enclave
993  * @page:	EPC page to be freed
994  *
995  * Free an EPC page assigned to an enclave. It does EREMOVE for the page, and
996  * only upon success, it puts the page back to free page list.  Otherwise, it
997  * gives a WARNING to indicate page is leaked.
998  */
sgx_encl_free_epc_page(struct sgx_epc_page * page)999 void sgx_encl_free_epc_page(struct sgx_epc_page *page)
1000 {
1001 	int ret;
1002 
1003 	WARN_ON_ONCE(page->flags & SGX_EPC_PAGE_RECLAIMER_TRACKED);
1004 
1005 	ret = __eremove(sgx_get_epc_virt_addr(page));
1006 	if (WARN_ONCE(ret, EREMOVE_ERROR_MESSAGE, ret, ret))
1007 		return;
1008 
1009 	sgx_free_epc_page(page);
1010 }
1011