1 // SPDX-License-Identifier: GPL-2.0
2 /* Copyright(c) 2016-20 Intel Corporation. */
3
4 #include <linux/lockdep.h>
5 #include <linux/mm.h>
6 #include <linux/mman.h>
7 #include <linux/shmem_fs.h>
8 #include <linux/suspend.h>
9 #include <linux/sched/mm.h>
10 #include <asm/sgx.h>
11 #include "encl.h"
12 #include "encls.h"
13 #include "sgx.h"
14
15 #define PCMDS_PER_PAGE (PAGE_SIZE / sizeof(struct sgx_pcmd))
16 /*
17 * 32 PCMD entries share a PCMD page. PCMD_FIRST_MASK is used to
18 * determine the page index associated with the first PCMD entry
19 * within a PCMD page.
20 */
21 #define PCMD_FIRST_MASK GENMASK(4, 0)
22
23 /**
24 * reclaimer_writing_to_pcmd() - Query if any enclave page associated with
25 * a PCMD page is in process of being reclaimed.
26 * @encl: Enclave to which PCMD page belongs
27 * @start_addr: Address of enclave page using first entry within the PCMD page
28 *
29 * When an enclave page is reclaimed some Paging Crypto MetaData (PCMD) is
30 * stored. The PCMD data of a reclaimed enclave page contains enough
31 * information for the processor to verify the page at the time
32 * it is loaded back into the Enclave Page Cache (EPC).
33 *
34 * The backing storage to which enclave pages are reclaimed is laid out as
35 * follows:
36 * Encrypted enclave pages:SECS page:PCMD pages
37 *
38 * Each PCMD page contains the PCMD metadata of
39 * PAGE_SIZE/sizeof(struct sgx_pcmd) enclave pages.
40 *
41 * A PCMD page can only be truncated if it is (a) empty, and (b) not in the
42 * process of getting data (and thus soon being non-empty). (b) is tested with
43 * a check if an enclave page sharing the PCMD page is in the process of being
44 * reclaimed.
45 *
46 * The reclaimer sets the SGX_ENCL_PAGE_BEING_RECLAIMED flag when it
47 * intends to reclaim that enclave page - it means that the PCMD page
48 * associated with that enclave page is about to get some data and thus
49 * even if the PCMD page is empty, it should not be truncated.
50 *
51 * Context: Enclave mutex (&sgx_encl->lock) must be held.
52 * Return: 1 if the reclaimer is about to write to the PCMD page
53 * 0 if the reclaimer has no intention to write to the PCMD page
54 */
reclaimer_writing_to_pcmd(struct sgx_encl * encl,unsigned long start_addr)55 static int reclaimer_writing_to_pcmd(struct sgx_encl *encl,
56 unsigned long start_addr)
57 {
58 int reclaimed = 0;
59 int i;
60
61 /*
62 * PCMD_FIRST_MASK is based on number of PCMD entries within
63 * PCMD page being 32.
64 */
65 BUILD_BUG_ON(PCMDS_PER_PAGE != 32);
66
67 for (i = 0; i < PCMDS_PER_PAGE; i++) {
68 struct sgx_encl_page *entry;
69 unsigned long addr;
70
71 addr = start_addr + i * PAGE_SIZE;
72
73 /*
74 * Stop when reaching the SECS page - it does not
75 * have a page_array entry and its reclaim is
76 * started and completed with enclave mutex held so
77 * it does not use the SGX_ENCL_PAGE_BEING_RECLAIMED
78 * flag.
79 */
80 if (addr == encl->base + encl->size)
81 break;
82
83 entry = xa_load(&encl->page_array, PFN_DOWN(addr));
84 if (!entry)
85 continue;
86
87 /*
88 * VA page slot ID uses same bit as the flag so it is important
89 * to ensure that the page is not already in backing store.
90 */
91 if (entry->epc_page &&
92 (entry->desc & SGX_ENCL_PAGE_BEING_RECLAIMED)) {
93 reclaimed = 1;
94 break;
95 }
96 }
97
98 return reclaimed;
99 }
100
101 /*
102 * Calculate byte offset of a PCMD struct associated with an enclave page. PCMD's
103 * follow right after the EPC data in the backing storage. In addition to the
104 * visible enclave pages, there's one extra page slot for SECS, before PCMD
105 * structs.
106 */
sgx_encl_get_backing_page_pcmd_offset(struct sgx_encl * encl,unsigned long page_index)107 static inline pgoff_t sgx_encl_get_backing_page_pcmd_offset(struct sgx_encl *encl,
108 unsigned long page_index)
109 {
110 pgoff_t epc_end_off = encl->size + sizeof(struct sgx_secs);
111
112 return epc_end_off + page_index * sizeof(struct sgx_pcmd);
113 }
114
115 /*
116 * Free a page from the backing storage in the given page index.
117 */
sgx_encl_truncate_backing_page(struct sgx_encl * encl,unsigned long page_index)118 static inline void sgx_encl_truncate_backing_page(struct sgx_encl *encl, unsigned long page_index)
119 {
120 struct inode *inode = file_inode(encl->backing);
121
122 shmem_truncate_range(inode, PFN_PHYS(page_index), PFN_PHYS(page_index) + PAGE_SIZE - 1);
123 }
124
125 /*
126 * ELDU: Load an EPC page as unblocked. For more info, see "OS Management of EPC
127 * Pages" in the SDM.
128 */
__sgx_encl_eldu(struct sgx_encl_page * encl_page,struct sgx_epc_page * epc_page,struct sgx_epc_page * secs_page)129 static int __sgx_encl_eldu(struct sgx_encl_page *encl_page,
130 struct sgx_epc_page *epc_page,
131 struct sgx_epc_page *secs_page)
132 {
133 unsigned long va_offset = encl_page->desc & SGX_ENCL_PAGE_VA_OFFSET_MASK;
134 struct sgx_encl *encl = encl_page->encl;
135 pgoff_t page_index, page_pcmd_off;
136 unsigned long pcmd_first_page;
137 struct sgx_pageinfo pginfo;
138 struct sgx_backing b;
139 bool pcmd_page_empty;
140 u8 *pcmd_page;
141 int ret;
142
143 if (secs_page)
144 page_index = PFN_DOWN(encl_page->desc - encl_page->encl->base);
145 else
146 page_index = PFN_DOWN(encl->size);
147
148 /*
149 * Address of enclave page using the first entry within the PCMD page.
150 */
151 pcmd_first_page = PFN_PHYS(page_index & ~PCMD_FIRST_MASK) + encl->base;
152
153 page_pcmd_off = sgx_encl_get_backing_page_pcmd_offset(encl, page_index);
154
155 ret = sgx_encl_lookup_backing(encl, page_index, &b);
156 if (ret)
157 return ret;
158
159 pginfo.addr = encl_page->desc & PAGE_MASK;
160 pginfo.contents = (unsigned long)kmap_atomic(b.contents);
161 pcmd_page = kmap_atomic(b.pcmd);
162 pginfo.metadata = (unsigned long)pcmd_page + b.pcmd_offset;
163
164 if (secs_page)
165 pginfo.secs = (u64)sgx_get_epc_virt_addr(secs_page);
166 else
167 pginfo.secs = 0;
168
169 ret = __eldu(&pginfo, sgx_get_epc_virt_addr(epc_page),
170 sgx_get_epc_virt_addr(encl_page->va_page->epc_page) + va_offset);
171 if (ret) {
172 if (encls_failed(ret))
173 ENCLS_WARN(ret, "ELDU");
174
175 ret = -EFAULT;
176 }
177
178 memset(pcmd_page + b.pcmd_offset, 0, sizeof(struct sgx_pcmd));
179 set_page_dirty(b.pcmd);
180
181 /*
182 * The area for the PCMD in the page was zeroed above. Check if the
183 * whole page is now empty meaning that all PCMD's have been zeroed:
184 */
185 pcmd_page_empty = !memchr_inv(pcmd_page, 0, PAGE_SIZE);
186
187 kunmap_atomic(pcmd_page);
188 kunmap_atomic((void *)(unsigned long)pginfo.contents);
189
190 get_page(b.pcmd);
191 sgx_encl_put_backing(&b);
192
193 sgx_encl_truncate_backing_page(encl, page_index);
194
195 if (pcmd_page_empty && !reclaimer_writing_to_pcmd(encl, pcmd_first_page)) {
196 sgx_encl_truncate_backing_page(encl, PFN_DOWN(page_pcmd_off));
197 pcmd_page = kmap_atomic(b.pcmd);
198 if (memchr_inv(pcmd_page, 0, PAGE_SIZE))
199 pr_warn("PCMD page not empty after truncate.\n");
200 kunmap_atomic(pcmd_page);
201 }
202
203 put_page(b.pcmd);
204
205 return ret;
206 }
207
sgx_encl_eldu(struct sgx_encl_page * encl_page,struct sgx_epc_page * secs_page)208 static struct sgx_epc_page *sgx_encl_eldu(struct sgx_encl_page *encl_page,
209 struct sgx_epc_page *secs_page)
210 {
211
212 unsigned long va_offset = encl_page->desc & SGX_ENCL_PAGE_VA_OFFSET_MASK;
213 struct sgx_encl *encl = encl_page->encl;
214 struct sgx_epc_page *epc_page;
215 int ret;
216
217 epc_page = sgx_alloc_epc_page(encl_page, false);
218 if (IS_ERR(epc_page))
219 return epc_page;
220
221 ret = __sgx_encl_eldu(encl_page, epc_page, secs_page);
222 if (ret) {
223 sgx_encl_free_epc_page(epc_page);
224 return ERR_PTR(ret);
225 }
226
227 sgx_free_va_slot(encl_page->va_page, va_offset);
228 list_move(&encl_page->va_page->list, &encl->va_pages);
229 encl_page->desc &= ~SGX_ENCL_PAGE_VA_OFFSET_MASK;
230 encl_page->epc_page = epc_page;
231
232 return epc_page;
233 }
234
sgx_encl_load_page(struct sgx_encl * encl,unsigned long addr,unsigned long vm_flags)235 static struct sgx_encl_page *sgx_encl_load_page(struct sgx_encl *encl,
236 unsigned long addr,
237 unsigned long vm_flags)
238 {
239 unsigned long vm_prot_bits = vm_flags & (VM_READ | VM_WRITE | VM_EXEC);
240 struct sgx_epc_page *epc_page;
241 struct sgx_encl_page *entry;
242
243 entry = xa_load(&encl->page_array, PFN_DOWN(addr));
244 if (!entry)
245 return ERR_PTR(-EFAULT);
246
247 /*
248 * Verify that the faulted page has equal or higher build time
249 * permissions than the VMA permissions (i.e. the subset of {VM_READ,
250 * VM_WRITE, VM_EXECUTE} in vma->vm_flags).
251 */
252 if ((entry->vm_max_prot_bits & vm_prot_bits) != vm_prot_bits)
253 return ERR_PTR(-EFAULT);
254
255 /* Entry successfully located. */
256 if (entry->epc_page) {
257 if (entry->desc & SGX_ENCL_PAGE_BEING_RECLAIMED)
258 return ERR_PTR(-EBUSY);
259
260 return entry;
261 }
262
263 if (!(encl->secs.epc_page)) {
264 epc_page = sgx_encl_eldu(&encl->secs, NULL);
265 if (IS_ERR(epc_page))
266 return ERR_CAST(epc_page);
267 }
268
269 epc_page = sgx_encl_eldu(entry, encl->secs.epc_page);
270 if (IS_ERR(epc_page))
271 return ERR_CAST(epc_page);
272
273 encl->secs_child_cnt++;
274 sgx_mark_page_reclaimable(entry->epc_page);
275
276 return entry;
277 }
278
sgx_vma_fault(struct vm_fault * vmf)279 static vm_fault_t sgx_vma_fault(struct vm_fault *vmf)
280 {
281 unsigned long addr = (unsigned long)vmf->address;
282 struct vm_area_struct *vma = vmf->vma;
283 struct sgx_encl_page *entry;
284 unsigned long phys_addr;
285 struct sgx_encl *encl;
286 vm_fault_t ret;
287
288 encl = vma->vm_private_data;
289
290 /*
291 * It's very unlikely but possible that allocating memory for the
292 * mm_list entry of a forked process failed in sgx_vma_open(). When
293 * this happens, vm_private_data is set to NULL.
294 */
295 if (unlikely(!encl))
296 return VM_FAULT_SIGBUS;
297
298 mutex_lock(&encl->lock);
299
300 entry = sgx_encl_load_page(encl, addr, vma->vm_flags);
301 if (IS_ERR(entry)) {
302 mutex_unlock(&encl->lock);
303
304 if (PTR_ERR(entry) == -EBUSY)
305 return VM_FAULT_NOPAGE;
306
307 return VM_FAULT_SIGBUS;
308 }
309
310 phys_addr = sgx_get_epc_phys_addr(entry->epc_page);
311
312 ret = vmf_insert_pfn(vma, addr, PFN_DOWN(phys_addr));
313 if (ret != VM_FAULT_NOPAGE) {
314 mutex_unlock(&encl->lock);
315
316 return VM_FAULT_SIGBUS;
317 }
318
319 sgx_encl_test_and_clear_young(vma->vm_mm, entry);
320 mutex_unlock(&encl->lock);
321
322 return VM_FAULT_NOPAGE;
323 }
324
sgx_vma_open(struct vm_area_struct * vma)325 static void sgx_vma_open(struct vm_area_struct *vma)
326 {
327 struct sgx_encl *encl = vma->vm_private_data;
328
329 /*
330 * It's possible but unlikely that vm_private_data is NULL. This can
331 * happen in a grandchild of a process, when sgx_encl_mm_add() had
332 * failed to allocate memory in this callback.
333 */
334 if (unlikely(!encl))
335 return;
336
337 if (sgx_encl_mm_add(encl, vma->vm_mm))
338 vma->vm_private_data = NULL;
339 }
340
341
342 /**
343 * sgx_encl_may_map() - Check if a requested VMA mapping is allowed
344 * @encl: an enclave pointer
345 * @start: lower bound of the address range, inclusive
346 * @end: upper bound of the address range, exclusive
347 * @vm_flags: VMA flags
348 *
349 * Iterate through the enclave pages contained within [@start, @end) to verify
350 * that the permissions requested by a subset of {VM_READ, VM_WRITE, VM_EXEC}
351 * do not contain any permissions that are not contained in the build time
352 * permissions of any of the enclave pages within the given address range.
353 *
354 * An enclave creator must declare the strongest permissions that will be
355 * needed for each enclave page. This ensures that mappings have the identical
356 * or weaker permissions than the earlier declared permissions.
357 *
358 * Return: 0 on success, -EACCES otherwise
359 */
sgx_encl_may_map(struct sgx_encl * encl,unsigned long start,unsigned long end,unsigned long vm_flags)360 int sgx_encl_may_map(struct sgx_encl *encl, unsigned long start,
361 unsigned long end, unsigned long vm_flags)
362 {
363 unsigned long vm_prot_bits = vm_flags & (VM_READ | VM_WRITE | VM_EXEC);
364 struct sgx_encl_page *page;
365 unsigned long count = 0;
366 int ret = 0;
367
368 XA_STATE(xas, &encl->page_array, PFN_DOWN(start));
369
370 /*
371 * Disallow READ_IMPLIES_EXEC tasks as their VMA permissions might
372 * conflict with the enclave page permissions.
373 */
374 if (current->personality & READ_IMPLIES_EXEC)
375 return -EACCES;
376
377 mutex_lock(&encl->lock);
378 xas_lock(&xas);
379 xas_for_each(&xas, page, PFN_DOWN(end - 1)) {
380 if (~page->vm_max_prot_bits & vm_prot_bits) {
381 ret = -EACCES;
382 break;
383 }
384
385 /* Reschedule on every XA_CHECK_SCHED iteration. */
386 if (!(++count % XA_CHECK_SCHED)) {
387 xas_pause(&xas);
388 xas_unlock(&xas);
389 mutex_unlock(&encl->lock);
390
391 cond_resched();
392
393 mutex_lock(&encl->lock);
394 xas_lock(&xas);
395 }
396 }
397 xas_unlock(&xas);
398 mutex_unlock(&encl->lock);
399
400 return ret;
401 }
402
sgx_vma_mprotect(struct vm_area_struct * vma,unsigned long start,unsigned long end,unsigned long newflags)403 static int sgx_vma_mprotect(struct vm_area_struct *vma, unsigned long start,
404 unsigned long end, unsigned long newflags)
405 {
406 return sgx_encl_may_map(vma->vm_private_data, start, end, newflags);
407 }
408
sgx_encl_debug_read(struct sgx_encl * encl,struct sgx_encl_page * page,unsigned long addr,void * data)409 static int sgx_encl_debug_read(struct sgx_encl *encl, struct sgx_encl_page *page,
410 unsigned long addr, void *data)
411 {
412 unsigned long offset = addr & ~PAGE_MASK;
413 int ret;
414
415
416 ret = __edbgrd(sgx_get_epc_virt_addr(page->epc_page) + offset, data);
417 if (ret)
418 return -EIO;
419
420 return 0;
421 }
422
sgx_encl_debug_write(struct sgx_encl * encl,struct sgx_encl_page * page,unsigned long addr,void * data)423 static int sgx_encl_debug_write(struct sgx_encl *encl, struct sgx_encl_page *page,
424 unsigned long addr, void *data)
425 {
426 unsigned long offset = addr & ~PAGE_MASK;
427 int ret;
428
429 ret = __edbgwr(sgx_get_epc_virt_addr(page->epc_page) + offset, data);
430 if (ret)
431 return -EIO;
432
433 return 0;
434 }
435
436 /*
437 * Load an enclave page to EPC if required, and take encl->lock.
438 */
sgx_encl_reserve_page(struct sgx_encl * encl,unsigned long addr,unsigned long vm_flags)439 static struct sgx_encl_page *sgx_encl_reserve_page(struct sgx_encl *encl,
440 unsigned long addr,
441 unsigned long vm_flags)
442 {
443 struct sgx_encl_page *entry;
444
445 for ( ; ; ) {
446 mutex_lock(&encl->lock);
447
448 entry = sgx_encl_load_page(encl, addr, vm_flags);
449 if (PTR_ERR(entry) != -EBUSY)
450 break;
451
452 mutex_unlock(&encl->lock);
453 }
454
455 if (IS_ERR(entry))
456 mutex_unlock(&encl->lock);
457
458 return entry;
459 }
460
sgx_vma_access(struct vm_area_struct * vma,unsigned long addr,void * buf,int len,int write)461 static int sgx_vma_access(struct vm_area_struct *vma, unsigned long addr,
462 void *buf, int len, int write)
463 {
464 struct sgx_encl *encl = vma->vm_private_data;
465 struct sgx_encl_page *entry = NULL;
466 char data[sizeof(unsigned long)];
467 unsigned long align;
468 int offset;
469 int cnt;
470 int ret = 0;
471 int i;
472
473 /*
474 * If process was forked, VMA is still there but vm_private_data is set
475 * to NULL.
476 */
477 if (!encl)
478 return -EFAULT;
479
480 if (!test_bit(SGX_ENCL_DEBUG, &encl->flags))
481 return -EFAULT;
482
483 for (i = 0; i < len; i += cnt) {
484 entry = sgx_encl_reserve_page(encl, (addr + i) & PAGE_MASK,
485 vma->vm_flags);
486 if (IS_ERR(entry)) {
487 ret = PTR_ERR(entry);
488 break;
489 }
490
491 align = ALIGN_DOWN(addr + i, sizeof(unsigned long));
492 offset = (addr + i) & (sizeof(unsigned long) - 1);
493 cnt = sizeof(unsigned long) - offset;
494 cnt = min(cnt, len - i);
495
496 ret = sgx_encl_debug_read(encl, entry, align, data);
497 if (ret)
498 goto out;
499
500 if (write) {
501 memcpy(data + offset, buf + i, cnt);
502 ret = sgx_encl_debug_write(encl, entry, align, data);
503 if (ret)
504 goto out;
505 } else {
506 memcpy(buf + i, data + offset, cnt);
507 }
508
509 out:
510 mutex_unlock(&encl->lock);
511
512 if (ret)
513 break;
514 }
515
516 return ret < 0 ? ret : i;
517 }
518
519 const struct vm_operations_struct sgx_vm_ops = {
520 .fault = sgx_vma_fault,
521 .mprotect = sgx_vma_mprotect,
522 .open = sgx_vma_open,
523 .access = sgx_vma_access,
524 };
525
526 /**
527 * sgx_encl_release - Destroy an enclave instance
528 * @ref: address of a kref inside &sgx_encl
529 *
530 * Used together with kref_put(). Frees all the resources associated with the
531 * enclave and the instance itself.
532 */
sgx_encl_release(struct kref * ref)533 void sgx_encl_release(struct kref *ref)
534 {
535 struct sgx_encl *encl = container_of(ref, struct sgx_encl, refcount);
536 unsigned long max_page_index = PFN_DOWN(encl->base + encl->size - 1);
537 struct sgx_va_page *va_page;
538 struct sgx_encl_page *entry;
539 unsigned long count = 0;
540
541 XA_STATE(xas, &encl->page_array, PFN_DOWN(encl->base));
542
543 xas_lock(&xas);
544 xas_for_each(&xas, entry, max_page_index) {
545 if (entry->epc_page) {
546 /*
547 * The page and its radix tree entry cannot be freed
548 * if the page is being held by the reclaimer.
549 */
550 if (sgx_unmark_page_reclaimable(entry->epc_page))
551 continue;
552
553 sgx_encl_free_epc_page(entry->epc_page);
554 encl->secs_child_cnt--;
555 entry->epc_page = NULL;
556 }
557
558 kfree(entry);
559 /*
560 * Invoke scheduler on every XA_CHECK_SCHED iteration
561 * to prevent soft lockups.
562 */
563 if (!(++count % XA_CHECK_SCHED)) {
564 xas_pause(&xas);
565 xas_unlock(&xas);
566
567 cond_resched();
568
569 xas_lock(&xas);
570 }
571 }
572 xas_unlock(&xas);
573
574 xa_destroy(&encl->page_array);
575
576 if (!encl->secs_child_cnt && encl->secs.epc_page) {
577 sgx_encl_free_epc_page(encl->secs.epc_page);
578 encl->secs.epc_page = NULL;
579 }
580
581 while (!list_empty(&encl->va_pages)) {
582 va_page = list_first_entry(&encl->va_pages, struct sgx_va_page,
583 list);
584 list_del(&va_page->list);
585 sgx_encl_free_epc_page(va_page->epc_page);
586 kfree(va_page);
587 }
588
589 if (encl->backing)
590 fput(encl->backing);
591
592 cleanup_srcu_struct(&encl->srcu);
593
594 WARN_ON_ONCE(!list_empty(&encl->mm_list));
595
596 /* Detect EPC page leak's. */
597 WARN_ON_ONCE(encl->secs_child_cnt);
598 WARN_ON_ONCE(encl->secs.epc_page);
599
600 kfree(encl);
601 }
602
603 /*
604 * 'mm' is exiting and no longer needs mmu notifications.
605 */
sgx_mmu_notifier_release(struct mmu_notifier * mn,struct mm_struct * mm)606 static void sgx_mmu_notifier_release(struct mmu_notifier *mn,
607 struct mm_struct *mm)
608 {
609 struct sgx_encl_mm *encl_mm = container_of(mn, struct sgx_encl_mm, mmu_notifier);
610 struct sgx_encl_mm *tmp = NULL;
611
612 /*
613 * The enclave itself can remove encl_mm. Note, objects can't be moved
614 * off an RCU protected list, but deletion is ok.
615 */
616 spin_lock(&encl_mm->encl->mm_lock);
617 list_for_each_entry(tmp, &encl_mm->encl->mm_list, list) {
618 if (tmp == encl_mm) {
619 list_del_rcu(&encl_mm->list);
620 break;
621 }
622 }
623 spin_unlock(&encl_mm->encl->mm_lock);
624
625 if (tmp == encl_mm) {
626 synchronize_srcu(&encl_mm->encl->srcu);
627 mmu_notifier_put(mn);
628 }
629 }
630
sgx_mmu_notifier_free(struct mmu_notifier * mn)631 static void sgx_mmu_notifier_free(struct mmu_notifier *mn)
632 {
633 struct sgx_encl_mm *encl_mm = container_of(mn, struct sgx_encl_mm, mmu_notifier);
634
635 /* 'encl_mm' is going away, put encl_mm->encl reference: */
636 kref_put(&encl_mm->encl->refcount, sgx_encl_release);
637
638 kfree(encl_mm);
639 }
640
641 static const struct mmu_notifier_ops sgx_mmu_notifier_ops = {
642 .release = sgx_mmu_notifier_release,
643 .free_notifier = sgx_mmu_notifier_free,
644 };
645
sgx_encl_find_mm(struct sgx_encl * encl,struct mm_struct * mm)646 static struct sgx_encl_mm *sgx_encl_find_mm(struct sgx_encl *encl,
647 struct mm_struct *mm)
648 {
649 struct sgx_encl_mm *encl_mm = NULL;
650 struct sgx_encl_mm *tmp;
651 int idx;
652
653 idx = srcu_read_lock(&encl->srcu);
654
655 list_for_each_entry_rcu(tmp, &encl->mm_list, list) {
656 if (tmp->mm == mm) {
657 encl_mm = tmp;
658 break;
659 }
660 }
661
662 srcu_read_unlock(&encl->srcu, idx);
663
664 return encl_mm;
665 }
666
sgx_encl_mm_add(struct sgx_encl * encl,struct mm_struct * mm)667 int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm)
668 {
669 struct sgx_encl_mm *encl_mm;
670 int ret;
671
672 /*
673 * Even though a single enclave may be mapped into an mm more than once,
674 * each 'mm' only appears once on encl->mm_list. This is guaranteed by
675 * holding the mm's mmap lock for write before an mm can be added or
676 * remove to an encl->mm_list.
677 */
678 mmap_assert_write_locked(mm);
679
680 /*
681 * It's possible that an entry already exists in the mm_list, because it
682 * is removed only on VFS release or process exit.
683 */
684 if (sgx_encl_find_mm(encl, mm))
685 return 0;
686
687 encl_mm = kzalloc(sizeof(*encl_mm), GFP_KERNEL);
688 if (!encl_mm)
689 return -ENOMEM;
690
691 /* Grab a refcount for the encl_mm->encl reference: */
692 kref_get(&encl->refcount);
693 encl_mm->encl = encl;
694 encl_mm->mm = mm;
695 encl_mm->mmu_notifier.ops = &sgx_mmu_notifier_ops;
696
697 ret = __mmu_notifier_register(&encl_mm->mmu_notifier, mm);
698 if (ret) {
699 kfree(encl_mm);
700 return ret;
701 }
702
703 spin_lock(&encl->mm_lock);
704 list_add_rcu(&encl_mm->list, &encl->mm_list);
705 /* Pairs with smp_rmb() in sgx_reclaimer_block(). */
706 smp_wmb();
707 encl->mm_list_version++;
708 spin_unlock(&encl->mm_lock);
709
710 return 0;
711 }
712
sgx_encl_get_backing_page(struct sgx_encl * encl,pgoff_t index)713 static struct page *sgx_encl_get_backing_page(struct sgx_encl *encl,
714 pgoff_t index)
715 {
716 struct inode *inode = encl->backing->f_path.dentry->d_inode;
717 struct address_space *mapping = inode->i_mapping;
718 gfp_t gfpmask = mapping_gfp_mask(mapping);
719
720 return shmem_read_mapping_page_gfp(mapping, index, gfpmask);
721 }
722
723 /**
724 * sgx_encl_get_backing() - Pin the backing storage
725 * @encl: an enclave pointer
726 * @page_index: enclave page index
727 * @backing: data for accessing backing storage for the page
728 *
729 * Pin the backing storage pages for storing the encrypted contents and Paging
730 * Crypto MetaData (PCMD) of an enclave page.
731 *
732 * Return:
733 * 0 on success,
734 * -errno otherwise.
735 */
sgx_encl_get_backing(struct sgx_encl * encl,unsigned long page_index,struct sgx_backing * backing)736 static int sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index,
737 struct sgx_backing *backing)
738 {
739 pgoff_t page_pcmd_off = sgx_encl_get_backing_page_pcmd_offset(encl, page_index);
740 struct page *contents;
741 struct page *pcmd;
742
743 contents = sgx_encl_get_backing_page(encl, page_index);
744 if (IS_ERR(contents))
745 return PTR_ERR(contents);
746
747 pcmd = sgx_encl_get_backing_page(encl, PFN_DOWN(page_pcmd_off));
748 if (IS_ERR(pcmd)) {
749 put_page(contents);
750 return PTR_ERR(pcmd);
751 }
752
753 backing->page_index = page_index;
754 backing->contents = contents;
755 backing->pcmd = pcmd;
756 backing->pcmd_offset = page_pcmd_off & (PAGE_SIZE - 1);
757
758 return 0;
759 }
760
761 /*
762 * When called from ksgxd, returns the mem_cgroup of a struct mm stored
763 * in the enclave's mm_list. When not called from ksgxd, just returns
764 * the mem_cgroup of the current task.
765 */
sgx_encl_get_mem_cgroup(struct sgx_encl * encl)766 static struct mem_cgroup *sgx_encl_get_mem_cgroup(struct sgx_encl *encl)
767 {
768 struct mem_cgroup *memcg = NULL;
769 struct sgx_encl_mm *encl_mm;
770 int idx;
771
772 /*
773 * If called from normal task context, return the mem_cgroup
774 * of the current task's mm. The remainder of the handling is for
775 * ksgxd.
776 */
777 if (!current_is_ksgxd())
778 return get_mem_cgroup_from_mm(current->mm);
779
780 /*
781 * Search the enclave's mm_list to find an mm associated with
782 * this enclave to charge the allocation to.
783 */
784 idx = srcu_read_lock(&encl->srcu);
785
786 list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) {
787 if (!mmget_not_zero(encl_mm->mm))
788 continue;
789
790 memcg = get_mem_cgroup_from_mm(encl_mm->mm);
791
792 mmput_async(encl_mm->mm);
793
794 break;
795 }
796
797 srcu_read_unlock(&encl->srcu, idx);
798
799 /*
800 * In the rare case that there isn't an mm associated with
801 * the enclave, set memcg to the current active mem_cgroup.
802 * This will be the root mem_cgroup if there is no active
803 * mem_cgroup.
804 */
805 if (!memcg)
806 return get_mem_cgroup_from_mm(NULL);
807
808 return memcg;
809 }
810
811 /**
812 * sgx_encl_alloc_backing() - allocate a new backing storage page
813 * @encl: an enclave pointer
814 * @page_index: enclave page index
815 * @backing: data for accessing backing storage for the page
816 *
817 * When called from ksgxd, sets the active memcg from one of the
818 * mms in the enclave's mm_list prior to any backing page allocation,
819 * in order to ensure that shmem page allocations are charged to the
820 * enclave.
821 *
822 * Return:
823 * 0 on success,
824 * -errno otherwise.
825 */
sgx_encl_alloc_backing(struct sgx_encl * encl,unsigned long page_index,struct sgx_backing * backing)826 int sgx_encl_alloc_backing(struct sgx_encl *encl, unsigned long page_index,
827 struct sgx_backing *backing)
828 {
829 struct mem_cgroup *encl_memcg = sgx_encl_get_mem_cgroup(encl);
830 struct mem_cgroup *memcg = set_active_memcg(encl_memcg);
831 int ret;
832
833 ret = sgx_encl_get_backing(encl, page_index, backing);
834
835 set_active_memcg(memcg);
836 mem_cgroup_put(encl_memcg);
837
838 return ret;
839 }
840
841 /**
842 * sgx_encl_lookup_backing() - retrieve an existing backing storage page
843 * @encl: an enclave pointer
844 * @page_index: enclave page index
845 * @backing: data for accessing backing storage for the page
846 *
847 * Retrieve a backing page for loading data back into an EPC page with ELDU.
848 * It is the caller's responsibility to ensure that it is appropriate to use
849 * sgx_encl_lookup_backing() rather than sgx_encl_alloc_backing(). If lookup is
850 * not used correctly, this will cause an allocation which is not accounted for.
851 *
852 * Return:
853 * 0 on success,
854 * -errno otherwise.
855 */
sgx_encl_lookup_backing(struct sgx_encl * encl,unsigned long page_index,struct sgx_backing * backing)856 int sgx_encl_lookup_backing(struct sgx_encl *encl, unsigned long page_index,
857 struct sgx_backing *backing)
858 {
859 return sgx_encl_get_backing(encl, page_index, backing);
860 }
861
862 /**
863 * sgx_encl_put_backing() - Unpin the backing storage
864 * @backing: data for accessing backing storage for the page
865 */
sgx_encl_put_backing(struct sgx_backing * backing)866 void sgx_encl_put_backing(struct sgx_backing *backing)
867 {
868 put_page(backing->pcmd);
869 put_page(backing->contents);
870 }
871
sgx_encl_test_and_clear_young_cb(pte_t * ptep,unsigned long addr,void * data)872 static int sgx_encl_test_and_clear_young_cb(pte_t *ptep, unsigned long addr,
873 void *data)
874 {
875 pte_t pte;
876 int ret;
877
878 ret = pte_young(*ptep);
879 if (ret) {
880 pte = pte_mkold(*ptep);
881 set_pte_at((struct mm_struct *)data, addr, ptep, pte);
882 }
883
884 return ret;
885 }
886
887 /**
888 * sgx_encl_test_and_clear_young() - Test and reset the accessed bit
889 * @mm: mm_struct that is checked
890 * @page: enclave page to be tested for recent access
891 *
892 * Checks the Access (A) bit from the PTE corresponding to the enclave page and
893 * clears it.
894 *
895 * Return: 1 if the page has been recently accessed and 0 if not.
896 */
sgx_encl_test_and_clear_young(struct mm_struct * mm,struct sgx_encl_page * page)897 int sgx_encl_test_and_clear_young(struct mm_struct *mm,
898 struct sgx_encl_page *page)
899 {
900 unsigned long addr = page->desc & PAGE_MASK;
901 struct sgx_encl *encl = page->encl;
902 struct vm_area_struct *vma;
903 int ret;
904
905 ret = sgx_encl_find(mm, addr, &vma);
906 if (ret)
907 return 0;
908
909 if (encl != vma->vm_private_data)
910 return 0;
911
912 ret = apply_to_page_range(vma->vm_mm, addr, PAGE_SIZE,
913 sgx_encl_test_and_clear_young_cb, vma->vm_mm);
914 if (ret < 0)
915 return 0;
916
917 return ret;
918 }
919
920 /**
921 * sgx_alloc_va_page() - Allocate a Version Array (VA) page
922 *
923 * Allocate a free EPC page and convert it to a Version Array (VA) page.
924 *
925 * Return:
926 * a VA page,
927 * -errno otherwise
928 */
sgx_alloc_va_page(void)929 struct sgx_epc_page *sgx_alloc_va_page(void)
930 {
931 struct sgx_epc_page *epc_page;
932 int ret;
933
934 epc_page = sgx_alloc_epc_page(NULL, true);
935 if (IS_ERR(epc_page))
936 return ERR_CAST(epc_page);
937
938 ret = __epa(sgx_get_epc_virt_addr(epc_page));
939 if (ret) {
940 WARN_ONCE(1, "EPA returned %d (0x%x)", ret, ret);
941 sgx_encl_free_epc_page(epc_page);
942 return ERR_PTR(-EFAULT);
943 }
944
945 return epc_page;
946 }
947
948 /**
949 * sgx_alloc_va_slot - allocate a VA slot
950 * @va_page: a &struct sgx_va_page instance
951 *
952 * Allocates a slot from a &struct sgx_va_page instance.
953 *
954 * Return: offset of the slot inside the VA page
955 */
sgx_alloc_va_slot(struct sgx_va_page * va_page)956 unsigned int sgx_alloc_va_slot(struct sgx_va_page *va_page)
957 {
958 int slot = find_first_zero_bit(va_page->slots, SGX_VA_SLOT_COUNT);
959
960 if (slot < SGX_VA_SLOT_COUNT)
961 set_bit(slot, va_page->slots);
962
963 return slot << 3;
964 }
965
966 /**
967 * sgx_free_va_slot - free a VA slot
968 * @va_page: a &struct sgx_va_page instance
969 * @offset: offset of the slot inside the VA page
970 *
971 * Frees a slot from a &struct sgx_va_page instance.
972 */
sgx_free_va_slot(struct sgx_va_page * va_page,unsigned int offset)973 void sgx_free_va_slot(struct sgx_va_page *va_page, unsigned int offset)
974 {
975 clear_bit(offset >> 3, va_page->slots);
976 }
977
978 /**
979 * sgx_va_page_full - is the VA page full?
980 * @va_page: a &struct sgx_va_page instance
981 *
982 * Return: true if all slots have been taken
983 */
sgx_va_page_full(struct sgx_va_page * va_page)984 bool sgx_va_page_full(struct sgx_va_page *va_page)
985 {
986 int slot = find_first_zero_bit(va_page->slots, SGX_VA_SLOT_COUNT);
987
988 return slot == SGX_VA_SLOT_COUNT;
989 }
990
991 /**
992 * sgx_encl_free_epc_page - free an EPC page assigned to an enclave
993 * @page: EPC page to be freed
994 *
995 * Free an EPC page assigned to an enclave. It does EREMOVE for the page, and
996 * only upon success, it puts the page back to free page list. Otherwise, it
997 * gives a WARNING to indicate page is leaked.
998 */
sgx_encl_free_epc_page(struct sgx_epc_page * page)999 void sgx_encl_free_epc_page(struct sgx_epc_page *page)
1000 {
1001 int ret;
1002
1003 WARN_ON_ONCE(page->flags & SGX_EPC_PAGE_RECLAIMER_TRACKED);
1004
1005 ret = __eremove(sgx_get_epc_virt_addr(page));
1006 if (WARN_ONCE(ret, EREMOVE_ERROR_MESSAGE, ret, ret))
1007 return;
1008
1009 sgx_free_epc_page(page);
1010 }
1011