1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * Copyright © 2006-2014 Intel Corporation.
4 *
5 * Authors: David Woodhouse <dwmw2@infradead.org>,
6 * Ashok Raj <ashok.raj@intel.com>,
7 * Shaohua Li <shaohua.li@intel.com>,
8 * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9 * Fenghua Yu <fenghua.yu@intel.com>
10 * Joerg Roedel <jroedel@suse.de>
11 */
12
13 #define pr_fmt(fmt) "DMAR: " fmt
14 #define dev_fmt(fmt) pr_fmt(fmt)
15
16 #include <linux/init.h>
17 #include <linux/bitmap.h>
18 #include <linux/debugfs.h>
19 #include <linux/export.h>
20 #include <linux/slab.h>
21 #include <linux/irq.h>
22 #include <linux/interrupt.h>
23 #include <linux/spinlock.h>
24 #include <linux/pci.h>
25 #include <linux/dmar.h>
26 #include <linux/dma-map-ops.h>
27 #include <linux/mempool.h>
28 #include <linux/memory.h>
29 #include <linux/cpu.h>
30 #include <linux/timer.h>
31 #include <linux/io.h>
32 #include <linux/iova.h>
33 #include <linux/iommu.h>
34 #include <linux/intel-iommu.h>
35 #include <linux/syscore_ops.h>
36 #include <linux/tboot.h>
37 #include <linux/dmi.h>
38 #include <linux/pci-ats.h>
39 #include <linux/memblock.h>
40 #include <linux/dma-map-ops.h>
41 #include <linux/dma-direct.h>
42 #include <linux/crash_dump.h>
43 #include <linux/numa.h>
44 #include <linux/swiotlb.h>
45 #include <asm/irq_remapping.h>
46 #include <asm/cacheflush.h>
47 #include <asm/iommu.h>
48 #include <trace/events/intel_iommu.h>
49
50 #include "../irq_remapping.h"
51 #include "pasid.h"
52
53 #define ROOT_SIZE VTD_PAGE_SIZE
54 #define CONTEXT_SIZE VTD_PAGE_SIZE
55
56 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
57 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
58 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
59 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
60
61 #define IOAPIC_RANGE_START (0xfee00000)
62 #define IOAPIC_RANGE_END (0xfeefffff)
63 #define IOVA_START_ADDR (0x1000)
64
65 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
66
67 #define MAX_AGAW_WIDTH 64
68 #define MAX_AGAW_PFN_WIDTH (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
69
70 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
71 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
72
73 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
74 to match. That way, we can use 'unsigned long' for PFNs with impunity. */
75 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
76 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
77 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
78
79 /* IO virtual address start page frame number */
80 #define IOVA_START_PFN (1)
81
82 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
83
84 /* page table handling */
85 #define LEVEL_STRIDE (9)
86 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
87
88 /*
89 * This bitmap is used to advertise the page sizes our hardware support
90 * to the IOMMU core, which will then use this information to split
91 * physically contiguous memory regions it is mapping into page sizes
92 * that we support.
93 *
94 * Traditionally the IOMMU core just handed us the mappings directly,
95 * after making sure the size is an order of a 4KiB page and that the
96 * mapping has natural alignment.
97 *
98 * To retain this behavior, we currently advertise that we support
99 * all page sizes that are an order of 4KiB.
100 *
101 * If at some point we'd like to utilize the IOMMU core's new behavior,
102 * we could change this to advertise the real page sizes we support.
103 */
104 #define INTEL_IOMMU_PGSIZES (~0xFFFUL)
105
agaw_to_level(int agaw)106 static inline int agaw_to_level(int agaw)
107 {
108 return agaw + 2;
109 }
110
agaw_to_width(int agaw)111 static inline int agaw_to_width(int agaw)
112 {
113 return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
114 }
115
width_to_agaw(int width)116 static inline int width_to_agaw(int width)
117 {
118 return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
119 }
120
level_to_offset_bits(int level)121 static inline unsigned int level_to_offset_bits(int level)
122 {
123 return (level - 1) * LEVEL_STRIDE;
124 }
125
pfn_level_offset(u64 pfn,int level)126 static inline int pfn_level_offset(u64 pfn, int level)
127 {
128 return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
129 }
130
level_mask(int level)131 static inline u64 level_mask(int level)
132 {
133 return -1ULL << level_to_offset_bits(level);
134 }
135
level_size(int level)136 static inline u64 level_size(int level)
137 {
138 return 1ULL << level_to_offset_bits(level);
139 }
140
align_to_level(u64 pfn,int level)141 static inline u64 align_to_level(u64 pfn, int level)
142 {
143 return (pfn + level_size(level) - 1) & level_mask(level);
144 }
145
lvl_to_nr_pages(unsigned int lvl)146 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
147 {
148 return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
149 }
150
151 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
152 are never going to work. */
dma_to_mm_pfn(unsigned long dma_pfn)153 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
154 {
155 return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
156 }
157
mm_to_dma_pfn(unsigned long mm_pfn)158 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
159 {
160 return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
161 }
page_to_dma_pfn(struct page * pg)162 static inline unsigned long page_to_dma_pfn(struct page *pg)
163 {
164 return mm_to_dma_pfn(page_to_pfn(pg));
165 }
virt_to_dma_pfn(void * p)166 static inline unsigned long virt_to_dma_pfn(void *p)
167 {
168 return page_to_dma_pfn(virt_to_page(p));
169 }
170
171 /* global iommu list, set NULL for ignored DMAR units */
172 static struct intel_iommu **g_iommus;
173
174 static void __init check_tylersburg_isoch(void);
175 static int rwbf_quirk;
176
177 /*
178 * set to 1 to panic kernel if can't successfully enable VT-d
179 * (used when kernel is launched w/ TXT)
180 */
181 static int force_on = 0;
182 static int intel_iommu_tboot_noforce;
183 static int no_platform_optin;
184
185 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
186
187 /*
188 * Take a root_entry and return the Lower Context Table Pointer (LCTP)
189 * if marked present.
190 */
root_entry_lctp(struct root_entry * re)191 static phys_addr_t root_entry_lctp(struct root_entry *re)
192 {
193 if (!(re->lo & 1))
194 return 0;
195
196 return re->lo & VTD_PAGE_MASK;
197 }
198
199 /*
200 * Take a root_entry and return the Upper Context Table Pointer (UCTP)
201 * if marked present.
202 */
root_entry_uctp(struct root_entry * re)203 static phys_addr_t root_entry_uctp(struct root_entry *re)
204 {
205 if (!(re->hi & 1))
206 return 0;
207
208 return re->hi & VTD_PAGE_MASK;
209 }
210
context_clear_pasid_enable(struct context_entry * context)211 static inline void context_clear_pasid_enable(struct context_entry *context)
212 {
213 context->lo &= ~(1ULL << 11);
214 }
215
context_pasid_enabled(struct context_entry * context)216 static inline bool context_pasid_enabled(struct context_entry *context)
217 {
218 return !!(context->lo & (1ULL << 11));
219 }
220
context_set_copied(struct context_entry * context)221 static inline void context_set_copied(struct context_entry *context)
222 {
223 context->hi |= (1ull << 3);
224 }
225
context_copied(struct context_entry * context)226 static inline bool context_copied(struct context_entry *context)
227 {
228 return !!(context->hi & (1ULL << 3));
229 }
230
__context_present(struct context_entry * context)231 static inline bool __context_present(struct context_entry *context)
232 {
233 return (context->lo & 1);
234 }
235
context_present(struct context_entry * context)236 bool context_present(struct context_entry *context)
237 {
238 return context_pasid_enabled(context) ?
239 __context_present(context) :
240 __context_present(context) && !context_copied(context);
241 }
242
context_set_present(struct context_entry * context)243 static inline void context_set_present(struct context_entry *context)
244 {
245 context->lo |= 1;
246 }
247
context_set_fault_enable(struct context_entry * context)248 static inline void context_set_fault_enable(struct context_entry *context)
249 {
250 context->lo &= (((u64)-1) << 2) | 1;
251 }
252
context_set_translation_type(struct context_entry * context,unsigned long value)253 static inline void context_set_translation_type(struct context_entry *context,
254 unsigned long value)
255 {
256 context->lo &= (((u64)-1) << 4) | 3;
257 context->lo |= (value & 3) << 2;
258 }
259
context_set_address_root(struct context_entry * context,unsigned long value)260 static inline void context_set_address_root(struct context_entry *context,
261 unsigned long value)
262 {
263 context->lo &= ~VTD_PAGE_MASK;
264 context->lo |= value & VTD_PAGE_MASK;
265 }
266
context_set_address_width(struct context_entry * context,unsigned long value)267 static inline void context_set_address_width(struct context_entry *context,
268 unsigned long value)
269 {
270 context->hi |= value & 7;
271 }
272
context_set_domain_id(struct context_entry * context,unsigned long value)273 static inline void context_set_domain_id(struct context_entry *context,
274 unsigned long value)
275 {
276 context->hi |= (value & ((1 << 16) - 1)) << 8;
277 }
278
context_domain_id(struct context_entry * c)279 static inline int context_domain_id(struct context_entry *c)
280 {
281 return((c->hi >> 8) & 0xffff);
282 }
283
context_clear_entry(struct context_entry * context)284 static inline void context_clear_entry(struct context_entry *context)
285 {
286 context->lo = 0;
287 context->hi = 0;
288 }
289
290 /*
291 * This domain is a statically identity mapping domain.
292 * 1. This domain creats a static 1:1 mapping to all usable memory.
293 * 2. It maps to each iommu if successful.
294 * 3. Each iommu mapps to this domain if successful.
295 */
296 static struct dmar_domain *si_domain;
297 static int hw_pass_through = 1;
298
299 #define for_each_domain_iommu(idx, domain) \
300 for (idx = 0; idx < g_num_of_iommus; idx++) \
301 if (domain->iommu_refcnt[idx])
302
303 struct dmar_rmrr_unit {
304 struct list_head list; /* list of rmrr units */
305 struct acpi_dmar_header *hdr; /* ACPI header */
306 u64 base_address; /* reserved base address*/
307 u64 end_address; /* reserved end address */
308 struct dmar_dev_scope *devices; /* target devices */
309 int devices_cnt; /* target device count */
310 };
311
312 struct dmar_atsr_unit {
313 struct list_head list; /* list of ATSR units */
314 struct acpi_dmar_header *hdr; /* ACPI header */
315 struct dmar_dev_scope *devices; /* target devices */
316 int devices_cnt; /* target device count */
317 u8 include_all:1; /* include all ports */
318 };
319
320 static LIST_HEAD(dmar_atsr_units);
321 static LIST_HEAD(dmar_rmrr_units);
322
323 #define for_each_rmrr_units(rmrr) \
324 list_for_each_entry(rmrr, &dmar_rmrr_units, list)
325
326 /* bitmap for indexing intel_iommus */
327 static int g_num_of_iommus;
328
329 static void domain_exit(struct dmar_domain *domain);
330 static void domain_remove_dev_info(struct dmar_domain *domain);
331 static void dmar_remove_one_dev_info(struct device *dev);
332 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
333 static int intel_iommu_attach_device(struct iommu_domain *domain,
334 struct device *dev);
335 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
336 dma_addr_t iova);
337
338 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
339 int dmar_disabled = 0;
340 #else
341 int dmar_disabled = 1;
342 #endif /* CONFIG_INTEL_IOMMU_DEFAULT_ON */
343
344 #ifdef CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON
345 int intel_iommu_sm = 1;
346 #else
347 int intel_iommu_sm;
348 #endif /* CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON */
349
350 int intel_iommu_enabled = 0;
351 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
352
353 static int dmar_map_gfx = 1;
354 static int dmar_forcedac;
355 static int intel_iommu_strict;
356 static int intel_iommu_superpage = 1;
357 static int iommu_identity_mapping;
358 static int intel_no_bounce;
359 static int iommu_skip_te_disable;
360
361 #define IDENTMAP_GFX 2
362 #define IDENTMAP_AZALIA 4
363
364 int intel_iommu_gfx_mapped;
365 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
366
367 #define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2))
get_domain_info(struct device * dev)368 struct device_domain_info *get_domain_info(struct device *dev)
369 {
370 struct device_domain_info *info;
371
372 if (!dev)
373 return NULL;
374
375 info = dev_iommu_priv_get(dev);
376 if (unlikely(info == DEFER_DEVICE_DOMAIN_INFO))
377 return NULL;
378
379 return info;
380 }
381
382 DEFINE_SPINLOCK(device_domain_lock);
383 static LIST_HEAD(device_domain_list);
384
385 #define device_needs_bounce(d) (!intel_no_bounce && dev_is_pci(d) && \
386 to_pci_dev(d)->untrusted)
387
388 /*
389 * Iterate over elements in device_domain_list and call the specified
390 * callback @fn against each element.
391 */
for_each_device_domain(int (* fn)(struct device_domain_info * info,void * data),void * data)392 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
393 void *data), void *data)
394 {
395 int ret = 0;
396 unsigned long flags;
397 struct device_domain_info *info;
398
399 spin_lock_irqsave(&device_domain_lock, flags);
400 list_for_each_entry(info, &device_domain_list, global) {
401 ret = fn(info, data);
402 if (ret) {
403 spin_unlock_irqrestore(&device_domain_lock, flags);
404 return ret;
405 }
406 }
407 spin_unlock_irqrestore(&device_domain_lock, flags);
408
409 return 0;
410 }
411
412 const struct iommu_ops intel_iommu_ops;
413
translation_pre_enabled(struct intel_iommu * iommu)414 static bool translation_pre_enabled(struct intel_iommu *iommu)
415 {
416 return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
417 }
418
clear_translation_pre_enabled(struct intel_iommu * iommu)419 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
420 {
421 iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
422 }
423
init_translation_status(struct intel_iommu * iommu)424 static void init_translation_status(struct intel_iommu *iommu)
425 {
426 u32 gsts;
427
428 gsts = readl(iommu->reg + DMAR_GSTS_REG);
429 if (gsts & DMA_GSTS_TES)
430 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
431 }
432
intel_iommu_setup(char * str)433 static int __init intel_iommu_setup(char *str)
434 {
435 if (!str)
436 return -EINVAL;
437 while (*str) {
438 if (!strncmp(str, "on", 2)) {
439 dmar_disabled = 0;
440 pr_info("IOMMU enabled\n");
441 } else if (!strncmp(str, "off", 3)) {
442 dmar_disabled = 1;
443 no_platform_optin = 1;
444 pr_info("IOMMU disabled\n");
445 } else if (!strncmp(str, "igfx_off", 8)) {
446 dmar_map_gfx = 0;
447 pr_info("Disable GFX device mapping\n");
448 } else if (!strncmp(str, "forcedac", 8)) {
449 pr_info("Forcing DAC for PCI devices\n");
450 dmar_forcedac = 1;
451 } else if (!strncmp(str, "strict", 6)) {
452 pr_info("Disable batched IOTLB flush\n");
453 intel_iommu_strict = 1;
454 } else if (!strncmp(str, "sp_off", 6)) {
455 pr_info("Disable supported super page\n");
456 intel_iommu_superpage = 0;
457 } else if (!strncmp(str, "sm_on", 5)) {
458 pr_info("Intel-IOMMU: scalable mode supported\n");
459 intel_iommu_sm = 1;
460 } else if (!strncmp(str, "tboot_noforce", 13)) {
461 pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
462 intel_iommu_tboot_noforce = 1;
463 } else if (!strncmp(str, "nobounce", 8)) {
464 pr_info("Intel-IOMMU: No bounce buffer. This could expose security risks of DMA attacks\n");
465 intel_no_bounce = 1;
466 }
467
468 str += strcspn(str, ",");
469 while (*str == ',')
470 str++;
471 }
472 return 0;
473 }
474 __setup("intel_iommu=", intel_iommu_setup);
475
476 static struct kmem_cache *iommu_domain_cache;
477 static struct kmem_cache *iommu_devinfo_cache;
478
get_iommu_domain(struct intel_iommu * iommu,u16 did)479 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
480 {
481 struct dmar_domain **domains;
482 int idx = did >> 8;
483
484 domains = iommu->domains[idx];
485 if (!domains)
486 return NULL;
487
488 return domains[did & 0xff];
489 }
490
set_iommu_domain(struct intel_iommu * iommu,u16 did,struct dmar_domain * domain)491 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
492 struct dmar_domain *domain)
493 {
494 struct dmar_domain **domains;
495 int idx = did >> 8;
496
497 if (!iommu->domains[idx]) {
498 size_t size = 256 * sizeof(struct dmar_domain *);
499 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
500 }
501
502 domains = iommu->domains[idx];
503 if (WARN_ON(!domains))
504 return;
505 else
506 domains[did & 0xff] = domain;
507 }
508
alloc_pgtable_page(int node)509 void *alloc_pgtable_page(int node)
510 {
511 struct page *page;
512 void *vaddr = NULL;
513
514 page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
515 if (page)
516 vaddr = page_address(page);
517 return vaddr;
518 }
519
free_pgtable_page(void * vaddr)520 void free_pgtable_page(void *vaddr)
521 {
522 free_page((unsigned long)vaddr);
523 }
524
alloc_domain_mem(void)525 static inline void *alloc_domain_mem(void)
526 {
527 return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
528 }
529
free_domain_mem(void * vaddr)530 static void free_domain_mem(void *vaddr)
531 {
532 kmem_cache_free(iommu_domain_cache, vaddr);
533 }
534
alloc_devinfo_mem(void)535 static inline void * alloc_devinfo_mem(void)
536 {
537 return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
538 }
539
free_devinfo_mem(void * vaddr)540 static inline void free_devinfo_mem(void *vaddr)
541 {
542 kmem_cache_free(iommu_devinfo_cache, vaddr);
543 }
544
domain_type_is_si(struct dmar_domain * domain)545 static inline int domain_type_is_si(struct dmar_domain *domain)
546 {
547 return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
548 }
549
domain_use_first_level(struct dmar_domain * domain)550 static inline bool domain_use_first_level(struct dmar_domain *domain)
551 {
552 return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL;
553 }
554
domain_pfn_supported(struct dmar_domain * domain,unsigned long pfn)555 static inline int domain_pfn_supported(struct dmar_domain *domain,
556 unsigned long pfn)
557 {
558 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
559
560 return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
561 }
562
563 /*
564 * Calculate the Supported Adjusted Guest Address Widths of an IOMMU.
565 * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of
566 * the returned SAGAW.
567 */
__iommu_calculate_sagaw(struct intel_iommu * iommu)568 static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu)
569 {
570 unsigned long fl_sagaw, sl_sagaw;
571
572 fl_sagaw = BIT(2) | (cap_5lp_support(iommu->cap) ? BIT(3) : 0);
573 sl_sagaw = cap_sagaw(iommu->cap);
574
575 /* Second level only. */
576 if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
577 return sl_sagaw;
578
579 /* First level only. */
580 if (!ecap_slts(iommu->ecap))
581 return fl_sagaw;
582
583 return fl_sagaw & sl_sagaw;
584 }
585
__iommu_calculate_agaw(struct intel_iommu * iommu,int max_gaw)586 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
587 {
588 unsigned long sagaw;
589 int agaw = -1;
590
591 sagaw = __iommu_calculate_sagaw(iommu);
592 for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) {
593 if (test_bit(agaw, &sagaw))
594 break;
595 }
596
597 return agaw;
598 }
599
600 /*
601 * Calculate max SAGAW for each iommu.
602 */
iommu_calculate_max_sagaw(struct intel_iommu * iommu)603 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
604 {
605 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
606 }
607
608 /*
609 * calculate agaw for each iommu.
610 * "SAGAW" may be different across iommus, use a default agaw, and
611 * get a supported less agaw for iommus that don't support the default agaw.
612 */
iommu_calculate_agaw(struct intel_iommu * iommu)613 int iommu_calculate_agaw(struct intel_iommu *iommu)
614 {
615 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
616 }
617
618 /* This functionin only returns single iommu in a domain */
domain_get_iommu(struct dmar_domain * domain)619 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
620 {
621 int iommu_id;
622
623 /* si_domain and vm domain should not get here. */
624 if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA))
625 return NULL;
626
627 for_each_domain_iommu(iommu_id, domain)
628 break;
629
630 if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
631 return NULL;
632
633 return g_iommus[iommu_id];
634 }
635
iommu_paging_structure_coherency(struct intel_iommu * iommu)636 static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
637 {
638 return sm_supported(iommu) ?
639 ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
640 }
641
domain_update_iommu_coherency(struct dmar_domain * domain)642 static void domain_update_iommu_coherency(struct dmar_domain *domain)
643 {
644 struct dmar_drhd_unit *drhd;
645 struct intel_iommu *iommu;
646 bool found = false;
647 int i;
648
649 domain->iommu_coherency = 1;
650
651 for_each_domain_iommu(i, domain) {
652 found = true;
653 if (!iommu_paging_structure_coherency(g_iommus[i])) {
654 domain->iommu_coherency = 0;
655 break;
656 }
657 }
658 if (found)
659 return;
660
661 /* No hardware attached; use lowest common denominator */
662 rcu_read_lock();
663 for_each_active_iommu(iommu, drhd) {
664 if (!iommu_paging_structure_coherency(iommu)) {
665 domain->iommu_coherency = 0;
666 break;
667 }
668 }
669 rcu_read_unlock();
670 }
671
domain_update_iommu_snooping(struct intel_iommu * skip)672 static int domain_update_iommu_snooping(struct intel_iommu *skip)
673 {
674 struct dmar_drhd_unit *drhd;
675 struct intel_iommu *iommu;
676 int ret = 1;
677
678 rcu_read_lock();
679 for_each_active_iommu(iommu, drhd) {
680 if (iommu != skip) {
681 /*
682 * If the hardware is operating in the scalable mode,
683 * the snooping control is always supported since we
684 * always set PASID-table-entry.PGSNP bit if the domain
685 * is managed outside (UNMANAGED).
686 */
687 if (!sm_supported(iommu) &&
688 !ecap_sc_support(iommu->ecap)) {
689 ret = 0;
690 break;
691 }
692 }
693 }
694 rcu_read_unlock();
695
696 return ret;
697 }
698
domain_update_iommu_superpage(struct dmar_domain * domain,struct intel_iommu * skip)699 static int domain_update_iommu_superpage(struct dmar_domain *domain,
700 struct intel_iommu *skip)
701 {
702 struct dmar_drhd_unit *drhd;
703 struct intel_iommu *iommu;
704 int mask = 0x3;
705
706 if (!intel_iommu_superpage) {
707 return 0;
708 }
709
710 /* set iommu_superpage to the smallest common denominator */
711 rcu_read_lock();
712 for_each_active_iommu(iommu, drhd) {
713 if (iommu != skip) {
714 if (domain && domain_use_first_level(domain)) {
715 if (!cap_fl1gp_support(iommu->cap))
716 mask = 0x1;
717 } else {
718 mask &= cap_super_page_val(iommu->cap);
719 }
720
721 if (!mask)
722 break;
723 }
724 }
725 rcu_read_unlock();
726
727 return fls(mask);
728 }
729
domain_update_device_node(struct dmar_domain * domain)730 static int domain_update_device_node(struct dmar_domain *domain)
731 {
732 struct device_domain_info *info;
733 int nid = NUMA_NO_NODE;
734
735 assert_spin_locked(&device_domain_lock);
736
737 if (list_empty(&domain->devices))
738 return NUMA_NO_NODE;
739
740 list_for_each_entry(info, &domain->devices, link) {
741 if (!info->dev)
742 continue;
743
744 /*
745 * There could possibly be multiple device numa nodes as devices
746 * within the same domain may sit behind different IOMMUs. There
747 * isn't perfect answer in such situation, so we select first
748 * come first served policy.
749 */
750 nid = dev_to_node(info->dev);
751 if (nid != NUMA_NO_NODE)
752 break;
753 }
754
755 return nid;
756 }
757
758 /* Some capabilities may be different across iommus */
domain_update_iommu_cap(struct dmar_domain * domain)759 static void domain_update_iommu_cap(struct dmar_domain *domain)
760 {
761 domain_update_iommu_coherency(domain);
762 domain->iommu_snooping = domain_update_iommu_snooping(NULL);
763 domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
764
765 /*
766 * If RHSA is missing, we should default to the device numa domain
767 * as fall back.
768 */
769 if (domain->nid == NUMA_NO_NODE)
770 domain->nid = domain_update_device_node(domain);
771
772 /*
773 * First-level translation restricts the input-address to a
774 * canonical address (i.e., address bits 63:N have the same
775 * value as address bit [N-1], where N is 48-bits with 4-level
776 * paging and 57-bits with 5-level paging). Hence, skip bit
777 * [N-1].
778 */
779 if (domain_use_first_level(domain))
780 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
781 else
782 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
783 }
784
iommu_context_addr(struct intel_iommu * iommu,u8 bus,u8 devfn,int alloc)785 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
786 u8 devfn, int alloc)
787 {
788 struct root_entry *root = &iommu->root_entry[bus];
789 struct context_entry *context;
790 u64 *entry;
791
792 entry = &root->lo;
793 if (sm_supported(iommu)) {
794 if (devfn >= 0x80) {
795 devfn -= 0x80;
796 entry = &root->hi;
797 }
798 devfn *= 2;
799 }
800 if (*entry & 1)
801 context = phys_to_virt(*entry & VTD_PAGE_MASK);
802 else {
803 unsigned long phy_addr;
804 if (!alloc)
805 return NULL;
806
807 context = alloc_pgtable_page(iommu->node);
808 if (!context)
809 return NULL;
810
811 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
812 phy_addr = virt_to_phys((void *)context);
813 *entry = phy_addr | 1;
814 __iommu_flush_cache(iommu, entry, sizeof(*entry));
815 }
816 return &context[devfn];
817 }
818
attach_deferred(struct device * dev)819 static bool attach_deferred(struct device *dev)
820 {
821 return dev_iommu_priv_get(dev) == DEFER_DEVICE_DOMAIN_INFO;
822 }
823
824 /**
825 * is_downstream_to_pci_bridge - test if a device belongs to the PCI
826 * sub-hierarchy of a candidate PCI-PCI bridge
827 * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
828 * @bridge: the candidate PCI-PCI bridge
829 *
830 * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
831 */
832 static bool
is_downstream_to_pci_bridge(struct device * dev,struct device * bridge)833 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
834 {
835 struct pci_dev *pdev, *pbridge;
836
837 if (!dev_is_pci(dev) || !dev_is_pci(bridge))
838 return false;
839
840 pdev = to_pci_dev(dev);
841 pbridge = to_pci_dev(bridge);
842
843 if (pbridge->subordinate &&
844 pbridge->subordinate->number <= pdev->bus->number &&
845 pbridge->subordinate->busn_res.end >= pdev->bus->number)
846 return true;
847
848 return false;
849 }
850
quirk_ioat_snb_local_iommu(struct pci_dev * pdev)851 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
852 {
853 struct dmar_drhd_unit *drhd;
854 u32 vtbar;
855 int rc;
856
857 /* We know that this device on this chipset has its own IOMMU.
858 * If we find it under a different IOMMU, then the BIOS is lying
859 * to us. Hope that the IOMMU for this device is actually
860 * disabled, and it needs no translation...
861 */
862 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
863 if (rc) {
864 /* "can't" happen */
865 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
866 return false;
867 }
868 vtbar &= 0xffff0000;
869
870 /* we know that the this iommu should be at offset 0xa000 from vtbar */
871 drhd = dmar_find_matched_drhd_unit(pdev);
872 if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
873 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
874 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
875 return true;
876 }
877
878 return false;
879 }
880
iommu_is_dummy(struct intel_iommu * iommu,struct device * dev)881 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
882 {
883 if (!iommu || iommu->drhd->ignored)
884 return true;
885
886 if (dev_is_pci(dev)) {
887 struct pci_dev *pdev = to_pci_dev(dev);
888
889 if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
890 pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
891 quirk_ioat_snb_local_iommu(pdev))
892 return true;
893 }
894
895 return false;
896 }
897
device_to_iommu(struct device * dev,u8 * bus,u8 * devfn)898 struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
899 {
900 struct dmar_drhd_unit *drhd = NULL;
901 struct pci_dev *pdev = NULL;
902 struct intel_iommu *iommu;
903 struct device *tmp;
904 u16 segment = 0;
905 int i;
906
907 if (!dev)
908 return NULL;
909
910 if (dev_is_pci(dev)) {
911 struct pci_dev *pf_pdev;
912
913 pdev = pci_real_dma_dev(to_pci_dev(dev));
914
915 /* VFs aren't listed in scope tables; we need to look up
916 * the PF instead to find the IOMMU. */
917 pf_pdev = pci_physfn(pdev);
918 dev = &pf_pdev->dev;
919 segment = pci_domain_nr(pdev->bus);
920 } else if (has_acpi_companion(dev))
921 dev = &ACPI_COMPANION(dev)->dev;
922
923 rcu_read_lock();
924 for_each_iommu(iommu, drhd) {
925 if (pdev && segment != drhd->segment)
926 continue;
927
928 for_each_active_dev_scope(drhd->devices,
929 drhd->devices_cnt, i, tmp) {
930 if (tmp == dev) {
931 /* For a VF use its original BDF# not that of the PF
932 * which we used for the IOMMU lookup. Strictly speaking
933 * we could do this for all PCI devices; we only need to
934 * get the BDF# from the scope table for ACPI matches. */
935 if (pdev && pdev->is_virtfn)
936 goto got_pdev;
937
938 if (bus && devfn) {
939 *bus = drhd->devices[i].bus;
940 *devfn = drhd->devices[i].devfn;
941 }
942 goto out;
943 }
944
945 if (is_downstream_to_pci_bridge(dev, tmp))
946 goto got_pdev;
947 }
948
949 if (pdev && drhd->include_all) {
950 got_pdev:
951 if (bus && devfn) {
952 *bus = pdev->bus->number;
953 *devfn = pdev->devfn;
954 }
955 goto out;
956 }
957 }
958 iommu = NULL;
959 out:
960 if (iommu_is_dummy(iommu, dev))
961 iommu = NULL;
962
963 rcu_read_unlock();
964
965 return iommu;
966 }
967
domain_flush_cache(struct dmar_domain * domain,void * addr,int size)968 static void domain_flush_cache(struct dmar_domain *domain,
969 void *addr, int size)
970 {
971 if (!domain->iommu_coherency)
972 clflush_cache_range(addr, size);
973 }
974
device_context_mapped(struct intel_iommu * iommu,u8 bus,u8 devfn)975 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
976 {
977 struct context_entry *context;
978 int ret = 0;
979 unsigned long flags;
980
981 spin_lock_irqsave(&iommu->lock, flags);
982 context = iommu_context_addr(iommu, bus, devfn, 0);
983 if (context)
984 ret = context_present(context);
985 spin_unlock_irqrestore(&iommu->lock, flags);
986 return ret;
987 }
988
free_context_table(struct intel_iommu * iommu)989 static void free_context_table(struct intel_iommu *iommu)
990 {
991 int i;
992 unsigned long flags;
993 struct context_entry *context;
994
995 spin_lock_irqsave(&iommu->lock, flags);
996 if (!iommu->root_entry) {
997 goto out;
998 }
999 for (i = 0; i < ROOT_ENTRY_NR; i++) {
1000 context = iommu_context_addr(iommu, i, 0, 0);
1001 if (context)
1002 free_pgtable_page(context);
1003
1004 if (!sm_supported(iommu))
1005 continue;
1006
1007 context = iommu_context_addr(iommu, i, 0x80, 0);
1008 if (context)
1009 free_pgtable_page(context);
1010
1011 }
1012 free_pgtable_page(iommu->root_entry);
1013 iommu->root_entry = NULL;
1014 out:
1015 spin_unlock_irqrestore(&iommu->lock, flags);
1016 }
1017
pfn_to_dma_pte(struct dmar_domain * domain,unsigned long pfn,int * target_level)1018 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
1019 unsigned long pfn, int *target_level)
1020 {
1021 struct dma_pte *parent, *pte;
1022 int level = agaw_to_level(domain->agaw);
1023 int offset;
1024
1025 BUG_ON(!domain->pgd);
1026
1027 if (!domain_pfn_supported(domain, pfn))
1028 /* Address beyond IOMMU's addressing capabilities. */
1029 return NULL;
1030
1031 parent = domain->pgd;
1032
1033 while (1) {
1034 void *tmp_page;
1035
1036 offset = pfn_level_offset(pfn, level);
1037 pte = &parent[offset];
1038 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
1039 break;
1040 if (level == *target_level)
1041 break;
1042
1043 if (!dma_pte_present(pte)) {
1044 uint64_t pteval;
1045
1046 tmp_page = alloc_pgtable_page(domain->nid);
1047
1048 if (!tmp_page)
1049 return NULL;
1050
1051 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
1052 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
1053 if (domain_use_first_level(domain)) {
1054 pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US;
1055 if (domain->domain.type == IOMMU_DOMAIN_DMA)
1056 pteval |= DMA_FL_PTE_ACCESS;
1057 }
1058 if (cmpxchg64(&pte->val, 0ULL, pteval))
1059 /* Someone else set it while we were thinking; use theirs. */
1060 free_pgtable_page(tmp_page);
1061 else
1062 domain_flush_cache(domain, pte, sizeof(*pte));
1063 }
1064 if (level == 1)
1065 break;
1066
1067 parent = phys_to_virt(dma_pte_addr(pte));
1068 level--;
1069 }
1070
1071 if (!*target_level)
1072 *target_level = level;
1073
1074 return pte;
1075 }
1076
1077 /* return address's pte at specific level */
dma_pfn_level_pte(struct dmar_domain * domain,unsigned long pfn,int level,int * large_page)1078 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
1079 unsigned long pfn,
1080 int level, int *large_page)
1081 {
1082 struct dma_pte *parent, *pte;
1083 int total = agaw_to_level(domain->agaw);
1084 int offset;
1085
1086 parent = domain->pgd;
1087 while (level <= total) {
1088 offset = pfn_level_offset(pfn, total);
1089 pte = &parent[offset];
1090 if (level == total)
1091 return pte;
1092
1093 if (!dma_pte_present(pte)) {
1094 *large_page = total;
1095 break;
1096 }
1097
1098 if (dma_pte_superpage(pte)) {
1099 *large_page = total;
1100 return pte;
1101 }
1102
1103 parent = phys_to_virt(dma_pte_addr(pte));
1104 total--;
1105 }
1106 return NULL;
1107 }
1108
1109 /* clear last level pte, a tlb flush should be followed */
dma_pte_clear_range(struct dmar_domain * domain,unsigned long start_pfn,unsigned long last_pfn)1110 static void dma_pte_clear_range(struct dmar_domain *domain,
1111 unsigned long start_pfn,
1112 unsigned long last_pfn)
1113 {
1114 unsigned int large_page;
1115 struct dma_pte *first_pte, *pte;
1116
1117 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1118 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1119 BUG_ON(start_pfn > last_pfn);
1120
1121 /* we don't need lock here; nobody else touches the iova range */
1122 do {
1123 large_page = 1;
1124 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1125 if (!pte) {
1126 start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1127 continue;
1128 }
1129 do {
1130 dma_clear_pte(pte);
1131 start_pfn += lvl_to_nr_pages(large_page);
1132 pte++;
1133 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1134
1135 domain_flush_cache(domain, first_pte,
1136 (void *)pte - (void *)first_pte);
1137
1138 } while (start_pfn && start_pfn <= last_pfn);
1139 }
1140
dma_pte_free_level(struct dmar_domain * domain,int level,int retain_level,struct dma_pte * pte,unsigned long pfn,unsigned long start_pfn,unsigned long last_pfn)1141 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1142 int retain_level, struct dma_pte *pte,
1143 unsigned long pfn, unsigned long start_pfn,
1144 unsigned long last_pfn)
1145 {
1146 pfn = max(start_pfn, pfn);
1147 pte = &pte[pfn_level_offset(pfn, level)];
1148
1149 do {
1150 unsigned long level_pfn;
1151 struct dma_pte *level_pte;
1152
1153 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1154 goto next;
1155
1156 level_pfn = pfn & level_mask(level);
1157 level_pte = phys_to_virt(dma_pte_addr(pte));
1158
1159 if (level > 2) {
1160 dma_pte_free_level(domain, level - 1, retain_level,
1161 level_pte, level_pfn, start_pfn,
1162 last_pfn);
1163 }
1164
1165 /*
1166 * Free the page table if we're below the level we want to
1167 * retain and the range covers the entire table.
1168 */
1169 if (level < retain_level && !(start_pfn > level_pfn ||
1170 last_pfn < level_pfn + level_size(level) - 1)) {
1171 dma_clear_pte(pte);
1172 domain_flush_cache(domain, pte, sizeof(*pte));
1173 free_pgtable_page(level_pte);
1174 }
1175 next:
1176 pfn += level_size(level);
1177 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1178 }
1179
1180 /*
1181 * clear last level (leaf) ptes and free page table pages below the
1182 * level we wish to keep intact.
1183 */
dma_pte_free_pagetable(struct dmar_domain * domain,unsigned long start_pfn,unsigned long last_pfn,int retain_level)1184 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1185 unsigned long start_pfn,
1186 unsigned long last_pfn,
1187 int retain_level)
1188 {
1189 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1190 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1191 BUG_ON(start_pfn > last_pfn);
1192
1193 dma_pte_clear_range(domain, start_pfn, last_pfn);
1194
1195 /* We don't need lock here; nobody else touches the iova range */
1196 dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1197 domain->pgd, 0, start_pfn, last_pfn);
1198
1199 /* free pgd */
1200 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1201 free_pgtable_page(domain->pgd);
1202 domain->pgd = NULL;
1203 }
1204 }
1205
1206 /* When a page at a given level is being unlinked from its parent, we don't
1207 need to *modify* it at all. All we need to do is make a list of all the
1208 pages which can be freed just as soon as we've flushed the IOTLB and we
1209 know the hardware page-walk will no longer touch them.
1210 The 'pte' argument is the *parent* PTE, pointing to the page that is to
1211 be freed. */
dma_pte_list_pagetables(struct dmar_domain * domain,int level,struct dma_pte * pte,struct page * freelist)1212 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1213 int level, struct dma_pte *pte,
1214 struct page *freelist)
1215 {
1216 struct page *pg;
1217
1218 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1219 pg->freelist = freelist;
1220 freelist = pg;
1221
1222 if (level == 1)
1223 return freelist;
1224
1225 pte = page_address(pg);
1226 do {
1227 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1228 freelist = dma_pte_list_pagetables(domain, level - 1,
1229 pte, freelist);
1230 pte++;
1231 } while (!first_pte_in_page(pte));
1232
1233 return freelist;
1234 }
1235
dma_pte_clear_level(struct dmar_domain * domain,int level,struct dma_pte * pte,unsigned long pfn,unsigned long start_pfn,unsigned long last_pfn,struct page * freelist)1236 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1237 struct dma_pte *pte, unsigned long pfn,
1238 unsigned long start_pfn,
1239 unsigned long last_pfn,
1240 struct page *freelist)
1241 {
1242 struct dma_pte *first_pte = NULL, *last_pte = NULL;
1243
1244 pfn = max(start_pfn, pfn);
1245 pte = &pte[pfn_level_offset(pfn, level)];
1246
1247 do {
1248 unsigned long level_pfn;
1249
1250 if (!dma_pte_present(pte))
1251 goto next;
1252
1253 level_pfn = pfn & level_mask(level);
1254
1255 /* If range covers entire pagetable, free it */
1256 if (start_pfn <= level_pfn &&
1257 last_pfn >= level_pfn + level_size(level) - 1) {
1258 /* These suborbinate page tables are going away entirely. Don't
1259 bother to clear them; we're just going to *free* them. */
1260 if (level > 1 && !dma_pte_superpage(pte))
1261 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1262
1263 dma_clear_pte(pte);
1264 if (!first_pte)
1265 first_pte = pte;
1266 last_pte = pte;
1267 } else if (level > 1) {
1268 /* Recurse down into a level that isn't *entirely* obsolete */
1269 freelist = dma_pte_clear_level(domain, level - 1,
1270 phys_to_virt(dma_pte_addr(pte)),
1271 level_pfn, start_pfn, last_pfn,
1272 freelist);
1273 }
1274 next:
1275 pfn += level_size(level);
1276 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1277
1278 if (first_pte)
1279 domain_flush_cache(domain, first_pte,
1280 (void *)++last_pte - (void *)first_pte);
1281
1282 return freelist;
1283 }
1284
1285 /* We can't just free the pages because the IOMMU may still be walking
1286 the page tables, and may have cached the intermediate levels. The
1287 pages can only be freed after the IOTLB flush has been done. */
domain_unmap(struct dmar_domain * domain,unsigned long start_pfn,unsigned long last_pfn)1288 static struct page *domain_unmap(struct dmar_domain *domain,
1289 unsigned long start_pfn,
1290 unsigned long last_pfn)
1291 {
1292 struct page *freelist;
1293
1294 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1295 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1296 BUG_ON(start_pfn > last_pfn);
1297
1298 /* we don't need lock here; nobody else touches the iova range */
1299 freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1300 domain->pgd, 0, start_pfn, last_pfn, NULL);
1301
1302 /* free pgd */
1303 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1304 struct page *pgd_page = virt_to_page(domain->pgd);
1305 pgd_page->freelist = freelist;
1306 freelist = pgd_page;
1307
1308 domain->pgd = NULL;
1309 }
1310
1311 return freelist;
1312 }
1313
dma_free_pagelist(struct page * freelist)1314 static void dma_free_pagelist(struct page *freelist)
1315 {
1316 struct page *pg;
1317
1318 while ((pg = freelist)) {
1319 freelist = pg->freelist;
1320 free_pgtable_page(page_address(pg));
1321 }
1322 }
1323
iova_entry_free(unsigned long data)1324 static void iova_entry_free(unsigned long data)
1325 {
1326 struct page *freelist = (struct page *)data;
1327
1328 dma_free_pagelist(freelist);
1329 }
1330
1331 /* iommu handling */
iommu_alloc_root_entry(struct intel_iommu * iommu)1332 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1333 {
1334 struct root_entry *root;
1335 unsigned long flags;
1336
1337 root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1338 if (!root) {
1339 pr_err("Allocating root entry for %s failed\n",
1340 iommu->name);
1341 return -ENOMEM;
1342 }
1343
1344 __iommu_flush_cache(iommu, root, ROOT_SIZE);
1345
1346 spin_lock_irqsave(&iommu->lock, flags);
1347 iommu->root_entry = root;
1348 spin_unlock_irqrestore(&iommu->lock, flags);
1349
1350 return 0;
1351 }
1352
iommu_set_root_entry(struct intel_iommu * iommu)1353 static void iommu_set_root_entry(struct intel_iommu *iommu)
1354 {
1355 u64 addr;
1356 u32 sts;
1357 unsigned long flag;
1358
1359 addr = virt_to_phys(iommu->root_entry);
1360 if (sm_supported(iommu))
1361 addr |= DMA_RTADDR_SMT;
1362
1363 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1364 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1365
1366 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1367
1368 /* Make sure hardware complete it */
1369 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1370 readl, (sts & DMA_GSTS_RTPS), sts);
1371
1372 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1373
1374 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1375 if (sm_supported(iommu))
1376 qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1377 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1378 }
1379
iommu_flush_write_buffer(struct intel_iommu * iommu)1380 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1381 {
1382 u32 val;
1383 unsigned long flag;
1384
1385 if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1386 return;
1387
1388 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1389 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1390
1391 /* Make sure hardware complete it */
1392 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1393 readl, (!(val & DMA_GSTS_WBFS)), val);
1394
1395 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1396 }
1397
1398 /* return value determine if we need a write buffer flush */
__iommu_flush_context(struct intel_iommu * iommu,u16 did,u16 source_id,u8 function_mask,u64 type)1399 static void __iommu_flush_context(struct intel_iommu *iommu,
1400 u16 did, u16 source_id, u8 function_mask,
1401 u64 type)
1402 {
1403 u64 val = 0;
1404 unsigned long flag;
1405
1406 switch (type) {
1407 case DMA_CCMD_GLOBAL_INVL:
1408 val = DMA_CCMD_GLOBAL_INVL;
1409 break;
1410 case DMA_CCMD_DOMAIN_INVL:
1411 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1412 break;
1413 case DMA_CCMD_DEVICE_INVL:
1414 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1415 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1416 break;
1417 default:
1418 BUG();
1419 }
1420 val |= DMA_CCMD_ICC;
1421
1422 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1423 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1424
1425 /* Make sure hardware complete it */
1426 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1427 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1428
1429 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1430 }
1431
1432 /* return value determine if we need a write buffer flush */
__iommu_flush_iotlb(struct intel_iommu * iommu,u16 did,u64 addr,unsigned int size_order,u64 type)1433 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1434 u64 addr, unsigned int size_order, u64 type)
1435 {
1436 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1437 u64 val = 0, val_iva = 0;
1438 unsigned long flag;
1439
1440 switch (type) {
1441 case DMA_TLB_GLOBAL_FLUSH:
1442 /* global flush doesn't need set IVA_REG */
1443 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1444 break;
1445 case DMA_TLB_DSI_FLUSH:
1446 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1447 break;
1448 case DMA_TLB_PSI_FLUSH:
1449 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1450 /* IH bit is passed in as part of address */
1451 val_iva = size_order | addr;
1452 break;
1453 default:
1454 BUG();
1455 }
1456 /* Note: set drain read/write */
1457 #if 0
1458 /*
1459 * This is probably to be super secure.. Looks like we can
1460 * ignore it without any impact.
1461 */
1462 if (cap_read_drain(iommu->cap))
1463 val |= DMA_TLB_READ_DRAIN;
1464 #endif
1465 if (cap_write_drain(iommu->cap))
1466 val |= DMA_TLB_WRITE_DRAIN;
1467
1468 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1469 /* Note: Only uses first TLB reg currently */
1470 if (val_iva)
1471 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1472 dmar_writeq(iommu->reg + tlb_offset + 8, val);
1473
1474 /* Make sure hardware complete it */
1475 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1476 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1477
1478 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1479
1480 /* check IOTLB invalidation granularity */
1481 if (DMA_TLB_IAIG(val) == 0)
1482 pr_err("Flush IOTLB failed\n");
1483 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1484 pr_debug("TLB flush request %Lx, actual %Lx\n",
1485 (unsigned long long)DMA_TLB_IIRG(type),
1486 (unsigned long long)DMA_TLB_IAIG(val));
1487 }
1488
1489 static struct device_domain_info *
iommu_support_dev_iotlb(struct dmar_domain * domain,struct intel_iommu * iommu,u8 bus,u8 devfn)1490 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1491 u8 bus, u8 devfn)
1492 {
1493 struct device_domain_info *info;
1494
1495 assert_spin_locked(&device_domain_lock);
1496
1497 if (!iommu->qi)
1498 return NULL;
1499
1500 list_for_each_entry(info, &domain->devices, link)
1501 if (info->iommu == iommu && info->bus == bus &&
1502 info->devfn == devfn) {
1503 if (info->ats_supported && info->dev)
1504 return info;
1505 break;
1506 }
1507
1508 return NULL;
1509 }
1510
domain_update_iotlb(struct dmar_domain * domain)1511 static void domain_update_iotlb(struct dmar_domain *domain)
1512 {
1513 struct device_domain_info *info;
1514 bool has_iotlb_device = false;
1515
1516 assert_spin_locked(&device_domain_lock);
1517
1518 list_for_each_entry(info, &domain->devices, link) {
1519 struct pci_dev *pdev;
1520
1521 if (!info->dev || !dev_is_pci(info->dev))
1522 continue;
1523
1524 pdev = to_pci_dev(info->dev);
1525 if (pdev->ats_enabled) {
1526 has_iotlb_device = true;
1527 break;
1528 }
1529 }
1530
1531 domain->has_iotlb_device = has_iotlb_device;
1532 }
1533
iommu_enable_dev_iotlb(struct device_domain_info * info)1534 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1535 {
1536 struct pci_dev *pdev;
1537
1538 assert_spin_locked(&device_domain_lock);
1539
1540 if (!info || !dev_is_pci(info->dev))
1541 return;
1542
1543 pdev = to_pci_dev(info->dev);
1544 /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1545 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1546 * queue depth at PF level. If DIT is not set, PFSID will be treated as
1547 * reserved, which should be set to 0.
1548 */
1549 if (!ecap_dit(info->iommu->ecap))
1550 info->pfsid = 0;
1551 else {
1552 struct pci_dev *pf_pdev;
1553
1554 /* pdev will be returned if device is not a vf */
1555 pf_pdev = pci_physfn(pdev);
1556 info->pfsid = pci_dev_id(pf_pdev);
1557 }
1558
1559 #ifdef CONFIG_INTEL_IOMMU_SVM
1560 /* The PCIe spec, in its wisdom, declares that the behaviour of
1561 the device if you enable PASID support after ATS support is
1562 undefined. So always enable PASID support on devices which
1563 have it, even if we can't yet know if we're ever going to
1564 use it. */
1565 if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1566 info->pasid_enabled = 1;
1567
1568 if (info->pri_supported &&
1569 (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1) &&
1570 !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1571 info->pri_enabled = 1;
1572 #endif
1573 if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1574 !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1575 info->ats_enabled = 1;
1576 domain_update_iotlb(info->domain);
1577 info->ats_qdep = pci_ats_queue_depth(pdev);
1578 }
1579 }
1580
iommu_disable_dev_iotlb(struct device_domain_info * info)1581 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1582 {
1583 struct pci_dev *pdev;
1584
1585 assert_spin_locked(&device_domain_lock);
1586
1587 if (!dev_is_pci(info->dev))
1588 return;
1589
1590 pdev = to_pci_dev(info->dev);
1591
1592 if (info->ats_enabled) {
1593 pci_disable_ats(pdev);
1594 info->ats_enabled = 0;
1595 domain_update_iotlb(info->domain);
1596 }
1597 #ifdef CONFIG_INTEL_IOMMU_SVM
1598 if (info->pri_enabled) {
1599 pci_disable_pri(pdev);
1600 info->pri_enabled = 0;
1601 }
1602 if (info->pasid_enabled) {
1603 pci_disable_pasid(pdev);
1604 info->pasid_enabled = 0;
1605 }
1606 #endif
1607 }
1608
iommu_flush_dev_iotlb(struct dmar_domain * domain,u64 addr,unsigned mask)1609 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1610 u64 addr, unsigned mask)
1611 {
1612 u16 sid, qdep;
1613 unsigned long flags;
1614 struct device_domain_info *info;
1615
1616 if (!domain->has_iotlb_device)
1617 return;
1618
1619 spin_lock_irqsave(&device_domain_lock, flags);
1620 list_for_each_entry(info, &domain->devices, link) {
1621 if (!info->ats_enabled)
1622 continue;
1623
1624 sid = info->bus << 8 | info->devfn;
1625 qdep = info->ats_qdep;
1626 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1627 qdep, addr, mask);
1628 }
1629 spin_unlock_irqrestore(&device_domain_lock, flags);
1630 }
1631
domain_flush_piotlb(struct intel_iommu * iommu,struct dmar_domain * domain,u64 addr,unsigned long npages,bool ih)1632 static void domain_flush_piotlb(struct intel_iommu *iommu,
1633 struct dmar_domain *domain,
1634 u64 addr, unsigned long npages, bool ih)
1635 {
1636 u16 did = domain->iommu_did[iommu->seq_id];
1637
1638 if (domain->default_pasid)
1639 qi_flush_piotlb(iommu, did, domain->default_pasid,
1640 addr, npages, ih);
1641
1642 if (!list_empty(&domain->devices))
1643 qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, npages, ih);
1644 }
1645
iommu_flush_iotlb_psi(struct intel_iommu * iommu,struct dmar_domain * domain,unsigned long pfn,unsigned int pages,int ih,int map)1646 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1647 struct dmar_domain *domain,
1648 unsigned long pfn, unsigned int pages,
1649 int ih, int map)
1650 {
1651 unsigned int aligned_pages = __roundup_pow_of_two(pages);
1652 unsigned int mask = ilog2(aligned_pages);
1653 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1654 u16 did = domain->iommu_did[iommu->seq_id];
1655
1656 BUG_ON(pages == 0);
1657
1658 if (ih)
1659 ih = 1 << 6;
1660
1661 if (domain_use_first_level(domain)) {
1662 domain_flush_piotlb(iommu, domain, addr, pages, ih);
1663 } else {
1664 unsigned long bitmask = aligned_pages - 1;
1665
1666 /*
1667 * PSI masks the low order bits of the base address. If the
1668 * address isn't aligned to the mask, then compute a mask value
1669 * needed to ensure the target range is flushed.
1670 */
1671 if (unlikely(bitmask & pfn)) {
1672 unsigned long end_pfn = pfn + pages - 1, shared_bits;
1673
1674 /*
1675 * Since end_pfn <= pfn + bitmask, the only way bits
1676 * higher than bitmask can differ in pfn and end_pfn is
1677 * by carrying. This means after masking out bitmask,
1678 * high bits starting with the first set bit in
1679 * shared_bits are all equal in both pfn and end_pfn.
1680 */
1681 shared_bits = ~(pfn ^ end_pfn) & ~bitmask;
1682 mask = shared_bits ? __ffs(shared_bits) : BITS_PER_LONG;
1683 }
1684
1685 /*
1686 * Fallback to domain selective flush if no PSI support or
1687 * the size is too big.
1688 */
1689 if (!cap_pgsel_inv(iommu->cap) ||
1690 mask > cap_max_amask_val(iommu->cap))
1691 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1692 DMA_TLB_DSI_FLUSH);
1693 else
1694 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1695 DMA_TLB_PSI_FLUSH);
1696 }
1697
1698 /*
1699 * In caching mode, changes of pages from non-present to present require
1700 * flush. However, device IOTLB doesn't need to be flushed in this case.
1701 */
1702 if (!cap_caching_mode(iommu->cap) || !map)
1703 iommu_flush_dev_iotlb(domain, addr, mask);
1704 }
1705
1706 /* Notification for newly created mappings */
__mapping_notify_one(struct intel_iommu * iommu,struct dmar_domain * domain,unsigned long pfn,unsigned int pages)1707 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1708 struct dmar_domain *domain,
1709 unsigned long pfn, unsigned int pages)
1710 {
1711 /*
1712 * It's a non-present to present mapping. Only flush if caching mode
1713 * and second level.
1714 */
1715 if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain))
1716 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1717 else
1718 iommu_flush_write_buffer(iommu);
1719 }
1720
iommu_flush_iova(struct iova_domain * iovad)1721 static void iommu_flush_iova(struct iova_domain *iovad)
1722 {
1723 struct dmar_domain *domain;
1724 int idx;
1725
1726 domain = container_of(iovad, struct dmar_domain, iovad);
1727
1728 for_each_domain_iommu(idx, domain) {
1729 struct intel_iommu *iommu = g_iommus[idx];
1730 u16 did = domain->iommu_did[iommu->seq_id];
1731
1732 if (domain_use_first_level(domain))
1733 domain_flush_piotlb(iommu, domain, 0, -1, 0);
1734 else
1735 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1736 DMA_TLB_DSI_FLUSH);
1737
1738 if (!cap_caching_mode(iommu->cap))
1739 iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1740 0, MAX_AGAW_PFN_WIDTH);
1741 }
1742 }
1743
iommu_disable_protect_mem_regions(struct intel_iommu * iommu)1744 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1745 {
1746 u32 pmen;
1747 unsigned long flags;
1748
1749 if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1750 return;
1751
1752 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1753 pmen = readl(iommu->reg + DMAR_PMEN_REG);
1754 pmen &= ~DMA_PMEN_EPM;
1755 writel(pmen, iommu->reg + DMAR_PMEN_REG);
1756
1757 /* wait for the protected region status bit to clear */
1758 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1759 readl, !(pmen & DMA_PMEN_PRS), pmen);
1760
1761 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1762 }
1763
iommu_enable_translation(struct intel_iommu * iommu)1764 static void iommu_enable_translation(struct intel_iommu *iommu)
1765 {
1766 u32 sts;
1767 unsigned long flags;
1768
1769 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1770 iommu->gcmd |= DMA_GCMD_TE;
1771 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1772
1773 /* Make sure hardware complete it */
1774 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1775 readl, (sts & DMA_GSTS_TES), sts);
1776
1777 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1778 }
1779
iommu_disable_translation(struct intel_iommu * iommu)1780 static void iommu_disable_translation(struct intel_iommu *iommu)
1781 {
1782 u32 sts;
1783 unsigned long flag;
1784
1785 if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1786 (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1787 return;
1788
1789 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1790 iommu->gcmd &= ~DMA_GCMD_TE;
1791 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1792
1793 /* Make sure hardware complete it */
1794 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1795 readl, (!(sts & DMA_GSTS_TES)), sts);
1796
1797 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1798 }
1799
iommu_init_domains(struct intel_iommu * iommu)1800 static int iommu_init_domains(struct intel_iommu *iommu)
1801 {
1802 u32 ndomains, nlongs;
1803 size_t size;
1804
1805 ndomains = cap_ndoms(iommu->cap);
1806 pr_debug("%s: Number of Domains supported <%d>\n",
1807 iommu->name, ndomains);
1808 nlongs = BITS_TO_LONGS(ndomains);
1809
1810 spin_lock_init(&iommu->lock);
1811
1812 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1813 if (!iommu->domain_ids) {
1814 pr_err("%s: Allocating domain id array failed\n",
1815 iommu->name);
1816 return -ENOMEM;
1817 }
1818
1819 size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1820 iommu->domains = kzalloc(size, GFP_KERNEL);
1821
1822 if (iommu->domains) {
1823 size = 256 * sizeof(struct dmar_domain *);
1824 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1825 }
1826
1827 if (!iommu->domains || !iommu->domains[0]) {
1828 pr_err("%s: Allocating domain array failed\n",
1829 iommu->name);
1830 kfree(iommu->domain_ids);
1831 kfree(iommu->domains);
1832 iommu->domain_ids = NULL;
1833 iommu->domains = NULL;
1834 return -ENOMEM;
1835 }
1836
1837 /*
1838 * If Caching mode is set, then invalid translations are tagged
1839 * with domain-id 0, hence we need to pre-allocate it. We also
1840 * use domain-id 0 as a marker for non-allocated domain-id, so
1841 * make sure it is not used for a real domain.
1842 */
1843 set_bit(0, iommu->domain_ids);
1844
1845 /*
1846 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1847 * entry for first-level or pass-through translation modes should
1848 * be programmed with a domain id different from those used for
1849 * second-level or nested translation. We reserve a domain id for
1850 * this purpose.
1851 */
1852 if (sm_supported(iommu))
1853 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1854
1855 return 0;
1856 }
1857
disable_dmar_iommu(struct intel_iommu * iommu)1858 static void disable_dmar_iommu(struct intel_iommu *iommu)
1859 {
1860 struct device_domain_info *info, *tmp;
1861 unsigned long flags;
1862
1863 if (!iommu->domains || !iommu->domain_ids)
1864 return;
1865
1866 spin_lock_irqsave(&device_domain_lock, flags);
1867 list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1868 if (info->iommu != iommu)
1869 continue;
1870
1871 if (!info->dev || !info->domain)
1872 continue;
1873
1874 __dmar_remove_one_dev_info(info);
1875 }
1876 spin_unlock_irqrestore(&device_domain_lock, flags);
1877
1878 if (iommu->gcmd & DMA_GCMD_TE)
1879 iommu_disable_translation(iommu);
1880 }
1881
free_dmar_iommu(struct intel_iommu * iommu)1882 static void free_dmar_iommu(struct intel_iommu *iommu)
1883 {
1884 if ((iommu->domains) && (iommu->domain_ids)) {
1885 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1886 int i;
1887
1888 for (i = 0; i < elems; i++)
1889 kfree(iommu->domains[i]);
1890 kfree(iommu->domains);
1891 kfree(iommu->domain_ids);
1892 iommu->domains = NULL;
1893 iommu->domain_ids = NULL;
1894 }
1895
1896 g_iommus[iommu->seq_id] = NULL;
1897
1898 /* free context mapping */
1899 free_context_table(iommu);
1900
1901 #ifdef CONFIG_INTEL_IOMMU_SVM
1902 if (pasid_supported(iommu)) {
1903 if (ecap_prs(iommu->ecap))
1904 intel_svm_finish_prq(iommu);
1905 }
1906 if (vccap_pasid(iommu->vccap))
1907 ioasid_unregister_allocator(&iommu->pasid_allocator);
1908
1909 #endif
1910 }
1911
1912 /*
1913 * Check and return whether first level is used by default for
1914 * DMA translation.
1915 */
first_level_by_default(void)1916 static bool first_level_by_default(void)
1917 {
1918 struct dmar_drhd_unit *drhd;
1919 struct intel_iommu *iommu;
1920 static int first_level_support = -1;
1921
1922 if (likely(first_level_support != -1))
1923 return first_level_support;
1924
1925 first_level_support = 1;
1926
1927 rcu_read_lock();
1928 for_each_active_iommu(iommu, drhd) {
1929 if (!sm_supported(iommu) || !ecap_flts(iommu->ecap)) {
1930 first_level_support = 0;
1931 break;
1932 }
1933 }
1934 rcu_read_unlock();
1935
1936 return first_level_support;
1937 }
1938
alloc_domain(int flags)1939 static struct dmar_domain *alloc_domain(int flags)
1940 {
1941 struct dmar_domain *domain;
1942
1943 domain = alloc_domain_mem();
1944 if (!domain)
1945 return NULL;
1946
1947 memset(domain, 0, sizeof(*domain));
1948 domain->nid = NUMA_NO_NODE;
1949 domain->flags = flags;
1950 if (first_level_by_default())
1951 domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL;
1952 domain->has_iotlb_device = false;
1953 INIT_LIST_HEAD(&domain->devices);
1954
1955 return domain;
1956 }
1957
1958 /* Must be called with iommu->lock */
domain_attach_iommu(struct dmar_domain * domain,struct intel_iommu * iommu)1959 static int domain_attach_iommu(struct dmar_domain *domain,
1960 struct intel_iommu *iommu)
1961 {
1962 unsigned long ndomains;
1963 int num;
1964
1965 assert_spin_locked(&device_domain_lock);
1966 assert_spin_locked(&iommu->lock);
1967
1968 domain->iommu_refcnt[iommu->seq_id] += 1;
1969 domain->iommu_count += 1;
1970 if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1971 ndomains = cap_ndoms(iommu->cap);
1972 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1973
1974 if (num >= ndomains) {
1975 pr_err("%s: No free domain ids\n", iommu->name);
1976 domain->iommu_refcnt[iommu->seq_id] -= 1;
1977 domain->iommu_count -= 1;
1978 return -ENOSPC;
1979 }
1980
1981 set_bit(num, iommu->domain_ids);
1982 set_iommu_domain(iommu, num, domain);
1983
1984 domain->iommu_did[iommu->seq_id] = num;
1985 domain->nid = iommu->node;
1986
1987 domain_update_iommu_cap(domain);
1988 }
1989
1990 return 0;
1991 }
1992
domain_detach_iommu(struct dmar_domain * domain,struct intel_iommu * iommu)1993 static int domain_detach_iommu(struct dmar_domain *domain,
1994 struct intel_iommu *iommu)
1995 {
1996 int num, count;
1997
1998 assert_spin_locked(&device_domain_lock);
1999 assert_spin_locked(&iommu->lock);
2000
2001 domain->iommu_refcnt[iommu->seq_id] -= 1;
2002 count = --domain->iommu_count;
2003 if (domain->iommu_refcnt[iommu->seq_id] == 0) {
2004 num = domain->iommu_did[iommu->seq_id];
2005 clear_bit(num, iommu->domain_ids);
2006 set_iommu_domain(iommu, num, NULL);
2007
2008 domain_update_iommu_cap(domain);
2009 domain->iommu_did[iommu->seq_id] = 0;
2010 }
2011
2012 return count;
2013 }
2014
2015 static struct iova_domain reserved_iova_list;
2016 static struct lock_class_key reserved_rbtree_key;
2017
dmar_init_reserved_ranges(void)2018 static int dmar_init_reserved_ranges(void)
2019 {
2020 struct pci_dev *pdev = NULL;
2021 struct iova *iova;
2022 int i;
2023
2024 init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
2025
2026 lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
2027 &reserved_rbtree_key);
2028
2029 /* IOAPIC ranges shouldn't be accessed by DMA */
2030 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
2031 IOVA_PFN(IOAPIC_RANGE_END));
2032 if (!iova) {
2033 pr_err("Reserve IOAPIC range failed\n");
2034 return -ENODEV;
2035 }
2036
2037 /* Reserve all PCI MMIO to avoid peer-to-peer access */
2038 for_each_pci_dev(pdev) {
2039 struct resource *r;
2040
2041 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
2042 r = &pdev->resource[i];
2043 if (!r->flags || !(r->flags & IORESOURCE_MEM))
2044 continue;
2045 iova = reserve_iova(&reserved_iova_list,
2046 IOVA_PFN(r->start),
2047 IOVA_PFN(r->end));
2048 if (!iova) {
2049 pci_err(pdev, "Reserve iova for %pR failed\n", r);
2050 return -ENODEV;
2051 }
2052 }
2053 }
2054 return 0;
2055 }
2056
guestwidth_to_adjustwidth(int gaw)2057 static inline int guestwidth_to_adjustwidth(int gaw)
2058 {
2059 int agaw;
2060 int r = (gaw - 12) % 9;
2061
2062 if (r == 0)
2063 agaw = gaw;
2064 else
2065 agaw = gaw + 9 - r;
2066 if (agaw > 64)
2067 agaw = 64;
2068 return agaw;
2069 }
2070
domain_exit(struct dmar_domain * domain)2071 static void domain_exit(struct dmar_domain *domain)
2072 {
2073
2074 /* Remove associated devices and clear attached or cached domains */
2075 domain_remove_dev_info(domain);
2076
2077 /* destroy iovas */
2078 if (domain->domain.type == IOMMU_DOMAIN_DMA)
2079 put_iova_domain(&domain->iovad);
2080
2081 if (domain->pgd) {
2082 struct page *freelist;
2083
2084 freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
2085 dma_free_pagelist(freelist);
2086 }
2087
2088 free_domain_mem(domain);
2089 }
2090
2091 /*
2092 * Get the PASID directory size for scalable mode context entry.
2093 * Value of X in the PDTS field of a scalable mode context entry
2094 * indicates PASID directory with 2^(X + 7) entries.
2095 */
context_get_sm_pds(struct pasid_table * table)2096 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
2097 {
2098 int pds, max_pde;
2099
2100 max_pde = table->max_pasid >> PASID_PDE_SHIFT;
2101 pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
2102 if (pds < 7)
2103 return 0;
2104
2105 return pds - 7;
2106 }
2107
2108 /*
2109 * Set the RID_PASID field of a scalable mode context entry. The
2110 * IOMMU hardware will use the PASID value set in this field for
2111 * DMA translations of DMA requests without PASID.
2112 */
2113 static inline void
context_set_sm_rid2pasid(struct context_entry * context,unsigned long pasid)2114 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
2115 {
2116 context->hi |= pasid & ((1 << 20) - 1);
2117 }
2118
2119 /*
2120 * Set the DTE(Device-TLB Enable) field of a scalable mode context
2121 * entry.
2122 */
context_set_sm_dte(struct context_entry * context)2123 static inline void context_set_sm_dte(struct context_entry *context)
2124 {
2125 context->lo |= (1 << 2);
2126 }
2127
2128 /*
2129 * Set the PRE(Page Request Enable) field of a scalable mode context
2130 * entry.
2131 */
context_set_sm_pre(struct context_entry * context)2132 static inline void context_set_sm_pre(struct context_entry *context)
2133 {
2134 context->lo |= (1 << 4);
2135 }
2136
2137 /* Convert value to context PASID directory size field coding. */
2138 #define context_pdts(pds) (((pds) & 0x7) << 9)
2139
domain_context_mapping_one(struct dmar_domain * domain,struct intel_iommu * iommu,struct pasid_table * table,u8 bus,u8 devfn)2140 static int domain_context_mapping_one(struct dmar_domain *domain,
2141 struct intel_iommu *iommu,
2142 struct pasid_table *table,
2143 u8 bus, u8 devfn)
2144 {
2145 u16 did = domain->iommu_did[iommu->seq_id];
2146 int translation = CONTEXT_TT_MULTI_LEVEL;
2147 struct device_domain_info *info = NULL;
2148 struct context_entry *context;
2149 unsigned long flags;
2150 int ret;
2151
2152 WARN_ON(did == 0);
2153
2154 if (hw_pass_through && domain_type_is_si(domain))
2155 translation = CONTEXT_TT_PASS_THROUGH;
2156
2157 pr_debug("Set context mapping for %02x:%02x.%d\n",
2158 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
2159
2160 BUG_ON(!domain->pgd);
2161
2162 spin_lock_irqsave(&device_domain_lock, flags);
2163 spin_lock(&iommu->lock);
2164
2165 ret = -ENOMEM;
2166 context = iommu_context_addr(iommu, bus, devfn, 1);
2167 if (!context)
2168 goto out_unlock;
2169
2170 ret = 0;
2171 if (context_present(context))
2172 goto out_unlock;
2173
2174 /*
2175 * For kdump cases, old valid entries may be cached due to the
2176 * in-flight DMA and copied pgtable, but there is no unmapping
2177 * behaviour for them, thus we need an explicit cache flush for
2178 * the newly-mapped device. For kdump, at this point, the device
2179 * is supposed to finish reset at its driver probe stage, so no
2180 * in-flight DMA will exist, and we don't need to worry anymore
2181 * hereafter.
2182 */
2183 if (context_copied(context)) {
2184 u16 did_old = context_domain_id(context);
2185
2186 if (did_old < cap_ndoms(iommu->cap)) {
2187 iommu->flush.flush_context(iommu, did_old,
2188 (((u16)bus) << 8) | devfn,
2189 DMA_CCMD_MASK_NOBIT,
2190 DMA_CCMD_DEVICE_INVL);
2191 iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2192 DMA_TLB_DSI_FLUSH);
2193 }
2194 }
2195
2196 context_clear_entry(context);
2197
2198 if (sm_supported(iommu)) {
2199 unsigned long pds;
2200
2201 WARN_ON(!table);
2202
2203 /* Setup the PASID DIR pointer: */
2204 pds = context_get_sm_pds(table);
2205 context->lo = (u64)virt_to_phys(table->table) |
2206 context_pdts(pds);
2207
2208 /* Setup the RID_PASID field: */
2209 context_set_sm_rid2pasid(context, PASID_RID2PASID);
2210
2211 /*
2212 * Setup the Device-TLB enable bit and Page request
2213 * Enable bit:
2214 */
2215 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2216 if (info && info->ats_supported)
2217 context_set_sm_dte(context);
2218 if (info && info->pri_supported)
2219 context_set_sm_pre(context);
2220 } else {
2221 struct dma_pte *pgd = domain->pgd;
2222 int agaw;
2223
2224 context_set_domain_id(context, did);
2225
2226 if (translation != CONTEXT_TT_PASS_THROUGH) {
2227 /*
2228 * Skip top levels of page tables for iommu which has
2229 * less agaw than default. Unnecessary for PT mode.
2230 */
2231 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2232 ret = -ENOMEM;
2233 pgd = phys_to_virt(dma_pte_addr(pgd));
2234 if (!dma_pte_present(pgd))
2235 goto out_unlock;
2236 }
2237
2238 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2239 if (info && info->ats_supported)
2240 translation = CONTEXT_TT_DEV_IOTLB;
2241 else
2242 translation = CONTEXT_TT_MULTI_LEVEL;
2243
2244 context_set_address_root(context, virt_to_phys(pgd));
2245 context_set_address_width(context, agaw);
2246 } else {
2247 /*
2248 * In pass through mode, AW must be programmed to
2249 * indicate the largest AGAW value supported by
2250 * hardware. And ASR is ignored by hardware.
2251 */
2252 context_set_address_width(context, iommu->msagaw);
2253 }
2254
2255 context_set_translation_type(context, translation);
2256 }
2257
2258 context_set_fault_enable(context);
2259 context_set_present(context);
2260 if (!ecap_coherent(iommu->ecap))
2261 clflush_cache_range(context, sizeof(*context));
2262
2263 /*
2264 * It's a non-present to present mapping. If hardware doesn't cache
2265 * non-present entry we only need to flush the write-buffer. If the
2266 * _does_ cache non-present entries, then it does so in the special
2267 * domain #0, which we have to flush:
2268 */
2269 if (cap_caching_mode(iommu->cap)) {
2270 iommu->flush.flush_context(iommu, 0,
2271 (((u16)bus) << 8) | devfn,
2272 DMA_CCMD_MASK_NOBIT,
2273 DMA_CCMD_DEVICE_INVL);
2274 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2275 } else {
2276 iommu_flush_write_buffer(iommu);
2277 }
2278 iommu_enable_dev_iotlb(info);
2279
2280 ret = 0;
2281
2282 out_unlock:
2283 spin_unlock(&iommu->lock);
2284 spin_unlock_irqrestore(&device_domain_lock, flags);
2285
2286 return ret;
2287 }
2288
2289 struct domain_context_mapping_data {
2290 struct dmar_domain *domain;
2291 struct intel_iommu *iommu;
2292 struct pasid_table *table;
2293 };
2294
domain_context_mapping_cb(struct pci_dev * pdev,u16 alias,void * opaque)2295 static int domain_context_mapping_cb(struct pci_dev *pdev,
2296 u16 alias, void *opaque)
2297 {
2298 struct domain_context_mapping_data *data = opaque;
2299
2300 return domain_context_mapping_one(data->domain, data->iommu,
2301 data->table, PCI_BUS_NUM(alias),
2302 alias & 0xff);
2303 }
2304
2305 static int
domain_context_mapping(struct dmar_domain * domain,struct device * dev)2306 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2307 {
2308 struct domain_context_mapping_data data;
2309 struct pasid_table *table;
2310 struct intel_iommu *iommu;
2311 u8 bus, devfn;
2312
2313 iommu = device_to_iommu(dev, &bus, &devfn);
2314 if (!iommu)
2315 return -ENODEV;
2316
2317 table = intel_pasid_get_table(dev);
2318
2319 if (!dev_is_pci(dev))
2320 return domain_context_mapping_one(domain, iommu, table,
2321 bus, devfn);
2322
2323 data.domain = domain;
2324 data.iommu = iommu;
2325 data.table = table;
2326
2327 return pci_for_each_dma_alias(to_pci_dev(dev),
2328 &domain_context_mapping_cb, &data);
2329 }
2330
domain_context_mapped_cb(struct pci_dev * pdev,u16 alias,void * opaque)2331 static int domain_context_mapped_cb(struct pci_dev *pdev,
2332 u16 alias, void *opaque)
2333 {
2334 struct intel_iommu *iommu = opaque;
2335
2336 return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2337 }
2338
domain_context_mapped(struct device * dev)2339 static int domain_context_mapped(struct device *dev)
2340 {
2341 struct intel_iommu *iommu;
2342 u8 bus, devfn;
2343
2344 iommu = device_to_iommu(dev, &bus, &devfn);
2345 if (!iommu)
2346 return -ENODEV;
2347
2348 if (!dev_is_pci(dev))
2349 return device_context_mapped(iommu, bus, devfn);
2350
2351 return !pci_for_each_dma_alias(to_pci_dev(dev),
2352 domain_context_mapped_cb, iommu);
2353 }
2354
2355 /* Returns a number of VTD pages, but aligned to MM page size */
aligned_nrpages(unsigned long host_addr,size_t size)2356 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2357 size_t size)
2358 {
2359 host_addr &= ~PAGE_MASK;
2360 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2361 }
2362
2363 /* Return largest possible superpage level for a given mapping */
hardware_largepage_caps(struct dmar_domain * domain,unsigned long iov_pfn,unsigned long phy_pfn,unsigned long pages)2364 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2365 unsigned long iov_pfn,
2366 unsigned long phy_pfn,
2367 unsigned long pages)
2368 {
2369 int support, level = 1;
2370 unsigned long pfnmerge;
2371
2372 support = domain->iommu_superpage;
2373
2374 /* To use a large page, the virtual *and* physical addresses
2375 must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2376 of them will mean we have to use smaller pages. So just
2377 merge them and check both at once. */
2378 pfnmerge = iov_pfn | phy_pfn;
2379
2380 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2381 pages >>= VTD_STRIDE_SHIFT;
2382 if (!pages)
2383 break;
2384 pfnmerge >>= VTD_STRIDE_SHIFT;
2385 level++;
2386 support--;
2387 }
2388 return level;
2389 }
2390
__domain_mapping(struct dmar_domain * domain,unsigned long iov_pfn,struct scatterlist * sg,unsigned long phys_pfn,unsigned long nr_pages,int prot)2391 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2392 struct scatterlist *sg, unsigned long phys_pfn,
2393 unsigned long nr_pages, int prot)
2394 {
2395 struct dma_pte *first_pte = NULL, *pte = NULL;
2396 phys_addr_t pteval;
2397 unsigned long sg_res = 0;
2398 unsigned int largepage_lvl = 0;
2399 unsigned long lvl_pages = 0;
2400 u64 attr;
2401
2402 BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2403
2404 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2405 return -EINVAL;
2406
2407 attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2408 attr |= DMA_FL_PTE_PRESENT;
2409 if (domain_use_first_level(domain)) {
2410 attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US;
2411
2412 if (domain->domain.type == IOMMU_DOMAIN_DMA) {
2413 attr |= DMA_FL_PTE_ACCESS;
2414 if (prot & DMA_PTE_WRITE)
2415 attr |= DMA_FL_PTE_DIRTY;
2416 }
2417 }
2418
2419 if (!sg) {
2420 sg_res = nr_pages;
2421 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2422 }
2423
2424 while (nr_pages > 0) {
2425 uint64_t tmp;
2426
2427 if (!sg_res) {
2428 unsigned int pgoff = sg->offset & ~PAGE_MASK;
2429
2430 sg_res = aligned_nrpages(sg->offset, sg->length);
2431 sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2432 sg->dma_length = sg->length;
2433 pteval = (sg_phys(sg) - pgoff) | attr;
2434 phys_pfn = pteval >> VTD_PAGE_SHIFT;
2435 }
2436
2437 if (!pte) {
2438 largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2439
2440 first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2441 if (!pte)
2442 return -ENOMEM;
2443 /* It is large page*/
2444 if (largepage_lvl > 1) {
2445 unsigned long nr_superpages, end_pfn;
2446
2447 pteval |= DMA_PTE_LARGE_PAGE;
2448 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2449
2450 nr_superpages = sg_res / lvl_pages;
2451 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2452
2453 /*
2454 * Ensure that old small page tables are
2455 * removed to make room for superpage(s).
2456 * We're adding new large pages, so make sure
2457 * we don't remove their parent tables.
2458 */
2459 dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2460 largepage_lvl + 1);
2461 } else {
2462 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2463 }
2464
2465 }
2466 /* We don't need lock here, nobody else
2467 * touches the iova range
2468 */
2469 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2470 if (tmp) {
2471 static int dumps = 5;
2472 pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2473 iov_pfn, tmp, (unsigned long long)pteval);
2474 if (dumps) {
2475 dumps--;
2476 debug_dma_dump_mappings(NULL);
2477 }
2478 WARN_ON(1);
2479 }
2480
2481 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2482
2483 BUG_ON(nr_pages < lvl_pages);
2484 BUG_ON(sg_res < lvl_pages);
2485
2486 nr_pages -= lvl_pages;
2487 iov_pfn += lvl_pages;
2488 phys_pfn += lvl_pages;
2489 pteval += lvl_pages * VTD_PAGE_SIZE;
2490 sg_res -= lvl_pages;
2491
2492 /* If the next PTE would be the first in a new page, then we
2493 need to flush the cache on the entries we've just written.
2494 And then we'll need to recalculate 'pte', so clear it and
2495 let it get set again in the if (!pte) block above.
2496
2497 If we're done (!nr_pages) we need to flush the cache too.
2498
2499 Also if we've been setting superpages, we may need to
2500 recalculate 'pte' and switch back to smaller pages for the
2501 end of the mapping, if the trailing size is not enough to
2502 use another superpage (i.e. sg_res < lvl_pages). */
2503 pte++;
2504 if (!nr_pages || first_pte_in_page(pte) ||
2505 (largepage_lvl > 1 && sg_res < lvl_pages)) {
2506 domain_flush_cache(domain, first_pte,
2507 (void *)pte - (void *)first_pte);
2508 pte = NULL;
2509 }
2510
2511 if (!sg_res && nr_pages)
2512 sg = sg_next(sg);
2513 }
2514 return 0;
2515 }
2516
domain_mapping(struct dmar_domain * domain,unsigned long iov_pfn,struct scatterlist * sg,unsigned long phys_pfn,unsigned long nr_pages,int prot)2517 static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2518 struct scatterlist *sg, unsigned long phys_pfn,
2519 unsigned long nr_pages, int prot)
2520 {
2521 int iommu_id, ret;
2522 struct intel_iommu *iommu;
2523
2524 /* Do the real mapping first */
2525 ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot);
2526 if (ret)
2527 return ret;
2528
2529 for_each_domain_iommu(iommu_id, domain) {
2530 iommu = g_iommus[iommu_id];
2531 __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2532 }
2533
2534 return 0;
2535 }
2536
domain_sg_mapping(struct dmar_domain * domain,unsigned long iov_pfn,struct scatterlist * sg,unsigned long nr_pages,int prot)2537 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2538 struct scatterlist *sg, unsigned long nr_pages,
2539 int prot)
2540 {
2541 return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2542 }
2543
domain_pfn_mapping(struct dmar_domain * domain,unsigned long iov_pfn,unsigned long phys_pfn,unsigned long nr_pages,int prot)2544 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2545 unsigned long phys_pfn, unsigned long nr_pages,
2546 int prot)
2547 {
2548 return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2549 }
2550
domain_context_clear_one(struct intel_iommu * iommu,u8 bus,u8 devfn)2551 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2552 {
2553 unsigned long flags;
2554 struct context_entry *context;
2555 u16 did_old;
2556
2557 if (!iommu)
2558 return;
2559
2560 spin_lock_irqsave(&iommu->lock, flags);
2561 context = iommu_context_addr(iommu, bus, devfn, 0);
2562 if (!context) {
2563 spin_unlock_irqrestore(&iommu->lock, flags);
2564 return;
2565 }
2566 did_old = context_domain_id(context);
2567 context_clear_entry(context);
2568 __iommu_flush_cache(iommu, context, sizeof(*context));
2569 spin_unlock_irqrestore(&iommu->lock, flags);
2570 iommu->flush.flush_context(iommu,
2571 did_old,
2572 (((u16)bus) << 8) | devfn,
2573 DMA_CCMD_MASK_NOBIT,
2574 DMA_CCMD_DEVICE_INVL);
2575
2576 if (sm_supported(iommu))
2577 qi_flush_pasid_cache(iommu, did_old, QI_PC_ALL_PASIDS, 0);
2578
2579 iommu->flush.flush_iotlb(iommu,
2580 did_old,
2581 0,
2582 0,
2583 DMA_TLB_DSI_FLUSH);
2584 }
2585
unlink_domain_info(struct device_domain_info * info)2586 static inline void unlink_domain_info(struct device_domain_info *info)
2587 {
2588 assert_spin_locked(&device_domain_lock);
2589 list_del(&info->link);
2590 list_del(&info->global);
2591 if (info->dev)
2592 dev_iommu_priv_set(info->dev, NULL);
2593 }
2594
domain_remove_dev_info(struct dmar_domain * domain)2595 static void domain_remove_dev_info(struct dmar_domain *domain)
2596 {
2597 struct device_domain_info *info, *tmp;
2598 unsigned long flags;
2599
2600 spin_lock_irqsave(&device_domain_lock, flags);
2601 list_for_each_entry_safe(info, tmp, &domain->devices, link)
2602 __dmar_remove_one_dev_info(info);
2603 spin_unlock_irqrestore(&device_domain_lock, flags);
2604 }
2605
find_domain(struct device * dev)2606 struct dmar_domain *find_domain(struct device *dev)
2607 {
2608 struct device_domain_info *info;
2609
2610 if (unlikely(!dev || !dev->iommu))
2611 return NULL;
2612
2613 if (unlikely(attach_deferred(dev)))
2614 return NULL;
2615
2616 /* No lock here, assumes no domain exit in normal case */
2617 info = get_domain_info(dev);
2618 if (likely(info))
2619 return info->domain;
2620
2621 return NULL;
2622 }
2623
do_deferred_attach(struct device * dev)2624 static void do_deferred_attach(struct device *dev)
2625 {
2626 struct iommu_domain *domain;
2627
2628 dev_iommu_priv_set(dev, NULL);
2629 domain = iommu_get_domain_for_dev(dev);
2630 if (domain)
2631 intel_iommu_attach_device(domain, dev);
2632 }
2633
2634 static inline struct device_domain_info *
dmar_search_domain_by_dev_info(int segment,int bus,int devfn)2635 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2636 {
2637 struct device_domain_info *info;
2638
2639 list_for_each_entry(info, &device_domain_list, global)
2640 if (info->segment == segment && info->bus == bus &&
2641 info->devfn == devfn)
2642 return info;
2643
2644 return NULL;
2645 }
2646
domain_setup_first_level(struct intel_iommu * iommu,struct dmar_domain * domain,struct device * dev,u32 pasid)2647 static int domain_setup_first_level(struct intel_iommu *iommu,
2648 struct dmar_domain *domain,
2649 struct device *dev,
2650 u32 pasid)
2651 {
2652 struct dma_pte *pgd = domain->pgd;
2653 int agaw, level;
2654 int flags = 0;
2655
2656 /*
2657 * Skip top levels of page tables for iommu which has
2658 * less agaw than default. Unnecessary for PT mode.
2659 */
2660 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2661 pgd = phys_to_virt(dma_pte_addr(pgd));
2662 if (!dma_pte_present(pgd))
2663 return -ENOMEM;
2664 }
2665
2666 level = agaw_to_level(agaw);
2667 if (level != 4 && level != 5)
2668 return -EINVAL;
2669
2670 if (pasid != PASID_RID2PASID)
2671 flags |= PASID_FLAG_SUPERVISOR_MODE;
2672 if (level == 5)
2673 flags |= PASID_FLAG_FL5LP;
2674
2675 if (domain->domain.type == IOMMU_DOMAIN_UNMANAGED)
2676 flags |= PASID_FLAG_PAGE_SNOOP;
2677
2678 return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2679 domain->iommu_did[iommu->seq_id],
2680 flags);
2681 }
2682
dev_is_real_dma_subdevice(struct device * dev)2683 static bool dev_is_real_dma_subdevice(struct device *dev)
2684 {
2685 return dev && dev_is_pci(dev) &&
2686 pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2687 }
2688
dmar_insert_one_dev_info(struct intel_iommu * iommu,int bus,int devfn,struct device * dev,struct dmar_domain * domain)2689 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2690 int bus, int devfn,
2691 struct device *dev,
2692 struct dmar_domain *domain)
2693 {
2694 struct dmar_domain *found = NULL;
2695 struct device_domain_info *info;
2696 unsigned long flags;
2697 int ret;
2698
2699 info = alloc_devinfo_mem();
2700 if (!info)
2701 return NULL;
2702
2703 if (!dev_is_real_dma_subdevice(dev)) {
2704 info->bus = bus;
2705 info->devfn = devfn;
2706 info->segment = iommu->segment;
2707 } else {
2708 struct pci_dev *pdev = to_pci_dev(dev);
2709
2710 info->bus = pdev->bus->number;
2711 info->devfn = pdev->devfn;
2712 info->segment = pci_domain_nr(pdev->bus);
2713 }
2714
2715 info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2716 info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2717 info->ats_qdep = 0;
2718 info->dev = dev;
2719 info->domain = domain;
2720 info->iommu = iommu;
2721 info->pasid_table = NULL;
2722 info->auxd_enabled = 0;
2723 INIT_LIST_HEAD(&info->auxiliary_domains);
2724
2725 if (dev && dev_is_pci(dev)) {
2726 struct pci_dev *pdev = to_pci_dev(info->dev);
2727
2728 if (ecap_dev_iotlb_support(iommu->ecap) &&
2729 pci_ats_supported(pdev) &&
2730 dmar_find_matched_atsr_unit(pdev))
2731 info->ats_supported = 1;
2732
2733 if (sm_supported(iommu)) {
2734 if (pasid_supported(iommu)) {
2735 int features = pci_pasid_features(pdev);
2736 if (features >= 0)
2737 info->pasid_supported = features | 1;
2738 }
2739
2740 if (info->ats_supported && ecap_prs(iommu->ecap) &&
2741 pci_pri_supported(pdev))
2742 info->pri_supported = 1;
2743 }
2744 }
2745
2746 spin_lock_irqsave(&device_domain_lock, flags);
2747 if (dev)
2748 found = find_domain(dev);
2749
2750 if (!found) {
2751 struct device_domain_info *info2;
2752 info2 = dmar_search_domain_by_dev_info(info->segment, info->bus,
2753 info->devfn);
2754 if (info2) {
2755 found = info2->domain;
2756 info2->dev = dev;
2757 }
2758 }
2759
2760 if (found) {
2761 spin_unlock_irqrestore(&device_domain_lock, flags);
2762 free_devinfo_mem(info);
2763 /* Caller must free the original domain */
2764 return found;
2765 }
2766
2767 spin_lock(&iommu->lock);
2768 ret = domain_attach_iommu(domain, iommu);
2769 spin_unlock(&iommu->lock);
2770
2771 if (ret) {
2772 spin_unlock_irqrestore(&device_domain_lock, flags);
2773 free_devinfo_mem(info);
2774 return NULL;
2775 }
2776
2777 list_add(&info->link, &domain->devices);
2778 list_add(&info->global, &device_domain_list);
2779 if (dev)
2780 dev_iommu_priv_set(dev, info);
2781 spin_unlock_irqrestore(&device_domain_lock, flags);
2782
2783 /* PASID table is mandatory for a PCI device in scalable mode. */
2784 if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
2785 ret = intel_pasid_alloc_table(dev);
2786 if (ret) {
2787 dev_err(dev, "PASID table allocation failed\n");
2788 dmar_remove_one_dev_info(dev);
2789 return NULL;
2790 }
2791
2792 /* Setup the PASID entry for requests without PASID: */
2793 spin_lock_irqsave(&iommu->lock, flags);
2794 if (hw_pass_through && domain_type_is_si(domain))
2795 ret = intel_pasid_setup_pass_through(iommu, domain,
2796 dev, PASID_RID2PASID);
2797 else if (domain_use_first_level(domain))
2798 ret = domain_setup_first_level(iommu, domain, dev,
2799 PASID_RID2PASID);
2800 else
2801 ret = intel_pasid_setup_second_level(iommu, domain,
2802 dev, PASID_RID2PASID);
2803 spin_unlock_irqrestore(&iommu->lock, flags);
2804 if (ret) {
2805 dev_err(dev, "Setup RID2PASID failed\n");
2806 dmar_remove_one_dev_info(dev);
2807 return NULL;
2808 }
2809 }
2810
2811 if (dev && domain_context_mapping(domain, dev)) {
2812 dev_err(dev, "Domain context map failed\n");
2813 dmar_remove_one_dev_info(dev);
2814 return NULL;
2815 }
2816
2817 return domain;
2818 }
2819
iommu_domain_identity_map(struct dmar_domain * domain,unsigned long first_vpfn,unsigned long last_vpfn)2820 static int iommu_domain_identity_map(struct dmar_domain *domain,
2821 unsigned long first_vpfn,
2822 unsigned long last_vpfn)
2823 {
2824 /*
2825 * RMRR range might have overlap with physical memory range,
2826 * clear it first
2827 */
2828 dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2829
2830 return __domain_mapping(domain, first_vpfn, NULL,
2831 first_vpfn, last_vpfn - first_vpfn + 1,
2832 DMA_PTE_READ|DMA_PTE_WRITE);
2833 }
2834
2835 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2836
si_domain_init(int hw)2837 static int __init si_domain_init(int hw)
2838 {
2839 struct dmar_rmrr_unit *rmrr;
2840 struct device *dev;
2841 int i, nid, ret;
2842
2843 si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2844 if (!si_domain)
2845 return -EFAULT;
2846
2847 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2848 domain_exit(si_domain);
2849 si_domain = NULL;
2850 return -EFAULT;
2851 }
2852
2853 if (hw)
2854 return 0;
2855
2856 for_each_online_node(nid) {
2857 unsigned long start_pfn, end_pfn;
2858 int i;
2859
2860 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2861 ret = iommu_domain_identity_map(si_domain,
2862 mm_to_dma_pfn(start_pfn),
2863 mm_to_dma_pfn(end_pfn));
2864 if (ret)
2865 return ret;
2866 }
2867 }
2868
2869 /*
2870 * Identity map the RMRRs so that devices with RMRRs could also use
2871 * the si_domain.
2872 */
2873 for_each_rmrr_units(rmrr) {
2874 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2875 i, dev) {
2876 unsigned long long start = rmrr->base_address;
2877 unsigned long long end = rmrr->end_address;
2878
2879 if (WARN_ON(end < start ||
2880 end >> agaw_to_width(si_domain->agaw)))
2881 continue;
2882
2883 ret = iommu_domain_identity_map(si_domain,
2884 mm_to_dma_pfn(start >> PAGE_SHIFT),
2885 mm_to_dma_pfn(end >> PAGE_SHIFT));
2886 if (ret)
2887 return ret;
2888 }
2889 }
2890
2891 return 0;
2892 }
2893
domain_add_dev_info(struct dmar_domain * domain,struct device * dev)2894 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2895 {
2896 struct dmar_domain *ndomain;
2897 struct intel_iommu *iommu;
2898 u8 bus, devfn;
2899
2900 iommu = device_to_iommu(dev, &bus, &devfn);
2901 if (!iommu)
2902 return -ENODEV;
2903
2904 ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2905 if (ndomain != domain)
2906 return -EBUSY;
2907
2908 return 0;
2909 }
2910
device_has_rmrr(struct device * dev)2911 static bool device_has_rmrr(struct device *dev)
2912 {
2913 struct dmar_rmrr_unit *rmrr;
2914 struct device *tmp;
2915 int i;
2916
2917 rcu_read_lock();
2918 for_each_rmrr_units(rmrr) {
2919 /*
2920 * Return TRUE if this RMRR contains the device that
2921 * is passed in.
2922 */
2923 for_each_active_dev_scope(rmrr->devices,
2924 rmrr->devices_cnt, i, tmp)
2925 if (tmp == dev ||
2926 is_downstream_to_pci_bridge(dev, tmp)) {
2927 rcu_read_unlock();
2928 return true;
2929 }
2930 }
2931 rcu_read_unlock();
2932 return false;
2933 }
2934
2935 /**
2936 * device_rmrr_is_relaxable - Test whether the RMRR of this device
2937 * is relaxable (ie. is allowed to be not enforced under some conditions)
2938 * @dev: device handle
2939 *
2940 * We assume that PCI USB devices with RMRRs have them largely
2941 * for historical reasons and that the RMRR space is not actively used post
2942 * boot. This exclusion may change if vendors begin to abuse it.
2943 *
2944 * The same exception is made for graphics devices, with the requirement that
2945 * any use of the RMRR regions will be torn down before assigning the device
2946 * to a guest.
2947 *
2948 * Return: true if the RMRR is relaxable, false otherwise
2949 */
device_rmrr_is_relaxable(struct device * dev)2950 static bool device_rmrr_is_relaxable(struct device *dev)
2951 {
2952 struct pci_dev *pdev;
2953
2954 if (!dev_is_pci(dev))
2955 return false;
2956
2957 pdev = to_pci_dev(dev);
2958 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2959 return true;
2960 else
2961 return false;
2962 }
2963
2964 /*
2965 * There are a couple cases where we need to restrict the functionality of
2966 * devices associated with RMRRs. The first is when evaluating a device for
2967 * identity mapping because problems exist when devices are moved in and out
2968 * of domains and their respective RMRR information is lost. This means that
2969 * a device with associated RMRRs will never be in a "passthrough" domain.
2970 * The second is use of the device through the IOMMU API. This interface
2971 * expects to have full control of the IOVA space for the device. We cannot
2972 * satisfy both the requirement that RMRR access is maintained and have an
2973 * unencumbered IOVA space. We also have no ability to quiesce the device's
2974 * use of the RMRR space or even inform the IOMMU API user of the restriction.
2975 * We therefore prevent devices associated with an RMRR from participating in
2976 * the IOMMU API, which eliminates them from device assignment.
2977 *
2978 * In both cases, devices which have relaxable RMRRs are not concerned by this
2979 * restriction. See device_rmrr_is_relaxable comment.
2980 */
device_is_rmrr_locked(struct device * dev)2981 static bool device_is_rmrr_locked(struct device *dev)
2982 {
2983 if (!device_has_rmrr(dev))
2984 return false;
2985
2986 if (device_rmrr_is_relaxable(dev))
2987 return false;
2988
2989 return true;
2990 }
2991
2992 /*
2993 * Return the required default domain type for a specific device.
2994 *
2995 * @dev: the device in query
2996 * @startup: true if this is during early boot
2997 *
2998 * Returns:
2999 * - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
3000 * - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
3001 * - 0: both identity and dynamic domains work for this device
3002 */
device_def_domain_type(struct device * dev)3003 static int device_def_domain_type(struct device *dev)
3004 {
3005 if (dev_is_pci(dev)) {
3006 struct pci_dev *pdev = to_pci_dev(dev);
3007
3008 /*
3009 * Prevent any device marked as untrusted from getting
3010 * placed into the statically identity mapping domain.
3011 */
3012 if (pdev->untrusted)
3013 return IOMMU_DOMAIN_DMA;
3014
3015 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
3016 return IOMMU_DOMAIN_IDENTITY;
3017
3018 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
3019 return IOMMU_DOMAIN_IDENTITY;
3020 }
3021
3022 return 0;
3023 }
3024
intel_iommu_init_qi(struct intel_iommu * iommu)3025 static void intel_iommu_init_qi(struct intel_iommu *iommu)
3026 {
3027 /*
3028 * Start from the sane iommu hardware state.
3029 * If the queued invalidation is already initialized by us
3030 * (for example, while enabling interrupt-remapping) then
3031 * we got the things already rolling from a sane state.
3032 */
3033 if (!iommu->qi) {
3034 /*
3035 * Clear any previous faults.
3036 */
3037 dmar_fault(-1, iommu);
3038 /*
3039 * Disable queued invalidation if supported and already enabled
3040 * before OS handover.
3041 */
3042 dmar_disable_qi(iommu);
3043 }
3044
3045 if (dmar_enable_qi(iommu)) {
3046 /*
3047 * Queued Invalidate not enabled, use Register Based Invalidate
3048 */
3049 iommu->flush.flush_context = __iommu_flush_context;
3050 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
3051 pr_info("%s: Using Register based invalidation\n",
3052 iommu->name);
3053 } else {
3054 iommu->flush.flush_context = qi_flush_context;
3055 iommu->flush.flush_iotlb = qi_flush_iotlb;
3056 pr_info("%s: Using Queued invalidation\n", iommu->name);
3057 }
3058 }
3059
copy_context_table(struct intel_iommu * iommu,struct root_entry * old_re,struct context_entry ** tbl,int bus,bool ext)3060 static int copy_context_table(struct intel_iommu *iommu,
3061 struct root_entry *old_re,
3062 struct context_entry **tbl,
3063 int bus, bool ext)
3064 {
3065 int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
3066 struct context_entry *new_ce = NULL, ce;
3067 struct context_entry *old_ce = NULL;
3068 struct root_entry re;
3069 phys_addr_t old_ce_phys;
3070
3071 tbl_idx = ext ? bus * 2 : bus;
3072 memcpy(&re, old_re, sizeof(re));
3073
3074 for (devfn = 0; devfn < 256; devfn++) {
3075 /* First calculate the correct index */
3076 idx = (ext ? devfn * 2 : devfn) % 256;
3077
3078 if (idx == 0) {
3079 /* First save what we may have and clean up */
3080 if (new_ce) {
3081 tbl[tbl_idx] = new_ce;
3082 __iommu_flush_cache(iommu, new_ce,
3083 VTD_PAGE_SIZE);
3084 pos = 1;
3085 }
3086
3087 if (old_ce)
3088 memunmap(old_ce);
3089
3090 ret = 0;
3091 if (devfn < 0x80)
3092 old_ce_phys = root_entry_lctp(&re);
3093 else
3094 old_ce_phys = root_entry_uctp(&re);
3095
3096 if (!old_ce_phys) {
3097 if (ext && devfn == 0) {
3098 /* No LCTP, try UCTP */
3099 devfn = 0x7f;
3100 continue;
3101 } else {
3102 goto out;
3103 }
3104 }
3105
3106 ret = -ENOMEM;
3107 old_ce = memremap(old_ce_phys, PAGE_SIZE,
3108 MEMREMAP_WB);
3109 if (!old_ce)
3110 goto out;
3111
3112 new_ce = alloc_pgtable_page(iommu->node);
3113 if (!new_ce)
3114 goto out_unmap;
3115
3116 ret = 0;
3117 }
3118
3119 /* Now copy the context entry */
3120 memcpy(&ce, old_ce + idx, sizeof(ce));
3121
3122 if (!__context_present(&ce))
3123 continue;
3124
3125 did = context_domain_id(&ce);
3126 if (did >= 0 && did < cap_ndoms(iommu->cap))
3127 set_bit(did, iommu->domain_ids);
3128
3129 /*
3130 * We need a marker for copied context entries. This
3131 * marker needs to work for the old format as well as
3132 * for extended context entries.
3133 *
3134 * Bit 67 of the context entry is used. In the old
3135 * format this bit is available to software, in the
3136 * extended format it is the PGE bit, but PGE is ignored
3137 * by HW if PASIDs are disabled (and thus still
3138 * available).
3139 *
3140 * So disable PASIDs first and then mark the entry
3141 * copied. This means that we don't copy PASID
3142 * translations from the old kernel, but this is fine as
3143 * faults there are not fatal.
3144 */
3145 context_clear_pasid_enable(&ce);
3146 context_set_copied(&ce);
3147
3148 new_ce[idx] = ce;
3149 }
3150
3151 tbl[tbl_idx + pos] = new_ce;
3152
3153 __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3154
3155 out_unmap:
3156 memunmap(old_ce);
3157
3158 out:
3159 return ret;
3160 }
3161
copy_translation_tables(struct intel_iommu * iommu)3162 static int copy_translation_tables(struct intel_iommu *iommu)
3163 {
3164 struct context_entry **ctxt_tbls;
3165 struct root_entry *old_rt;
3166 phys_addr_t old_rt_phys;
3167 int ctxt_table_entries;
3168 unsigned long flags;
3169 u64 rtaddr_reg;
3170 int bus, ret;
3171 bool new_ext, ext;
3172
3173 rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3174 ext = !!(rtaddr_reg & DMA_RTADDR_RTT);
3175 new_ext = !!ecap_ecs(iommu->ecap);
3176
3177 /*
3178 * The RTT bit can only be changed when translation is disabled,
3179 * but disabling translation means to open a window for data
3180 * corruption. So bail out and don't copy anything if we would
3181 * have to change the bit.
3182 */
3183 if (new_ext != ext)
3184 return -EINVAL;
3185
3186 old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3187 if (!old_rt_phys)
3188 return -EINVAL;
3189
3190 old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3191 if (!old_rt)
3192 return -ENOMEM;
3193
3194 /* This is too big for the stack - allocate it from slab */
3195 ctxt_table_entries = ext ? 512 : 256;
3196 ret = -ENOMEM;
3197 ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3198 if (!ctxt_tbls)
3199 goto out_unmap;
3200
3201 for (bus = 0; bus < 256; bus++) {
3202 ret = copy_context_table(iommu, &old_rt[bus],
3203 ctxt_tbls, bus, ext);
3204 if (ret) {
3205 pr_err("%s: Failed to copy context table for bus %d\n",
3206 iommu->name, bus);
3207 continue;
3208 }
3209 }
3210
3211 spin_lock_irqsave(&iommu->lock, flags);
3212
3213 /* Context tables are copied, now write them to the root_entry table */
3214 for (bus = 0; bus < 256; bus++) {
3215 int idx = ext ? bus * 2 : bus;
3216 u64 val;
3217
3218 if (ctxt_tbls[idx]) {
3219 val = virt_to_phys(ctxt_tbls[idx]) | 1;
3220 iommu->root_entry[bus].lo = val;
3221 }
3222
3223 if (!ext || !ctxt_tbls[idx + 1])
3224 continue;
3225
3226 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3227 iommu->root_entry[bus].hi = val;
3228 }
3229
3230 spin_unlock_irqrestore(&iommu->lock, flags);
3231
3232 kfree(ctxt_tbls);
3233
3234 __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3235
3236 ret = 0;
3237
3238 out_unmap:
3239 memunmap(old_rt);
3240
3241 return ret;
3242 }
3243
3244 #ifdef CONFIG_INTEL_IOMMU_SVM
intel_vcmd_ioasid_alloc(ioasid_t min,ioasid_t max,void * data)3245 static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data)
3246 {
3247 struct intel_iommu *iommu = data;
3248 ioasid_t ioasid;
3249
3250 if (!iommu)
3251 return INVALID_IOASID;
3252 /*
3253 * VT-d virtual command interface always uses the full 20 bit
3254 * PASID range. Host can partition guest PASID range based on
3255 * policies but it is out of guest's control.
3256 */
3257 if (min < PASID_MIN || max > intel_pasid_max_id)
3258 return INVALID_IOASID;
3259
3260 if (vcmd_alloc_pasid(iommu, &ioasid))
3261 return INVALID_IOASID;
3262
3263 return ioasid;
3264 }
3265
intel_vcmd_ioasid_free(ioasid_t ioasid,void * data)3266 static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data)
3267 {
3268 struct intel_iommu *iommu = data;
3269
3270 if (!iommu)
3271 return;
3272 /*
3273 * Sanity check the ioasid owner is done at upper layer, e.g. VFIO
3274 * We can only free the PASID when all the devices are unbound.
3275 */
3276 if (ioasid_find(NULL, ioasid, NULL)) {
3277 pr_alert("Cannot free active IOASID %d\n", ioasid);
3278 return;
3279 }
3280 vcmd_free_pasid(iommu, ioasid);
3281 }
3282
register_pasid_allocator(struct intel_iommu * iommu)3283 static void register_pasid_allocator(struct intel_iommu *iommu)
3284 {
3285 /*
3286 * If we are running in the host, no need for custom allocator
3287 * in that PASIDs are allocated from the host system-wide.
3288 */
3289 if (!cap_caching_mode(iommu->cap))
3290 return;
3291
3292 if (!sm_supported(iommu)) {
3293 pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n");
3294 return;
3295 }
3296
3297 /*
3298 * Register a custom PASID allocator if we are running in a guest,
3299 * guest PASID must be obtained via virtual command interface.
3300 * There can be multiple vIOMMUs in each guest but only one allocator
3301 * is active. All vIOMMU allocators will eventually be calling the same
3302 * host allocator.
3303 */
3304 if (!vccap_pasid(iommu->vccap))
3305 return;
3306
3307 pr_info("Register custom PASID allocator\n");
3308 iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc;
3309 iommu->pasid_allocator.free = intel_vcmd_ioasid_free;
3310 iommu->pasid_allocator.pdata = (void *)iommu;
3311 if (ioasid_register_allocator(&iommu->pasid_allocator)) {
3312 pr_warn("Custom PASID allocator failed, scalable mode disabled\n");
3313 /*
3314 * Disable scalable mode on this IOMMU if there
3315 * is no custom allocator. Mixing SM capable vIOMMU
3316 * and non-SM vIOMMU are not supported.
3317 */
3318 intel_iommu_sm = 0;
3319 }
3320 }
3321 #endif
3322
init_dmars(void)3323 static int __init init_dmars(void)
3324 {
3325 struct dmar_drhd_unit *drhd;
3326 struct intel_iommu *iommu;
3327 int ret;
3328
3329 /*
3330 * for each drhd
3331 * allocate root
3332 * initialize and program root entry to not present
3333 * endfor
3334 */
3335 for_each_drhd_unit(drhd) {
3336 /*
3337 * lock not needed as this is only incremented in the single
3338 * threaded kernel __init code path all other access are read
3339 * only
3340 */
3341 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3342 g_num_of_iommus++;
3343 continue;
3344 }
3345 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3346 }
3347
3348 /* Preallocate enough resources for IOMMU hot-addition */
3349 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3350 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3351
3352 g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3353 GFP_KERNEL);
3354 if (!g_iommus) {
3355 pr_err("Allocating global iommu array failed\n");
3356 ret = -ENOMEM;
3357 goto error;
3358 }
3359
3360 for_each_iommu(iommu, drhd) {
3361 if (drhd->ignored) {
3362 iommu_disable_translation(iommu);
3363 continue;
3364 }
3365
3366 /*
3367 * Find the max pasid size of all IOMMU's in the system.
3368 * We need to ensure the system pasid table is no bigger
3369 * than the smallest supported.
3370 */
3371 if (pasid_supported(iommu)) {
3372 u32 temp = 2 << ecap_pss(iommu->ecap);
3373
3374 intel_pasid_max_id = min_t(u32, temp,
3375 intel_pasid_max_id);
3376 }
3377
3378 g_iommus[iommu->seq_id] = iommu;
3379
3380 intel_iommu_init_qi(iommu);
3381
3382 ret = iommu_init_domains(iommu);
3383 if (ret)
3384 goto free_iommu;
3385
3386 init_translation_status(iommu);
3387
3388 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3389 iommu_disable_translation(iommu);
3390 clear_translation_pre_enabled(iommu);
3391 pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3392 iommu->name);
3393 }
3394
3395 /*
3396 * TBD:
3397 * we could share the same root & context tables
3398 * among all IOMMU's. Need to Split it later.
3399 */
3400 ret = iommu_alloc_root_entry(iommu);
3401 if (ret)
3402 goto free_iommu;
3403
3404 if (translation_pre_enabled(iommu)) {
3405 pr_info("Translation already enabled - trying to copy translation structures\n");
3406
3407 ret = copy_translation_tables(iommu);
3408 if (ret) {
3409 /*
3410 * We found the IOMMU with translation
3411 * enabled - but failed to copy over the
3412 * old root-entry table. Try to proceed
3413 * by disabling translation now and
3414 * allocating a clean root-entry table.
3415 * This might cause DMAR faults, but
3416 * probably the dump will still succeed.
3417 */
3418 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3419 iommu->name);
3420 iommu_disable_translation(iommu);
3421 clear_translation_pre_enabled(iommu);
3422 } else {
3423 pr_info("Copied translation tables from previous kernel for %s\n",
3424 iommu->name);
3425 }
3426 }
3427
3428 if (!ecap_pass_through(iommu->ecap))
3429 hw_pass_through = 0;
3430
3431 if (!intel_iommu_strict && cap_caching_mode(iommu->cap)) {
3432 pr_warn("Disable batched IOTLB flush due to virtualization");
3433 intel_iommu_strict = 1;
3434 }
3435 intel_svm_check(iommu);
3436 }
3437
3438 /*
3439 * Now that qi is enabled on all iommus, set the root entry and flush
3440 * caches. This is required on some Intel X58 chipsets, otherwise the
3441 * flush_context function will loop forever and the boot hangs.
3442 */
3443 for_each_active_iommu(iommu, drhd) {
3444 iommu_flush_write_buffer(iommu);
3445 #ifdef CONFIG_INTEL_IOMMU_SVM
3446 register_pasid_allocator(iommu);
3447 #endif
3448 iommu_set_root_entry(iommu);
3449 }
3450
3451 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3452 dmar_map_gfx = 0;
3453 #endif
3454
3455 if (!dmar_map_gfx)
3456 iommu_identity_mapping |= IDENTMAP_GFX;
3457
3458 check_tylersburg_isoch();
3459
3460 ret = si_domain_init(hw_pass_through);
3461 if (ret)
3462 goto free_iommu;
3463
3464 /*
3465 * for each drhd
3466 * enable fault log
3467 * global invalidate context cache
3468 * global invalidate iotlb
3469 * enable translation
3470 */
3471 for_each_iommu(iommu, drhd) {
3472 if (drhd->ignored) {
3473 /*
3474 * we always have to disable PMRs or DMA may fail on
3475 * this device
3476 */
3477 if (force_on)
3478 iommu_disable_protect_mem_regions(iommu);
3479 continue;
3480 }
3481
3482 iommu_flush_write_buffer(iommu);
3483
3484 #ifdef CONFIG_INTEL_IOMMU_SVM
3485 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3486 /*
3487 * Call dmar_alloc_hwirq() with dmar_global_lock held,
3488 * could cause possible lock race condition.
3489 */
3490 up_write(&dmar_global_lock);
3491 ret = intel_svm_enable_prq(iommu);
3492 down_write(&dmar_global_lock);
3493 if (ret)
3494 goto free_iommu;
3495 }
3496 #endif
3497 ret = dmar_set_interrupt(iommu);
3498 if (ret)
3499 goto free_iommu;
3500 }
3501
3502 return 0;
3503
3504 free_iommu:
3505 for_each_active_iommu(iommu, drhd) {
3506 disable_dmar_iommu(iommu);
3507 free_dmar_iommu(iommu);
3508 }
3509 if (si_domain) {
3510 domain_exit(si_domain);
3511 si_domain = NULL;
3512 }
3513
3514 kfree(g_iommus);
3515
3516 error:
3517 return ret;
3518 }
3519
3520 /* This takes a number of _MM_ pages, not VTD pages */
intel_alloc_iova(struct device * dev,struct dmar_domain * domain,unsigned long nrpages,uint64_t dma_mask)3521 static unsigned long intel_alloc_iova(struct device *dev,
3522 struct dmar_domain *domain,
3523 unsigned long nrpages, uint64_t dma_mask)
3524 {
3525 unsigned long iova_pfn;
3526
3527 /*
3528 * Restrict dma_mask to the width that the iommu can handle.
3529 * First-level translation restricts the input-address to a
3530 * canonical address (i.e., address bits 63:N have the same
3531 * value as address bit [N-1], where N is 48-bits with 4-level
3532 * paging and 57-bits with 5-level paging). Hence, skip bit
3533 * [N-1].
3534 */
3535 if (domain_use_first_level(domain))
3536 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw - 1),
3537 dma_mask);
3538 else
3539 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw),
3540 dma_mask);
3541
3542 /* Ensure we reserve the whole size-aligned region */
3543 nrpages = __roundup_pow_of_two(nrpages);
3544
3545 if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3546 /*
3547 * First try to allocate an io virtual address in
3548 * DMA_BIT_MASK(32) and if that fails then try allocating
3549 * from higher range
3550 */
3551 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3552 IOVA_PFN(DMA_BIT_MASK(32)), false);
3553 if (iova_pfn)
3554 return iova_pfn;
3555 }
3556 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3557 IOVA_PFN(dma_mask), true);
3558 if (unlikely(!iova_pfn)) {
3559 dev_err_once(dev, "Allocating %ld-page iova failed\n",
3560 nrpages);
3561 return 0;
3562 }
3563
3564 return iova_pfn;
3565 }
3566
__intel_map_single(struct device * dev,phys_addr_t paddr,size_t size,int dir,u64 dma_mask)3567 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3568 size_t size, int dir, u64 dma_mask)
3569 {
3570 struct dmar_domain *domain;
3571 phys_addr_t start_paddr;
3572 unsigned long iova_pfn;
3573 int prot = 0;
3574 int ret;
3575 struct intel_iommu *iommu;
3576 unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3577
3578 BUG_ON(dir == DMA_NONE);
3579
3580 if (unlikely(attach_deferred(dev)))
3581 do_deferred_attach(dev);
3582
3583 domain = find_domain(dev);
3584 if (!domain)
3585 return DMA_MAPPING_ERROR;
3586
3587 iommu = domain_get_iommu(domain);
3588 size = aligned_nrpages(paddr, size);
3589
3590 iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3591 if (!iova_pfn)
3592 goto error;
3593
3594 /*
3595 * Check if DMAR supports zero-length reads on write only
3596 * mappings..
3597 */
3598 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3599 !cap_zlr(iommu->cap))
3600 prot |= DMA_PTE_READ;
3601 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3602 prot |= DMA_PTE_WRITE;
3603 /*
3604 * paddr - (paddr + size) might be partial page, we should map the whole
3605 * page. Note: if two part of one page are separately mapped, we
3606 * might have two guest_addr mapping to the same host paddr, but this
3607 * is not a big problem
3608 */
3609 ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3610 mm_to_dma_pfn(paddr_pfn), size, prot);
3611 if (ret)
3612 goto error;
3613
3614 start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
3615 start_paddr += paddr & ~PAGE_MASK;
3616
3617 trace_map_single(dev, start_paddr, paddr, size << VTD_PAGE_SHIFT);
3618
3619 return start_paddr;
3620
3621 error:
3622 if (iova_pfn)
3623 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3624 dev_err(dev, "Device request: %zx@%llx dir %d --- failed\n",
3625 size, (unsigned long long)paddr, dir);
3626 return DMA_MAPPING_ERROR;
3627 }
3628
intel_map_page(struct device * dev,struct page * page,unsigned long offset,size_t size,enum dma_data_direction dir,unsigned long attrs)3629 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3630 unsigned long offset, size_t size,
3631 enum dma_data_direction dir,
3632 unsigned long attrs)
3633 {
3634 return __intel_map_single(dev, page_to_phys(page) + offset,
3635 size, dir, *dev->dma_mask);
3636 }
3637
intel_map_resource(struct device * dev,phys_addr_t phys_addr,size_t size,enum dma_data_direction dir,unsigned long attrs)3638 static dma_addr_t intel_map_resource(struct device *dev, phys_addr_t phys_addr,
3639 size_t size, enum dma_data_direction dir,
3640 unsigned long attrs)
3641 {
3642 return __intel_map_single(dev, phys_addr, size, dir, *dev->dma_mask);
3643 }
3644
intel_unmap(struct device * dev,dma_addr_t dev_addr,size_t size)3645 static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
3646 {
3647 struct dmar_domain *domain;
3648 unsigned long start_pfn, last_pfn;
3649 unsigned long nrpages;
3650 unsigned long iova_pfn;
3651 struct intel_iommu *iommu;
3652 struct page *freelist;
3653 struct pci_dev *pdev = NULL;
3654
3655 domain = find_domain(dev);
3656 BUG_ON(!domain);
3657
3658 iommu = domain_get_iommu(domain);
3659
3660 iova_pfn = IOVA_PFN(dev_addr);
3661
3662 nrpages = aligned_nrpages(dev_addr, size);
3663 start_pfn = mm_to_dma_pfn(iova_pfn);
3664 last_pfn = start_pfn + nrpages - 1;
3665
3666 if (dev_is_pci(dev))
3667 pdev = to_pci_dev(dev);
3668
3669 freelist = domain_unmap(domain, start_pfn, last_pfn);
3670 if (intel_iommu_strict || (pdev && pdev->untrusted) ||
3671 !has_iova_flush_queue(&domain->iovad)) {
3672 iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3673 nrpages, !freelist, 0);
3674 /* free iova */
3675 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3676 dma_free_pagelist(freelist);
3677 } else {
3678 queue_iova(&domain->iovad, iova_pfn, nrpages,
3679 (unsigned long)freelist);
3680 /*
3681 * queue up the release of the unmap to save the 1/6th of the
3682 * cpu used up by the iotlb flush operation...
3683 */
3684 }
3685
3686 trace_unmap_single(dev, dev_addr, size);
3687 }
3688
intel_unmap_page(struct device * dev,dma_addr_t dev_addr,size_t size,enum dma_data_direction dir,unsigned long attrs)3689 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3690 size_t size, enum dma_data_direction dir,
3691 unsigned long attrs)
3692 {
3693 intel_unmap(dev, dev_addr, size);
3694 }
3695
intel_unmap_resource(struct device * dev,dma_addr_t dev_addr,size_t size,enum dma_data_direction dir,unsigned long attrs)3696 static void intel_unmap_resource(struct device *dev, dma_addr_t dev_addr,
3697 size_t size, enum dma_data_direction dir, unsigned long attrs)
3698 {
3699 intel_unmap(dev, dev_addr, size);
3700 }
3701
intel_alloc_coherent(struct device * dev,size_t size,dma_addr_t * dma_handle,gfp_t flags,unsigned long attrs)3702 static void *intel_alloc_coherent(struct device *dev, size_t size,
3703 dma_addr_t *dma_handle, gfp_t flags,
3704 unsigned long attrs)
3705 {
3706 struct page *page = NULL;
3707 int order;
3708
3709 if (unlikely(attach_deferred(dev)))
3710 do_deferred_attach(dev);
3711
3712 size = PAGE_ALIGN(size);
3713 order = get_order(size);
3714
3715 if (gfpflags_allow_blocking(flags)) {
3716 unsigned int count = size >> PAGE_SHIFT;
3717
3718 page = dma_alloc_from_contiguous(dev, count, order,
3719 flags & __GFP_NOWARN);
3720 }
3721
3722 if (!page)
3723 page = alloc_pages(flags, order);
3724 if (!page)
3725 return NULL;
3726 memset(page_address(page), 0, size);
3727
3728 *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3729 DMA_BIDIRECTIONAL,
3730 dev->coherent_dma_mask);
3731 if (*dma_handle != DMA_MAPPING_ERROR)
3732 return page_address(page);
3733 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3734 __free_pages(page, order);
3735
3736 return NULL;
3737 }
3738
intel_free_coherent(struct device * dev,size_t size,void * vaddr,dma_addr_t dma_handle,unsigned long attrs)3739 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3740 dma_addr_t dma_handle, unsigned long attrs)
3741 {
3742 int order;
3743 struct page *page = virt_to_page(vaddr);
3744
3745 size = PAGE_ALIGN(size);
3746 order = get_order(size);
3747
3748 intel_unmap(dev, dma_handle, size);
3749 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3750 __free_pages(page, order);
3751 }
3752
intel_unmap_sg(struct device * dev,struct scatterlist * sglist,int nelems,enum dma_data_direction dir,unsigned long attrs)3753 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3754 int nelems, enum dma_data_direction dir,
3755 unsigned long attrs)
3756 {
3757 dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
3758 unsigned long nrpages = 0;
3759 struct scatterlist *sg;
3760 int i;
3761
3762 for_each_sg(sglist, sg, nelems, i) {
3763 nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
3764 }
3765
3766 intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3767
3768 trace_unmap_sg(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3769 }
3770
intel_map_sg(struct device * dev,struct scatterlist * sglist,int nelems,enum dma_data_direction dir,unsigned long attrs)3771 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3772 enum dma_data_direction dir, unsigned long attrs)
3773 {
3774 int i;
3775 struct dmar_domain *domain;
3776 size_t size = 0;
3777 int prot = 0;
3778 unsigned long iova_pfn;
3779 int ret;
3780 struct scatterlist *sg;
3781 unsigned long start_vpfn;
3782 struct intel_iommu *iommu;
3783
3784 BUG_ON(dir == DMA_NONE);
3785
3786 if (unlikely(attach_deferred(dev)))
3787 do_deferred_attach(dev);
3788
3789 domain = find_domain(dev);
3790 if (!domain)
3791 return 0;
3792
3793 iommu = domain_get_iommu(domain);
3794
3795 for_each_sg(sglist, sg, nelems, i)
3796 size += aligned_nrpages(sg->offset, sg->length);
3797
3798 iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3799 *dev->dma_mask);
3800 if (!iova_pfn) {
3801 sglist->dma_length = 0;
3802 return 0;
3803 }
3804
3805 /*
3806 * Check if DMAR supports zero-length reads on write only
3807 * mappings..
3808 */
3809 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3810 !cap_zlr(iommu->cap))
3811 prot |= DMA_PTE_READ;
3812 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3813 prot |= DMA_PTE_WRITE;
3814
3815 start_vpfn = mm_to_dma_pfn(iova_pfn);
3816
3817 ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3818 if (unlikely(ret)) {
3819 dma_pte_free_pagetable(domain, start_vpfn,
3820 start_vpfn + size - 1,
3821 agaw_to_level(domain->agaw) + 1);
3822 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3823 return 0;
3824 }
3825
3826 for_each_sg(sglist, sg, nelems, i)
3827 trace_map_sg(dev, i + 1, nelems, sg);
3828
3829 return nelems;
3830 }
3831
intel_get_required_mask(struct device * dev)3832 static u64 intel_get_required_mask(struct device *dev)
3833 {
3834 return DMA_BIT_MASK(32);
3835 }
3836
3837 static const struct dma_map_ops intel_dma_ops = {
3838 .alloc = intel_alloc_coherent,
3839 .free = intel_free_coherent,
3840 .map_sg = intel_map_sg,
3841 .unmap_sg = intel_unmap_sg,
3842 .map_page = intel_map_page,
3843 .unmap_page = intel_unmap_page,
3844 .map_resource = intel_map_resource,
3845 .unmap_resource = intel_unmap_resource,
3846 .dma_supported = dma_direct_supported,
3847 .mmap = dma_common_mmap,
3848 .get_sgtable = dma_common_get_sgtable,
3849 .alloc_pages = dma_common_alloc_pages,
3850 .free_pages = dma_common_free_pages,
3851 .get_required_mask = intel_get_required_mask,
3852 };
3853
3854 static void
bounce_sync_single(struct device * dev,dma_addr_t addr,size_t size,enum dma_data_direction dir,enum dma_sync_target target)3855 bounce_sync_single(struct device *dev, dma_addr_t addr, size_t size,
3856 enum dma_data_direction dir, enum dma_sync_target target)
3857 {
3858 struct dmar_domain *domain;
3859 phys_addr_t tlb_addr;
3860
3861 domain = find_domain(dev);
3862 if (WARN_ON(!domain))
3863 return;
3864
3865 tlb_addr = intel_iommu_iova_to_phys(&domain->domain, addr);
3866 if (is_swiotlb_buffer(tlb_addr))
3867 swiotlb_tbl_sync_single(dev, tlb_addr, size, dir, target);
3868 }
3869
3870 static dma_addr_t
bounce_map_single(struct device * dev,phys_addr_t paddr,size_t size,enum dma_data_direction dir,unsigned long attrs,u64 dma_mask)3871 bounce_map_single(struct device *dev, phys_addr_t paddr, size_t size,
3872 enum dma_data_direction dir, unsigned long attrs,
3873 u64 dma_mask)
3874 {
3875 size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
3876 struct dmar_domain *domain;
3877 struct intel_iommu *iommu;
3878 unsigned long iova_pfn;
3879 unsigned long nrpages;
3880 phys_addr_t tlb_addr;
3881 int prot = 0;
3882 int ret;
3883
3884 if (unlikely(attach_deferred(dev)))
3885 do_deferred_attach(dev);
3886
3887 domain = find_domain(dev);
3888
3889 if (WARN_ON(dir == DMA_NONE || !domain))
3890 return DMA_MAPPING_ERROR;
3891
3892 iommu = domain_get_iommu(domain);
3893 if (WARN_ON(!iommu))
3894 return DMA_MAPPING_ERROR;
3895
3896 nrpages = aligned_nrpages(0, size);
3897 iova_pfn = intel_alloc_iova(dev, domain,
3898 dma_to_mm_pfn(nrpages), dma_mask);
3899 if (!iova_pfn)
3900 return DMA_MAPPING_ERROR;
3901
3902 /*
3903 * Check if DMAR supports zero-length reads on write only
3904 * mappings..
3905 */
3906 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL ||
3907 !cap_zlr(iommu->cap))
3908 prot |= DMA_PTE_READ;
3909 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3910 prot |= DMA_PTE_WRITE;
3911
3912 /*
3913 * If both the physical buffer start address and size are
3914 * page aligned, we don't need to use a bounce page.
3915 */
3916 if (!IS_ALIGNED(paddr | size, VTD_PAGE_SIZE)) {
3917 tlb_addr = swiotlb_tbl_map_single(dev, paddr, size,
3918 aligned_size, dir, attrs);
3919 if (tlb_addr == DMA_MAPPING_ERROR) {
3920 goto swiotlb_error;
3921 } else {
3922 /* Cleanup the padding area. */
3923 void *padding_start = phys_to_virt(tlb_addr);
3924 size_t padding_size = aligned_size;
3925
3926 if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
3927 (dir == DMA_TO_DEVICE ||
3928 dir == DMA_BIDIRECTIONAL)) {
3929 padding_start += size;
3930 padding_size -= size;
3931 }
3932
3933 memset(padding_start, 0, padding_size);
3934 }
3935 } else {
3936 tlb_addr = paddr;
3937 }
3938
3939 ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3940 tlb_addr >> VTD_PAGE_SHIFT, nrpages, prot);
3941 if (ret)
3942 goto mapping_error;
3943
3944 trace_bounce_map_single(dev, iova_pfn << PAGE_SHIFT, paddr, size);
3945
3946 return (phys_addr_t)iova_pfn << PAGE_SHIFT;
3947
3948 mapping_error:
3949 if (is_swiotlb_buffer(tlb_addr))
3950 swiotlb_tbl_unmap_single(dev, tlb_addr, size,
3951 aligned_size, dir, attrs);
3952 swiotlb_error:
3953 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3954 dev_err(dev, "Device bounce map: %zx@%llx dir %d --- failed\n",
3955 size, (unsigned long long)paddr, dir);
3956
3957 return DMA_MAPPING_ERROR;
3958 }
3959
3960 static void
bounce_unmap_single(struct device * dev,dma_addr_t dev_addr,size_t size,enum dma_data_direction dir,unsigned long attrs)3961 bounce_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
3962 enum dma_data_direction dir, unsigned long attrs)
3963 {
3964 size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
3965 struct dmar_domain *domain;
3966 phys_addr_t tlb_addr;
3967
3968 domain = find_domain(dev);
3969 if (WARN_ON(!domain))
3970 return;
3971
3972 tlb_addr = intel_iommu_iova_to_phys(&domain->domain, dev_addr);
3973 if (WARN_ON(!tlb_addr))
3974 return;
3975
3976 intel_unmap(dev, dev_addr, size);
3977 if (is_swiotlb_buffer(tlb_addr))
3978 swiotlb_tbl_unmap_single(dev, tlb_addr, size,
3979 aligned_size, dir, attrs);
3980
3981 trace_bounce_unmap_single(dev, dev_addr, size);
3982 }
3983
3984 static dma_addr_t
bounce_map_page(struct device * dev,struct page * page,unsigned long offset,size_t size,enum dma_data_direction dir,unsigned long attrs)3985 bounce_map_page(struct device *dev, struct page *page, unsigned long offset,
3986 size_t size, enum dma_data_direction dir, unsigned long attrs)
3987 {
3988 return bounce_map_single(dev, page_to_phys(page) + offset,
3989 size, dir, attrs, *dev->dma_mask);
3990 }
3991
3992 static dma_addr_t
bounce_map_resource(struct device * dev,phys_addr_t phys_addr,size_t size,enum dma_data_direction dir,unsigned long attrs)3993 bounce_map_resource(struct device *dev, phys_addr_t phys_addr, size_t size,
3994 enum dma_data_direction dir, unsigned long attrs)
3995 {
3996 return bounce_map_single(dev, phys_addr, size,
3997 dir, attrs, *dev->dma_mask);
3998 }
3999
4000 static void
bounce_unmap_page(struct device * dev,dma_addr_t dev_addr,size_t size,enum dma_data_direction dir,unsigned long attrs)4001 bounce_unmap_page(struct device *dev, dma_addr_t dev_addr, size_t size,
4002 enum dma_data_direction dir, unsigned long attrs)
4003 {
4004 bounce_unmap_single(dev, dev_addr, size, dir, attrs);
4005 }
4006
4007 static void
bounce_unmap_resource(struct device * dev,dma_addr_t dev_addr,size_t size,enum dma_data_direction dir,unsigned long attrs)4008 bounce_unmap_resource(struct device *dev, dma_addr_t dev_addr, size_t size,
4009 enum dma_data_direction dir, unsigned long attrs)
4010 {
4011 bounce_unmap_single(dev, dev_addr, size, dir, attrs);
4012 }
4013
4014 static void
bounce_unmap_sg(struct device * dev,struct scatterlist * sglist,int nelems,enum dma_data_direction dir,unsigned long attrs)4015 bounce_unmap_sg(struct device *dev, struct scatterlist *sglist, int nelems,
4016 enum dma_data_direction dir, unsigned long attrs)
4017 {
4018 struct scatterlist *sg;
4019 int i;
4020
4021 for_each_sg(sglist, sg, nelems, i)
4022 bounce_unmap_page(dev, sg->dma_address,
4023 sg_dma_len(sg), dir, attrs);
4024 }
4025
4026 static int
bounce_map_sg(struct device * dev,struct scatterlist * sglist,int nelems,enum dma_data_direction dir,unsigned long attrs)4027 bounce_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
4028 enum dma_data_direction dir, unsigned long attrs)
4029 {
4030 int i;
4031 struct scatterlist *sg;
4032
4033 for_each_sg(sglist, sg, nelems, i) {
4034 sg->dma_address = bounce_map_page(dev, sg_page(sg),
4035 sg->offset, sg->length,
4036 dir, attrs);
4037 if (sg->dma_address == DMA_MAPPING_ERROR)
4038 goto out_unmap;
4039 sg_dma_len(sg) = sg->length;
4040 }
4041
4042 for_each_sg(sglist, sg, nelems, i)
4043 trace_bounce_map_sg(dev, i + 1, nelems, sg);
4044
4045 return nelems;
4046
4047 out_unmap:
4048 bounce_unmap_sg(dev, sglist, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
4049 return 0;
4050 }
4051
4052 static void
bounce_sync_single_for_cpu(struct device * dev,dma_addr_t addr,size_t size,enum dma_data_direction dir)4053 bounce_sync_single_for_cpu(struct device *dev, dma_addr_t addr,
4054 size_t size, enum dma_data_direction dir)
4055 {
4056 bounce_sync_single(dev, addr, size, dir, SYNC_FOR_CPU);
4057 }
4058
4059 static void
bounce_sync_single_for_device(struct device * dev,dma_addr_t addr,size_t size,enum dma_data_direction dir)4060 bounce_sync_single_for_device(struct device *dev, dma_addr_t addr,
4061 size_t size, enum dma_data_direction dir)
4062 {
4063 bounce_sync_single(dev, addr, size, dir, SYNC_FOR_DEVICE);
4064 }
4065
4066 static void
bounce_sync_sg_for_cpu(struct device * dev,struct scatterlist * sglist,int nelems,enum dma_data_direction dir)4067 bounce_sync_sg_for_cpu(struct device *dev, struct scatterlist *sglist,
4068 int nelems, enum dma_data_direction dir)
4069 {
4070 struct scatterlist *sg;
4071 int i;
4072
4073 for_each_sg(sglist, sg, nelems, i)
4074 bounce_sync_single(dev, sg_dma_address(sg),
4075 sg_dma_len(sg), dir, SYNC_FOR_CPU);
4076 }
4077
4078 static void
bounce_sync_sg_for_device(struct device * dev,struct scatterlist * sglist,int nelems,enum dma_data_direction dir)4079 bounce_sync_sg_for_device(struct device *dev, struct scatterlist *sglist,
4080 int nelems, enum dma_data_direction dir)
4081 {
4082 struct scatterlist *sg;
4083 int i;
4084
4085 for_each_sg(sglist, sg, nelems, i)
4086 bounce_sync_single(dev, sg_dma_address(sg),
4087 sg_dma_len(sg), dir, SYNC_FOR_DEVICE);
4088 }
4089
4090 static const struct dma_map_ops bounce_dma_ops = {
4091 .alloc = intel_alloc_coherent,
4092 .free = intel_free_coherent,
4093 .map_sg = bounce_map_sg,
4094 .unmap_sg = bounce_unmap_sg,
4095 .map_page = bounce_map_page,
4096 .unmap_page = bounce_unmap_page,
4097 .sync_single_for_cpu = bounce_sync_single_for_cpu,
4098 .sync_single_for_device = bounce_sync_single_for_device,
4099 .sync_sg_for_cpu = bounce_sync_sg_for_cpu,
4100 .sync_sg_for_device = bounce_sync_sg_for_device,
4101 .map_resource = bounce_map_resource,
4102 .unmap_resource = bounce_unmap_resource,
4103 .alloc_pages = dma_common_alloc_pages,
4104 .free_pages = dma_common_free_pages,
4105 .dma_supported = dma_direct_supported,
4106 };
4107
iommu_domain_cache_init(void)4108 static inline int iommu_domain_cache_init(void)
4109 {
4110 int ret = 0;
4111
4112 iommu_domain_cache = kmem_cache_create("iommu_domain",
4113 sizeof(struct dmar_domain),
4114 0,
4115 SLAB_HWCACHE_ALIGN,
4116
4117 NULL);
4118 if (!iommu_domain_cache) {
4119 pr_err("Couldn't create iommu_domain cache\n");
4120 ret = -ENOMEM;
4121 }
4122
4123 return ret;
4124 }
4125
iommu_devinfo_cache_init(void)4126 static inline int iommu_devinfo_cache_init(void)
4127 {
4128 int ret = 0;
4129
4130 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
4131 sizeof(struct device_domain_info),
4132 0,
4133 SLAB_HWCACHE_ALIGN,
4134 NULL);
4135 if (!iommu_devinfo_cache) {
4136 pr_err("Couldn't create devinfo cache\n");
4137 ret = -ENOMEM;
4138 }
4139
4140 return ret;
4141 }
4142
iommu_init_mempool(void)4143 static int __init iommu_init_mempool(void)
4144 {
4145 int ret;
4146 ret = iova_cache_get();
4147 if (ret)
4148 return ret;
4149
4150 ret = iommu_domain_cache_init();
4151 if (ret)
4152 goto domain_error;
4153
4154 ret = iommu_devinfo_cache_init();
4155 if (!ret)
4156 return ret;
4157
4158 kmem_cache_destroy(iommu_domain_cache);
4159 domain_error:
4160 iova_cache_put();
4161
4162 return -ENOMEM;
4163 }
4164
iommu_exit_mempool(void)4165 static void __init iommu_exit_mempool(void)
4166 {
4167 kmem_cache_destroy(iommu_devinfo_cache);
4168 kmem_cache_destroy(iommu_domain_cache);
4169 iova_cache_put();
4170 }
4171
init_no_remapping_devices(void)4172 static void __init init_no_remapping_devices(void)
4173 {
4174 struct dmar_drhd_unit *drhd;
4175 struct device *dev;
4176 int i;
4177
4178 for_each_drhd_unit(drhd) {
4179 if (!drhd->include_all) {
4180 for_each_active_dev_scope(drhd->devices,
4181 drhd->devices_cnt, i, dev)
4182 break;
4183 /* ignore DMAR unit if no devices exist */
4184 if (i == drhd->devices_cnt)
4185 drhd->ignored = 1;
4186 }
4187 }
4188
4189 for_each_active_drhd_unit(drhd) {
4190 if (drhd->include_all)
4191 continue;
4192
4193 for_each_active_dev_scope(drhd->devices,
4194 drhd->devices_cnt, i, dev)
4195 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
4196 break;
4197 if (i < drhd->devices_cnt)
4198 continue;
4199
4200 /* This IOMMU has *only* gfx devices. Either bypass it or
4201 set the gfx_mapped flag, as appropriate */
4202 drhd->gfx_dedicated = 1;
4203 if (!dmar_map_gfx)
4204 drhd->ignored = 1;
4205 }
4206 }
4207
4208 #ifdef CONFIG_SUSPEND
init_iommu_hw(void)4209 static int init_iommu_hw(void)
4210 {
4211 struct dmar_drhd_unit *drhd;
4212 struct intel_iommu *iommu = NULL;
4213
4214 for_each_active_iommu(iommu, drhd)
4215 if (iommu->qi)
4216 dmar_reenable_qi(iommu);
4217
4218 for_each_iommu(iommu, drhd) {
4219 if (drhd->ignored) {
4220 /*
4221 * we always have to disable PMRs or DMA may fail on
4222 * this device
4223 */
4224 if (force_on)
4225 iommu_disable_protect_mem_regions(iommu);
4226 continue;
4227 }
4228
4229 iommu_flush_write_buffer(iommu);
4230 iommu_set_root_entry(iommu);
4231 iommu_enable_translation(iommu);
4232 iommu_disable_protect_mem_regions(iommu);
4233 }
4234
4235 return 0;
4236 }
4237
iommu_flush_all(void)4238 static void iommu_flush_all(void)
4239 {
4240 struct dmar_drhd_unit *drhd;
4241 struct intel_iommu *iommu;
4242
4243 for_each_active_iommu(iommu, drhd) {
4244 iommu->flush.flush_context(iommu, 0, 0, 0,
4245 DMA_CCMD_GLOBAL_INVL);
4246 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
4247 DMA_TLB_GLOBAL_FLUSH);
4248 }
4249 }
4250
iommu_suspend(void)4251 static int iommu_suspend(void)
4252 {
4253 struct dmar_drhd_unit *drhd;
4254 struct intel_iommu *iommu = NULL;
4255 unsigned long flag;
4256
4257 for_each_active_iommu(iommu, drhd) {
4258 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
4259 GFP_ATOMIC);
4260 if (!iommu->iommu_state)
4261 goto nomem;
4262 }
4263
4264 iommu_flush_all();
4265
4266 for_each_active_iommu(iommu, drhd) {
4267 iommu_disable_translation(iommu);
4268
4269 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4270
4271 iommu->iommu_state[SR_DMAR_FECTL_REG] =
4272 readl(iommu->reg + DMAR_FECTL_REG);
4273 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
4274 readl(iommu->reg + DMAR_FEDATA_REG);
4275 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
4276 readl(iommu->reg + DMAR_FEADDR_REG);
4277 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
4278 readl(iommu->reg + DMAR_FEUADDR_REG);
4279
4280 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4281 }
4282 return 0;
4283
4284 nomem:
4285 for_each_active_iommu(iommu, drhd)
4286 kfree(iommu->iommu_state);
4287
4288 return -ENOMEM;
4289 }
4290
iommu_resume(void)4291 static void iommu_resume(void)
4292 {
4293 struct dmar_drhd_unit *drhd;
4294 struct intel_iommu *iommu = NULL;
4295 unsigned long flag;
4296
4297 if (init_iommu_hw()) {
4298 if (force_on)
4299 panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
4300 else
4301 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
4302 return;
4303 }
4304
4305 for_each_active_iommu(iommu, drhd) {
4306
4307 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4308
4309 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
4310 iommu->reg + DMAR_FECTL_REG);
4311 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
4312 iommu->reg + DMAR_FEDATA_REG);
4313 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
4314 iommu->reg + DMAR_FEADDR_REG);
4315 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
4316 iommu->reg + DMAR_FEUADDR_REG);
4317
4318 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4319 }
4320
4321 for_each_active_iommu(iommu, drhd)
4322 kfree(iommu->iommu_state);
4323 }
4324
4325 static struct syscore_ops iommu_syscore_ops = {
4326 .resume = iommu_resume,
4327 .suspend = iommu_suspend,
4328 };
4329
init_iommu_pm_ops(void)4330 static void __init init_iommu_pm_ops(void)
4331 {
4332 register_syscore_ops(&iommu_syscore_ops);
4333 }
4334
4335 #else
init_iommu_pm_ops(void)4336 static inline void init_iommu_pm_ops(void) {}
4337 #endif /* CONFIG_PM */
4338
rmrr_sanity_check(struct acpi_dmar_reserved_memory * rmrr)4339 static int rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
4340 {
4341 if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
4342 !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
4343 rmrr->end_address <= rmrr->base_address ||
4344 arch_rmrr_sanity_check(rmrr))
4345 return -EINVAL;
4346
4347 return 0;
4348 }
4349
dmar_parse_one_rmrr(struct acpi_dmar_header * header,void * arg)4350 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4351 {
4352 struct acpi_dmar_reserved_memory *rmrr;
4353 struct dmar_rmrr_unit *rmrru;
4354
4355 rmrr = (struct acpi_dmar_reserved_memory *)header;
4356 if (rmrr_sanity_check(rmrr)) {
4357 pr_warn(FW_BUG
4358 "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
4359 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4360 rmrr->base_address, rmrr->end_address,
4361 dmi_get_system_info(DMI_BIOS_VENDOR),
4362 dmi_get_system_info(DMI_BIOS_VERSION),
4363 dmi_get_system_info(DMI_PRODUCT_VERSION));
4364 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
4365 }
4366
4367 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4368 if (!rmrru)
4369 goto out;
4370
4371 rmrru->hdr = header;
4372
4373 rmrru->base_address = rmrr->base_address;
4374 rmrru->end_address = rmrr->end_address;
4375
4376 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4377 ((void *)rmrr) + rmrr->header.length,
4378 &rmrru->devices_cnt);
4379 if (rmrru->devices_cnt && rmrru->devices == NULL)
4380 goto free_rmrru;
4381
4382 list_add(&rmrru->list, &dmar_rmrr_units);
4383
4384 return 0;
4385 free_rmrru:
4386 kfree(rmrru);
4387 out:
4388 return -ENOMEM;
4389 }
4390
dmar_find_atsr(struct acpi_dmar_atsr * atsr)4391 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4392 {
4393 struct dmar_atsr_unit *atsru;
4394 struct acpi_dmar_atsr *tmp;
4395
4396 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
4397 dmar_rcu_check()) {
4398 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4399 if (atsr->segment != tmp->segment)
4400 continue;
4401 if (atsr->header.length != tmp->header.length)
4402 continue;
4403 if (memcmp(atsr, tmp, atsr->header.length) == 0)
4404 return atsru;
4405 }
4406
4407 return NULL;
4408 }
4409
dmar_parse_one_atsr(struct acpi_dmar_header * hdr,void * arg)4410 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4411 {
4412 struct acpi_dmar_atsr *atsr;
4413 struct dmar_atsr_unit *atsru;
4414
4415 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
4416 return 0;
4417
4418 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4419 atsru = dmar_find_atsr(atsr);
4420 if (atsru)
4421 return 0;
4422
4423 atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4424 if (!atsru)
4425 return -ENOMEM;
4426
4427 /*
4428 * If memory is allocated from slab by ACPI _DSM method, we need to
4429 * copy the memory content because the memory buffer will be freed
4430 * on return.
4431 */
4432 atsru->hdr = (void *)(atsru + 1);
4433 memcpy(atsru->hdr, hdr, hdr->length);
4434 atsru->include_all = atsr->flags & 0x1;
4435 if (!atsru->include_all) {
4436 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4437 (void *)atsr + atsr->header.length,
4438 &atsru->devices_cnt);
4439 if (atsru->devices_cnt && atsru->devices == NULL) {
4440 kfree(atsru);
4441 return -ENOMEM;
4442 }
4443 }
4444
4445 list_add_rcu(&atsru->list, &dmar_atsr_units);
4446
4447 return 0;
4448 }
4449
intel_iommu_free_atsr(struct dmar_atsr_unit * atsru)4450 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4451 {
4452 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4453 kfree(atsru);
4454 }
4455
dmar_release_one_atsr(struct acpi_dmar_header * hdr,void * arg)4456 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4457 {
4458 struct acpi_dmar_atsr *atsr;
4459 struct dmar_atsr_unit *atsru;
4460
4461 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4462 atsru = dmar_find_atsr(atsr);
4463 if (atsru) {
4464 list_del_rcu(&atsru->list);
4465 synchronize_rcu();
4466 intel_iommu_free_atsr(atsru);
4467 }
4468
4469 return 0;
4470 }
4471
dmar_check_one_atsr(struct acpi_dmar_header * hdr,void * arg)4472 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4473 {
4474 int i;
4475 struct device *dev;
4476 struct acpi_dmar_atsr *atsr;
4477 struct dmar_atsr_unit *atsru;
4478
4479 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4480 atsru = dmar_find_atsr(atsr);
4481 if (!atsru)
4482 return 0;
4483
4484 if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4485 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4486 i, dev)
4487 return -EBUSY;
4488 }
4489
4490 return 0;
4491 }
4492
intel_iommu_add(struct dmar_drhd_unit * dmaru)4493 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4494 {
4495 int sp, ret;
4496 struct intel_iommu *iommu = dmaru->iommu;
4497
4498 if (g_iommus[iommu->seq_id])
4499 return 0;
4500
4501 if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4502 pr_warn("%s: Doesn't support hardware pass through.\n",
4503 iommu->name);
4504 return -ENXIO;
4505 }
4506 if (!ecap_sc_support(iommu->ecap) &&
4507 domain_update_iommu_snooping(iommu)) {
4508 pr_warn("%s: Doesn't support snooping.\n",
4509 iommu->name);
4510 return -ENXIO;
4511 }
4512 sp = domain_update_iommu_superpage(NULL, iommu) - 1;
4513 if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4514 pr_warn("%s: Doesn't support large page.\n",
4515 iommu->name);
4516 return -ENXIO;
4517 }
4518
4519 /*
4520 * Disable translation if already enabled prior to OS handover.
4521 */
4522 if (iommu->gcmd & DMA_GCMD_TE)
4523 iommu_disable_translation(iommu);
4524
4525 g_iommus[iommu->seq_id] = iommu;
4526 ret = iommu_init_domains(iommu);
4527 if (ret == 0)
4528 ret = iommu_alloc_root_entry(iommu);
4529 if (ret)
4530 goto out;
4531
4532 intel_svm_check(iommu);
4533
4534 if (dmaru->ignored) {
4535 /*
4536 * we always have to disable PMRs or DMA may fail on this device
4537 */
4538 if (force_on)
4539 iommu_disable_protect_mem_regions(iommu);
4540 return 0;
4541 }
4542
4543 intel_iommu_init_qi(iommu);
4544 iommu_flush_write_buffer(iommu);
4545
4546 #ifdef CONFIG_INTEL_IOMMU_SVM
4547 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
4548 ret = intel_svm_enable_prq(iommu);
4549 if (ret)
4550 goto disable_iommu;
4551 }
4552 #endif
4553 ret = dmar_set_interrupt(iommu);
4554 if (ret)
4555 goto disable_iommu;
4556
4557 iommu_set_root_entry(iommu);
4558 iommu_enable_translation(iommu);
4559
4560 iommu_disable_protect_mem_regions(iommu);
4561 return 0;
4562
4563 disable_iommu:
4564 disable_dmar_iommu(iommu);
4565 out:
4566 free_dmar_iommu(iommu);
4567 return ret;
4568 }
4569
dmar_iommu_hotplug(struct dmar_drhd_unit * dmaru,bool insert)4570 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4571 {
4572 int ret = 0;
4573 struct intel_iommu *iommu = dmaru->iommu;
4574
4575 if (!intel_iommu_enabled)
4576 return 0;
4577 if (iommu == NULL)
4578 return -EINVAL;
4579
4580 if (insert) {
4581 ret = intel_iommu_add(dmaru);
4582 } else {
4583 disable_dmar_iommu(iommu);
4584 free_dmar_iommu(iommu);
4585 }
4586
4587 return ret;
4588 }
4589
intel_iommu_free_dmars(void)4590 static void intel_iommu_free_dmars(void)
4591 {
4592 struct dmar_rmrr_unit *rmrru, *rmrr_n;
4593 struct dmar_atsr_unit *atsru, *atsr_n;
4594
4595 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4596 list_del(&rmrru->list);
4597 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4598 kfree(rmrru);
4599 }
4600
4601 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4602 list_del(&atsru->list);
4603 intel_iommu_free_atsr(atsru);
4604 }
4605 }
4606
dmar_find_matched_atsr_unit(struct pci_dev * dev)4607 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4608 {
4609 int i, ret = 1;
4610 struct pci_bus *bus;
4611 struct pci_dev *bridge = NULL;
4612 struct device *tmp;
4613 struct acpi_dmar_atsr *atsr;
4614 struct dmar_atsr_unit *atsru;
4615
4616 dev = pci_physfn(dev);
4617 for (bus = dev->bus; bus; bus = bus->parent) {
4618 bridge = bus->self;
4619 /* If it's an integrated device, allow ATS */
4620 if (!bridge)
4621 return 1;
4622 /* Connected via non-PCIe: no ATS */
4623 if (!pci_is_pcie(bridge) ||
4624 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4625 return 0;
4626 /* If we found the root port, look it up in the ATSR */
4627 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4628 break;
4629 }
4630
4631 rcu_read_lock();
4632 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4633 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4634 if (atsr->segment != pci_domain_nr(dev->bus))
4635 continue;
4636
4637 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4638 if (tmp == &bridge->dev)
4639 goto out;
4640
4641 if (atsru->include_all)
4642 goto out;
4643 }
4644 ret = 0;
4645 out:
4646 rcu_read_unlock();
4647
4648 return ret;
4649 }
4650
dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info * info)4651 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4652 {
4653 int ret;
4654 struct dmar_rmrr_unit *rmrru;
4655 struct dmar_atsr_unit *atsru;
4656 struct acpi_dmar_atsr *atsr;
4657 struct acpi_dmar_reserved_memory *rmrr;
4658
4659 if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4660 return 0;
4661
4662 list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4663 rmrr = container_of(rmrru->hdr,
4664 struct acpi_dmar_reserved_memory, header);
4665 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4666 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4667 ((void *)rmrr) + rmrr->header.length,
4668 rmrr->segment, rmrru->devices,
4669 rmrru->devices_cnt);
4670 if (ret < 0)
4671 return ret;
4672 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4673 dmar_remove_dev_scope(info, rmrr->segment,
4674 rmrru->devices, rmrru->devices_cnt);
4675 }
4676 }
4677
4678 list_for_each_entry(atsru, &dmar_atsr_units, list) {
4679 if (atsru->include_all)
4680 continue;
4681
4682 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4683 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4684 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4685 (void *)atsr + atsr->header.length,
4686 atsr->segment, atsru->devices,
4687 atsru->devices_cnt);
4688 if (ret > 0)
4689 break;
4690 else if (ret < 0)
4691 return ret;
4692 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4693 if (dmar_remove_dev_scope(info, atsr->segment,
4694 atsru->devices, atsru->devices_cnt))
4695 break;
4696 }
4697 }
4698
4699 return 0;
4700 }
4701
intel_iommu_memory_notifier(struct notifier_block * nb,unsigned long val,void * v)4702 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4703 unsigned long val, void *v)
4704 {
4705 struct memory_notify *mhp = v;
4706 unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4707 unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn +
4708 mhp->nr_pages - 1);
4709
4710 switch (val) {
4711 case MEM_GOING_ONLINE:
4712 if (iommu_domain_identity_map(si_domain,
4713 start_vpfn, last_vpfn)) {
4714 pr_warn("Failed to build identity map for [%lx-%lx]\n",
4715 start_vpfn, last_vpfn);
4716 return NOTIFY_BAD;
4717 }
4718 break;
4719
4720 case MEM_OFFLINE:
4721 case MEM_CANCEL_ONLINE:
4722 {
4723 struct dmar_drhd_unit *drhd;
4724 struct intel_iommu *iommu;
4725 struct page *freelist;
4726
4727 freelist = domain_unmap(si_domain,
4728 start_vpfn, last_vpfn);
4729
4730 rcu_read_lock();
4731 for_each_active_iommu(iommu, drhd)
4732 iommu_flush_iotlb_psi(iommu, si_domain,
4733 start_vpfn, mhp->nr_pages,
4734 !freelist, 0);
4735 rcu_read_unlock();
4736 dma_free_pagelist(freelist);
4737 }
4738 break;
4739 }
4740
4741 return NOTIFY_OK;
4742 }
4743
4744 static struct notifier_block intel_iommu_memory_nb = {
4745 .notifier_call = intel_iommu_memory_notifier,
4746 .priority = 0
4747 };
4748
free_all_cpu_cached_iovas(unsigned int cpu)4749 static void free_all_cpu_cached_iovas(unsigned int cpu)
4750 {
4751 int i;
4752
4753 for (i = 0; i < g_num_of_iommus; i++) {
4754 struct intel_iommu *iommu = g_iommus[i];
4755 struct dmar_domain *domain;
4756 int did;
4757
4758 if (!iommu)
4759 continue;
4760
4761 for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4762 domain = get_iommu_domain(iommu, (u16)did);
4763
4764 if (!domain || domain->domain.type != IOMMU_DOMAIN_DMA)
4765 continue;
4766
4767 free_cpu_cached_iovas(cpu, &domain->iovad);
4768 }
4769 }
4770 }
4771
intel_iommu_cpu_dead(unsigned int cpu)4772 static int intel_iommu_cpu_dead(unsigned int cpu)
4773 {
4774 free_all_cpu_cached_iovas(cpu);
4775 return 0;
4776 }
4777
intel_disable_iommus(void)4778 static void intel_disable_iommus(void)
4779 {
4780 struct intel_iommu *iommu = NULL;
4781 struct dmar_drhd_unit *drhd;
4782
4783 for_each_iommu(iommu, drhd)
4784 iommu_disable_translation(iommu);
4785 }
4786
intel_iommu_shutdown(void)4787 void intel_iommu_shutdown(void)
4788 {
4789 struct dmar_drhd_unit *drhd;
4790 struct intel_iommu *iommu = NULL;
4791
4792 if (no_iommu || dmar_disabled)
4793 return;
4794
4795 down_write(&dmar_global_lock);
4796
4797 /* Disable PMRs explicitly here. */
4798 for_each_iommu(iommu, drhd)
4799 iommu_disable_protect_mem_regions(iommu);
4800
4801 /* Make sure the IOMMUs are switched off */
4802 intel_disable_iommus();
4803
4804 up_write(&dmar_global_lock);
4805 }
4806
dev_to_intel_iommu(struct device * dev)4807 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4808 {
4809 struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4810
4811 return container_of(iommu_dev, struct intel_iommu, iommu);
4812 }
4813
intel_iommu_show_version(struct device * dev,struct device_attribute * attr,char * buf)4814 static ssize_t intel_iommu_show_version(struct device *dev,
4815 struct device_attribute *attr,
4816 char *buf)
4817 {
4818 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4819 u32 ver = readl(iommu->reg + DMAR_VER_REG);
4820 return sprintf(buf, "%d:%d\n",
4821 DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4822 }
4823 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4824
intel_iommu_show_address(struct device * dev,struct device_attribute * attr,char * buf)4825 static ssize_t intel_iommu_show_address(struct device *dev,
4826 struct device_attribute *attr,
4827 char *buf)
4828 {
4829 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4830 return sprintf(buf, "%llx\n", iommu->reg_phys);
4831 }
4832 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4833
intel_iommu_show_cap(struct device * dev,struct device_attribute * attr,char * buf)4834 static ssize_t intel_iommu_show_cap(struct device *dev,
4835 struct device_attribute *attr,
4836 char *buf)
4837 {
4838 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4839 return sprintf(buf, "%llx\n", iommu->cap);
4840 }
4841 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4842
intel_iommu_show_ecap(struct device * dev,struct device_attribute * attr,char * buf)4843 static ssize_t intel_iommu_show_ecap(struct device *dev,
4844 struct device_attribute *attr,
4845 char *buf)
4846 {
4847 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4848 return sprintf(buf, "%llx\n", iommu->ecap);
4849 }
4850 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4851
intel_iommu_show_ndoms(struct device * dev,struct device_attribute * attr,char * buf)4852 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4853 struct device_attribute *attr,
4854 char *buf)
4855 {
4856 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4857 return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4858 }
4859 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4860
intel_iommu_show_ndoms_used(struct device * dev,struct device_attribute * attr,char * buf)4861 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4862 struct device_attribute *attr,
4863 char *buf)
4864 {
4865 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4866 return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4867 cap_ndoms(iommu->cap)));
4868 }
4869 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4870
4871 static struct attribute *intel_iommu_attrs[] = {
4872 &dev_attr_version.attr,
4873 &dev_attr_address.attr,
4874 &dev_attr_cap.attr,
4875 &dev_attr_ecap.attr,
4876 &dev_attr_domains_supported.attr,
4877 &dev_attr_domains_used.attr,
4878 NULL,
4879 };
4880
4881 static struct attribute_group intel_iommu_group = {
4882 .name = "intel-iommu",
4883 .attrs = intel_iommu_attrs,
4884 };
4885
4886 const struct attribute_group *intel_iommu_groups[] = {
4887 &intel_iommu_group,
4888 NULL,
4889 };
4890
has_external_pci(void)4891 static inline bool has_external_pci(void)
4892 {
4893 struct pci_dev *pdev = NULL;
4894
4895 for_each_pci_dev(pdev)
4896 if (pdev->external_facing) {
4897 pci_dev_put(pdev);
4898 return true;
4899 }
4900
4901 return false;
4902 }
4903
platform_optin_force_iommu(void)4904 static int __init platform_optin_force_iommu(void)
4905 {
4906 if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
4907 return 0;
4908
4909 if (no_iommu || dmar_disabled)
4910 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
4911
4912 /*
4913 * If Intel-IOMMU is disabled by default, we will apply identity
4914 * map for all devices except those marked as being untrusted.
4915 */
4916 if (dmar_disabled)
4917 iommu_set_default_passthrough(false);
4918
4919 dmar_disabled = 0;
4920 no_iommu = 0;
4921
4922 return 1;
4923 }
4924
probe_acpi_namespace_devices(void)4925 static int __init probe_acpi_namespace_devices(void)
4926 {
4927 struct dmar_drhd_unit *drhd;
4928 /* To avoid a -Wunused-but-set-variable warning. */
4929 struct intel_iommu *iommu __maybe_unused;
4930 struct device *dev;
4931 int i, ret = 0;
4932
4933 for_each_active_iommu(iommu, drhd) {
4934 for_each_active_dev_scope(drhd->devices,
4935 drhd->devices_cnt, i, dev) {
4936 struct acpi_device_physical_node *pn;
4937 struct iommu_group *group;
4938 struct acpi_device *adev;
4939
4940 if (dev->bus != &acpi_bus_type)
4941 continue;
4942
4943 adev = to_acpi_device(dev);
4944 mutex_lock(&adev->physical_node_lock);
4945 list_for_each_entry(pn,
4946 &adev->physical_node_list, node) {
4947 group = iommu_group_get(pn->dev);
4948 if (group) {
4949 iommu_group_put(group);
4950 continue;
4951 }
4952
4953 pn->dev->bus->iommu_ops = &intel_iommu_ops;
4954 ret = iommu_probe_device(pn->dev);
4955 if (ret)
4956 break;
4957 }
4958 mutex_unlock(&adev->physical_node_lock);
4959
4960 if (ret)
4961 return ret;
4962 }
4963 }
4964
4965 return 0;
4966 }
4967
intel_iommu_init(void)4968 int __init intel_iommu_init(void)
4969 {
4970 int ret = -ENODEV;
4971 struct dmar_drhd_unit *drhd;
4972 struct intel_iommu *iommu;
4973
4974 /*
4975 * Intel IOMMU is required for a TXT/tboot launch or platform
4976 * opt in, so enforce that.
4977 */
4978 force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
4979 platform_optin_force_iommu();
4980
4981 if (iommu_init_mempool()) {
4982 if (force_on)
4983 panic("tboot: Failed to initialize iommu memory\n");
4984 return -ENOMEM;
4985 }
4986
4987 down_write(&dmar_global_lock);
4988 if (dmar_table_init()) {
4989 if (force_on)
4990 panic("tboot: Failed to initialize DMAR table\n");
4991 goto out_free_dmar;
4992 }
4993
4994 if (dmar_dev_scope_init() < 0) {
4995 if (force_on)
4996 panic("tboot: Failed to initialize DMAR device scope\n");
4997 goto out_free_dmar;
4998 }
4999
5000 up_write(&dmar_global_lock);
5001
5002 /*
5003 * The bus notifier takes the dmar_global_lock, so lockdep will
5004 * complain later when we register it under the lock.
5005 */
5006 dmar_register_bus_notifier();
5007
5008 down_write(&dmar_global_lock);
5009
5010 if (!no_iommu)
5011 intel_iommu_debugfs_init();
5012
5013 if (no_iommu || dmar_disabled) {
5014 /*
5015 * We exit the function here to ensure IOMMU's remapping and
5016 * mempool aren't setup, which means that the IOMMU's PMRs
5017 * won't be disabled via the call to init_dmars(). So disable
5018 * it explicitly here. The PMRs were setup by tboot prior to
5019 * calling SENTER, but the kernel is expected to reset/tear
5020 * down the PMRs.
5021 */
5022 if (intel_iommu_tboot_noforce) {
5023 for_each_iommu(iommu, drhd)
5024 iommu_disable_protect_mem_regions(iommu);
5025 }
5026
5027 /*
5028 * Make sure the IOMMUs are switched off, even when we
5029 * boot into a kexec kernel and the previous kernel left
5030 * them enabled
5031 */
5032 intel_disable_iommus();
5033 goto out_free_dmar;
5034 }
5035
5036 if (list_empty(&dmar_rmrr_units))
5037 pr_info("No RMRR found\n");
5038
5039 if (list_empty(&dmar_atsr_units))
5040 pr_info("No ATSR found\n");
5041
5042 if (dmar_init_reserved_ranges()) {
5043 if (force_on)
5044 panic("tboot: Failed to reserve iommu ranges\n");
5045 goto out_free_reserved_range;
5046 }
5047
5048 if (dmar_map_gfx)
5049 intel_iommu_gfx_mapped = 1;
5050
5051 init_no_remapping_devices();
5052
5053 ret = init_dmars();
5054 if (ret) {
5055 if (force_on)
5056 panic("tboot: Failed to initialize DMARs\n");
5057 pr_err("Initialization failed\n");
5058 goto out_free_reserved_range;
5059 }
5060 up_write(&dmar_global_lock);
5061
5062 init_iommu_pm_ops();
5063
5064 down_read(&dmar_global_lock);
5065 for_each_active_iommu(iommu, drhd) {
5066 iommu_device_sysfs_add(&iommu->iommu, NULL,
5067 intel_iommu_groups,
5068 "%s", iommu->name);
5069 iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
5070 iommu_device_register(&iommu->iommu);
5071 }
5072 up_read(&dmar_global_lock);
5073
5074 bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
5075 if (si_domain && !hw_pass_through)
5076 register_memory_notifier(&intel_iommu_memory_nb);
5077 cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
5078 intel_iommu_cpu_dead);
5079
5080 down_read(&dmar_global_lock);
5081 if (probe_acpi_namespace_devices())
5082 pr_warn("ACPI name space devices didn't probe correctly\n");
5083
5084 /* Finally, we enable the DMA remapping hardware. */
5085 for_each_iommu(iommu, drhd) {
5086 if (!drhd->ignored && !translation_pre_enabled(iommu))
5087 iommu_enable_translation(iommu);
5088
5089 iommu_disable_protect_mem_regions(iommu);
5090 }
5091 up_read(&dmar_global_lock);
5092
5093 pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
5094
5095 intel_iommu_enabled = 1;
5096
5097 return 0;
5098
5099 out_free_reserved_range:
5100 put_iova_domain(&reserved_iova_list);
5101 out_free_dmar:
5102 intel_iommu_free_dmars();
5103 up_write(&dmar_global_lock);
5104 iommu_exit_mempool();
5105 return ret;
5106 }
5107
domain_context_clear_one_cb(struct pci_dev * pdev,u16 alias,void * opaque)5108 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
5109 {
5110 struct intel_iommu *iommu = opaque;
5111
5112 domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
5113 return 0;
5114 }
5115
5116 /*
5117 * NB - intel-iommu lacks any sort of reference counting for the users of
5118 * dependent devices. If multiple endpoints have intersecting dependent
5119 * devices, unbinding the driver from any one of them will possibly leave
5120 * the others unable to operate.
5121 */
domain_context_clear(struct intel_iommu * iommu,struct device * dev)5122 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
5123 {
5124 if (!iommu || !dev || !dev_is_pci(dev))
5125 return;
5126
5127 pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
5128 }
5129
__dmar_remove_one_dev_info(struct device_domain_info * info)5130 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
5131 {
5132 struct dmar_domain *domain;
5133 struct intel_iommu *iommu;
5134 unsigned long flags;
5135
5136 assert_spin_locked(&device_domain_lock);
5137
5138 if (WARN_ON(!info))
5139 return;
5140
5141 iommu = info->iommu;
5142 domain = info->domain;
5143
5144 if (info->dev) {
5145 if (dev_is_pci(info->dev) && sm_supported(iommu))
5146 intel_pasid_tear_down_entry(iommu, info->dev,
5147 PASID_RID2PASID, false);
5148
5149 iommu_disable_dev_iotlb(info);
5150 if (!dev_is_real_dma_subdevice(info->dev))
5151 domain_context_clear(iommu, info->dev);
5152 intel_pasid_free_table(info->dev);
5153 }
5154
5155 unlink_domain_info(info);
5156
5157 spin_lock_irqsave(&iommu->lock, flags);
5158 domain_detach_iommu(domain, iommu);
5159 spin_unlock_irqrestore(&iommu->lock, flags);
5160
5161 free_devinfo_mem(info);
5162 }
5163
dmar_remove_one_dev_info(struct device * dev)5164 static void dmar_remove_one_dev_info(struct device *dev)
5165 {
5166 struct device_domain_info *info;
5167 unsigned long flags;
5168
5169 spin_lock_irqsave(&device_domain_lock, flags);
5170 info = get_domain_info(dev);
5171 if (info)
5172 __dmar_remove_one_dev_info(info);
5173 spin_unlock_irqrestore(&device_domain_lock, flags);
5174 }
5175
md_domain_init(struct dmar_domain * domain,int guest_width)5176 static int md_domain_init(struct dmar_domain *domain, int guest_width)
5177 {
5178 int adjust_width;
5179
5180 /* calculate AGAW */
5181 domain->gaw = guest_width;
5182 adjust_width = guestwidth_to_adjustwidth(guest_width);
5183 domain->agaw = width_to_agaw(adjust_width);
5184
5185 domain->iommu_coherency = 0;
5186 domain->iommu_snooping = 0;
5187 domain->iommu_superpage = 0;
5188 domain->max_addr = 0;
5189
5190 /* always allocate the top pgd */
5191 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
5192 if (!domain->pgd)
5193 return -ENOMEM;
5194 domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
5195 return 0;
5196 }
5197
intel_init_iova_domain(struct dmar_domain * dmar_domain)5198 static void intel_init_iova_domain(struct dmar_domain *dmar_domain)
5199 {
5200 init_iova_domain(&dmar_domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
5201 copy_reserved_iova(&reserved_iova_list, &dmar_domain->iovad);
5202
5203 if (!intel_iommu_strict &&
5204 init_iova_flush_queue(&dmar_domain->iovad,
5205 iommu_flush_iova, iova_entry_free))
5206 pr_info("iova flush queue initialization failed\n");
5207 }
5208
intel_iommu_domain_alloc(unsigned type)5209 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
5210 {
5211 struct dmar_domain *dmar_domain;
5212 struct iommu_domain *domain;
5213
5214 switch (type) {
5215 case IOMMU_DOMAIN_DMA:
5216 case IOMMU_DOMAIN_UNMANAGED:
5217 dmar_domain = alloc_domain(0);
5218 if (!dmar_domain) {
5219 pr_err("Can't allocate dmar_domain\n");
5220 return NULL;
5221 }
5222 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
5223 pr_err("Domain initialization failed\n");
5224 domain_exit(dmar_domain);
5225 return NULL;
5226 }
5227
5228 if (type == IOMMU_DOMAIN_DMA)
5229 intel_init_iova_domain(dmar_domain);
5230
5231 domain = &dmar_domain->domain;
5232 domain->geometry.aperture_start = 0;
5233 domain->geometry.aperture_end =
5234 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
5235 domain->geometry.force_aperture = true;
5236
5237 return domain;
5238 case IOMMU_DOMAIN_IDENTITY:
5239 return &si_domain->domain;
5240 default:
5241 return NULL;
5242 }
5243
5244 return NULL;
5245 }
5246
intel_iommu_domain_free(struct iommu_domain * domain)5247 static void intel_iommu_domain_free(struct iommu_domain *domain)
5248 {
5249 if (domain != &si_domain->domain)
5250 domain_exit(to_dmar_domain(domain));
5251 }
5252
5253 /*
5254 * Check whether a @domain could be attached to the @dev through the
5255 * aux-domain attach/detach APIs.
5256 */
5257 static inline bool
is_aux_domain(struct device * dev,struct iommu_domain * domain)5258 is_aux_domain(struct device *dev, struct iommu_domain *domain)
5259 {
5260 struct device_domain_info *info = get_domain_info(dev);
5261
5262 return info && info->auxd_enabled &&
5263 domain->type == IOMMU_DOMAIN_UNMANAGED;
5264 }
5265
auxiliary_link_device(struct dmar_domain * domain,struct device * dev)5266 static void auxiliary_link_device(struct dmar_domain *domain,
5267 struct device *dev)
5268 {
5269 struct device_domain_info *info = get_domain_info(dev);
5270
5271 assert_spin_locked(&device_domain_lock);
5272 if (WARN_ON(!info))
5273 return;
5274
5275 domain->auxd_refcnt++;
5276 list_add(&domain->auxd, &info->auxiliary_domains);
5277 }
5278
auxiliary_unlink_device(struct dmar_domain * domain,struct device * dev)5279 static void auxiliary_unlink_device(struct dmar_domain *domain,
5280 struct device *dev)
5281 {
5282 struct device_domain_info *info = get_domain_info(dev);
5283
5284 assert_spin_locked(&device_domain_lock);
5285 if (WARN_ON(!info))
5286 return;
5287
5288 list_del(&domain->auxd);
5289 domain->auxd_refcnt--;
5290
5291 if (!domain->auxd_refcnt && domain->default_pasid > 0)
5292 ioasid_free(domain->default_pasid);
5293 }
5294
aux_domain_add_dev(struct dmar_domain * domain,struct device * dev)5295 static int aux_domain_add_dev(struct dmar_domain *domain,
5296 struct device *dev)
5297 {
5298 int ret;
5299 unsigned long flags;
5300 struct intel_iommu *iommu;
5301
5302 iommu = device_to_iommu(dev, NULL, NULL);
5303 if (!iommu)
5304 return -ENODEV;
5305
5306 if (domain->default_pasid <= 0) {
5307 u32 pasid;
5308
5309 /* No private data needed for the default pasid */
5310 pasid = ioasid_alloc(NULL, PASID_MIN,
5311 pci_max_pasids(to_pci_dev(dev)) - 1,
5312 NULL);
5313 if (pasid == INVALID_IOASID) {
5314 pr_err("Can't allocate default pasid\n");
5315 return -ENODEV;
5316 }
5317 domain->default_pasid = pasid;
5318 }
5319
5320 spin_lock_irqsave(&device_domain_lock, flags);
5321 /*
5322 * iommu->lock must be held to attach domain to iommu and setup the
5323 * pasid entry for second level translation.
5324 */
5325 spin_lock(&iommu->lock);
5326 ret = domain_attach_iommu(domain, iommu);
5327 if (ret)
5328 goto attach_failed;
5329
5330 /* Setup the PASID entry for mediated devices: */
5331 if (domain_use_first_level(domain))
5332 ret = domain_setup_first_level(iommu, domain, dev,
5333 domain->default_pasid);
5334 else
5335 ret = intel_pasid_setup_second_level(iommu, domain, dev,
5336 domain->default_pasid);
5337 if (ret)
5338 goto table_failed;
5339 spin_unlock(&iommu->lock);
5340
5341 auxiliary_link_device(domain, dev);
5342
5343 spin_unlock_irqrestore(&device_domain_lock, flags);
5344
5345 return 0;
5346
5347 table_failed:
5348 domain_detach_iommu(domain, iommu);
5349 attach_failed:
5350 spin_unlock(&iommu->lock);
5351 spin_unlock_irqrestore(&device_domain_lock, flags);
5352 if (!domain->auxd_refcnt && domain->default_pasid > 0)
5353 ioasid_free(domain->default_pasid);
5354
5355 return ret;
5356 }
5357
aux_domain_remove_dev(struct dmar_domain * domain,struct device * dev)5358 static void aux_domain_remove_dev(struct dmar_domain *domain,
5359 struct device *dev)
5360 {
5361 struct device_domain_info *info;
5362 struct intel_iommu *iommu;
5363 unsigned long flags;
5364
5365 if (!is_aux_domain(dev, &domain->domain))
5366 return;
5367
5368 spin_lock_irqsave(&device_domain_lock, flags);
5369 info = get_domain_info(dev);
5370 iommu = info->iommu;
5371
5372 auxiliary_unlink_device(domain, dev);
5373
5374 spin_lock(&iommu->lock);
5375 intel_pasid_tear_down_entry(iommu, dev, domain->default_pasid, false);
5376 domain_detach_iommu(domain, iommu);
5377 spin_unlock(&iommu->lock);
5378
5379 spin_unlock_irqrestore(&device_domain_lock, flags);
5380 }
5381
prepare_domain_attach_device(struct iommu_domain * domain,struct device * dev)5382 static int prepare_domain_attach_device(struct iommu_domain *domain,
5383 struct device *dev)
5384 {
5385 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5386 struct intel_iommu *iommu;
5387 int addr_width;
5388
5389 iommu = device_to_iommu(dev, NULL, NULL);
5390 if (!iommu)
5391 return -ENODEV;
5392
5393 /* check if this iommu agaw is sufficient for max mapped address */
5394 addr_width = agaw_to_width(iommu->agaw);
5395 if (addr_width > cap_mgaw(iommu->cap))
5396 addr_width = cap_mgaw(iommu->cap);
5397
5398 if (dmar_domain->max_addr > (1LL << addr_width)) {
5399 dev_err(dev, "%s: iommu width (%d) is not "
5400 "sufficient for the mapped address (%llx)\n",
5401 __func__, addr_width, dmar_domain->max_addr);
5402 return -EFAULT;
5403 }
5404 dmar_domain->gaw = addr_width;
5405
5406 /*
5407 * Knock out extra levels of page tables if necessary
5408 */
5409 while (iommu->agaw < dmar_domain->agaw) {
5410 struct dma_pte *pte;
5411
5412 pte = dmar_domain->pgd;
5413 if (dma_pte_present(pte)) {
5414 dmar_domain->pgd = (struct dma_pte *)
5415 phys_to_virt(dma_pte_addr(pte));
5416 free_pgtable_page(pte);
5417 }
5418 dmar_domain->agaw--;
5419 }
5420
5421 return 0;
5422 }
5423
intel_iommu_attach_device(struct iommu_domain * domain,struct device * dev)5424 static int intel_iommu_attach_device(struct iommu_domain *domain,
5425 struct device *dev)
5426 {
5427 int ret;
5428
5429 if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
5430 device_is_rmrr_locked(dev)) {
5431 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement. Contact your platform vendor.\n");
5432 return -EPERM;
5433 }
5434
5435 if (is_aux_domain(dev, domain))
5436 return -EPERM;
5437
5438 /* normally dev is not mapped */
5439 if (unlikely(domain_context_mapped(dev))) {
5440 struct dmar_domain *old_domain;
5441
5442 old_domain = find_domain(dev);
5443 if (old_domain)
5444 dmar_remove_one_dev_info(dev);
5445 }
5446
5447 ret = prepare_domain_attach_device(domain, dev);
5448 if (ret)
5449 return ret;
5450
5451 return domain_add_dev_info(to_dmar_domain(domain), dev);
5452 }
5453
intel_iommu_aux_attach_device(struct iommu_domain * domain,struct device * dev)5454 static int intel_iommu_aux_attach_device(struct iommu_domain *domain,
5455 struct device *dev)
5456 {
5457 int ret;
5458
5459 if (!is_aux_domain(dev, domain))
5460 return -EPERM;
5461
5462 ret = prepare_domain_attach_device(domain, dev);
5463 if (ret)
5464 return ret;
5465
5466 return aux_domain_add_dev(to_dmar_domain(domain), dev);
5467 }
5468
intel_iommu_detach_device(struct iommu_domain * domain,struct device * dev)5469 static void intel_iommu_detach_device(struct iommu_domain *domain,
5470 struct device *dev)
5471 {
5472 dmar_remove_one_dev_info(dev);
5473 }
5474
intel_iommu_aux_detach_device(struct iommu_domain * domain,struct device * dev)5475 static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
5476 struct device *dev)
5477 {
5478 aux_domain_remove_dev(to_dmar_domain(domain), dev);
5479 }
5480
5481 #ifdef CONFIG_INTEL_IOMMU_SVM
5482 /*
5483 * 2D array for converting and sanitizing IOMMU generic TLB granularity to
5484 * VT-d granularity. Invalidation is typically included in the unmap operation
5485 * as a result of DMA or VFIO unmap. However, for assigned devices guest
5486 * owns the first level page tables. Invalidations of translation caches in the
5487 * guest are trapped and passed down to the host.
5488 *
5489 * vIOMMU in the guest will only expose first level page tables, therefore
5490 * we do not support IOTLB granularity for request without PASID (second level).
5491 *
5492 * For example, to find the VT-d granularity encoding for IOTLB
5493 * type and page selective granularity within PASID:
5494 * X: indexed by iommu cache type
5495 * Y: indexed by enum iommu_inv_granularity
5496 * [IOMMU_CACHE_INV_TYPE_IOTLB][IOMMU_INV_GRANU_ADDR]
5497 */
5498
5499 static const int
5500 inv_type_granu_table[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU_NR] = {
5501 /*
5502 * PASID based IOTLB invalidation: PASID selective (per PASID),
5503 * page selective (address granularity)
5504 */
5505 {-EINVAL, QI_GRAN_NONG_PASID, QI_GRAN_PSI_PASID},
5506 /* PASID based dev TLBs */
5507 {-EINVAL, -EINVAL, QI_DEV_IOTLB_GRAN_PASID_SEL},
5508 /* PASID cache */
5509 {-EINVAL, -EINVAL, -EINVAL}
5510 };
5511
to_vtd_granularity(int type,int granu)5512 static inline int to_vtd_granularity(int type, int granu)
5513 {
5514 return inv_type_granu_table[type][granu];
5515 }
5516
to_vtd_size(u64 granu_size,u64 nr_granules)5517 static inline u64 to_vtd_size(u64 granu_size, u64 nr_granules)
5518 {
5519 u64 nr_pages = (granu_size * nr_granules) >> VTD_PAGE_SHIFT;
5520
5521 /* VT-d size is encoded as 2^size of 4K pages, 0 for 4k, 9 for 2MB, etc.
5522 * IOMMU cache invalidate API passes granu_size in bytes, and number of
5523 * granu size in contiguous memory.
5524 */
5525 return order_base_2(nr_pages);
5526 }
5527
5528 static int
intel_iommu_sva_invalidate(struct iommu_domain * domain,struct device * dev,struct iommu_cache_invalidate_info * inv_info)5529 intel_iommu_sva_invalidate(struct iommu_domain *domain, struct device *dev,
5530 struct iommu_cache_invalidate_info *inv_info)
5531 {
5532 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5533 struct device_domain_info *info;
5534 struct intel_iommu *iommu;
5535 unsigned long flags;
5536 int cache_type;
5537 u8 bus, devfn;
5538 u16 did, sid;
5539 int ret = 0;
5540 u64 size = 0;
5541
5542 if (!inv_info || !dmar_domain)
5543 return -EINVAL;
5544
5545 if (!dev || !dev_is_pci(dev))
5546 return -ENODEV;
5547
5548 iommu = device_to_iommu(dev, &bus, &devfn);
5549 if (!iommu)
5550 return -ENODEV;
5551
5552 if (!(dmar_domain->flags & DOMAIN_FLAG_NESTING_MODE))
5553 return -EINVAL;
5554
5555 spin_lock_irqsave(&device_domain_lock, flags);
5556 spin_lock(&iommu->lock);
5557 info = get_domain_info(dev);
5558 if (!info) {
5559 ret = -EINVAL;
5560 goto out_unlock;
5561 }
5562 did = dmar_domain->iommu_did[iommu->seq_id];
5563 sid = PCI_DEVID(bus, devfn);
5564
5565 /* Size is only valid in address selective invalidation */
5566 if (inv_info->granularity == IOMMU_INV_GRANU_ADDR)
5567 size = to_vtd_size(inv_info->granu.addr_info.granule_size,
5568 inv_info->granu.addr_info.nb_granules);
5569
5570 for_each_set_bit(cache_type,
5571 (unsigned long *)&inv_info->cache,
5572 IOMMU_CACHE_INV_TYPE_NR) {
5573 int granu = 0;
5574 u64 pasid = 0;
5575 u64 addr = 0;
5576
5577 granu = to_vtd_granularity(cache_type, inv_info->granularity);
5578 if (granu == -EINVAL) {
5579 pr_err_ratelimited("Invalid cache type and granu combination %d/%d\n",
5580 cache_type, inv_info->granularity);
5581 break;
5582 }
5583
5584 /*
5585 * PASID is stored in different locations based on the
5586 * granularity.
5587 */
5588 if (inv_info->granularity == IOMMU_INV_GRANU_PASID &&
5589 (inv_info->granu.pasid_info.flags & IOMMU_INV_PASID_FLAGS_PASID))
5590 pasid = inv_info->granu.pasid_info.pasid;
5591 else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
5592 (inv_info->granu.addr_info.flags & IOMMU_INV_ADDR_FLAGS_PASID))
5593 pasid = inv_info->granu.addr_info.pasid;
5594
5595 switch (BIT(cache_type)) {
5596 case IOMMU_CACHE_INV_TYPE_IOTLB:
5597 /* HW will ignore LSB bits based on address mask */
5598 if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
5599 size &&
5600 (inv_info->granu.addr_info.addr & ((BIT(VTD_PAGE_SHIFT + size)) - 1))) {
5601 pr_err_ratelimited("User address not aligned, 0x%llx, size order %llu\n",
5602 inv_info->granu.addr_info.addr, size);
5603 }
5604
5605 /*
5606 * If granu is PASID-selective, address is ignored.
5607 * We use npages = -1 to indicate that.
5608 */
5609 qi_flush_piotlb(iommu, did, pasid,
5610 mm_to_dma_pfn(inv_info->granu.addr_info.addr),
5611 (granu == QI_GRAN_NONG_PASID) ? -1 : 1 << size,
5612 inv_info->granu.addr_info.flags & IOMMU_INV_ADDR_FLAGS_LEAF);
5613
5614 if (!info->ats_enabled)
5615 break;
5616 /*
5617 * Always flush device IOTLB if ATS is enabled. vIOMMU
5618 * in the guest may assume IOTLB flush is inclusive,
5619 * which is more efficient.
5620 */
5621 fallthrough;
5622 case IOMMU_CACHE_INV_TYPE_DEV_IOTLB:
5623 /*
5624 * PASID based device TLB invalidation does not support
5625 * IOMMU_INV_GRANU_PASID granularity but only supports
5626 * IOMMU_INV_GRANU_ADDR.
5627 * The equivalent of that is we set the size to be the
5628 * entire range of 64 bit. User only provides PASID info
5629 * without address info. So we set addr to 0.
5630 */
5631 if (inv_info->granularity == IOMMU_INV_GRANU_PASID) {
5632 size = 64 - VTD_PAGE_SHIFT;
5633 addr = 0;
5634 } else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR) {
5635 addr = inv_info->granu.addr_info.addr;
5636 }
5637
5638 if (info->ats_enabled)
5639 qi_flush_dev_iotlb_pasid(iommu, sid,
5640 info->pfsid, pasid,
5641 info->ats_qdep, addr,
5642 size);
5643 else
5644 pr_warn_ratelimited("Passdown device IOTLB flush w/o ATS!\n");
5645 break;
5646 default:
5647 dev_err_ratelimited(dev, "Unsupported IOMMU invalidation type %d\n",
5648 cache_type);
5649 ret = -EINVAL;
5650 }
5651 }
5652 out_unlock:
5653 spin_unlock(&iommu->lock);
5654 spin_unlock_irqrestore(&device_domain_lock, flags);
5655
5656 return ret;
5657 }
5658 #endif
5659
intel_iommu_map(struct iommu_domain * domain,unsigned long iova,phys_addr_t hpa,size_t size,int iommu_prot,gfp_t gfp)5660 static int intel_iommu_map(struct iommu_domain *domain,
5661 unsigned long iova, phys_addr_t hpa,
5662 size_t size, int iommu_prot, gfp_t gfp)
5663 {
5664 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5665 u64 max_addr;
5666 int prot = 0;
5667 int ret;
5668
5669 if (iommu_prot & IOMMU_READ)
5670 prot |= DMA_PTE_READ;
5671 if (iommu_prot & IOMMU_WRITE)
5672 prot |= DMA_PTE_WRITE;
5673 if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5674 prot |= DMA_PTE_SNP;
5675
5676 max_addr = iova + size;
5677 if (dmar_domain->max_addr < max_addr) {
5678 u64 end;
5679
5680 /* check if minimum agaw is sufficient for mapped address */
5681 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5682 if (end < max_addr) {
5683 pr_err("%s: iommu width (%d) is not "
5684 "sufficient for the mapped address (%llx)\n",
5685 __func__, dmar_domain->gaw, max_addr);
5686 return -EFAULT;
5687 }
5688 dmar_domain->max_addr = max_addr;
5689 }
5690 /* Round up size to next multiple of PAGE_SIZE, if it and
5691 the low bits of hpa would take us onto the next page */
5692 size = aligned_nrpages(hpa, size);
5693 ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5694 hpa >> VTD_PAGE_SHIFT, size, prot);
5695 return ret;
5696 }
5697
intel_iommu_unmap(struct iommu_domain * domain,unsigned long iova,size_t size,struct iommu_iotlb_gather * gather)5698 static size_t intel_iommu_unmap(struct iommu_domain *domain,
5699 unsigned long iova, size_t size,
5700 struct iommu_iotlb_gather *gather)
5701 {
5702 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5703 struct page *freelist = NULL;
5704 unsigned long start_pfn, last_pfn;
5705 unsigned int npages;
5706 int iommu_id, level = 0;
5707
5708 /* Cope with horrid API which requires us to unmap more than the
5709 size argument if it happens to be a large-page mapping. */
5710 BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5711
5712 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5713 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5714
5715 start_pfn = iova >> VTD_PAGE_SHIFT;
5716 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5717
5718 freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
5719
5720 npages = last_pfn - start_pfn + 1;
5721
5722 for_each_domain_iommu(iommu_id, dmar_domain)
5723 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5724 start_pfn, npages, !freelist, 0);
5725
5726 dma_free_pagelist(freelist);
5727
5728 if (dmar_domain->max_addr == iova + size)
5729 dmar_domain->max_addr = iova;
5730
5731 return size;
5732 }
5733
intel_iommu_iova_to_phys(struct iommu_domain * domain,dma_addr_t iova)5734 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5735 dma_addr_t iova)
5736 {
5737 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5738 struct dma_pte *pte;
5739 int level = 0;
5740 u64 phys = 0;
5741
5742 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5743 if (pte && dma_pte_present(pte))
5744 phys = dma_pte_addr(pte) +
5745 (iova & (BIT_MASK(level_to_offset_bits(level) +
5746 VTD_PAGE_SHIFT) - 1));
5747
5748 return phys;
5749 }
5750
scalable_mode_support(void)5751 static inline bool scalable_mode_support(void)
5752 {
5753 struct dmar_drhd_unit *drhd;
5754 struct intel_iommu *iommu;
5755 bool ret = true;
5756
5757 rcu_read_lock();
5758 for_each_active_iommu(iommu, drhd) {
5759 if (!sm_supported(iommu)) {
5760 ret = false;
5761 break;
5762 }
5763 }
5764 rcu_read_unlock();
5765
5766 return ret;
5767 }
5768
iommu_pasid_support(void)5769 static inline bool iommu_pasid_support(void)
5770 {
5771 struct dmar_drhd_unit *drhd;
5772 struct intel_iommu *iommu;
5773 bool ret = true;
5774
5775 rcu_read_lock();
5776 for_each_active_iommu(iommu, drhd) {
5777 if (!pasid_supported(iommu)) {
5778 ret = false;
5779 break;
5780 }
5781 }
5782 rcu_read_unlock();
5783
5784 return ret;
5785 }
5786
nested_mode_support(void)5787 static inline bool nested_mode_support(void)
5788 {
5789 struct dmar_drhd_unit *drhd;
5790 struct intel_iommu *iommu;
5791 bool ret = true;
5792
5793 rcu_read_lock();
5794 for_each_active_iommu(iommu, drhd) {
5795 if (!sm_supported(iommu) || !ecap_nest(iommu->ecap)) {
5796 ret = false;
5797 break;
5798 }
5799 }
5800 rcu_read_unlock();
5801
5802 return ret;
5803 }
5804
intel_iommu_capable(enum iommu_cap cap)5805 static bool intel_iommu_capable(enum iommu_cap cap)
5806 {
5807 if (cap == IOMMU_CAP_CACHE_COHERENCY)
5808 return domain_update_iommu_snooping(NULL) == 1;
5809 if (cap == IOMMU_CAP_INTR_REMAP)
5810 return irq_remapping_enabled == 1;
5811
5812 return false;
5813 }
5814
intel_iommu_probe_device(struct device * dev)5815 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
5816 {
5817 struct intel_iommu *iommu;
5818
5819 iommu = device_to_iommu(dev, NULL, NULL);
5820 if (!iommu)
5821 return ERR_PTR(-ENODEV);
5822
5823 if (translation_pre_enabled(iommu))
5824 dev_iommu_priv_set(dev, DEFER_DEVICE_DOMAIN_INFO);
5825
5826 return &iommu->iommu;
5827 }
5828
intel_iommu_release_device(struct device * dev)5829 static void intel_iommu_release_device(struct device *dev)
5830 {
5831 struct intel_iommu *iommu;
5832
5833 iommu = device_to_iommu(dev, NULL, NULL);
5834 if (!iommu)
5835 return;
5836
5837 dmar_remove_one_dev_info(dev);
5838
5839 set_dma_ops(dev, NULL);
5840 }
5841
intel_iommu_probe_finalize(struct device * dev)5842 static void intel_iommu_probe_finalize(struct device *dev)
5843 {
5844 struct iommu_domain *domain;
5845
5846 domain = iommu_get_domain_for_dev(dev);
5847 if (device_needs_bounce(dev))
5848 set_dma_ops(dev, &bounce_dma_ops);
5849 else if (domain && domain->type == IOMMU_DOMAIN_DMA)
5850 set_dma_ops(dev, &intel_dma_ops);
5851 else
5852 set_dma_ops(dev, NULL);
5853 }
5854
intel_iommu_get_resv_regions(struct device * device,struct list_head * head)5855 static void intel_iommu_get_resv_regions(struct device *device,
5856 struct list_head *head)
5857 {
5858 int prot = DMA_PTE_READ | DMA_PTE_WRITE;
5859 struct iommu_resv_region *reg;
5860 struct dmar_rmrr_unit *rmrr;
5861 struct device *i_dev;
5862 int i;
5863
5864 down_read(&dmar_global_lock);
5865 for_each_rmrr_units(rmrr) {
5866 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5867 i, i_dev) {
5868 struct iommu_resv_region *resv;
5869 enum iommu_resv_type type;
5870 size_t length;
5871
5872 if (i_dev != device &&
5873 !is_downstream_to_pci_bridge(device, i_dev))
5874 continue;
5875
5876 length = rmrr->end_address - rmrr->base_address + 1;
5877
5878 type = device_rmrr_is_relaxable(device) ?
5879 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
5880
5881 resv = iommu_alloc_resv_region(rmrr->base_address,
5882 length, prot, type);
5883 if (!resv)
5884 break;
5885
5886 list_add_tail(&resv->list, head);
5887 }
5888 }
5889 up_read(&dmar_global_lock);
5890
5891 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
5892 if (dev_is_pci(device)) {
5893 struct pci_dev *pdev = to_pci_dev(device);
5894
5895 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
5896 reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
5897 IOMMU_RESV_DIRECT_RELAXABLE);
5898 if (reg)
5899 list_add_tail(®->list, head);
5900 }
5901 }
5902 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
5903
5904 reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5905 IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5906 0, IOMMU_RESV_MSI);
5907 if (!reg)
5908 return;
5909 list_add_tail(®->list, head);
5910 }
5911
intel_iommu_enable_pasid(struct intel_iommu * iommu,struct device * dev)5912 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
5913 {
5914 struct device_domain_info *info;
5915 struct context_entry *context;
5916 struct dmar_domain *domain;
5917 unsigned long flags;
5918 u64 ctx_lo;
5919 int ret;
5920
5921 domain = find_domain(dev);
5922 if (!domain)
5923 return -EINVAL;
5924
5925 spin_lock_irqsave(&device_domain_lock, flags);
5926 spin_lock(&iommu->lock);
5927
5928 ret = -EINVAL;
5929 info = get_domain_info(dev);
5930 if (!info || !info->pasid_supported)
5931 goto out;
5932
5933 context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5934 if (WARN_ON(!context))
5935 goto out;
5936
5937 ctx_lo = context[0].lo;
5938
5939 if (!(ctx_lo & CONTEXT_PASIDE)) {
5940 ctx_lo |= CONTEXT_PASIDE;
5941 context[0].lo = ctx_lo;
5942 wmb();
5943 iommu->flush.flush_context(iommu,
5944 domain->iommu_did[iommu->seq_id],
5945 PCI_DEVID(info->bus, info->devfn),
5946 DMA_CCMD_MASK_NOBIT,
5947 DMA_CCMD_DEVICE_INVL);
5948 }
5949
5950 /* Enable PASID support in the device, if it wasn't already */
5951 if (!info->pasid_enabled)
5952 iommu_enable_dev_iotlb(info);
5953
5954 ret = 0;
5955
5956 out:
5957 spin_unlock(&iommu->lock);
5958 spin_unlock_irqrestore(&device_domain_lock, flags);
5959
5960 return ret;
5961 }
5962
intel_iommu_apply_resv_region(struct device * dev,struct iommu_domain * domain,struct iommu_resv_region * region)5963 static void intel_iommu_apply_resv_region(struct device *dev,
5964 struct iommu_domain *domain,
5965 struct iommu_resv_region *region)
5966 {
5967 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5968 unsigned long start, end;
5969
5970 start = IOVA_PFN(region->start);
5971 end = IOVA_PFN(region->start + region->length - 1);
5972
5973 WARN_ON_ONCE(!reserve_iova(&dmar_domain->iovad, start, end));
5974 }
5975
intel_iommu_device_group(struct device * dev)5976 static struct iommu_group *intel_iommu_device_group(struct device *dev)
5977 {
5978 if (dev_is_pci(dev))
5979 return pci_device_group(dev);
5980 return generic_device_group(dev);
5981 }
5982
intel_iommu_enable_auxd(struct device * dev)5983 static int intel_iommu_enable_auxd(struct device *dev)
5984 {
5985 struct device_domain_info *info;
5986 struct intel_iommu *iommu;
5987 unsigned long flags;
5988 int ret;
5989
5990 iommu = device_to_iommu(dev, NULL, NULL);
5991 if (!iommu || dmar_disabled)
5992 return -EINVAL;
5993
5994 if (!sm_supported(iommu) || !pasid_supported(iommu))
5995 return -EINVAL;
5996
5997 ret = intel_iommu_enable_pasid(iommu, dev);
5998 if (ret)
5999 return -ENODEV;
6000
6001 spin_lock_irqsave(&device_domain_lock, flags);
6002 info = get_domain_info(dev);
6003 info->auxd_enabled = 1;
6004 spin_unlock_irqrestore(&device_domain_lock, flags);
6005
6006 return 0;
6007 }
6008
intel_iommu_disable_auxd(struct device * dev)6009 static int intel_iommu_disable_auxd(struct device *dev)
6010 {
6011 struct device_domain_info *info;
6012 unsigned long flags;
6013
6014 spin_lock_irqsave(&device_domain_lock, flags);
6015 info = get_domain_info(dev);
6016 if (!WARN_ON(!info))
6017 info->auxd_enabled = 0;
6018 spin_unlock_irqrestore(&device_domain_lock, flags);
6019
6020 return 0;
6021 }
6022
6023 /*
6024 * A PCI express designated vendor specific extended capability is defined
6025 * in the section 3.7 of Intel scalable I/O virtualization technical spec
6026 * for system software and tools to detect endpoint devices supporting the
6027 * Intel scalable IO virtualization without host driver dependency.
6028 *
6029 * Returns the address of the matching extended capability structure within
6030 * the device's PCI configuration space or 0 if the device does not support
6031 * it.
6032 */
siov_find_pci_dvsec(struct pci_dev * pdev)6033 static int siov_find_pci_dvsec(struct pci_dev *pdev)
6034 {
6035 int pos;
6036 u16 vendor, id;
6037
6038 pos = pci_find_next_ext_capability(pdev, 0, 0x23);
6039 while (pos) {
6040 pci_read_config_word(pdev, pos + 4, &vendor);
6041 pci_read_config_word(pdev, pos + 8, &id);
6042 if (vendor == PCI_VENDOR_ID_INTEL && id == 5)
6043 return pos;
6044
6045 pos = pci_find_next_ext_capability(pdev, pos, 0x23);
6046 }
6047
6048 return 0;
6049 }
6050
6051 static bool
intel_iommu_dev_has_feat(struct device * dev,enum iommu_dev_features feat)6052 intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
6053 {
6054 if (feat == IOMMU_DEV_FEAT_AUX) {
6055 int ret;
6056
6057 if (!dev_is_pci(dev) || dmar_disabled ||
6058 !scalable_mode_support() || !iommu_pasid_support())
6059 return false;
6060
6061 ret = pci_pasid_features(to_pci_dev(dev));
6062 if (ret < 0)
6063 return false;
6064
6065 return !!siov_find_pci_dvsec(to_pci_dev(dev));
6066 }
6067
6068 if (feat == IOMMU_DEV_FEAT_SVA) {
6069 struct device_domain_info *info = get_domain_info(dev);
6070
6071 return info && (info->iommu->flags & VTD_FLAG_SVM_CAPABLE) &&
6072 info->pasid_supported && info->pri_supported &&
6073 info->ats_supported;
6074 }
6075
6076 return false;
6077 }
6078
6079 static int
intel_iommu_dev_enable_feat(struct device * dev,enum iommu_dev_features feat)6080 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
6081 {
6082 if (feat == IOMMU_DEV_FEAT_AUX)
6083 return intel_iommu_enable_auxd(dev);
6084
6085 if (feat == IOMMU_DEV_FEAT_SVA) {
6086 struct device_domain_info *info = get_domain_info(dev);
6087
6088 if (!info)
6089 return -EINVAL;
6090
6091 if (info->iommu->flags & VTD_FLAG_SVM_CAPABLE)
6092 return 0;
6093 }
6094
6095 return -ENODEV;
6096 }
6097
6098 static int
intel_iommu_dev_disable_feat(struct device * dev,enum iommu_dev_features feat)6099 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
6100 {
6101 if (feat == IOMMU_DEV_FEAT_AUX)
6102 return intel_iommu_disable_auxd(dev);
6103
6104 return -ENODEV;
6105 }
6106
6107 static bool
intel_iommu_dev_feat_enabled(struct device * dev,enum iommu_dev_features feat)6108 intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat)
6109 {
6110 struct device_domain_info *info = get_domain_info(dev);
6111
6112 if (feat == IOMMU_DEV_FEAT_AUX)
6113 return scalable_mode_support() && info && info->auxd_enabled;
6114
6115 return false;
6116 }
6117
6118 static int
intel_iommu_aux_get_pasid(struct iommu_domain * domain,struct device * dev)6119 intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
6120 {
6121 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
6122
6123 return dmar_domain->default_pasid > 0 ?
6124 dmar_domain->default_pasid : -EINVAL;
6125 }
6126
intel_iommu_is_attach_deferred(struct iommu_domain * domain,struct device * dev)6127 static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,
6128 struct device *dev)
6129 {
6130 return attach_deferred(dev);
6131 }
6132
6133 static int
intel_iommu_domain_set_attr(struct iommu_domain * domain,enum iommu_attr attr,void * data)6134 intel_iommu_domain_set_attr(struct iommu_domain *domain,
6135 enum iommu_attr attr, void *data)
6136 {
6137 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
6138 unsigned long flags;
6139 int ret = 0;
6140
6141 if (domain->type != IOMMU_DOMAIN_UNMANAGED)
6142 return -EINVAL;
6143
6144 switch (attr) {
6145 case DOMAIN_ATTR_NESTING:
6146 spin_lock_irqsave(&device_domain_lock, flags);
6147 if (nested_mode_support() &&
6148 list_empty(&dmar_domain->devices)) {
6149 dmar_domain->flags |= DOMAIN_FLAG_NESTING_MODE;
6150 dmar_domain->flags &= ~DOMAIN_FLAG_USE_FIRST_LEVEL;
6151 } else {
6152 ret = -ENODEV;
6153 }
6154 spin_unlock_irqrestore(&device_domain_lock, flags);
6155 break;
6156 default:
6157 ret = -EINVAL;
6158 break;
6159 }
6160
6161 return ret;
6162 }
6163
6164 /*
6165 * Check that the device does not live on an external facing PCI port that is
6166 * marked as untrusted. Such devices should not be able to apply quirks and
6167 * thus not be able to bypass the IOMMU restrictions.
6168 */
risky_device(struct pci_dev * pdev)6169 static bool risky_device(struct pci_dev *pdev)
6170 {
6171 if (pdev->untrusted) {
6172 pci_info(pdev,
6173 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
6174 pdev->vendor, pdev->device);
6175 pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
6176 return true;
6177 }
6178 return false;
6179 }
6180
6181 const struct iommu_ops intel_iommu_ops = {
6182 .capable = intel_iommu_capable,
6183 .domain_alloc = intel_iommu_domain_alloc,
6184 .domain_free = intel_iommu_domain_free,
6185 .domain_set_attr = intel_iommu_domain_set_attr,
6186 .attach_dev = intel_iommu_attach_device,
6187 .detach_dev = intel_iommu_detach_device,
6188 .aux_attach_dev = intel_iommu_aux_attach_device,
6189 .aux_detach_dev = intel_iommu_aux_detach_device,
6190 .aux_get_pasid = intel_iommu_aux_get_pasid,
6191 .map = intel_iommu_map,
6192 .unmap = intel_iommu_unmap,
6193 .iova_to_phys = intel_iommu_iova_to_phys,
6194 .probe_device = intel_iommu_probe_device,
6195 .probe_finalize = intel_iommu_probe_finalize,
6196 .release_device = intel_iommu_release_device,
6197 .get_resv_regions = intel_iommu_get_resv_regions,
6198 .put_resv_regions = generic_iommu_put_resv_regions,
6199 .apply_resv_region = intel_iommu_apply_resv_region,
6200 .device_group = intel_iommu_device_group,
6201 .dev_has_feat = intel_iommu_dev_has_feat,
6202 .dev_feat_enabled = intel_iommu_dev_feat_enabled,
6203 .dev_enable_feat = intel_iommu_dev_enable_feat,
6204 .dev_disable_feat = intel_iommu_dev_disable_feat,
6205 .is_attach_deferred = intel_iommu_is_attach_deferred,
6206 .def_domain_type = device_def_domain_type,
6207 .pgsize_bitmap = INTEL_IOMMU_PGSIZES,
6208 #ifdef CONFIG_INTEL_IOMMU_SVM
6209 .cache_invalidate = intel_iommu_sva_invalidate,
6210 .sva_bind_gpasid = intel_svm_bind_gpasid,
6211 .sva_unbind_gpasid = intel_svm_unbind_gpasid,
6212 .sva_bind = intel_svm_bind,
6213 .sva_unbind = intel_svm_unbind,
6214 .sva_get_pasid = intel_svm_get_pasid,
6215 .page_response = intel_svm_page_response,
6216 #endif
6217 };
6218
quirk_iommu_igfx(struct pci_dev * dev)6219 static void quirk_iommu_igfx(struct pci_dev *dev)
6220 {
6221 if (risky_device(dev))
6222 return;
6223
6224 pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
6225 dmar_map_gfx = 0;
6226 }
6227
6228 /* G4x/GM45 integrated gfx dmar support is totally busted. */
6229 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
6230 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
6231 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
6232 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
6233 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
6234 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
6235 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
6236
6237 /* Broadwell igfx malfunctions with dmar */
6238 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
6239 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
6240 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
6241 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
6242 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
6243 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
6244 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
6245 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
6246 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
6247 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
6248 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
6249 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
6250 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
6251 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
6252 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
6253 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
6254 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
6255 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
6256 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
6257 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
6258 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
6259 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
6260 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
6261 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
6262
quirk_iommu_rwbf(struct pci_dev * dev)6263 static void quirk_iommu_rwbf(struct pci_dev *dev)
6264 {
6265 if (risky_device(dev))
6266 return;
6267
6268 /*
6269 * Mobile 4 Series Chipset neglects to set RWBF capability,
6270 * but needs it. Same seems to hold for the desktop versions.
6271 */
6272 pci_info(dev, "Forcing write-buffer flush capability\n");
6273 rwbf_quirk = 1;
6274 }
6275
6276 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
6277 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
6278 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
6279 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
6280 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
6281 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
6282 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
6283
6284 #define GGC 0x52
6285 #define GGC_MEMORY_SIZE_MASK (0xf << 8)
6286 #define GGC_MEMORY_SIZE_NONE (0x0 << 8)
6287 #define GGC_MEMORY_SIZE_1M (0x1 << 8)
6288 #define GGC_MEMORY_SIZE_2M (0x3 << 8)
6289 #define GGC_MEMORY_VT_ENABLED (0x8 << 8)
6290 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8)
6291 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8)
6292 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8)
6293
quirk_calpella_no_shadow_gtt(struct pci_dev * dev)6294 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
6295 {
6296 unsigned short ggc;
6297
6298 if (risky_device(dev))
6299 return;
6300
6301 if (pci_read_config_word(dev, GGC, &ggc))
6302 return;
6303
6304 if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
6305 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
6306 dmar_map_gfx = 0;
6307 } else if (dmar_map_gfx) {
6308 /* we have to ensure the gfx device is idle before we flush */
6309 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
6310 intel_iommu_strict = 1;
6311 }
6312 }
6313 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
6314 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
6315 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
6316 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
6317
quirk_igfx_skip_te_disable(struct pci_dev * dev)6318 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
6319 {
6320 unsigned short ver;
6321
6322 if (!IS_GFX_DEVICE(dev))
6323 return;
6324
6325 ver = (dev->device >> 8) & 0xff;
6326 if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
6327 ver != 0x4e && ver != 0x8a && ver != 0x98 &&
6328 ver != 0x9a && ver != 0xa7 && ver != 0x7d)
6329 return;
6330
6331 if (risky_device(dev))
6332 return;
6333
6334 pci_info(dev, "Skip IOMMU disabling for graphics\n");
6335 iommu_skip_te_disable = 1;
6336 }
6337 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
6338
6339 /* On Tylersburg chipsets, some BIOSes have been known to enable the
6340 ISOCH DMAR unit for the Azalia sound device, but not give it any
6341 TLB entries, which causes it to deadlock. Check for that. We do
6342 this in a function called from init_dmars(), instead of in a PCI
6343 quirk, because we don't want to print the obnoxious "BIOS broken"
6344 message if VT-d is actually disabled.
6345 */
check_tylersburg_isoch(void)6346 static void __init check_tylersburg_isoch(void)
6347 {
6348 struct pci_dev *pdev;
6349 uint32_t vtisochctrl;
6350
6351 /* If there's no Azalia in the system anyway, forget it. */
6352 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
6353 if (!pdev)
6354 return;
6355
6356 if (risky_device(pdev)) {
6357 pci_dev_put(pdev);
6358 return;
6359 }
6360
6361 pci_dev_put(pdev);
6362
6363 /* System Management Registers. Might be hidden, in which case
6364 we can't do the sanity check. But that's OK, because the
6365 known-broken BIOSes _don't_ actually hide it, so far. */
6366 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
6367 if (!pdev)
6368 return;
6369
6370 if (risky_device(pdev)) {
6371 pci_dev_put(pdev);
6372 return;
6373 }
6374
6375 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
6376 pci_dev_put(pdev);
6377 return;
6378 }
6379
6380 pci_dev_put(pdev);
6381
6382 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
6383 if (vtisochctrl & 1)
6384 return;
6385
6386 /* Drop all bits other than the number of TLB entries */
6387 vtisochctrl &= 0x1c;
6388
6389 /* If we have the recommended number of TLB entries (16), fine. */
6390 if (vtisochctrl == 0x10)
6391 return;
6392
6393 /* Zero TLB entries? You get to ride the short bus to school. */
6394 if (!vtisochctrl) {
6395 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
6396 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
6397 dmi_get_system_info(DMI_BIOS_VENDOR),
6398 dmi_get_system_info(DMI_BIOS_VERSION),
6399 dmi_get_system_info(DMI_PRODUCT_VERSION));
6400 iommu_identity_mapping |= IDENTMAP_AZALIA;
6401 return;
6402 }
6403
6404 pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
6405 vtisochctrl);
6406 }
6407