1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * Copyright © 2006-2014 Intel Corporation.
4 *
5 * Authors: David Woodhouse <dwmw2@infradead.org>,
6 * Ashok Raj <ashok.raj@intel.com>,
7 * Shaohua Li <shaohua.li@intel.com>,
8 * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9 * Fenghua Yu <fenghua.yu@intel.com>
10 * Joerg Roedel <jroedel@suse.de>
11 */
12
13 #define pr_fmt(fmt) "DMAR: " fmt
14 #define dev_fmt(fmt) pr_fmt(fmt)
15
16 #include <linux/crash_dump.h>
17 #include <linux/dma-direct.h>
18 #include <linux/dmi.h>
19 #include <linux/memory.h>
20 #include <linux/pci.h>
21 #include <linux/pci-ats.h>
22 #include <linux/spinlock.h>
23 #include <linux/syscore_ops.h>
24 #include <linux/tboot.h>
25 #include <uapi/linux/iommufd.h>
26
27 #include "iommu.h"
28 #include "../dma-iommu.h"
29 #include "../irq_remapping.h"
30 #include "../iommu-pages.h"
31 #include "pasid.h"
32 #include "cap_audit.h"
33 #include "perfmon.h"
34
35 #define ROOT_SIZE VTD_PAGE_SIZE
36 #define CONTEXT_SIZE VTD_PAGE_SIZE
37
38 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
39 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
40 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
41 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
42
43 #define IOAPIC_RANGE_START (0xfee00000)
44 #define IOAPIC_RANGE_END (0xfeefffff)
45 #define IOVA_START_ADDR (0x1000)
46
47 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
48
49 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
50 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
51
52 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
53 to match. That way, we can use 'unsigned long' for PFNs with impunity. */
54 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
55 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
56 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
57
58 static void __init check_tylersburg_isoch(void);
59 static int rwbf_quirk;
60
61 /*
62 * set to 1 to panic kernel if can't successfully enable VT-d
63 * (used when kernel is launched w/ TXT)
64 */
65 static int force_on = 0;
66 static int intel_iommu_tboot_noforce;
67 static int no_platform_optin;
68
69 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
70
71 /*
72 * Take a root_entry and return the Lower Context Table Pointer (LCTP)
73 * if marked present.
74 */
root_entry_lctp(struct root_entry * re)75 static phys_addr_t root_entry_lctp(struct root_entry *re)
76 {
77 if (!(re->lo & 1))
78 return 0;
79
80 return re->lo & VTD_PAGE_MASK;
81 }
82
83 /*
84 * Take a root_entry and return the Upper Context Table Pointer (UCTP)
85 * if marked present.
86 */
root_entry_uctp(struct root_entry * re)87 static phys_addr_t root_entry_uctp(struct root_entry *re)
88 {
89 if (!(re->hi & 1))
90 return 0;
91
92 return re->hi & VTD_PAGE_MASK;
93 }
94
device_rid_cmp_key(const void * key,const struct rb_node * node)95 static int device_rid_cmp_key(const void *key, const struct rb_node *node)
96 {
97 struct device_domain_info *info =
98 rb_entry(node, struct device_domain_info, node);
99 const u16 *rid_lhs = key;
100
101 if (*rid_lhs < PCI_DEVID(info->bus, info->devfn))
102 return -1;
103
104 if (*rid_lhs > PCI_DEVID(info->bus, info->devfn))
105 return 1;
106
107 return 0;
108 }
109
device_rid_cmp(struct rb_node * lhs,const struct rb_node * rhs)110 static int device_rid_cmp(struct rb_node *lhs, const struct rb_node *rhs)
111 {
112 struct device_domain_info *info =
113 rb_entry(lhs, struct device_domain_info, node);
114 u16 key = PCI_DEVID(info->bus, info->devfn);
115
116 return device_rid_cmp_key(&key, rhs);
117 }
118
119 /*
120 * Looks up an IOMMU-probed device using its source ID.
121 *
122 * Returns the pointer to the device if there is a match. Otherwise,
123 * returns NULL.
124 *
125 * Note that this helper doesn't guarantee that the device won't be
126 * released by the iommu subsystem after being returned. The caller
127 * should use its own synchronization mechanism to avoid the device
128 * being released during its use if its possibly the case.
129 */
device_rbtree_find(struct intel_iommu * iommu,u16 rid)130 struct device *device_rbtree_find(struct intel_iommu *iommu, u16 rid)
131 {
132 struct device_domain_info *info = NULL;
133 struct rb_node *node;
134 unsigned long flags;
135
136 spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
137 node = rb_find(&rid, &iommu->device_rbtree, device_rid_cmp_key);
138 if (node)
139 info = rb_entry(node, struct device_domain_info, node);
140 spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags);
141
142 return info ? info->dev : NULL;
143 }
144
device_rbtree_insert(struct intel_iommu * iommu,struct device_domain_info * info)145 static int device_rbtree_insert(struct intel_iommu *iommu,
146 struct device_domain_info *info)
147 {
148 struct rb_node *curr;
149 unsigned long flags;
150
151 spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
152 curr = rb_find_add(&info->node, &iommu->device_rbtree, device_rid_cmp);
153 spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags);
154 if (WARN_ON(curr))
155 return -EEXIST;
156
157 return 0;
158 }
159
device_rbtree_remove(struct device_domain_info * info)160 static void device_rbtree_remove(struct device_domain_info *info)
161 {
162 struct intel_iommu *iommu = info->iommu;
163 unsigned long flags;
164
165 spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
166 rb_erase(&info->node, &iommu->device_rbtree);
167 spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags);
168 }
169
170 struct dmar_rmrr_unit {
171 struct list_head list; /* list of rmrr units */
172 struct acpi_dmar_header *hdr; /* ACPI header */
173 u64 base_address; /* reserved base address*/
174 u64 end_address; /* reserved end address */
175 struct dmar_dev_scope *devices; /* target devices */
176 int devices_cnt; /* target device count */
177 };
178
179 struct dmar_atsr_unit {
180 struct list_head list; /* list of ATSR units */
181 struct acpi_dmar_header *hdr; /* ACPI header */
182 struct dmar_dev_scope *devices; /* target devices */
183 int devices_cnt; /* target device count */
184 u8 include_all:1; /* include all ports */
185 };
186
187 struct dmar_satc_unit {
188 struct list_head list; /* list of SATC units */
189 struct acpi_dmar_header *hdr; /* ACPI header */
190 struct dmar_dev_scope *devices; /* target devices */
191 struct intel_iommu *iommu; /* the corresponding iommu */
192 int devices_cnt; /* target device count */
193 u8 atc_required:1; /* ATS is required */
194 };
195
196 static LIST_HEAD(dmar_atsr_units);
197 static LIST_HEAD(dmar_rmrr_units);
198 static LIST_HEAD(dmar_satc_units);
199
200 #define for_each_rmrr_units(rmrr) \
201 list_for_each_entry(rmrr, &dmar_rmrr_units, list)
202
203 static void intel_iommu_domain_free(struct iommu_domain *domain);
204
205 int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
206 int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON);
207
208 int intel_iommu_enabled = 0;
209 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
210
211 static int intel_iommu_superpage = 1;
212 static int iommu_identity_mapping;
213 static int iommu_skip_te_disable;
214 static int disable_igfx_iommu;
215
216 #define IDENTMAP_AZALIA 4
217
218 const struct iommu_ops intel_iommu_ops;
219 static const struct iommu_dirty_ops intel_dirty_ops;
220
translation_pre_enabled(struct intel_iommu * iommu)221 static bool translation_pre_enabled(struct intel_iommu *iommu)
222 {
223 return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
224 }
225
clear_translation_pre_enabled(struct intel_iommu * iommu)226 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
227 {
228 iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
229 }
230
init_translation_status(struct intel_iommu * iommu)231 static void init_translation_status(struct intel_iommu *iommu)
232 {
233 u32 gsts;
234
235 gsts = readl(iommu->reg + DMAR_GSTS_REG);
236 if (gsts & DMA_GSTS_TES)
237 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
238 }
239
intel_iommu_setup(char * str)240 static int __init intel_iommu_setup(char *str)
241 {
242 if (!str)
243 return -EINVAL;
244
245 while (*str) {
246 if (!strncmp(str, "on", 2)) {
247 dmar_disabled = 0;
248 pr_info("IOMMU enabled\n");
249 } else if (!strncmp(str, "off", 3)) {
250 dmar_disabled = 1;
251 no_platform_optin = 1;
252 pr_info("IOMMU disabled\n");
253 } else if (!strncmp(str, "igfx_off", 8)) {
254 disable_igfx_iommu = 1;
255 pr_info("Disable GFX device mapping\n");
256 } else if (!strncmp(str, "forcedac", 8)) {
257 pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
258 iommu_dma_forcedac = true;
259 } else if (!strncmp(str, "strict", 6)) {
260 pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
261 iommu_set_dma_strict();
262 } else if (!strncmp(str, "sp_off", 6)) {
263 pr_info("Disable supported super page\n");
264 intel_iommu_superpage = 0;
265 } else if (!strncmp(str, "sm_on", 5)) {
266 pr_info("Enable scalable mode if hardware supports\n");
267 intel_iommu_sm = 1;
268 } else if (!strncmp(str, "sm_off", 6)) {
269 pr_info("Scalable mode is disallowed\n");
270 intel_iommu_sm = 0;
271 } else if (!strncmp(str, "tboot_noforce", 13)) {
272 pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
273 intel_iommu_tboot_noforce = 1;
274 } else {
275 pr_notice("Unknown option - '%s'\n", str);
276 }
277
278 str += strcspn(str, ",");
279 while (*str == ',')
280 str++;
281 }
282
283 return 1;
284 }
285 __setup("intel_iommu=", intel_iommu_setup);
286
domain_pfn_supported(struct dmar_domain * domain,unsigned long pfn)287 static int domain_pfn_supported(struct dmar_domain *domain, unsigned long pfn)
288 {
289 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
290
291 return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
292 }
293
294 /*
295 * Calculate the Supported Adjusted Guest Address Widths of an IOMMU.
296 * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of
297 * the returned SAGAW.
298 */
__iommu_calculate_sagaw(struct intel_iommu * iommu)299 static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu)
300 {
301 unsigned long fl_sagaw, sl_sagaw;
302
303 fl_sagaw = BIT(2) | (cap_fl5lp_support(iommu->cap) ? BIT(3) : 0);
304 sl_sagaw = cap_sagaw(iommu->cap);
305
306 /* Second level only. */
307 if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
308 return sl_sagaw;
309
310 /* First level only. */
311 if (!ecap_slts(iommu->ecap))
312 return fl_sagaw;
313
314 return fl_sagaw & sl_sagaw;
315 }
316
__iommu_calculate_agaw(struct intel_iommu * iommu,int max_gaw)317 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
318 {
319 unsigned long sagaw;
320 int agaw;
321
322 sagaw = __iommu_calculate_sagaw(iommu);
323 for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) {
324 if (test_bit(agaw, &sagaw))
325 break;
326 }
327
328 return agaw;
329 }
330
331 /*
332 * Calculate max SAGAW for each iommu.
333 */
iommu_calculate_max_sagaw(struct intel_iommu * iommu)334 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
335 {
336 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
337 }
338
339 /*
340 * calculate agaw for each iommu.
341 * "SAGAW" may be different across iommus, use a default agaw, and
342 * get a supported less agaw for iommus that don't support the default agaw.
343 */
iommu_calculate_agaw(struct intel_iommu * iommu)344 int iommu_calculate_agaw(struct intel_iommu *iommu)
345 {
346 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
347 }
348
iommu_paging_structure_coherency(struct intel_iommu * iommu)349 static bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
350 {
351 return sm_supported(iommu) ?
352 ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
353 }
354
domain_update_iommu_coherency(struct dmar_domain * domain)355 static void domain_update_iommu_coherency(struct dmar_domain *domain)
356 {
357 struct iommu_domain_info *info;
358 struct dmar_drhd_unit *drhd;
359 struct intel_iommu *iommu;
360 bool found = false;
361 unsigned long i;
362
363 domain->iommu_coherency = true;
364 xa_for_each(&domain->iommu_array, i, info) {
365 found = true;
366 if (!iommu_paging_structure_coherency(info->iommu)) {
367 domain->iommu_coherency = false;
368 break;
369 }
370 }
371 if (found)
372 return;
373
374 /* No hardware attached; use lowest common denominator */
375 rcu_read_lock();
376 for_each_active_iommu(iommu, drhd) {
377 if (!iommu_paging_structure_coherency(iommu)) {
378 domain->iommu_coherency = false;
379 break;
380 }
381 }
382 rcu_read_unlock();
383 }
384
domain_update_iommu_superpage(struct dmar_domain * domain,struct intel_iommu * skip)385 static int domain_update_iommu_superpage(struct dmar_domain *domain,
386 struct intel_iommu *skip)
387 {
388 struct dmar_drhd_unit *drhd;
389 struct intel_iommu *iommu;
390 int mask = 0x3;
391
392 if (!intel_iommu_superpage)
393 return 0;
394
395 /* set iommu_superpage to the smallest common denominator */
396 rcu_read_lock();
397 for_each_active_iommu(iommu, drhd) {
398 if (iommu != skip) {
399 if (domain && domain->use_first_level) {
400 if (!cap_fl1gp_support(iommu->cap))
401 mask = 0x1;
402 } else {
403 mask &= cap_super_page_val(iommu->cap);
404 }
405
406 if (!mask)
407 break;
408 }
409 }
410 rcu_read_unlock();
411
412 return fls(mask);
413 }
414
domain_update_device_node(struct dmar_domain * domain)415 static int domain_update_device_node(struct dmar_domain *domain)
416 {
417 struct device_domain_info *info;
418 int nid = NUMA_NO_NODE;
419 unsigned long flags;
420
421 spin_lock_irqsave(&domain->lock, flags);
422 list_for_each_entry(info, &domain->devices, link) {
423 /*
424 * There could possibly be multiple device numa nodes as devices
425 * within the same domain may sit behind different IOMMUs. There
426 * isn't perfect answer in such situation, so we select first
427 * come first served policy.
428 */
429 nid = dev_to_node(info->dev);
430 if (nid != NUMA_NO_NODE)
431 break;
432 }
433 spin_unlock_irqrestore(&domain->lock, flags);
434
435 return nid;
436 }
437
438 /* Return the super pagesize bitmap if supported. */
domain_super_pgsize_bitmap(struct dmar_domain * domain)439 static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
440 {
441 unsigned long bitmap = 0;
442
443 /*
444 * 1-level super page supports page size of 2MiB, 2-level super page
445 * supports page size of both 2MiB and 1GiB.
446 */
447 if (domain->iommu_superpage == 1)
448 bitmap |= SZ_2M;
449 else if (domain->iommu_superpage == 2)
450 bitmap |= SZ_2M | SZ_1G;
451
452 return bitmap;
453 }
454
455 /* Some capabilities may be different across iommus */
domain_update_iommu_cap(struct dmar_domain * domain)456 void domain_update_iommu_cap(struct dmar_domain *domain)
457 {
458 domain_update_iommu_coherency(domain);
459 domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
460
461 /*
462 * If RHSA is missing, we should default to the device numa domain
463 * as fall back.
464 */
465 if (domain->nid == NUMA_NO_NODE)
466 domain->nid = domain_update_device_node(domain);
467
468 /*
469 * First-level translation restricts the input-address to a
470 * canonical address (i.e., address bits 63:N have the same
471 * value as address bit [N-1], where N is 48-bits with 4-level
472 * paging and 57-bits with 5-level paging). Hence, skip bit
473 * [N-1].
474 */
475 if (domain->use_first_level)
476 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
477 else
478 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
479
480 domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
481 }
482
iommu_context_addr(struct intel_iommu * iommu,u8 bus,u8 devfn,int alloc)483 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
484 u8 devfn, int alloc)
485 {
486 struct root_entry *root = &iommu->root_entry[bus];
487 struct context_entry *context;
488 u64 *entry;
489
490 /*
491 * Except that the caller requested to allocate a new entry,
492 * returning a copied context entry makes no sense.
493 */
494 if (!alloc && context_copied(iommu, bus, devfn))
495 return NULL;
496
497 entry = &root->lo;
498 if (sm_supported(iommu)) {
499 if (devfn >= 0x80) {
500 devfn -= 0x80;
501 entry = &root->hi;
502 }
503 devfn *= 2;
504 }
505 if (*entry & 1)
506 context = phys_to_virt(*entry & VTD_PAGE_MASK);
507 else {
508 unsigned long phy_addr;
509 if (!alloc)
510 return NULL;
511
512 context = iommu_alloc_page_node(iommu->node, GFP_ATOMIC);
513 if (!context)
514 return NULL;
515
516 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
517 phy_addr = virt_to_phys((void *)context);
518 *entry = phy_addr | 1;
519 __iommu_flush_cache(iommu, entry, sizeof(*entry));
520 }
521 return &context[devfn];
522 }
523
524 /**
525 * is_downstream_to_pci_bridge - test if a device belongs to the PCI
526 * sub-hierarchy of a candidate PCI-PCI bridge
527 * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
528 * @bridge: the candidate PCI-PCI bridge
529 *
530 * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
531 */
532 static bool
is_downstream_to_pci_bridge(struct device * dev,struct device * bridge)533 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
534 {
535 struct pci_dev *pdev, *pbridge;
536
537 if (!dev_is_pci(dev) || !dev_is_pci(bridge))
538 return false;
539
540 pdev = to_pci_dev(dev);
541 pbridge = to_pci_dev(bridge);
542
543 if (pbridge->subordinate &&
544 pbridge->subordinate->number <= pdev->bus->number &&
545 pbridge->subordinate->busn_res.end >= pdev->bus->number)
546 return true;
547
548 return false;
549 }
550
quirk_ioat_snb_local_iommu(struct pci_dev * pdev)551 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
552 {
553 struct dmar_drhd_unit *drhd;
554 u32 vtbar;
555 int rc;
556
557 /* We know that this device on this chipset has its own IOMMU.
558 * If we find it under a different IOMMU, then the BIOS is lying
559 * to us. Hope that the IOMMU for this device is actually
560 * disabled, and it needs no translation...
561 */
562 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
563 if (rc) {
564 /* "can't" happen */
565 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
566 return false;
567 }
568 vtbar &= 0xffff0000;
569
570 /* we know that the this iommu should be at offset 0xa000 from vtbar */
571 drhd = dmar_find_matched_drhd_unit(pdev);
572 if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
573 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
574 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
575 return true;
576 }
577
578 return false;
579 }
580
iommu_is_dummy(struct intel_iommu * iommu,struct device * dev)581 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
582 {
583 if (!iommu || iommu->drhd->ignored)
584 return true;
585
586 if (dev_is_pci(dev)) {
587 struct pci_dev *pdev = to_pci_dev(dev);
588
589 if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
590 pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
591 quirk_ioat_snb_local_iommu(pdev))
592 return true;
593 }
594
595 return false;
596 }
597
device_lookup_iommu(struct device * dev,u8 * bus,u8 * devfn)598 static struct intel_iommu *device_lookup_iommu(struct device *dev, u8 *bus, u8 *devfn)
599 {
600 struct dmar_drhd_unit *drhd = NULL;
601 struct pci_dev *pdev = NULL;
602 struct intel_iommu *iommu;
603 struct device *tmp;
604 u16 segment = 0;
605 int i;
606
607 if (!dev)
608 return NULL;
609
610 if (dev_is_pci(dev)) {
611 struct pci_dev *pf_pdev;
612
613 pdev = pci_real_dma_dev(to_pci_dev(dev));
614
615 /* VFs aren't listed in scope tables; we need to look up
616 * the PF instead to find the IOMMU. */
617 pf_pdev = pci_physfn(pdev);
618 dev = &pf_pdev->dev;
619 segment = pci_domain_nr(pdev->bus);
620 } else if (has_acpi_companion(dev))
621 dev = &ACPI_COMPANION(dev)->dev;
622
623 rcu_read_lock();
624 for_each_iommu(iommu, drhd) {
625 if (pdev && segment != drhd->segment)
626 continue;
627
628 for_each_active_dev_scope(drhd->devices,
629 drhd->devices_cnt, i, tmp) {
630 if (tmp == dev) {
631 /* For a VF use its original BDF# not that of the PF
632 * which we used for the IOMMU lookup. Strictly speaking
633 * we could do this for all PCI devices; we only need to
634 * get the BDF# from the scope table for ACPI matches. */
635 if (pdev && pdev->is_virtfn)
636 goto got_pdev;
637
638 if (bus && devfn) {
639 *bus = drhd->devices[i].bus;
640 *devfn = drhd->devices[i].devfn;
641 }
642 goto out;
643 }
644
645 if (is_downstream_to_pci_bridge(dev, tmp))
646 goto got_pdev;
647 }
648
649 if (pdev && drhd->include_all) {
650 got_pdev:
651 if (bus && devfn) {
652 *bus = pdev->bus->number;
653 *devfn = pdev->devfn;
654 }
655 goto out;
656 }
657 }
658 iommu = NULL;
659 out:
660 if (iommu_is_dummy(iommu, dev))
661 iommu = NULL;
662
663 rcu_read_unlock();
664
665 return iommu;
666 }
667
domain_flush_cache(struct dmar_domain * domain,void * addr,int size)668 static void domain_flush_cache(struct dmar_domain *domain,
669 void *addr, int size)
670 {
671 if (!domain->iommu_coherency)
672 clflush_cache_range(addr, size);
673 }
674
free_context_table(struct intel_iommu * iommu)675 static void free_context_table(struct intel_iommu *iommu)
676 {
677 struct context_entry *context;
678 int i;
679
680 if (!iommu->root_entry)
681 return;
682
683 for (i = 0; i < ROOT_ENTRY_NR; i++) {
684 context = iommu_context_addr(iommu, i, 0, 0);
685 if (context)
686 iommu_free_page(context);
687
688 if (!sm_supported(iommu))
689 continue;
690
691 context = iommu_context_addr(iommu, i, 0x80, 0);
692 if (context)
693 iommu_free_page(context);
694 }
695
696 iommu_free_page(iommu->root_entry);
697 iommu->root_entry = NULL;
698 }
699
700 #ifdef CONFIG_DMAR_DEBUG
pgtable_walk(struct intel_iommu * iommu,unsigned long pfn,u8 bus,u8 devfn,struct dma_pte * parent,int level)701 static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn,
702 u8 bus, u8 devfn, struct dma_pte *parent, int level)
703 {
704 struct dma_pte *pte;
705 int offset;
706
707 while (1) {
708 offset = pfn_level_offset(pfn, level);
709 pte = &parent[offset];
710
711 pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val);
712
713 if (!dma_pte_present(pte)) {
714 pr_info("page table not present at level %d\n", level - 1);
715 break;
716 }
717
718 if (level == 1 || dma_pte_superpage(pte))
719 break;
720
721 parent = phys_to_virt(dma_pte_addr(pte));
722 level--;
723 }
724 }
725
dmar_fault_dump_ptes(struct intel_iommu * iommu,u16 source_id,unsigned long long addr,u32 pasid)726 void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
727 unsigned long long addr, u32 pasid)
728 {
729 struct pasid_dir_entry *dir, *pde;
730 struct pasid_entry *entries, *pte;
731 struct context_entry *ctx_entry;
732 struct root_entry *rt_entry;
733 int i, dir_index, index, level;
734 u8 devfn = source_id & 0xff;
735 u8 bus = source_id >> 8;
736 struct dma_pte *pgtable;
737
738 pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr);
739
740 /* root entry dump */
741 if (!iommu->root_entry) {
742 pr_info("root table is not present\n");
743 return;
744 }
745 rt_entry = &iommu->root_entry[bus];
746
747 if (sm_supported(iommu))
748 pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n",
749 rt_entry->hi, rt_entry->lo);
750 else
751 pr_info("root entry: 0x%016llx", rt_entry->lo);
752
753 /* context entry dump */
754 ctx_entry = iommu_context_addr(iommu, bus, devfn, 0);
755 if (!ctx_entry) {
756 pr_info("context table is not present\n");
757 return;
758 }
759
760 pr_info("context entry: hi 0x%016llx, low 0x%016llx\n",
761 ctx_entry->hi, ctx_entry->lo);
762
763 /* legacy mode does not require PASID entries */
764 if (!sm_supported(iommu)) {
765 if (!context_present(ctx_entry)) {
766 pr_info("legacy mode page table is not present\n");
767 return;
768 }
769 level = agaw_to_level(ctx_entry->hi & 7);
770 pgtable = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
771 goto pgtable_walk;
772 }
773
774 if (!context_present(ctx_entry)) {
775 pr_info("pasid directory table is not present\n");
776 return;
777 }
778
779 /* get the pointer to pasid directory entry */
780 dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
781
782 /* For request-without-pasid, get the pasid from context entry */
783 if (intel_iommu_sm && pasid == IOMMU_PASID_INVALID)
784 pasid = IOMMU_NO_PASID;
785
786 dir_index = pasid >> PASID_PDE_SHIFT;
787 pde = &dir[dir_index];
788 pr_info("pasid dir entry: 0x%016llx\n", pde->val);
789
790 /* get the pointer to the pasid table entry */
791 entries = get_pasid_table_from_pde(pde);
792 if (!entries) {
793 pr_info("pasid table is not present\n");
794 return;
795 }
796 index = pasid & PASID_PTE_MASK;
797 pte = &entries[index];
798 for (i = 0; i < ARRAY_SIZE(pte->val); i++)
799 pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]);
800
801 if (!pasid_pte_is_present(pte)) {
802 pr_info("scalable mode page table is not present\n");
803 return;
804 }
805
806 if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) {
807 level = pte->val[2] & BIT_ULL(2) ? 5 : 4;
808 pgtable = phys_to_virt(pte->val[2] & VTD_PAGE_MASK);
809 } else {
810 level = agaw_to_level((pte->val[0] >> 2) & 0x7);
811 pgtable = phys_to_virt(pte->val[0] & VTD_PAGE_MASK);
812 }
813
814 pgtable_walk:
815 pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn, pgtable, level);
816 }
817 #endif
818
pfn_to_dma_pte(struct dmar_domain * domain,unsigned long pfn,int * target_level,gfp_t gfp)819 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
820 unsigned long pfn, int *target_level,
821 gfp_t gfp)
822 {
823 struct dma_pte *parent, *pte;
824 int level = agaw_to_level(domain->agaw);
825 int offset;
826
827 if (!domain_pfn_supported(domain, pfn))
828 /* Address beyond IOMMU's addressing capabilities. */
829 return NULL;
830
831 parent = domain->pgd;
832
833 while (1) {
834 void *tmp_page;
835
836 offset = pfn_level_offset(pfn, level);
837 pte = &parent[offset];
838 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
839 break;
840 if (level == *target_level)
841 break;
842
843 if (!dma_pte_present(pte)) {
844 uint64_t pteval, tmp;
845
846 tmp_page = iommu_alloc_page_node(domain->nid, gfp);
847
848 if (!tmp_page)
849 return NULL;
850
851 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
852 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
853 if (domain->use_first_level)
854 pteval |= DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
855
856 tmp = 0ULL;
857 if (!try_cmpxchg64(&pte->val, &tmp, pteval))
858 /* Someone else set it while we were thinking; use theirs. */
859 iommu_free_page(tmp_page);
860 else
861 domain_flush_cache(domain, pte, sizeof(*pte));
862 }
863 if (level == 1)
864 break;
865
866 parent = phys_to_virt(dma_pte_addr(pte));
867 level--;
868 }
869
870 if (!*target_level)
871 *target_level = level;
872
873 return pte;
874 }
875
876 /* return address's pte at specific level */
dma_pfn_level_pte(struct dmar_domain * domain,unsigned long pfn,int level,int * large_page)877 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
878 unsigned long pfn,
879 int level, int *large_page)
880 {
881 struct dma_pte *parent, *pte;
882 int total = agaw_to_level(domain->agaw);
883 int offset;
884
885 parent = domain->pgd;
886 while (level <= total) {
887 offset = pfn_level_offset(pfn, total);
888 pte = &parent[offset];
889 if (level == total)
890 return pte;
891
892 if (!dma_pte_present(pte)) {
893 *large_page = total;
894 break;
895 }
896
897 if (dma_pte_superpage(pte)) {
898 *large_page = total;
899 return pte;
900 }
901
902 parent = phys_to_virt(dma_pte_addr(pte));
903 total--;
904 }
905 return NULL;
906 }
907
908 /* clear last level pte, a tlb flush should be followed */
dma_pte_clear_range(struct dmar_domain * domain,unsigned long start_pfn,unsigned long last_pfn)909 static void dma_pte_clear_range(struct dmar_domain *domain,
910 unsigned long start_pfn,
911 unsigned long last_pfn)
912 {
913 unsigned int large_page;
914 struct dma_pte *first_pte, *pte;
915
916 if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
917 WARN_ON(start_pfn > last_pfn))
918 return;
919
920 /* we don't need lock here; nobody else touches the iova range */
921 do {
922 large_page = 1;
923 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
924 if (!pte) {
925 start_pfn = align_to_level(start_pfn + 1, large_page + 1);
926 continue;
927 }
928 do {
929 dma_clear_pte(pte);
930 start_pfn += lvl_to_nr_pages(large_page);
931 pte++;
932 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
933
934 domain_flush_cache(domain, first_pte,
935 (void *)pte - (void *)first_pte);
936
937 } while (start_pfn && start_pfn <= last_pfn);
938 }
939
dma_pte_free_level(struct dmar_domain * domain,int level,int retain_level,struct dma_pte * pte,unsigned long pfn,unsigned long start_pfn,unsigned long last_pfn)940 static void dma_pte_free_level(struct dmar_domain *domain, int level,
941 int retain_level, struct dma_pte *pte,
942 unsigned long pfn, unsigned long start_pfn,
943 unsigned long last_pfn)
944 {
945 pfn = max(start_pfn, pfn);
946 pte = &pte[pfn_level_offset(pfn, level)];
947
948 do {
949 unsigned long level_pfn;
950 struct dma_pte *level_pte;
951
952 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
953 goto next;
954
955 level_pfn = pfn & level_mask(level);
956 level_pte = phys_to_virt(dma_pte_addr(pte));
957
958 if (level > 2) {
959 dma_pte_free_level(domain, level - 1, retain_level,
960 level_pte, level_pfn, start_pfn,
961 last_pfn);
962 }
963
964 /*
965 * Free the page table if we're below the level we want to
966 * retain and the range covers the entire table.
967 */
968 if (level < retain_level && !(start_pfn > level_pfn ||
969 last_pfn < level_pfn + level_size(level) - 1)) {
970 dma_clear_pte(pte);
971 domain_flush_cache(domain, pte, sizeof(*pte));
972 iommu_free_page(level_pte);
973 }
974 next:
975 pfn += level_size(level);
976 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
977 }
978
979 /*
980 * clear last level (leaf) ptes and free page table pages below the
981 * level we wish to keep intact.
982 */
dma_pte_free_pagetable(struct dmar_domain * domain,unsigned long start_pfn,unsigned long last_pfn,int retain_level)983 static void dma_pte_free_pagetable(struct dmar_domain *domain,
984 unsigned long start_pfn,
985 unsigned long last_pfn,
986 int retain_level)
987 {
988 dma_pte_clear_range(domain, start_pfn, last_pfn);
989
990 /* We don't need lock here; nobody else touches the iova range */
991 dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
992 domain->pgd, 0, start_pfn, last_pfn);
993
994 /* free pgd */
995 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
996 iommu_free_page(domain->pgd);
997 domain->pgd = NULL;
998 }
999 }
1000
1001 /* When a page at a given level is being unlinked from its parent, we don't
1002 need to *modify* it at all. All we need to do is make a list of all the
1003 pages which can be freed just as soon as we've flushed the IOTLB and we
1004 know the hardware page-walk will no longer touch them.
1005 The 'pte' argument is the *parent* PTE, pointing to the page that is to
1006 be freed. */
dma_pte_list_pagetables(struct dmar_domain * domain,int level,struct dma_pte * pte,struct list_head * freelist)1007 static void dma_pte_list_pagetables(struct dmar_domain *domain,
1008 int level, struct dma_pte *pte,
1009 struct list_head *freelist)
1010 {
1011 struct page *pg;
1012
1013 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1014 list_add_tail(&pg->lru, freelist);
1015
1016 if (level == 1)
1017 return;
1018
1019 pte = page_address(pg);
1020 do {
1021 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1022 dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1023 pte++;
1024 } while (!first_pte_in_page(pte));
1025 }
1026
dma_pte_clear_level(struct dmar_domain * domain,int level,struct dma_pte * pte,unsigned long pfn,unsigned long start_pfn,unsigned long last_pfn,struct list_head * freelist)1027 static void dma_pte_clear_level(struct dmar_domain *domain, int level,
1028 struct dma_pte *pte, unsigned long pfn,
1029 unsigned long start_pfn, unsigned long last_pfn,
1030 struct list_head *freelist)
1031 {
1032 struct dma_pte *first_pte = NULL, *last_pte = NULL;
1033
1034 pfn = max(start_pfn, pfn);
1035 pte = &pte[pfn_level_offset(pfn, level)];
1036
1037 do {
1038 unsigned long level_pfn = pfn & level_mask(level);
1039
1040 if (!dma_pte_present(pte))
1041 goto next;
1042
1043 /* If range covers entire pagetable, free it */
1044 if (start_pfn <= level_pfn &&
1045 last_pfn >= level_pfn + level_size(level) - 1) {
1046 /* These suborbinate page tables are going away entirely. Don't
1047 bother to clear them; we're just going to *free* them. */
1048 if (level > 1 && !dma_pte_superpage(pte))
1049 dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1050
1051 dma_clear_pte(pte);
1052 if (!first_pte)
1053 first_pte = pte;
1054 last_pte = pte;
1055 } else if (level > 1) {
1056 /* Recurse down into a level that isn't *entirely* obsolete */
1057 dma_pte_clear_level(domain, level - 1,
1058 phys_to_virt(dma_pte_addr(pte)),
1059 level_pfn, start_pfn, last_pfn,
1060 freelist);
1061 }
1062 next:
1063 pfn = level_pfn + level_size(level);
1064 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1065
1066 if (first_pte)
1067 domain_flush_cache(domain, first_pte,
1068 (void *)++last_pte - (void *)first_pte);
1069 }
1070
1071 /* We can't just free the pages because the IOMMU may still be walking
1072 the page tables, and may have cached the intermediate levels. The
1073 pages can only be freed after the IOTLB flush has been done. */
domain_unmap(struct dmar_domain * domain,unsigned long start_pfn,unsigned long last_pfn,struct list_head * freelist)1074 static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn,
1075 unsigned long last_pfn, struct list_head *freelist)
1076 {
1077 if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
1078 WARN_ON(start_pfn > last_pfn))
1079 return;
1080
1081 /* we don't need lock here; nobody else touches the iova range */
1082 dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1083 domain->pgd, 0, start_pfn, last_pfn, freelist);
1084
1085 /* free pgd */
1086 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1087 struct page *pgd_page = virt_to_page(domain->pgd);
1088 list_add_tail(&pgd_page->lru, freelist);
1089 domain->pgd = NULL;
1090 }
1091 }
1092
1093 /* iommu handling */
iommu_alloc_root_entry(struct intel_iommu * iommu)1094 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1095 {
1096 struct root_entry *root;
1097
1098 root = iommu_alloc_page_node(iommu->node, GFP_ATOMIC);
1099 if (!root) {
1100 pr_err("Allocating root entry for %s failed\n",
1101 iommu->name);
1102 return -ENOMEM;
1103 }
1104
1105 __iommu_flush_cache(iommu, root, ROOT_SIZE);
1106 iommu->root_entry = root;
1107
1108 return 0;
1109 }
1110
iommu_set_root_entry(struct intel_iommu * iommu)1111 static void iommu_set_root_entry(struct intel_iommu *iommu)
1112 {
1113 u64 addr;
1114 u32 sts;
1115 unsigned long flag;
1116
1117 addr = virt_to_phys(iommu->root_entry);
1118 if (sm_supported(iommu))
1119 addr |= DMA_RTADDR_SMT;
1120
1121 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1122 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1123
1124 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1125
1126 /* Make sure hardware complete it */
1127 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1128 readl, (sts & DMA_GSTS_RTPS), sts);
1129
1130 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1131
1132 /*
1133 * Hardware invalidates all DMA remapping hardware translation
1134 * caches as part of SRTP flow.
1135 */
1136 if (cap_esrtps(iommu->cap))
1137 return;
1138
1139 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1140 if (sm_supported(iommu))
1141 qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1142 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1143 }
1144
iommu_flush_write_buffer(struct intel_iommu * iommu)1145 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1146 {
1147 u32 val;
1148 unsigned long flag;
1149
1150 if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1151 return;
1152
1153 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1154 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1155
1156 /* Make sure hardware complete it */
1157 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1158 readl, (!(val & DMA_GSTS_WBFS)), val);
1159
1160 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1161 }
1162
1163 /* return value determine if we need a write buffer flush */
__iommu_flush_context(struct intel_iommu * iommu,u16 did,u16 source_id,u8 function_mask,u64 type)1164 static void __iommu_flush_context(struct intel_iommu *iommu,
1165 u16 did, u16 source_id, u8 function_mask,
1166 u64 type)
1167 {
1168 u64 val = 0;
1169 unsigned long flag;
1170
1171 switch (type) {
1172 case DMA_CCMD_GLOBAL_INVL:
1173 val = DMA_CCMD_GLOBAL_INVL;
1174 break;
1175 case DMA_CCMD_DOMAIN_INVL:
1176 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1177 break;
1178 case DMA_CCMD_DEVICE_INVL:
1179 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1180 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1181 break;
1182 default:
1183 pr_warn("%s: Unexpected context-cache invalidation type 0x%llx\n",
1184 iommu->name, type);
1185 return;
1186 }
1187 val |= DMA_CCMD_ICC;
1188
1189 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1190 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1191
1192 /* Make sure hardware complete it */
1193 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1194 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1195
1196 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1197 }
1198
__iommu_flush_iotlb(struct intel_iommu * iommu,u16 did,u64 addr,unsigned int size_order,u64 type)1199 void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
1200 unsigned int size_order, u64 type)
1201 {
1202 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1203 u64 val = 0, val_iva = 0;
1204 unsigned long flag;
1205
1206 switch (type) {
1207 case DMA_TLB_GLOBAL_FLUSH:
1208 /* global flush doesn't need set IVA_REG */
1209 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1210 break;
1211 case DMA_TLB_DSI_FLUSH:
1212 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1213 break;
1214 case DMA_TLB_PSI_FLUSH:
1215 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1216 /* IH bit is passed in as part of address */
1217 val_iva = size_order | addr;
1218 break;
1219 default:
1220 pr_warn("%s: Unexpected iotlb invalidation type 0x%llx\n",
1221 iommu->name, type);
1222 return;
1223 }
1224
1225 if (cap_write_drain(iommu->cap))
1226 val |= DMA_TLB_WRITE_DRAIN;
1227
1228 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1229 /* Note: Only uses first TLB reg currently */
1230 if (val_iva)
1231 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1232 dmar_writeq(iommu->reg + tlb_offset + 8, val);
1233
1234 /* Make sure hardware complete it */
1235 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1236 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1237
1238 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1239
1240 /* check IOTLB invalidation granularity */
1241 if (DMA_TLB_IAIG(val) == 0)
1242 pr_err("Flush IOTLB failed\n");
1243 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1244 pr_debug("TLB flush request %Lx, actual %Lx\n",
1245 (unsigned long long)DMA_TLB_IIRG(type),
1246 (unsigned long long)DMA_TLB_IAIG(val));
1247 }
1248
1249 static struct device_domain_info *
domain_lookup_dev_info(struct dmar_domain * domain,struct intel_iommu * iommu,u8 bus,u8 devfn)1250 domain_lookup_dev_info(struct dmar_domain *domain,
1251 struct intel_iommu *iommu, u8 bus, u8 devfn)
1252 {
1253 struct device_domain_info *info;
1254 unsigned long flags;
1255
1256 spin_lock_irqsave(&domain->lock, flags);
1257 list_for_each_entry(info, &domain->devices, link) {
1258 if (info->iommu == iommu && info->bus == bus &&
1259 info->devfn == devfn) {
1260 spin_unlock_irqrestore(&domain->lock, flags);
1261 return info;
1262 }
1263 }
1264 spin_unlock_irqrestore(&domain->lock, flags);
1265
1266 return NULL;
1267 }
1268
1269 /*
1270 * The extra devTLB flush quirk impacts those QAT devices with PCI device
1271 * IDs ranging from 0x4940 to 0x4943. It is exempted from risky_device()
1272 * check because it applies only to the built-in QAT devices and it doesn't
1273 * grant additional privileges.
1274 */
1275 #define BUGGY_QAT_DEVID_MASK 0x4940
dev_needs_extra_dtlb_flush(struct pci_dev * pdev)1276 static bool dev_needs_extra_dtlb_flush(struct pci_dev *pdev)
1277 {
1278 if (pdev->vendor != PCI_VENDOR_ID_INTEL)
1279 return false;
1280
1281 if ((pdev->device & 0xfffc) != BUGGY_QAT_DEVID_MASK)
1282 return false;
1283
1284 return true;
1285 }
1286
iommu_enable_pci_caps(struct device_domain_info * info)1287 static void iommu_enable_pci_caps(struct device_domain_info *info)
1288 {
1289 struct pci_dev *pdev;
1290
1291 if (!dev_is_pci(info->dev))
1292 return;
1293
1294 pdev = to_pci_dev(info->dev);
1295 if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1296 !pci_enable_ats(pdev, VTD_PAGE_SHIFT))
1297 info->ats_enabled = 1;
1298 }
1299
iommu_disable_pci_caps(struct device_domain_info * info)1300 static void iommu_disable_pci_caps(struct device_domain_info *info)
1301 {
1302 struct pci_dev *pdev;
1303
1304 if (!dev_is_pci(info->dev))
1305 return;
1306
1307 pdev = to_pci_dev(info->dev);
1308
1309 if (info->ats_enabled) {
1310 pci_disable_ats(pdev);
1311 info->ats_enabled = 0;
1312 }
1313 }
1314
intel_flush_iotlb_all(struct iommu_domain * domain)1315 static void intel_flush_iotlb_all(struct iommu_domain *domain)
1316 {
1317 cache_tag_flush_all(to_dmar_domain(domain));
1318 }
1319
iommu_disable_protect_mem_regions(struct intel_iommu * iommu)1320 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1321 {
1322 u32 pmen;
1323 unsigned long flags;
1324
1325 if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1326 return;
1327
1328 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1329 pmen = readl(iommu->reg + DMAR_PMEN_REG);
1330 pmen &= ~DMA_PMEN_EPM;
1331 writel(pmen, iommu->reg + DMAR_PMEN_REG);
1332
1333 /* wait for the protected region status bit to clear */
1334 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1335 readl, !(pmen & DMA_PMEN_PRS), pmen);
1336
1337 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1338 }
1339
iommu_enable_translation(struct intel_iommu * iommu)1340 static void iommu_enable_translation(struct intel_iommu *iommu)
1341 {
1342 u32 sts;
1343 unsigned long flags;
1344
1345 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1346 iommu->gcmd |= DMA_GCMD_TE;
1347 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1348
1349 /* Make sure hardware complete it */
1350 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1351 readl, (sts & DMA_GSTS_TES), sts);
1352
1353 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1354 }
1355
iommu_disable_translation(struct intel_iommu * iommu)1356 static void iommu_disable_translation(struct intel_iommu *iommu)
1357 {
1358 u32 sts;
1359 unsigned long flag;
1360
1361 if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1362 (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1363 return;
1364
1365 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1366 iommu->gcmd &= ~DMA_GCMD_TE;
1367 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1368
1369 /* Make sure hardware complete it */
1370 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1371 readl, (!(sts & DMA_GSTS_TES)), sts);
1372
1373 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1374 }
1375
iommu_init_domains(struct intel_iommu * iommu)1376 static int iommu_init_domains(struct intel_iommu *iommu)
1377 {
1378 u32 ndomains;
1379
1380 ndomains = cap_ndoms(iommu->cap);
1381 pr_debug("%s: Number of Domains supported <%d>\n",
1382 iommu->name, ndomains);
1383
1384 spin_lock_init(&iommu->lock);
1385
1386 iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL);
1387 if (!iommu->domain_ids)
1388 return -ENOMEM;
1389
1390 /*
1391 * If Caching mode is set, then invalid translations are tagged
1392 * with domain-id 0, hence we need to pre-allocate it. We also
1393 * use domain-id 0 as a marker for non-allocated domain-id, so
1394 * make sure it is not used for a real domain.
1395 */
1396 set_bit(0, iommu->domain_ids);
1397
1398 /*
1399 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1400 * entry for first-level or pass-through translation modes should
1401 * be programmed with a domain id different from those used for
1402 * second-level or nested translation. We reserve a domain id for
1403 * this purpose. This domain id is also used for identity domain
1404 * in legacy mode.
1405 */
1406 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1407
1408 return 0;
1409 }
1410
disable_dmar_iommu(struct intel_iommu * iommu)1411 static void disable_dmar_iommu(struct intel_iommu *iommu)
1412 {
1413 if (!iommu->domain_ids)
1414 return;
1415
1416 /*
1417 * All iommu domains must have been detached from the devices,
1418 * hence there should be no domain IDs in use.
1419 */
1420 if (WARN_ON(bitmap_weight(iommu->domain_ids, cap_ndoms(iommu->cap))
1421 > NUM_RESERVED_DID))
1422 return;
1423
1424 if (iommu->gcmd & DMA_GCMD_TE)
1425 iommu_disable_translation(iommu);
1426 }
1427
free_dmar_iommu(struct intel_iommu * iommu)1428 static void free_dmar_iommu(struct intel_iommu *iommu)
1429 {
1430 if (iommu->domain_ids) {
1431 bitmap_free(iommu->domain_ids);
1432 iommu->domain_ids = NULL;
1433 }
1434
1435 if (iommu->copied_tables) {
1436 bitmap_free(iommu->copied_tables);
1437 iommu->copied_tables = NULL;
1438 }
1439
1440 /* free context mapping */
1441 free_context_table(iommu);
1442
1443 #ifdef CONFIG_INTEL_IOMMU_SVM
1444 if (pasid_supported(iommu)) {
1445 if (ecap_prs(iommu->ecap))
1446 intel_svm_finish_prq(iommu);
1447 }
1448 #endif
1449 }
1450
1451 /*
1452 * Check and return whether first level is used by default for
1453 * DMA translation.
1454 */
first_level_by_default(unsigned int type)1455 static bool first_level_by_default(unsigned int type)
1456 {
1457 /* Only SL is available in legacy mode */
1458 if (!scalable_mode_support())
1459 return false;
1460
1461 /* Only level (either FL or SL) is available, just use it */
1462 if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity())
1463 return intel_cap_flts_sanity();
1464
1465 /* Both levels are available, decide it based on domain type */
1466 return type != IOMMU_DOMAIN_UNMANAGED;
1467 }
1468
alloc_domain(unsigned int type)1469 static struct dmar_domain *alloc_domain(unsigned int type)
1470 {
1471 struct dmar_domain *domain;
1472
1473 domain = kzalloc(sizeof(*domain), GFP_KERNEL);
1474 if (!domain)
1475 return NULL;
1476
1477 domain->nid = NUMA_NO_NODE;
1478 if (first_level_by_default(type))
1479 domain->use_first_level = true;
1480 INIT_LIST_HEAD(&domain->devices);
1481 INIT_LIST_HEAD(&domain->dev_pasids);
1482 INIT_LIST_HEAD(&domain->cache_tags);
1483 spin_lock_init(&domain->lock);
1484 spin_lock_init(&domain->cache_lock);
1485 xa_init(&domain->iommu_array);
1486
1487 return domain;
1488 }
1489
domain_attach_iommu(struct dmar_domain * domain,struct intel_iommu * iommu)1490 int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
1491 {
1492 struct iommu_domain_info *info, *curr;
1493 unsigned long ndomains;
1494 int num, ret = -ENOSPC;
1495
1496 if (domain->domain.type == IOMMU_DOMAIN_SVA)
1497 return 0;
1498
1499 info = kzalloc(sizeof(*info), GFP_KERNEL);
1500 if (!info)
1501 return -ENOMEM;
1502
1503 spin_lock(&iommu->lock);
1504 curr = xa_load(&domain->iommu_array, iommu->seq_id);
1505 if (curr) {
1506 curr->refcnt++;
1507 spin_unlock(&iommu->lock);
1508 kfree(info);
1509 return 0;
1510 }
1511
1512 ndomains = cap_ndoms(iommu->cap);
1513 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1514 if (num >= ndomains) {
1515 pr_err("%s: No free domain ids\n", iommu->name);
1516 goto err_unlock;
1517 }
1518
1519 set_bit(num, iommu->domain_ids);
1520 info->refcnt = 1;
1521 info->did = num;
1522 info->iommu = iommu;
1523 curr = xa_cmpxchg(&domain->iommu_array, iommu->seq_id,
1524 NULL, info, GFP_ATOMIC);
1525 if (curr) {
1526 ret = xa_err(curr) ? : -EBUSY;
1527 goto err_clear;
1528 }
1529 domain_update_iommu_cap(domain);
1530
1531 spin_unlock(&iommu->lock);
1532 return 0;
1533
1534 err_clear:
1535 clear_bit(info->did, iommu->domain_ids);
1536 err_unlock:
1537 spin_unlock(&iommu->lock);
1538 kfree(info);
1539 return ret;
1540 }
1541
domain_detach_iommu(struct dmar_domain * domain,struct intel_iommu * iommu)1542 void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
1543 {
1544 struct iommu_domain_info *info;
1545
1546 if (domain->domain.type == IOMMU_DOMAIN_SVA)
1547 return;
1548
1549 spin_lock(&iommu->lock);
1550 info = xa_load(&domain->iommu_array, iommu->seq_id);
1551 if (--info->refcnt == 0) {
1552 clear_bit(info->did, iommu->domain_ids);
1553 xa_erase(&domain->iommu_array, iommu->seq_id);
1554 domain->nid = NUMA_NO_NODE;
1555 domain_update_iommu_cap(domain);
1556 kfree(info);
1557 }
1558 spin_unlock(&iommu->lock);
1559 }
1560
guestwidth_to_adjustwidth(int gaw)1561 static int guestwidth_to_adjustwidth(int gaw)
1562 {
1563 int agaw;
1564 int r = (gaw - 12) % 9;
1565
1566 if (r == 0)
1567 agaw = gaw;
1568 else
1569 agaw = gaw + 9 - r;
1570 if (agaw > 64)
1571 agaw = 64;
1572 return agaw;
1573 }
1574
domain_exit(struct dmar_domain * domain)1575 static void domain_exit(struct dmar_domain *domain)
1576 {
1577 if (domain->pgd) {
1578 LIST_HEAD(freelist);
1579
1580 domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist);
1581 iommu_put_pages_list(&freelist);
1582 }
1583
1584 if (WARN_ON(!list_empty(&domain->devices)))
1585 return;
1586
1587 kfree(domain->qi_batch);
1588 kfree(domain);
1589 }
1590
1591 /*
1592 * For kdump cases, old valid entries may be cached due to the
1593 * in-flight DMA and copied pgtable, but there is no unmapping
1594 * behaviour for them, thus we need an explicit cache flush for
1595 * the newly-mapped device. For kdump, at this point, the device
1596 * is supposed to finish reset at its driver probe stage, so no
1597 * in-flight DMA will exist, and we don't need to worry anymore
1598 * hereafter.
1599 */
copied_context_tear_down(struct intel_iommu * iommu,struct context_entry * context,u8 bus,u8 devfn)1600 static void copied_context_tear_down(struct intel_iommu *iommu,
1601 struct context_entry *context,
1602 u8 bus, u8 devfn)
1603 {
1604 u16 did_old;
1605
1606 if (!context_copied(iommu, bus, devfn))
1607 return;
1608
1609 assert_spin_locked(&iommu->lock);
1610
1611 did_old = context_domain_id(context);
1612 context_clear_entry(context);
1613
1614 if (did_old < cap_ndoms(iommu->cap)) {
1615 iommu->flush.flush_context(iommu, did_old,
1616 (((u16)bus) << 8) | devfn,
1617 DMA_CCMD_MASK_NOBIT,
1618 DMA_CCMD_DEVICE_INVL);
1619 iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
1620 DMA_TLB_DSI_FLUSH);
1621 }
1622
1623 clear_context_copied(iommu, bus, devfn);
1624 }
1625
1626 /*
1627 * It's a non-present to present mapping. If hardware doesn't cache
1628 * non-present entry we only need to flush the write-buffer. If the
1629 * _does_ cache non-present entries, then it does so in the special
1630 * domain #0, which we have to flush:
1631 */
context_present_cache_flush(struct intel_iommu * iommu,u16 did,u8 bus,u8 devfn)1632 static void context_present_cache_flush(struct intel_iommu *iommu, u16 did,
1633 u8 bus, u8 devfn)
1634 {
1635 if (cap_caching_mode(iommu->cap)) {
1636 iommu->flush.flush_context(iommu, 0,
1637 (((u16)bus) << 8) | devfn,
1638 DMA_CCMD_MASK_NOBIT,
1639 DMA_CCMD_DEVICE_INVL);
1640 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
1641 } else {
1642 iommu_flush_write_buffer(iommu);
1643 }
1644 }
1645
domain_context_mapping_one(struct dmar_domain * domain,struct intel_iommu * iommu,u8 bus,u8 devfn)1646 static int domain_context_mapping_one(struct dmar_domain *domain,
1647 struct intel_iommu *iommu,
1648 u8 bus, u8 devfn)
1649 {
1650 struct device_domain_info *info =
1651 domain_lookup_dev_info(domain, iommu, bus, devfn);
1652 u16 did = domain_id_iommu(domain, iommu);
1653 int translation = CONTEXT_TT_MULTI_LEVEL;
1654 struct dma_pte *pgd = domain->pgd;
1655 struct context_entry *context;
1656 int agaw, ret;
1657
1658 pr_debug("Set context mapping for %02x:%02x.%d\n",
1659 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1660
1661 spin_lock(&iommu->lock);
1662 ret = -ENOMEM;
1663 context = iommu_context_addr(iommu, bus, devfn, 1);
1664 if (!context)
1665 goto out_unlock;
1666
1667 ret = 0;
1668 if (context_present(context) && !context_copied(iommu, bus, devfn))
1669 goto out_unlock;
1670
1671 copied_context_tear_down(iommu, context, bus, devfn);
1672 context_clear_entry(context);
1673
1674 context_set_domain_id(context, did);
1675
1676 /*
1677 * Skip top levels of page tables for iommu which has
1678 * less agaw than default. Unnecessary for PT mode.
1679 */
1680 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
1681 ret = -ENOMEM;
1682 pgd = phys_to_virt(dma_pte_addr(pgd));
1683 if (!dma_pte_present(pgd))
1684 goto out_unlock;
1685 }
1686
1687 if (info && info->ats_supported)
1688 translation = CONTEXT_TT_DEV_IOTLB;
1689 else
1690 translation = CONTEXT_TT_MULTI_LEVEL;
1691
1692 context_set_address_root(context, virt_to_phys(pgd));
1693 context_set_address_width(context, agaw);
1694 context_set_translation_type(context, translation);
1695 context_set_fault_enable(context);
1696 context_set_present(context);
1697 if (!ecap_coherent(iommu->ecap))
1698 clflush_cache_range(context, sizeof(*context));
1699 context_present_cache_flush(iommu, did, bus, devfn);
1700 ret = 0;
1701
1702 out_unlock:
1703 spin_unlock(&iommu->lock);
1704
1705 return ret;
1706 }
1707
domain_context_mapping_cb(struct pci_dev * pdev,u16 alias,void * opaque)1708 static int domain_context_mapping_cb(struct pci_dev *pdev,
1709 u16 alias, void *opaque)
1710 {
1711 struct device_domain_info *info = dev_iommu_priv_get(&pdev->dev);
1712 struct intel_iommu *iommu = info->iommu;
1713 struct dmar_domain *domain = opaque;
1714
1715 return domain_context_mapping_one(domain, iommu,
1716 PCI_BUS_NUM(alias), alias & 0xff);
1717 }
1718
1719 static int
domain_context_mapping(struct dmar_domain * domain,struct device * dev)1720 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
1721 {
1722 struct device_domain_info *info = dev_iommu_priv_get(dev);
1723 struct intel_iommu *iommu = info->iommu;
1724 u8 bus = info->bus, devfn = info->devfn;
1725
1726 if (!dev_is_pci(dev))
1727 return domain_context_mapping_one(domain, iommu, bus, devfn);
1728
1729 return pci_for_each_dma_alias(to_pci_dev(dev),
1730 domain_context_mapping_cb, domain);
1731 }
1732
1733 /* Return largest possible superpage level for a given mapping */
hardware_largepage_caps(struct dmar_domain * domain,unsigned long iov_pfn,unsigned long phy_pfn,unsigned long pages)1734 static int hardware_largepage_caps(struct dmar_domain *domain, unsigned long iov_pfn,
1735 unsigned long phy_pfn, unsigned long pages)
1736 {
1737 int support, level = 1;
1738 unsigned long pfnmerge;
1739
1740 support = domain->iommu_superpage;
1741
1742 /* To use a large page, the virtual *and* physical addresses
1743 must be aligned to 2MiB/1GiB/etc. Lower bits set in either
1744 of them will mean we have to use smaller pages. So just
1745 merge them and check both at once. */
1746 pfnmerge = iov_pfn | phy_pfn;
1747
1748 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
1749 pages >>= VTD_STRIDE_SHIFT;
1750 if (!pages)
1751 break;
1752 pfnmerge >>= VTD_STRIDE_SHIFT;
1753 level++;
1754 support--;
1755 }
1756 return level;
1757 }
1758
1759 /*
1760 * Ensure that old small page tables are removed to make room for superpage(s).
1761 * We're going to add new large pages, so make sure we don't remove their parent
1762 * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
1763 */
switch_to_super_page(struct dmar_domain * domain,unsigned long start_pfn,unsigned long end_pfn,int level)1764 static void switch_to_super_page(struct dmar_domain *domain,
1765 unsigned long start_pfn,
1766 unsigned long end_pfn, int level)
1767 {
1768 unsigned long lvl_pages = lvl_to_nr_pages(level);
1769 struct dma_pte *pte = NULL;
1770
1771 if (WARN_ON(!IS_ALIGNED(start_pfn, lvl_pages) ||
1772 !IS_ALIGNED(end_pfn + 1, lvl_pages)))
1773 return;
1774
1775 while (start_pfn <= end_pfn) {
1776 if (!pte)
1777 pte = pfn_to_dma_pte(domain, start_pfn, &level,
1778 GFP_ATOMIC);
1779
1780 if (dma_pte_present(pte)) {
1781 dma_pte_free_pagetable(domain, start_pfn,
1782 start_pfn + lvl_pages - 1,
1783 level + 1);
1784
1785 cache_tag_flush_range(domain, start_pfn << VTD_PAGE_SHIFT,
1786 end_pfn << VTD_PAGE_SHIFT, 0);
1787 }
1788
1789 pte++;
1790 start_pfn += lvl_pages;
1791 if (first_pte_in_page(pte))
1792 pte = NULL;
1793 }
1794 }
1795
1796 static int
__domain_mapping(struct dmar_domain * domain,unsigned long iov_pfn,unsigned long phys_pfn,unsigned long nr_pages,int prot,gfp_t gfp)1797 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1798 unsigned long phys_pfn, unsigned long nr_pages, int prot,
1799 gfp_t gfp)
1800 {
1801 struct dma_pte *first_pte = NULL, *pte = NULL;
1802 unsigned int largepage_lvl = 0;
1803 unsigned long lvl_pages = 0;
1804 phys_addr_t pteval;
1805 u64 attr;
1806
1807 if (unlikely(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1)))
1808 return -EINVAL;
1809
1810 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1811 return -EINVAL;
1812
1813 if (!(prot & DMA_PTE_WRITE) && domain->nested_parent) {
1814 pr_err_ratelimited("Read-only mapping is disallowed on the domain which serves as the parent in a nested configuration, due to HW errata (ERRATA_772415_SPR17)\n");
1815 return -EINVAL;
1816 }
1817
1818 attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
1819 attr |= DMA_FL_PTE_PRESENT;
1820 if (domain->use_first_level) {
1821 attr |= DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
1822 if (prot & DMA_PTE_WRITE)
1823 attr |= DMA_FL_PTE_DIRTY;
1824 }
1825
1826 domain->has_mappings = true;
1827
1828 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
1829
1830 while (nr_pages > 0) {
1831 uint64_t tmp;
1832
1833 if (!pte) {
1834 largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
1835 phys_pfn, nr_pages);
1836
1837 pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl,
1838 gfp);
1839 if (!pte)
1840 return -ENOMEM;
1841 first_pte = pte;
1842
1843 lvl_pages = lvl_to_nr_pages(largepage_lvl);
1844
1845 /* It is large page*/
1846 if (largepage_lvl > 1) {
1847 unsigned long end_pfn;
1848 unsigned long pages_to_remove;
1849
1850 pteval |= DMA_PTE_LARGE_PAGE;
1851 pages_to_remove = min_t(unsigned long,
1852 round_down(nr_pages, lvl_pages),
1853 nr_pte_to_next_page(pte) * lvl_pages);
1854 end_pfn = iov_pfn + pages_to_remove - 1;
1855 switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
1856 } else {
1857 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
1858 }
1859
1860 }
1861 /* We don't need lock here, nobody else
1862 * touches the iova range
1863 */
1864 tmp = 0ULL;
1865 if (!try_cmpxchg64_local(&pte->val, &tmp, pteval)) {
1866 static int dumps = 5;
1867 pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
1868 iov_pfn, tmp, (unsigned long long)pteval);
1869 if (dumps) {
1870 dumps--;
1871 debug_dma_dump_mappings(NULL);
1872 }
1873 WARN_ON(1);
1874 }
1875
1876 nr_pages -= lvl_pages;
1877 iov_pfn += lvl_pages;
1878 phys_pfn += lvl_pages;
1879 pteval += lvl_pages * VTD_PAGE_SIZE;
1880
1881 /* If the next PTE would be the first in a new page, then we
1882 * need to flush the cache on the entries we've just written.
1883 * And then we'll need to recalculate 'pte', so clear it and
1884 * let it get set again in the if (!pte) block above.
1885 *
1886 * If we're done (!nr_pages) we need to flush the cache too.
1887 *
1888 * Also if we've been setting superpages, we may need to
1889 * recalculate 'pte' and switch back to smaller pages for the
1890 * end of the mapping, if the trailing size is not enough to
1891 * use another superpage (i.e. nr_pages < lvl_pages).
1892 */
1893 pte++;
1894 if (!nr_pages || first_pte_in_page(pte) ||
1895 (largepage_lvl > 1 && nr_pages < lvl_pages)) {
1896 domain_flush_cache(domain, first_pte,
1897 (void *)pte - (void *)first_pte);
1898 pte = NULL;
1899 }
1900 }
1901
1902 return 0;
1903 }
1904
domain_context_clear_one(struct device_domain_info * info,u8 bus,u8 devfn)1905 static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
1906 {
1907 struct intel_iommu *iommu = info->iommu;
1908 struct context_entry *context;
1909 u16 did;
1910
1911 spin_lock(&iommu->lock);
1912 context = iommu_context_addr(iommu, bus, devfn, 0);
1913 if (!context) {
1914 spin_unlock(&iommu->lock);
1915 return;
1916 }
1917
1918 did = context_domain_id(context);
1919 context_clear_entry(context);
1920 __iommu_flush_cache(iommu, context, sizeof(*context));
1921 spin_unlock(&iommu->lock);
1922 intel_context_flush_present(info, context, did, true);
1923 }
1924
domain_setup_first_level(struct intel_iommu * iommu,struct dmar_domain * domain,struct device * dev,u32 pasid)1925 static int domain_setup_first_level(struct intel_iommu *iommu,
1926 struct dmar_domain *domain,
1927 struct device *dev,
1928 u32 pasid)
1929 {
1930 struct dma_pte *pgd = domain->pgd;
1931 int agaw, level;
1932 int flags = 0;
1933
1934 /*
1935 * Skip top levels of page tables for iommu which has
1936 * less agaw than default. Unnecessary for PT mode.
1937 */
1938 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
1939 pgd = phys_to_virt(dma_pte_addr(pgd));
1940 if (!dma_pte_present(pgd))
1941 return -ENOMEM;
1942 }
1943
1944 level = agaw_to_level(agaw);
1945 if (level != 4 && level != 5)
1946 return -EINVAL;
1947
1948 if (level == 5)
1949 flags |= PASID_FLAG_FL5LP;
1950
1951 if (domain->force_snooping)
1952 flags |= PASID_FLAG_PAGE_SNOOP;
1953
1954 return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
1955 domain_id_iommu(domain, iommu),
1956 flags);
1957 }
1958
dev_is_real_dma_subdevice(struct device * dev)1959 static bool dev_is_real_dma_subdevice(struct device *dev)
1960 {
1961 return dev && dev_is_pci(dev) &&
1962 pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
1963 }
1964
domain_need_iotlb_sync_map(struct dmar_domain * domain,struct intel_iommu * iommu)1965 static bool domain_need_iotlb_sync_map(struct dmar_domain *domain,
1966 struct intel_iommu *iommu)
1967 {
1968 if (cap_caching_mode(iommu->cap) && !domain->use_first_level)
1969 return true;
1970
1971 if (rwbf_quirk || cap_rwbf(iommu->cap))
1972 return true;
1973
1974 return false;
1975 }
1976
dmar_domain_attach_device(struct dmar_domain * domain,struct device * dev)1977 static int dmar_domain_attach_device(struct dmar_domain *domain,
1978 struct device *dev)
1979 {
1980 struct device_domain_info *info = dev_iommu_priv_get(dev);
1981 struct intel_iommu *iommu = info->iommu;
1982 unsigned long flags;
1983 int ret;
1984
1985 ret = domain_attach_iommu(domain, iommu);
1986 if (ret)
1987 return ret;
1988
1989 info->domain = domain;
1990 info->domain_attached = true;
1991 spin_lock_irqsave(&domain->lock, flags);
1992 list_add(&info->link, &domain->devices);
1993 spin_unlock_irqrestore(&domain->lock, flags);
1994
1995 if (dev_is_real_dma_subdevice(dev))
1996 return 0;
1997
1998 if (!sm_supported(iommu))
1999 ret = domain_context_mapping(domain, dev);
2000 else if (domain->use_first_level)
2001 ret = domain_setup_first_level(iommu, domain, dev, IOMMU_NO_PASID);
2002 else
2003 ret = intel_pasid_setup_second_level(iommu, domain, dev, IOMMU_NO_PASID);
2004
2005 if (ret)
2006 goto out_block_translation;
2007
2008 iommu_enable_pci_caps(info);
2009
2010 ret = cache_tag_assign_domain(domain, dev, IOMMU_NO_PASID);
2011 if (ret)
2012 goto out_block_translation;
2013
2014 domain->iotlb_sync_map |= domain_need_iotlb_sync_map(domain, iommu);
2015
2016 return 0;
2017
2018 out_block_translation:
2019 device_block_translation(dev);
2020 return ret;
2021 }
2022
2023 /**
2024 * device_rmrr_is_relaxable - Test whether the RMRR of this device
2025 * is relaxable (ie. is allowed to be not enforced under some conditions)
2026 * @dev: device handle
2027 *
2028 * We assume that PCI USB devices with RMRRs have them largely
2029 * for historical reasons and that the RMRR space is not actively used post
2030 * boot. This exclusion may change if vendors begin to abuse it.
2031 *
2032 * The same exception is made for graphics devices, with the requirement that
2033 * any use of the RMRR regions will be torn down before assigning the device
2034 * to a guest.
2035 *
2036 * Return: true if the RMRR is relaxable, false otherwise
2037 */
device_rmrr_is_relaxable(struct device * dev)2038 static bool device_rmrr_is_relaxable(struct device *dev)
2039 {
2040 struct pci_dev *pdev;
2041
2042 if (!dev_is_pci(dev))
2043 return false;
2044
2045 pdev = to_pci_dev(dev);
2046 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2047 return true;
2048 else
2049 return false;
2050 }
2051
device_def_domain_type(struct device * dev)2052 static int device_def_domain_type(struct device *dev)
2053 {
2054 struct device_domain_info *info = dev_iommu_priv_get(dev);
2055 struct intel_iommu *iommu = info->iommu;
2056
2057 /*
2058 * Hardware does not support the passthrough translation mode.
2059 * Always use a dynamaic mapping domain.
2060 */
2061 if (!ecap_pass_through(iommu->ecap))
2062 return IOMMU_DOMAIN_DMA;
2063
2064 if (dev_is_pci(dev)) {
2065 struct pci_dev *pdev = to_pci_dev(dev);
2066
2067 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2068 return IOMMU_DOMAIN_IDENTITY;
2069 }
2070
2071 return 0;
2072 }
2073
intel_iommu_init_qi(struct intel_iommu * iommu)2074 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2075 {
2076 /*
2077 * Start from the sane iommu hardware state.
2078 * If the queued invalidation is already initialized by us
2079 * (for example, while enabling interrupt-remapping) then
2080 * we got the things already rolling from a sane state.
2081 */
2082 if (!iommu->qi) {
2083 /*
2084 * Clear any previous faults.
2085 */
2086 dmar_fault(-1, iommu);
2087 /*
2088 * Disable queued invalidation if supported and already enabled
2089 * before OS handover.
2090 */
2091 dmar_disable_qi(iommu);
2092 }
2093
2094 if (dmar_enable_qi(iommu)) {
2095 /*
2096 * Queued Invalidate not enabled, use Register Based Invalidate
2097 */
2098 iommu->flush.flush_context = __iommu_flush_context;
2099 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2100 pr_info("%s: Using Register based invalidation\n",
2101 iommu->name);
2102 } else {
2103 iommu->flush.flush_context = qi_flush_context;
2104 iommu->flush.flush_iotlb = qi_flush_iotlb;
2105 pr_info("%s: Using Queued invalidation\n", iommu->name);
2106 }
2107 }
2108
copy_context_table(struct intel_iommu * iommu,struct root_entry * old_re,struct context_entry ** tbl,int bus,bool ext)2109 static int copy_context_table(struct intel_iommu *iommu,
2110 struct root_entry *old_re,
2111 struct context_entry **tbl,
2112 int bus, bool ext)
2113 {
2114 int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2115 struct context_entry *new_ce = NULL, ce;
2116 struct context_entry *old_ce = NULL;
2117 struct root_entry re;
2118 phys_addr_t old_ce_phys;
2119
2120 tbl_idx = ext ? bus * 2 : bus;
2121 memcpy(&re, old_re, sizeof(re));
2122
2123 for (devfn = 0; devfn < 256; devfn++) {
2124 /* First calculate the correct index */
2125 idx = (ext ? devfn * 2 : devfn) % 256;
2126
2127 if (idx == 0) {
2128 /* First save what we may have and clean up */
2129 if (new_ce) {
2130 tbl[tbl_idx] = new_ce;
2131 __iommu_flush_cache(iommu, new_ce,
2132 VTD_PAGE_SIZE);
2133 pos = 1;
2134 }
2135
2136 if (old_ce)
2137 memunmap(old_ce);
2138
2139 ret = 0;
2140 if (devfn < 0x80)
2141 old_ce_phys = root_entry_lctp(&re);
2142 else
2143 old_ce_phys = root_entry_uctp(&re);
2144
2145 if (!old_ce_phys) {
2146 if (ext && devfn == 0) {
2147 /* No LCTP, try UCTP */
2148 devfn = 0x7f;
2149 continue;
2150 } else {
2151 goto out;
2152 }
2153 }
2154
2155 ret = -ENOMEM;
2156 old_ce = memremap(old_ce_phys, PAGE_SIZE,
2157 MEMREMAP_WB);
2158 if (!old_ce)
2159 goto out;
2160
2161 new_ce = iommu_alloc_page_node(iommu->node, GFP_KERNEL);
2162 if (!new_ce)
2163 goto out_unmap;
2164
2165 ret = 0;
2166 }
2167
2168 /* Now copy the context entry */
2169 memcpy(&ce, old_ce + idx, sizeof(ce));
2170
2171 if (!context_present(&ce))
2172 continue;
2173
2174 did = context_domain_id(&ce);
2175 if (did >= 0 && did < cap_ndoms(iommu->cap))
2176 set_bit(did, iommu->domain_ids);
2177
2178 set_context_copied(iommu, bus, devfn);
2179 new_ce[idx] = ce;
2180 }
2181
2182 tbl[tbl_idx + pos] = new_ce;
2183
2184 __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2185
2186 out_unmap:
2187 memunmap(old_ce);
2188
2189 out:
2190 return ret;
2191 }
2192
copy_translation_tables(struct intel_iommu * iommu)2193 static int copy_translation_tables(struct intel_iommu *iommu)
2194 {
2195 struct context_entry **ctxt_tbls;
2196 struct root_entry *old_rt;
2197 phys_addr_t old_rt_phys;
2198 int ctxt_table_entries;
2199 u64 rtaddr_reg;
2200 int bus, ret;
2201 bool new_ext, ext;
2202
2203 rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2204 ext = !!(rtaddr_reg & DMA_RTADDR_SMT);
2205 new_ext = !!sm_supported(iommu);
2206
2207 /*
2208 * The RTT bit can only be changed when translation is disabled,
2209 * but disabling translation means to open a window for data
2210 * corruption. So bail out and don't copy anything if we would
2211 * have to change the bit.
2212 */
2213 if (new_ext != ext)
2214 return -EINVAL;
2215
2216 iommu->copied_tables = bitmap_zalloc(BIT_ULL(16), GFP_KERNEL);
2217 if (!iommu->copied_tables)
2218 return -ENOMEM;
2219
2220 old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
2221 if (!old_rt_phys)
2222 return -EINVAL;
2223
2224 old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
2225 if (!old_rt)
2226 return -ENOMEM;
2227
2228 /* This is too big for the stack - allocate it from slab */
2229 ctxt_table_entries = ext ? 512 : 256;
2230 ret = -ENOMEM;
2231 ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
2232 if (!ctxt_tbls)
2233 goto out_unmap;
2234
2235 for (bus = 0; bus < 256; bus++) {
2236 ret = copy_context_table(iommu, &old_rt[bus],
2237 ctxt_tbls, bus, ext);
2238 if (ret) {
2239 pr_err("%s: Failed to copy context table for bus %d\n",
2240 iommu->name, bus);
2241 continue;
2242 }
2243 }
2244
2245 spin_lock(&iommu->lock);
2246
2247 /* Context tables are copied, now write them to the root_entry table */
2248 for (bus = 0; bus < 256; bus++) {
2249 int idx = ext ? bus * 2 : bus;
2250 u64 val;
2251
2252 if (ctxt_tbls[idx]) {
2253 val = virt_to_phys(ctxt_tbls[idx]) | 1;
2254 iommu->root_entry[bus].lo = val;
2255 }
2256
2257 if (!ext || !ctxt_tbls[idx + 1])
2258 continue;
2259
2260 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
2261 iommu->root_entry[bus].hi = val;
2262 }
2263
2264 spin_unlock(&iommu->lock);
2265
2266 kfree(ctxt_tbls);
2267
2268 __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
2269
2270 ret = 0;
2271
2272 out_unmap:
2273 memunmap(old_rt);
2274
2275 return ret;
2276 }
2277
init_dmars(void)2278 static int __init init_dmars(void)
2279 {
2280 struct dmar_drhd_unit *drhd;
2281 struct intel_iommu *iommu;
2282 int ret;
2283
2284 ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL);
2285 if (ret)
2286 goto free_iommu;
2287
2288 for_each_iommu(iommu, drhd) {
2289 if (drhd->ignored) {
2290 iommu_disable_translation(iommu);
2291 continue;
2292 }
2293
2294 /*
2295 * Find the max pasid size of all IOMMU's in the system.
2296 * We need to ensure the system pasid table is no bigger
2297 * than the smallest supported.
2298 */
2299 if (pasid_supported(iommu)) {
2300 u32 temp = 2 << ecap_pss(iommu->ecap);
2301
2302 intel_pasid_max_id = min_t(u32, temp,
2303 intel_pasid_max_id);
2304 }
2305
2306 intel_iommu_init_qi(iommu);
2307
2308 ret = iommu_init_domains(iommu);
2309 if (ret)
2310 goto free_iommu;
2311
2312 init_translation_status(iommu);
2313
2314 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
2315 iommu_disable_translation(iommu);
2316 clear_translation_pre_enabled(iommu);
2317 pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
2318 iommu->name);
2319 }
2320
2321 /*
2322 * TBD:
2323 * we could share the same root & context tables
2324 * among all IOMMU's. Need to Split it later.
2325 */
2326 ret = iommu_alloc_root_entry(iommu);
2327 if (ret)
2328 goto free_iommu;
2329
2330 if (translation_pre_enabled(iommu)) {
2331 pr_info("Translation already enabled - trying to copy translation structures\n");
2332
2333 ret = copy_translation_tables(iommu);
2334 if (ret) {
2335 /*
2336 * We found the IOMMU with translation
2337 * enabled - but failed to copy over the
2338 * old root-entry table. Try to proceed
2339 * by disabling translation now and
2340 * allocating a clean root-entry table.
2341 * This might cause DMAR faults, but
2342 * probably the dump will still succeed.
2343 */
2344 pr_err("Failed to copy translation tables from previous kernel for %s\n",
2345 iommu->name);
2346 iommu_disable_translation(iommu);
2347 clear_translation_pre_enabled(iommu);
2348 } else {
2349 pr_info("Copied translation tables from previous kernel for %s\n",
2350 iommu->name);
2351 }
2352 }
2353
2354 intel_svm_check(iommu);
2355 }
2356
2357 /*
2358 * Now that qi is enabled on all iommus, set the root entry and flush
2359 * caches. This is required on some Intel X58 chipsets, otherwise the
2360 * flush_context function will loop forever and the boot hangs.
2361 */
2362 for_each_active_iommu(iommu, drhd) {
2363 iommu_flush_write_buffer(iommu);
2364 iommu_set_root_entry(iommu);
2365 }
2366
2367 check_tylersburg_isoch();
2368
2369 /*
2370 * for each drhd
2371 * enable fault log
2372 * global invalidate context cache
2373 * global invalidate iotlb
2374 * enable translation
2375 */
2376 for_each_iommu(iommu, drhd) {
2377 if (drhd->ignored) {
2378 /*
2379 * we always have to disable PMRs or DMA may fail on
2380 * this device
2381 */
2382 if (force_on)
2383 iommu_disable_protect_mem_regions(iommu);
2384 continue;
2385 }
2386
2387 iommu_flush_write_buffer(iommu);
2388
2389 #ifdef CONFIG_INTEL_IOMMU_SVM
2390 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
2391 /*
2392 * Call dmar_alloc_hwirq() with dmar_global_lock held,
2393 * could cause possible lock race condition.
2394 */
2395 up_write(&dmar_global_lock);
2396 ret = intel_svm_enable_prq(iommu);
2397 down_write(&dmar_global_lock);
2398 if (ret)
2399 goto free_iommu;
2400 }
2401 #endif
2402 ret = dmar_set_interrupt(iommu);
2403 if (ret)
2404 goto free_iommu;
2405 }
2406
2407 return 0;
2408
2409 free_iommu:
2410 for_each_active_iommu(iommu, drhd) {
2411 disable_dmar_iommu(iommu);
2412 free_dmar_iommu(iommu);
2413 }
2414
2415 return ret;
2416 }
2417
init_no_remapping_devices(void)2418 static void __init init_no_remapping_devices(void)
2419 {
2420 struct dmar_drhd_unit *drhd;
2421 struct device *dev;
2422 int i;
2423
2424 for_each_drhd_unit(drhd) {
2425 if (!drhd->include_all) {
2426 for_each_active_dev_scope(drhd->devices,
2427 drhd->devices_cnt, i, dev)
2428 break;
2429 /* ignore DMAR unit if no devices exist */
2430 if (i == drhd->devices_cnt)
2431 drhd->ignored = 1;
2432 }
2433 }
2434
2435 for_each_active_drhd_unit(drhd) {
2436 if (drhd->include_all)
2437 continue;
2438
2439 for_each_active_dev_scope(drhd->devices,
2440 drhd->devices_cnt, i, dev)
2441 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
2442 break;
2443 if (i < drhd->devices_cnt)
2444 continue;
2445
2446 /* This IOMMU has *only* gfx devices. Either bypass it or
2447 set the gfx_mapped flag, as appropriate */
2448 drhd->gfx_dedicated = 1;
2449 if (disable_igfx_iommu)
2450 drhd->ignored = 1;
2451 }
2452 }
2453
2454 #ifdef CONFIG_SUSPEND
init_iommu_hw(void)2455 static int init_iommu_hw(void)
2456 {
2457 struct dmar_drhd_unit *drhd;
2458 struct intel_iommu *iommu = NULL;
2459 int ret;
2460
2461 for_each_active_iommu(iommu, drhd) {
2462 if (iommu->qi) {
2463 ret = dmar_reenable_qi(iommu);
2464 if (ret)
2465 return ret;
2466 }
2467 }
2468
2469 for_each_iommu(iommu, drhd) {
2470 if (drhd->ignored) {
2471 /*
2472 * we always have to disable PMRs or DMA may fail on
2473 * this device
2474 */
2475 if (force_on)
2476 iommu_disable_protect_mem_regions(iommu);
2477 continue;
2478 }
2479
2480 iommu_flush_write_buffer(iommu);
2481 iommu_set_root_entry(iommu);
2482 iommu_enable_translation(iommu);
2483 iommu_disable_protect_mem_regions(iommu);
2484 }
2485
2486 return 0;
2487 }
2488
iommu_flush_all(void)2489 static void iommu_flush_all(void)
2490 {
2491 struct dmar_drhd_unit *drhd;
2492 struct intel_iommu *iommu;
2493
2494 for_each_active_iommu(iommu, drhd) {
2495 iommu->flush.flush_context(iommu, 0, 0, 0,
2496 DMA_CCMD_GLOBAL_INVL);
2497 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2498 DMA_TLB_GLOBAL_FLUSH);
2499 }
2500 }
2501
iommu_suspend(void)2502 static int iommu_suspend(void)
2503 {
2504 struct dmar_drhd_unit *drhd;
2505 struct intel_iommu *iommu = NULL;
2506 unsigned long flag;
2507
2508 iommu_flush_all();
2509
2510 for_each_active_iommu(iommu, drhd) {
2511 iommu_disable_translation(iommu);
2512
2513 raw_spin_lock_irqsave(&iommu->register_lock, flag);
2514
2515 iommu->iommu_state[SR_DMAR_FECTL_REG] =
2516 readl(iommu->reg + DMAR_FECTL_REG);
2517 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
2518 readl(iommu->reg + DMAR_FEDATA_REG);
2519 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
2520 readl(iommu->reg + DMAR_FEADDR_REG);
2521 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
2522 readl(iommu->reg + DMAR_FEUADDR_REG);
2523
2524 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
2525 }
2526 return 0;
2527 }
2528
iommu_resume(void)2529 static void iommu_resume(void)
2530 {
2531 struct dmar_drhd_unit *drhd;
2532 struct intel_iommu *iommu = NULL;
2533 unsigned long flag;
2534
2535 if (init_iommu_hw()) {
2536 if (force_on)
2537 panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
2538 else
2539 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
2540 return;
2541 }
2542
2543 for_each_active_iommu(iommu, drhd) {
2544
2545 raw_spin_lock_irqsave(&iommu->register_lock, flag);
2546
2547 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
2548 iommu->reg + DMAR_FECTL_REG);
2549 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
2550 iommu->reg + DMAR_FEDATA_REG);
2551 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
2552 iommu->reg + DMAR_FEADDR_REG);
2553 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
2554 iommu->reg + DMAR_FEUADDR_REG);
2555
2556 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
2557 }
2558 }
2559
2560 static struct syscore_ops iommu_syscore_ops = {
2561 .resume = iommu_resume,
2562 .suspend = iommu_suspend,
2563 };
2564
init_iommu_pm_ops(void)2565 static void __init init_iommu_pm_ops(void)
2566 {
2567 register_syscore_ops(&iommu_syscore_ops);
2568 }
2569
2570 #else
init_iommu_pm_ops(void)2571 static inline void init_iommu_pm_ops(void) {}
2572 #endif /* CONFIG_PM */
2573
rmrr_sanity_check(struct acpi_dmar_reserved_memory * rmrr)2574 static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
2575 {
2576 if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
2577 !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
2578 rmrr->end_address <= rmrr->base_address ||
2579 arch_rmrr_sanity_check(rmrr))
2580 return -EINVAL;
2581
2582 return 0;
2583 }
2584
dmar_parse_one_rmrr(struct acpi_dmar_header * header,void * arg)2585 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
2586 {
2587 struct acpi_dmar_reserved_memory *rmrr;
2588 struct dmar_rmrr_unit *rmrru;
2589
2590 rmrr = (struct acpi_dmar_reserved_memory *)header;
2591 if (rmrr_sanity_check(rmrr)) {
2592 pr_warn(FW_BUG
2593 "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
2594 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2595 rmrr->base_address, rmrr->end_address,
2596 dmi_get_system_info(DMI_BIOS_VENDOR),
2597 dmi_get_system_info(DMI_BIOS_VERSION),
2598 dmi_get_system_info(DMI_PRODUCT_VERSION));
2599 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
2600 }
2601
2602 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
2603 if (!rmrru)
2604 goto out;
2605
2606 rmrru->hdr = header;
2607
2608 rmrru->base_address = rmrr->base_address;
2609 rmrru->end_address = rmrr->end_address;
2610
2611 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
2612 ((void *)rmrr) + rmrr->header.length,
2613 &rmrru->devices_cnt);
2614 if (rmrru->devices_cnt && rmrru->devices == NULL)
2615 goto free_rmrru;
2616
2617 list_add(&rmrru->list, &dmar_rmrr_units);
2618
2619 return 0;
2620 free_rmrru:
2621 kfree(rmrru);
2622 out:
2623 return -ENOMEM;
2624 }
2625
dmar_find_atsr(struct acpi_dmar_atsr * atsr)2626 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
2627 {
2628 struct dmar_atsr_unit *atsru;
2629 struct acpi_dmar_atsr *tmp;
2630
2631 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
2632 dmar_rcu_check()) {
2633 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
2634 if (atsr->segment != tmp->segment)
2635 continue;
2636 if (atsr->header.length != tmp->header.length)
2637 continue;
2638 if (memcmp(atsr, tmp, atsr->header.length) == 0)
2639 return atsru;
2640 }
2641
2642 return NULL;
2643 }
2644
dmar_parse_one_atsr(struct acpi_dmar_header * hdr,void * arg)2645 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
2646 {
2647 struct acpi_dmar_atsr *atsr;
2648 struct dmar_atsr_unit *atsru;
2649
2650 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
2651 return 0;
2652
2653 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
2654 atsru = dmar_find_atsr(atsr);
2655 if (atsru)
2656 return 0;
2657
2658 atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
2659 if (!atsru)
2660 return -ENOMEM;
2661
2662 /*
2663 * If memory is allocated from slab by ACPI _DSM method, we need to
2664 * copy the memory content because the memory buffer will be freed
2665 * on return.
2666 */
2667 atsru->hdr = (void *)(atsru + 1);
2668 memcpy(atsru->hdr, hdr, hdr->length);
2669 atsru->include_all = atsr->flags & 0x1;
2670 if (!atsru->include_all) {
2671 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
2672 (void *)atsr + atsr->header.length,
2673 &atsru->devices_cnt);
2674 if (atsru->devices_cnt && atsru->devices == NULL) {
2675 kfree(atsru);
2676 return -ENOMEM;
2677 }
2678 }
2679
2680 list_add_rcu(&atsru->list, &dmar_atsr_units);
2681
2682 return 0;
2683 }
2684
intel_iommu_free_atsr(struct dmar_atsr_unit * atsru)2685 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
2686 {
2687 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
2688 kfree(atsru);
2689 }
2690
dmar_release_one_atsr(struct acpi_dmar_header * hdr,void * arg)2691 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
2692 {
2693 struct acpi_dmar_atsr *atsr;
2694 struct dmar_atsr_unit *atsru;
2695
2696 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
2697 atsru = dmar_find_atsr(atsr);
2698 if (atsru) {
2699 list_del_rcu(&atsru->list);
2700 synchronize_rcu();
2701 intel_iommu_free_atsr(atsru);
2702 }
2703
2704 return 0;
2705 }
2706
dmar_check_one_atsr(struct acpi_dmar_header * hdr,void * arg)2707 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
2708 {
2709 int i;
2710 struct device *dev;
2711 struct acpi_dmar_atsr *atsr;
2712 struct dmar_atsr_unit *atsru;
2713
2714 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
2715 atsru = dmar_find_atsr(atsr);
2716 if (!atsru)
2717 return 0;
2718
2719 if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
2720 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
2721 i, dev)
2722 return -EBUSY;
2723 }
2724
2725 return 0;
2726 }
2727
dmar_find_satc(struct acpi_dmar_satc * satc)2728 static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
2729 {
2730 struct dmar_satc_unit *satcu;
2731 struct acpi_dmar_satc *tmp;
2732
2733 list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
2734 dmar_rcu_check()) {
2735 tmp = (struct acpi_dmar_satc *)satcu->hdr;
2736 if (satc->segment != tmp->segment)
2737 continue;
2738 if (satc->header.length != tmp->header.length)
2739 continue;
2740 if (memcmp(satc, tmp, satc->header.length) == 0)
2741 return satcu;
2742 }
2743
2744 return NULL;
2745 }
2746
dmar_parse_one_satc(struct acpi_dmar_header * hdr,void * arg)2747 int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
2748 {
2749 struct acpi_dmar_satc *satc;
2750 struct dmar_satc_unit *satcu;
2751
2752 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
2753 return 0;
2754
2755 satc = container_of(hdr, struct acpi_dmar_satc, header);
2756 satcu = dmar_find_satc(satc);
2757 if (satcu)
2758 return 0;
2759
2760 satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
2761 if (!satcu)
2762 return -ENOMEM;
2763
2764 satcu->hdr = (void *)(satcu + 1);
2765 memcpy(satcu->hdr, hdr, hdr->length);
2766 satcu->atc_required = satc->flags & 0x1;
2767 satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
2768 (void *)satc + satc->header.length,
2769 &satcu->devices_cnt);
2770 if (satcu->devices_cnt && !satcu->devices) {
2771 kfree(satcu);
2772 return -ENOMEM;
2773 }
2774 list_add_rcu(&satcu->list, &dmar_satc_units);
2775
2776 return 0;
2777 }
2778
intel_iommu_add(struct dmar_drhd_unit * dmaru)2779 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
2780 {
2781 int sp, ret;
2782 struct intel_iommu *iommu = dmaru->iommu;
2783
2784 ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu);
2785 if (ret)
2786 goto out;
2787
2788 sp = domain_update_iommu_superpage(NULL, iommu) - 1;
2789 if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
2790 pr_warn("%s: Doesn't support large page.\n",
2791 iommu->name);
2792 return -ENXIO;
2793 }
2794
2795 /*
2796 * Disable translation if already enabled prior to OS handover.
2797 */
2798 if (iommu->gcmd & DMA_GCMD_TE)
2799 iommu_disable_translation(iommu);
2800
2801 ret = iommu_init_domains(iommu);
2802 if (ret == 0)
2803 ret = iommu_alloc_root_entry(iommu);
2804 if (ret)
2805 goto out;
2806
2807 intel_svm_check(iommu);
2808
2809 if (dmaru->ignored) {
2810 /*
2811 * we always have to disable PMRs or DMA may fail on this device
2812 */
2813 if (force_on)
2814 iommu_disable_protect_mem_regions(iommu);
2815 return 0;
2816 }
2817
2818 intel_iommu_init_qi(iommu);
2819 iommu_flush_write_buffer(iommu);
2820
2821 #ifdef CONFIG_INTEL_IOMMU_SVM
2822 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
2823 ret = intel_svm_enable_prq(iommu);
2824 if (ret)
2825 goto disable_iommu;
2826 }
2827 #endif
2828 ret = dmar_set_interrupt(iommu);
2829 if (ret)
2830 goto disable_iommu;
2831
2832 iommu_set_root_entry(iommu);
2833 iommu_enable_translation(iommu);
2834
2835 iommu_disable_protect_mem_regions(iommu);
2836 return 0;
2837
2838 disable_iommu:
2839 disable_dmar_iommu(iommu);
2840 out:
2841 free_dmar_iommu(iommu);
2842 return ret;
2843 }
2844
dmar_iommu_hotplug(struct dmar_drhd_unit * dmaru,bool insert)2845 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
2846 {
2847 int ret = 0;
2848 struct intel_iommu *iommu = dmaru->iommu;
2849
2850 if (!intel_iommu_enabled)
2851 return 0;
2852 if (iommu == NULL)
2853 return -EINVAL;
2854
2855 if (insert) {
2856 ret = intel_iommu_add(dmaru);
2857 } else {
2858 disable_dmar_iommu(iommu);
2859 free_dmar_iommu(iommu);
2860 }
2861
2862 return ret;
2863 }
2864
intel_iommu_free_dmars(void)2865 static void intel_iommu_free_dmars(void)
2866 {
2867 struct dmar_rmrr_unit *rmrru, *rmrr_n;
2868 struct dmar_atsr_unit *atsru, *atsr_n;
2869 struct dmar_satc_unit *satcu, *satc_n;
2870
2871 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
2872 list_del(&rmrru->list);
2873 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
2874 kfree(rmrru);
2875 }
2876
2877 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
2878 list_del(&atsru->list);
2879 intel_iommu_free_atsr(atsru);
2880 }
2881 list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
2882 list_del(&satcu->list);
2883 dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
2884 kfree(satcu);
2885 }
2886 }
2887
dmar_find_matched_satc_unit(struct pci_dev * dev)2888 static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev)
2889 {
2890 struct dmar_satc_unit *satcu;
2891 struct acpi_dmar_satc *satc;
2892 struct device *tmp;
2893 int i;
2894
2895 dev = pci_physfn(dev);
2896 rcu_read_lock();
2897
2898 list_for_each_entry_rcu(satcu, &dmar_satc_units, list) {
2899 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
2900 if (satc->segment != pci_domain_nr(dev->bus))
2901 continue;
2902 for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp)
2903 if (to_pci_dev(tmp) == dev)
2904 goto out;
2905 }
2906 satcu = NULL;
2907 out:
2908 rcu_read_unlock();
2909 return satcu;
2910 }
2911
dmar_ats_supported(struct pci_dev * dev,struct intel_iommu * iommu)2912 static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu)
2913 {
2914 int i, ret = 1;
2915 struct pci_bus *bus;
2916 struct pci_dev *bridge = NULL;
2917 struct device *tmp;
2918 struct acpi_dmar_atsr *atsr;
2919 struct dmar_atsr_unit *atsru;
2920 struct dmar_satc_unit *satcu;
2921
2922 dev = pci_physfn(dev);
2923 satcu = dmar_find_matched_satc_unit(dev);
2924 if (satcu)
2925 /*
2926 * This device supports ATS as it is in SATC table.
2927 * When IOMMU is in legacy mode, enabling ATS is done
2928 * automatically by HW for the device that requires
2929 * ATS, hence OS should not enable this device ATS
2930 * to avoid duplicated TLB invalidation.
2931 */
2932 return !(satcu->atc_required && !sm_supported(iommu));
2933
2934 for (bus = dev->bus; bus; bus = bus->parent) {
2935 bridge = bus->self;
2936 /* If it's an integrated device, allow ATS */
2937 if (!bridge)
2938 return 1;
2939 /* Connected via non-PCIe: no ATS */
2940 if (!pci_is_pcie(bridge) ||
2941 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
2942 return 0;
2943 /* If we found the root port, look it up in the ATSR */
2944 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
2945 break;
2946 }
2947
2948 rcu_read_lock();
2949 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
2950 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
2951 if (atsr->segment != pci_domain_nr(dev->bus))
2952 continue;
2953
2954 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
2955 if (tmp == &bridge->dev)
2956 goto out;
2957
2958 if (atsru->include_all)
2959 goto out;
2960 }
2961 ret = 0;
2962 out:
2963 rcu_read_unlock();
2964
2965 return ret;
2966 }
2967
dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info * info)2968 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
2969 {
2970 int ret;
2971 struct dmar_rmrr_unit *rmrru;
2972 struct dmar_atsr_unit *atsru;
2973 struct dmar_satc_unit *satcu;
2974 struct acpi_dmar_atsr *atsr;
2975 struct acpi_dmar_reserved_memory *rmrr;
2976 struct acpi_dmar_satc *satc;
2977
2978 if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
2979 return 0;
2980
2981 list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
2982 rmrr = container_of(rmrru->hdr,
2983 struct acpi_dmar_reserved_memory, header);
2984 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
2985 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
2986 ((void *)rmrr) + rmrr->header.length,
2987 rmrr->segment, rmrru->devices,
2988 rmrru->devices_cnt);
2989 if (ret < 0)
2990 return ret;
2991 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
2992 dmar_remove_dev_scope(info, rmrr->segment,
2993 rmrru->devices, rmrru->devices_cnt);
2994 }
2995 }
2996
2997 list_for_each_entry(atsru, &dmar_atsr_units, list) {
2998 if (atsru->include_all)
2999 continue;
3000
3001 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3002 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3003 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
3004 (void *)atsr + atsr->header.length,
3005 atsr->segment, atsru->devices,
3006 atsru->devices_cnt);
3007 if (ret > 0)
3008 break;
3009 else if (ret < 0)
3010 return ret;
3011 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3012 if (dmar_remove_dev_scope(info, atsr->segment,
3013 atsru->devices, atsru->devices_cnt))
3014 break;
3015 }
3016 }
3017 list_for_each_entry(satcu, &dmar_satc_units, list) {
3018 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3019 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3020 ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
3021 (void *)satc + satc->header.length,
3022 satc->segment, satcu->devices,
3023 satcu->devices_cnt);
3024 if (ret > 0)
3025 break;
3026 else if (ret < 0)
3027 return ret;
3028 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3029 if (dmar_remove_dev_scope(info, satc->segment,
3030 satcu->devices, satcu->devices_cnt))
3031 break;
3032 }
3033 }
3034
3035 return 0;
3036 }
3037
intel_disable_iommus(void)3038 static void intel_disable_iommus(void)
3039 {
3040 struct intel_iommu *iommu = NULL;
3041 struct dmar_drhd_unit *drhd;
3042
3043 for_each_iommu(iommu, drhd)
3044 iommu_disable_translation(iommu);
3045 }
3046
intel_iommu_shutdown(void)3047 void intel_iommu_shutdown(void)
3048 {
3049 struct dmar_drhd_unit *drhd;
3050 struct intel_iommu *iommu = NULL;
3051
3052 if (no_iommu || dmar_disabled)
3053 return;
3054
3055 down_write(&dmar_global_lock);
3056
3057 /* Disable PMRs explicitly here. */
3058 for_each_iommu(iommu, drhd)
3059 iommu_disable_protect_mem_regions(iommu);
3060
3061 /* Make sure the IOMMUs are switched off */
3062 intel_disable_iommus();
3063
3064 up_write(&dmar_global_lock);
3065 }
3066
dev_to_intel_iommu(struct device * dev)3067 static struct intel_iommu *dev_to_intel_iommu(struct device *dev)
3068 {
3069 struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
3070
3071 return container_of(iommu_dev, struct intel_iommu, iommu);
3072 }
3073
version_show(struct device * dev,struct device_attribute * attr,char * buf)3074 static ssize_t version_show(struct device *dev,
3075 struct device_attribute *attr, char *buf)
3076 {
3077 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3078 u32 ver = readl(iommu->reg + DMAR_VER_REG);
3079 return sysfs_emit(buf, "%d:%d\n",
3080 DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
3081 }
3082 static DEVICE_ATTR_RO(version);
3083
address_show(struct device * dev,struct device_attribute * attr,char * buf)3084 static ssize_t address_show(struct device *dev,
3085 struct device_attribute *attr, char *buf)
3086 {
3087 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3088 return sysfs_emit(buf, "%llx\n", iommu->reg_phys);
3089 }
3090 static DEVICE_ATTR_RO(address);
3091
cap_show(struct device * dev,struct device_attribute * attr,char * buf)3092 static ssize_t cap_show(struct device *dev,
3093 struct device_attribute *attr, char *buf)
3094 {
3095 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3096 return sysfs_emit(buf, "%llx\n", iommu->cap);
3097 }
3098 static DEVICE_ATTR_RO(cap);
3099
ecap_show(struct device * dev,struct device_attribute * attr,char * buf)3100 static ssize_t ecap_show(struct device *dev,
3101 struct device_attribute *attr, char *buf)
3102 {
3103 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3104 return sysfs_emit(buf, "%llx\n", iommu->ecap);
3105 }
3106 static DEVICE_ATTR_RO(ecap);
3107
domains_supported_show(struct device * dev,struct device_attribute * attr,char * buf)3108 static ssize_t domains_supported_show(struct device *dev,
3109 struct device_attribute *attr, char *buf)
3110 {
3111 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3112 return sysfs_emit(buf, "%ld\n", cap_ndoms(iommu->cap));
3113 }
3114 static DEVICE_ATTR_RO(domains_supported);
3115
domains_used_show(struct device * dev,struct device_attribute * attr,char * buf)3116 static ssize_t domains_used_show(struct device *dev,
3117 struct device_attribute *attr, char *buf)
3118 {
3119 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3120 return sysfs_emit(buf, "%d\n",
3121 bitmap_weight(iommu->domain_ids,
3122 cap_ndoms(iommu->cap)));
3123 }
3124 static DEVICE_ATTR_RO(domains_used);
3125
3126 static struct attribute *intel_iommu_attrs[] = {
3127 &dev_attr_version.attr,
3128 &dev_attr_address.attr,
3129 &dev_attr_cap.attr,
3130 &dev_attr_ecap.attr,
3131 &dev_attr_domains_supported.attr,
3132 &dev_attr_domains_used.attr,
3133 NULL,
3134 };
3135
3136 static struct attribute_group intel_iommu_group = {
3137 .name = "intel-iommu",
3138 .attrs = intel_iommu_attrs,
3139 };
3140
3141 const struct attribute_group *intel_iommu_groups[] = {
3142 &intel_iommu_group,
3143 NULL,
3144 };
3145
has_external_pci(void)3146 static bool has_external_pci(void)
3147 {
3148 struct pci_dev *pdev = NULL;
3149
3150 for_each_pci_dev(pdev)
3151 if (pdev->external_facing) {
3152 pci_dev_put(pdev);
3153 return true;
3154 }
3155
3156 return false;
3157 }
3158
platform_optin_force_iommu(void)3159 static int __init platform_optin_force_iommu(void)
3160 {
3161 if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
3162 return 0;
3163
3164 if (no_iommu || dmar_disabled)
3165 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
3166
3167 /*
3168 * If Intel-IOMMU is disabled by default, we will apply identity
3169 * map for all devices except those marked as requiring DMA protection.
3170 */
3171 if (dmar_disabled)
3172 iommu_set_default_passthrough(false);
3173
3174 dmar_disabled = 0;
3175 no_iommu = 0;
3176
3177 return 1;
3178 }
3179
probe_acpi_namespace_devices(void)3180 static int __init probe_acpi_namespace_devices(void)
3181 {
3182 struct dmar_drhd_unit *drhd;
3183 /* To avoid a -Wunused-but-set-variable warning. */
3184 struct intel_iommu *iommu __maybe_unused;
3185 struct device *dev;
3186 int i, ret = 0;
3187
3188 for_each_active_iommu(iommu, drhd) {
3189 for_each_active_dev_scope(drhd->devices,
3190 drhd->devices_cnt, i, dev) {
3191 struct acpi_device_physical_node *pn;
3192 struct acpi_device *adev;
3193
3194 if (dev->bus != &acpi_bus_type)
3195 continue;
3196
3197 up_read(&dmar_global_lock);
3198 adev = to_acpi_device(dev);
3199 mutex_lock(&adev->physical_node_lock);
3200 list_for_each_entry(pn,
3201 &adev->physical_node_list, node) {
3202 ret = iommu_probe_device(pn->dev);
3203 if (ret)
3204 break;
3205 }
3206 mutex_unlock(&adev->physical_node_lock);
3207 down_read(&dmar_global_lock);
3208
3209 if (ret)
3210 return ret;
3211 }
3212 }
3213
3214 return 0;
3215 }
3216
tboot_force_iommu(void)3217 static __init int tboot_force_iommu(void)
3218 {
3219 if (!tboot_enabled())
3220 return 0;
3221
3222 if (no_iommu || dmar_disabled)
3223 pr_warn("Forcing Intel-IOMMU to enabled\n");
3224
3225 dmar_disabled = 0;
3226 no_iommu = 0;
3227
3228 return 1;
3229 }
3230
intel_iommu_init(void)3231 int __init intel_iommu_init(void)
3232 {
3233 int ret = -ENODEV;
3234 struct dmar_drhd_unit *drhd;
3235 struct intel_iommu *iommu;
3236
3237 /*
3238 * Intel IOMMU is required for a TXT/tboot launch or platform
3239 * opt in, so enforce that.
3240 */
3241 force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
3242 platform_optin_force_iommu();
3243
3244 down_write(&dmar_global_lock);
3245 if (dmar_table_init()) {
3246 if (force_on)
3247 panic("tboot: Failed to initialize DMAR table\n");
3248 goto out_free_dmar;
3249 }
3250
3251 if (dmar_dev_scope_init() < 0) {
3252 if (force_on)
3253 panic("tboot: Failed to initialize DMAR device scope\n");
3254 goto out_free_dmar;
3255 }
3256
3257 up_write(&dmar_global_lock);
3258
3259 /*
3260 * The bus notifier takes the dmar_global_lock, so lockdep will
3261 * complain later when we register it under the lock.
3262 */
3263 dmar_register_bus_notifier();
3264
3265 down_write(&dmar_global_lock);
3266
3267 if (!no_iommu)
3268 intel_iommu_debugfs_init();
3269
3270 if (no_iommu || dmar_disabled) {
3271 /*
3272 * We exit the function here to ensure IOMMU's remapping and
3273 * mempool aren't setup, which means that the IOMMU's PMRs
3274 * won't be disabled via the call to init_dmars(). So disable
3275 * it explicitly here. The PMRs were setup by tboot prior to
3276 * calling SENTER, but the kernel is expected to reset/tear
3277 * down the PMRs.
3278 */
3279 if (intel_iommu_tboot_noforce) {
3280 for_each_iommu(iommu, drhd)
3281 iommu_disable_protect_mem_regions(iommu);
3282 }
3283
3284 /*
3285 * Make sure the IOMMUs are switched off, even when we
3286 * boot into a kexec kernel and the previous kernel left
3287 * them enabled
3288 */
3289 intel_disable_iommus();
3290 goto out_free_dmar;
3291 }
3292
3293 if (list_empty(&dmar_rmrr_units))
3294 pr_info("No RMRR found\n");
3295
3296 if (list_empty(&dmar_atsr_units))
3297 pr_info("No ATSR found\n");
3298
3299 if (list_empty(&dmar_satc_units))
3300 pr_info("No SATC found\n");
3301
3302 init_no_remapping_devices();
3303
3304 ret = init_dmars();
3305 if (ret) {
3306 if (force_on)
3307 panic("tboot: Failed to initialize DMARs\n");
3308 pr_err("Initialization failed\n");
3309 goto out_free_dmar;
3310 }
3311 up_write(&dmar_global_lock);
3312
3313 init_iommu_pm_ops();
3314
3315 down_read(&dmar_global_lock);
3316 for_each_active_iommu(iommu, drhd) {
3317 /*
3318 * The flush queue implementation does not perform
3319 * page-selective invalidations that are required for efficient
3320 * TLB flushes in virtual environments. The benefit of batching
3321 * is likely to be much lower than the overhead of synchronizing
3322 * the virtual and physical IOMMU page-tables.
3323 */
3324 if (cap_caching_mode(iommu->cap) &&
3325 !first_level_by_default(IOMMU_DOMAIN_DMA)) {
3326 pr_info_once("IOMMU batching disallowed due to virtualization\n");
3327 iommu_set_dma_strict();
3328 }
3329 iommu_device_sysfs_add(&iommu->iommu, NULL,
3330 intel_iommu_groups,
3331 "%s", iommu->name);
3332 /*
3333 * The iommu device probe is protected by the iommu_probe_device_lock.
3334 * Release the dmar_global_lock before entering the device probe path
3335 * to avoid unnecessary lock order splat.
3336 */
3337 up_read(&dmar_global_lock);
3338 iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
3339 down_read(&dmar_global_lock);
3340
3341 iommu_pmu_register(iommu);
3342 }
3343
3344 if (probe_acpi_namespace_devices())
3345 pr_warn("ACPI name space devices didn't probe correctly\n");
3346
3347 /* Finally, we enable the DMA remapping hardware. */
3348 for_each_iommu(iommu, drhd) {
3349 if (!drhd->ignored && !translation_pre_enabled(iommu))
3350 iommu_enable_translation(iommu);
3351
3352 iommu_disable_protect_mem_regions(iommu);
3353 }
3354 up_read(&dmar_global_lock);
3355
3356 pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
3357
3358 intel_iommu_enabled = 1;
3359
3360 return 0;
3361
3362 out_free_dmar:
3363 intel_iommu_free_dmars();
3364 up_write(&dmar_global_lock);
3365 return ret;
3366 }
3367
domain_context_clear_one_cb(struct pci_dev * pdev,u16 alias,void * opaque)3368 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
3369 {
3370 struct device_domain_info *info = opaque;
3371
3372 domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
3373 return 0;
3374 }
3375
3376 /*
3377 * NB - intel-iommu lacks any sort of reference counting for the users of
3378 * dependent devices. If multiple endpoints have intersecting dependent
3379 * devices, unbinding the driver from any one of them will possibly leave
3380 * the others unable to operate.
3381 */
domain_context_clear(struct device_domain_info * info)3382 static void domain_context_clear(struct device_domain_info *info)
3383 {
3384 if (!dev_is_pci(info->dev)) {
3385 domain_context_clear_one(info, info->bus, info->devfn);
3386 return;
3387 }
3388
3389 pci_for_each_dma_alias(to_pci_dev(info->dev),
3390 &domain_context_clear_one_cb, info);
3391 }
3392
3393 /*
3394 * Clear the page table pointer in context or pasid table entries so that
3395 * all DMA requests without PASID from the device are blocked. If the page
3396 * table has been set, clean up the data structures.
3397 */
device_block_translation(struct device * dev)3398 void device_block_translation(struct device *dev)
3399 {
3400 struct device_domain_info *info = dev_iommu_priv_get(dev);
3401 struct intel_iommu *iommu = info->iommu;
3402 unsigned long flags;
3403
3404 /* Device in DMA blocking state. Noting to do. */
3405 if (!info->domain_attached)
3406 return;
3407
3408 if (info->domain)
3409 cache_tag_unassign_domain(info->domain, dev, IOMMU_NO_PASID);
3410
3411 iommu_disable_pci_caps(info);
3412 if (!dev_is_real_dma_subdevice(dev)) {
3413 if (sm_supported(iommu))
3414 intel_pasid_tear_down_entry(iommu, dev,
3415 IOMMU_NO_PASID, false);
3416 else
3417 domain_context_clear(info);
3418 }
3419
3420 /* Device now in DMA blocking state. */
3421 info->domain_attached = false;
3422
3423 if (!info->domain)
3424 return;
3425
3426 spin_lock_irqsave(&info->domain->lock, flags);
3427 list_del(&info->link);
3428 spin_unlock_irqrestore(&info->domain->lock, flags);
3429
3430 domain_detach_iommu(info->domain, iommu);
3431 info->domain = NULL;
3432 }
3433
md_domain_init(struct dmar_domain * domain,int guest_width)3434 static int md_domain_init(struct dmar_domain *domain, int guest_width)
3435 {
3436 int adjust_width;
3437
3438 /* calculate AGAW */
3439 domain->gaw = guest_width;
3440 adjust_width = guestwidth_to_adjustwidth(guest_width);
3441 domain->agaw = width_to_agaw(adjust_width);
3442
3443 domain->iommu_coherency = false;
3444 domain->iommu_superpage = 0;
3445 domain->max_addr = 0;
3446
3447 /* always allocate the top pgd */
3448 domain->pgd = iommu_alloc_page_node(domain->nid, GFP_ATOMIC);
3449 if (!domain->pgd)
3450 return -ENOMEM;
3451 domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3452 return 0;
3453 }
3454
blocking_domain_attach_dev(struct iommu_domain * domain,struct device * dev)3455 static int blocking_domain_attach_dev(struct iommu_domain *domain,
3456 struct device *dev)
3457 {
3458 device_block_translation(dev);
3459 return 0;
3460 }
3461
3462 static struct iommu_domain blocking_domain = {
3463 .type = IOMMU_DOMAIN_BLOCKED,
3464 .ops = &(const struct iommu_domain_ops) {
3465 .attach_dev = blocking_domain_attach_dev,
3466 }
3467 };
3468
iommu_superpage_capability(struct intel_iommu * iommu,bool first_stage)3469 static int iommu_superpage_capability(struct intel_iommu *iommu, bool first_stage)
3470 {
3471 if (!intel_iommu_superpage)
3472 return 0;
3473
3474 if (first_stage)
3475 return cap_fl1gp_support(iommu->cap) ? 2 : 1;
3476
3477 return fls(cap_super_page_val(iommu->cap));
3478 }
3479
paging_domain_alloc(struct device * dev,bool first_stage)3480 static struct dmar_domain *paging_domain_alloc(struct device *dev, bool first_stage)
3481 {
3482 struct device_domain_info *info = dev_iommu_priv_get(dev);
3483 struct intel_iommu *iommu = info->iommu;
3484 struct dmar_domain *domain;
3485 int addr_width;
3486
3487 domain = kzalloc(sizeof(*domain), GFP_KERNEL);
3488 if (!domain)
3489 return ERR_PTR(-ENOMEM);
3490
3491 INIT_LIST_HEAD(&domain->devices);
3492 INIT_LIST_HEAD(&domain->dev_pasids);
3493 INIT_LIST_HEAD(&domain->cache_tags);
3494 spin_lock_init(&domain->lock);
3495 spin_lock_init(&domain->cache_lock);
3496 xa_init(&domain->iommu_array);
3497
3498 domain->nid = dev_to_node(dev);
3499 domain->use_first_level = first_stage;
3500
3501 /* calculate the address width */
3502 addr_width = agaw_to_width(iommu->agaw);
3503 if (addr_width > cap_mgaw(iommu->cap))
3504 addr_width = cap_mgaw(iommu->cap);
3505 domain->gaw = addr_width;
3506 domain->agaw = iommu->agaw;
3507 domain->max_addr = __DOMAIN_MAX_ADDR(addr_width);
3508
3509 /* iommu memory access coherency */
3510 domain->iommu_coherency = iommu_paging_structure_coherency(iommu);
3511
3512 /* pagesize bitmap */
3513 domain->domain.pgsize_bitmap = SZ_4K;
3514 domain->iommu_superpage = iommu_superpage_capability(iommu, first_stage);
3515 domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
3516
3517 /*
3518 * IOVA aperture: First-level translation restricts the input-address
3519 * to a canonical address (i.e., address bits 63:N have the same value
3520 * as address bit [N-1], where N is 48-bits with 4-level paging and
3521 * 57-bits with 5-level paging). Hence, skip bit [N-1].
3522 */
3523 domain->domain.geometry.force_aperture = true;
3524 domain->domain.geometry.aperture_start = 0;
3525 if (first_stage)
3526 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
3527 else
3528 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
3529
3530 /* always allocate the top pgd */
3531 domain->pgd = iommu_alloc_page_node(domain->nid, GFP_KERNEL);
3532 if (!domain->pgd) {
3533 kfree(domain);
3534 return ERR_PTR(-ENOMEM);
3535 }
3536 domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3537
3538 return domain;
3539 }
3540
intel_iommu_domain_alloc(unsigned type)3541 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
3542 {
3543 struct dmar_domain *dmar_domain;
3544 struct iommu_domain *domain;
3545
3546 switch (type) {
3547 case IOMMU_DOMAIN_DMA:
3548 case IOMMU_DOMAIN_UNMANAGED:
3549 dmar_domain = alloc_domain(type);
3550 if (!dmar_domain) {
3551 pr_err("Can't allocate dmar_domain\n");
3552 return NULL;
3553 }
3554 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3555 pr_err("Domain initialization failed\n");
3556 domain_exit(dmar_domain);
3557 return NULL;
3558 }
3559
3560 domain = &dmar_domain->domain;
3561 domain->geometry.aperture_start = 0;
3562 domain->geometry.aperture_end =
3563 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
3564 domain->geometry.force_aperture = true;
3565
3566 return domain;
3567 default:
3568 return NULL;
3569 }
3570
3571 return NULL;
3572 }
3573
3574 static struct iommu_domain *
intel_iommu_domain_alloc_user(struct device * dev,u32 flags,struct iommu_domain * parent,const struct iommu_user_data * user_data)3575 intel_iommu_domain_alloc_user(struct device *dev, u32 flags,
3576 struct iommu_domain *parent,
3577 const struct iommu_user_data *user_data)
3578 {
3579 struct device_domain_info *info = dev_iommu_priv_get(dev);
3580 bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
3581 bool nested_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT;
3582 struct intel_iommu *iommu = info->iommu;
3583 struct dmar_domain *dmar_domain;
3584 struct iommu_domain *domain;
3585
3586 /* Must be NESTING domain */
3587 if (parent) {
3588 if (!nested_supported(iommu) || flags)
3589 return ERR_PTR(-EOPNOTSUPP);
3590 return intel_nested_domain_alloc(parent, user_data);
3591 }
3592
3593 if (flags &
3594 (~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING)))
3595 return ERR_PTR(-EOPNOTSUPP);
3596 if (nested_parent && !nested_supported(iommu))
3597 return ERR_PTR(-EOPNOTSUPP);
3598 if (user_data || (dirty_tracking && !ssads_supported(iommu)))
3599 return ERR_PTR(-EOPNOTSUPP);
3600
3601 /* Do not use first stage for user domain translation. */
3602 dmar_domain = paging_domain_alloc(dev, false);
3603 if (IS_ERR(dmar_domain))
3604 return ERR_CAST(dmar_domain);
3605 domain = &dmar_domain->domain;
3606 domain->type = IOMMU_DOMAIN_UNMANAGED;
3607 domain->owner = &intel_iommu_ops;
3608 domain->ops = intel_iommu_ops.default_domain_ops;
3609
3610 if (nested_parent) {
3611 dmar_domain->nested_parent = true;
3612 INIT_LIST_HEAD(&dmar_domain->s1_domains);
3613 spin_lock_init(&dmar_domain->s1_lock);
3614 }
3615
3616 if (dirty_tracking) {
3617 if (dmar_domain->use_first_level) {
3618 iommu_domain_free(domain);
3619 return ERR_PTR(-EOPNOTSUPP);
3620 }
3621 domain->dirty_ops = &intel_dirty_ops;
3622 }
3623
3624 return domain;
3625 }
3626
intel_iommu_domain_free(struct iommu_domain * domain)3627 static void intel_iommu_domain_free(struct iommu_domain *domain)
3628 {
3629 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3630
3631 WARN_ON(dmar_domain->nested_parent &&
3632 !list_empty(&dmar_domain->s1_domains));
3633 domain_exit(dmar_domain);
3634 }
3635
prepare_domain_attach_device(struct iommu_domain * domain,struct device * dev)3636 int prepare_domain_attach_device(struct iommu_domain *domain,
3637 struct device *dev)
3638 {
3639 struct device_domain_info *info = dev_iommu_priv_get(dev);
3640 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3641 struct intel_iommu *iommu = info->iommu;
3642 int addr_width;
3643
3644 if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap))
3645 return -EINVAL;
3646
3647 if (domain->dirty_ops && !ssads_supported(iommu))
3648 return -EINVAL;
3649
3650 /* check if this iommu agaw is sufficient for max mapped address */
3651 addr_width = agaw_to_width(iommu->agaw);
3652 if (addr_width > cap_mgaw(iommu->cap))
3653 addr_width = cap_mgaw(iommu->cap);
3654
3655 if (dmar_domain->max_addr > (1LL << addr_width))
3656 return -EINVAL;
3657 dmar_domain->gaw = addr_width;
3658
3659 /*
3660 * Knock out extra levels of page tables if necessary
3661 */
3662 while (iommu->agaw < dmar_domain->agaw) {
3663 struct dma_pte *pte;
3664
3665 pte = dmar_domain->pgd;
3666 if (dma_pte_present(pte)) {
3667 dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte));
3668 iommu_free_page(pte);
3669 }
3670 dmar_domain->agaw--;
3671 }
3672
3673 if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) &&
3674 context_copied(iommu, info->bus, info->devfn))
3675 return intel_pasid_setup_sm_context(dev);
3676
3677 return 0;
3678 }
3679
intel_iommu_attach_device(struct iommu_domain * domain,struct device * dev)3680 static int intel_iommu_attach_device(struct iommu_domain *domain,
3681 struct device *dev)
3682 {
3683 int ret;
3684
3685 device_block_translation(dev);
3686
3687 ret = prepare_domain_attach_device(domain, dev);
3688 if (ret)
3689 return ret;
3690
3691 return dmar_domain_attach_device(to_dmar_domain(domain), dev);
3692 }
3693
intel_iommu_map(struct iommu_domain * domain,unsigned long iova,phys_addr_t hpa,size_t size,int iommu_prot,gfp_t gfp)3694 static int intel_iommu_map(struct iommu_domain *domain,
3695 unsigned long iova, phys_addr_t hpa,
3696 size_t size, int iommu_prot, gfp_t gfp)
3697 {
3698 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3699 u64 max_addr;
3700 int prot = 0;
3701
3702 if (iommu_prot & IOMMU_READ)
3703 prot |= DMA_PTE_READ;
3704 if (iommu_prot & IOMMU_WRITE)
3705 prot |= DMA_PTE_WRITE;
3706 if (dmar_domain->set_pte_snp)
3707 prot |= DMA_PTE_SNP;
3708
3709 max_addr = iova + size;
3710 if (dmar_domain->max_addr < max_addr) {
3711 u64 end;
3712
3713 /* check if minimum agaw is sufficient for mapped address */
3714 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
3715 if (end < max_addr) {
3716 pr_err("%s: iommu width (%d) is not "
3717 "sufficient for the mapped address (%llx)\n",
3718 __func__, dmar_domain->gaw, max_addr);
3719 return -EFAULT;
3720 }
3721 dmar_domain->max_addr = max_addr;
3722 }
3723 /* Round up size to next multiple of PAGE_SIZE, if it and
3724 the low bits of hpa would take us onto the next page */
3725 size = aligned_nrpages(hpa, size);
3726 return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
3727 hpa >> VTD_PAGE_SHIFT, size, prot, gfp);
3728 }
3729
intel_iommu_map_pages(struct iommu_domain * domain,unsigned long iova,phys_addr_t paddr,size_t pgsize,size_t pgcount,int prot,gfp_t gfp,size_t * mapped)3730 static int intel_iommu_map_pages(struct iommu_domain *domain,
3731 unsigned long iova, phys_addr_t paddr,
3732 size_t pgsize, size_t pgcount,
3733 int prot, gfp_t gfp, size_t *mapped)
3734 {
3735 unsigned long pgshift = __ffs(pgsize);
3736 size_t size = pgcount << pgshift;
3737 int ret;
3738
3739 if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
3740 return -EINVAL;
3741
3742 if (!IS_ALIGNED(iova | paddr, pgsize))
3743 return -EINVAL;
3744
3745 ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp);
3746 if (!ret && mapped)
3747 *mapped = size;
3748
3749 return ret;
3750 }
3751
intel_iommu_unmap(struct iommu_domain * domain,unsigned long iova,size_t size,struct iommu_iotlb_gather * gather)3752 static size_t intel_iommu_unmap(struct iommu_domain *domain,
3753 unsigned long iova, size_t size,
3754 struct iommu_iotlb_gather *gather)
3755 {
3756 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3757 unsigned long start_pfn, last_pfn;
3758 int level = 0;
3759
3760 /* Cope with horrid API which requires us to unmap more than the
3761 size argument if it happens to be a large-page mapping. */
3762 if (unlikely(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT,
3763 &level, GFP_ATOMIC)))
3764 return 0;
3765
3766 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
3767 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
3768
3769 start_pfn = iova >> VTD_PAGE_SHIFT;
3770 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
3771
3772 domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist);
3773
3774 if (dmar_domain->max_addr == iova + size)
3775 dmar_domain->max_addr = iova;
3776
3777 /*
3778 * We do not use page-selective IOTLB invalidation in flush queue,
3779 * so there is no need to track page and sync iotlb.
3780 */
3781 if (!iommu_iotlb_gather_queued(gather))
3782 iommu_iotlb_gather_add_page(domain, gather, iova, size);
3783
3784 return size;
3785 }
3786
intel_iommu_unmap_pages(struct iommu_domain * domain,unsigned long iova,size_t pgsize,size_t pgcount,struct iommu_iotlb_gather * gather)3787 static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
3788 unsigned long iova,
3789 size_t pgsize, size_t pgcount,
3790 struct iommu_iotlb_gather *gather)
3791 {
3792 unsigned long pgshift = __ffs(pgsize);
3793 size_t size = pgcount << pgshift;
3794
3795 return intel_iommu_unmap(domain, iova, size, gather);
3796 }
3797
intel_iommu_tlb_sync(struct iommu_domain * domain,struct iommu_iotlb_gather * gather)3798 static void intel_iommu_tlb_sync(struct iommu_domain *domain,
3799 struct iommu_iotlb_gather *gather)
3800 {
3801 cache_tag_flush_range(to_dmar_domain(domain), gather->start,
3802 gather->end, list_empty(&gather->freelist));
3803 iommu_put_pages_list(&gather->freelist);
3804 }
3805
intel_iommu_iova_to_phys(struct iommu_domain * domain,dma_addr_t iova)3806 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
3807 dma_addr_t iova)
3808 {
3809 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3810 struct dma_pte *pte;
3811 int level = 0;
3812 u64 phys = 0;
3813
3814 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level,
3815 GFP_ATOMIC);
3816 if (pte && dma_pte_present(pte))
3817 phys = dma_pte_addr(pte) +
3818 (iova & (BIT_MASK(level_to_offset_bits(level) +
3819 VTD_PAGE_SHIFT) - 1));
3820
3821 return phys;
3822 }
3823
domain_support_force_snooping(struct dmar_domain * domain)3824 static bool domain_support_force_snooping(struct dmar_domain *domain)
3825 {
3826 struct device_domain_info *info;
3827 bool support = true;
3828
3829 assert_spin_locked(&domain->lock);
3830 list_for_each_entry(info, &domain->devices, link) {
3831 if (!ecap_sc_support(info->iommu->ecap)) {
3832 support = false;
3833 break;
3834 }
3835 }
3836
3837 return support;
3838 }
3839
domain_set_force_snooping(struct dmar_domain * domain)3840 static void domain_set_force_snooping(struct dmar_domain *domain)
3841 {
3842 struct device_domain_info *info;
3843
3844 assert_spin_locked(&domain->lock);
3845 /*
3846 * Second level page table supports per-PTE snoop control. The
3847 * iommu_map() interface will handle this by setting SNP bit.
3848 */
3849 if (!domain->use_first_level) {
3850 domain->set_pte_snp = true;
3851 return;
3852 }
3853
3854 list_for_each_entry(info, &domain->devices, link)
3855 intel_pasid_setup_page_snoop_control(info->iommu, info->dev,
3856 IOMMU_NO_PASID);
3857 }
3858
intel_iommu_enforce_cache_coherency(struct iommu_domain * domain)3859 static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain)
3860 {
3861 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3862 unsigned long flags;
3863
3864 if (dmar_domain->force_snooping)
3865 return true;
3866
3867 spin_lock_irqsave(&dmar_domain->lock, flags);
3868 if (!domain_support_force_snooping(dmar_domain) ||
3869 (!dmar_domain->use_first_level && dmar_domain->has_mappings)) {
3870 spin_unlock_irqrestore(&dmar_domain->lock, flags);
3871 return false;
3872 }
3873
3874 domain_set_force_snooping(dmar_domain);
3875 dmar_domain->force_snooping = true;
3876 spin_unlock_irqrestore(&dmar_domain->lock, flags);
3877
3878 return true;
3879 }
3880
intel_iommu_capable(struct device * dev,enum iommu_cap cap)3881 static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap)
3882 {
3883 struct device_domain_info *info = dev_iommu_priv_get(dev);
3884
3885 switch (cap) {
3886 case IOMMU_CAP_CACHE_COHERENCY:
3887 case IOMMU_CAP_DEFERRED_FLUSH:
3888 return true;
3889 case IOMMU_CAP_PRE_BOOT_PROTECTION:
3890 return dmar_platform_optin();
3891 case IOMMU_CAP_ENFORCE_CACHE_COHERENCY:
3892 return ecap_sc_support(info->iommu->ecap);
3893 case IOMMU_CAP_DIRTY_TRACKING:
3894 return ssads_supported(info->iommu);
3895 default:
3896 return false;
3897 }
3898 }
3899
intel_iommu_probe_device(struct device * dev)3900 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
3901 {
3902 struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
3903 struct device_domain_info *info;
3904 struct intel_iommu *iommu;
3905 u8 bus, devfn;
3906 int ret;
3907
3908 iommu = device_lookup_iommu(dev, &bus, &devfn);
3909 if (!iommu || !iommu->iommu.ops)
3910 return ERR_PTR(-ENODEV);
3911
3912 info = kzalloc(sizeof(*info), GFP_KERNEL);
3913 if (!info)
3914 return ERR_PTR(-ENOMEM);
3915
3916 if (dev_is_real_dma_subdevice(dev)) {
3917 info->bus = pdev->bus->number;
3918 info->devfn = pdev->devfn;
3919 info->segment = pci_domain_nr(pdev->bus);
3920 } else {
3921 info->bus = bus;
3922 info->devfn = devfn;
3923 info->segment = iommu->segment;
3924 }
3925
3926 info->dev = dev;
3927 info->iommu = iommu;
3928 if (dev_is_pci(dev)) {
3929 if (ecap_dev_iotlb_support(iommu->ecap) &&
3930 pci_ats_supported(pdev) &&
3931 dmar_ats_supported(pdev, iommu)) {
3932 info->ats_supported = 1;
3933 info->dtlb_extra_inval = dev_needs_extra_dtlb_flush(pdev);
3934
3935 /*
3936 * For IOMMU that supports device IOTLB throttling
3937 * (DIT), we assign PFSID to the invalidation desc
3938 * of a VF such that IOMMU HW can gauge queue depth
3939 * at PF level. If DIT is not set, PFSID will be
3940 * treated as reserved, which should be set to 0.
3941 */
3942 if (ecap_dit(iommu->ecap))
3943 info->pfsid = pci_dev_id(pci_physfn(pdev));
3944 info->ats_qdep = pci_ats_queue_depth(pdev);
3945 }
3946 if (sm_supported(iommu)) {
3947 if (pasid_supported(iommu)) {
3948 int features = pci_pasid_features(pdev);
3949
3950 if (features >= 0)
3951 info->pasid_supported = features | 1;
3952 }
3953
3954 if (info->ats_supported && ecap_prs(iommu->ecap) &&
3955 pci_pri_supported(pdev))
3956 info->pri_supported = 1;
3957 }
3958 }
3959
3960 dev_iommu_priv_set(dev, info);
3961 if (pdev && pci_ats_supported(pdev)) {
3962 pci_prepare_ats(pdev, VTD_PAGE_SHIFT);
3963 ret = device_rbtree_insert(iommu, info);
3964 if (ret)
3965 goto free;
3966 }
3967
3968 if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
3969 ret = intel_pasid_alloc_table(dev);
3970 if (ret) {
3971 dev_err(dev, "PASID table allocation failed\n");
3972 goto clear_rbtree;
3973 }
3974
3975 if (!context_copied(iommu, info->bus, info->devfn)) {
3976 ret = intel_pasid_setup_sm_context(dev);
3977 if (ret)
3978 goto free_table;
3979 }
3980 }
3981
3982 intel_iommu_debugfs_create_dev(info);
3983
3984 /*
3985 * The PCIe spec, in its wisdom, declares that the behaviour of the
3986 * device is undefined if you enable PASID support after ATS support.
3987 * So always enable PASID support on devices which have it, even if
3988 * we can't yet know if we're ever going to use it.
3989 */
3990 if (info->pasid_supported &&
3991 !pci_enable_pasid(pdev, info->pasid_supported & ~1))
3992 info->pasid_enabled = 1;
3993
3994 return &iommu->iommu;
3995 free_table:
3996 intel_pasid_free_table(dev);
3997 clear_rbtree:
3998 device_rbtree_remove(info);
3999 free:
4000 kfree(info);
4001
4002 return ERR_PTR(ret);
4003 }
4004
intel_iommu_release_device(struct device * dev)4005 static void intel_iommu_release_device(struct device *dev)
4006 {
4007 struct device_domain_info *info = dev_iommu_priv_get(dev);
4008 struct intel_iommu *iommu = info->iommu;
4009
4010 if (info->pasid_enabled) {
4011 pci_disable_pasid(to_pci_dev(dev));
4012 info->pasid_enabled = 0;
4013 }
4014
4015 mutex_lock(&iommu->iopf_lock);
4016 if (dev_is_pci(dev) && pci_ats_supported(to_pci_dev(dev)))
4017 device_rbtree_remove(info);
4018 mutex_unlock(&iommu->iopf_lock);
4019
4020 if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) &&
4021 !context_copied(iommu, info->bus, info->devfn))
4022 intel_pasid_teardown_sm_context(dev);
4023
4024 intel_pasid_free_table(dev);
4025 intel_iommu_debugfs_remove_dev(info);
4026 kfree(info);
4027 set_dma_ops(dev, NULL);
4028 }
4029
intel_iommu_get_resv_regions(struct device * device,struct list_head * head)4030 static void intel_iommu_get_resv_regions(struct device *device,
4031 struct list_head *head)
4032 {
4033 int prot = DMA_PTE_READ | DMA_PTE_WRITE;
4034 struct iommu_resv_region *reg;
4035 struct dmar_rmrr_unit *rmrr;
4036 struct device *i_dev;
4037 int i;
4038
4039 rcu_read_lock();
4040 for_each_rmrr_units(rmrr) {
4041 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
4042 i, i_dev) {
4043 struct iommu_resv_region *resv;
4044 enum iommu_resv_type type;
4045 size_t length;
4046
4047 if (i_dev != device &&
4048 !is_downstream_to_pci_bridge(device, i_dev))
4049 continue;
4050
4051 length = rmrr->end_address - rmrr->base_address + 1;
4052
4053 type = device_rmrr_is_relaxable(device) ?
4054 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
4055
4056 resv = iommu_alloc_resv_region(rmrr->base_address,
4057 length, prot, type,
4058 GFP_ATOMIC);
4059 if (!resv)
4060 break;
4061
4062 list_add_tail(&resv->list, head);
4063 }
4064 }
4065 rcu_read_unlock();
4066
4067 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
4068 if (dev_is_pci(device)) {
4069 struct pci_dev *pdev = to_pci_dev(device);
4070
4071 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
4072 reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
4073 IOMMU_RESV_DIRECT_RELAXABLE,
4074 GFP_KERNEL);
4075 if (reg)
4076 list_add_tail(®->list, head);
4077 }
4078 }
4079 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
4080
4081 reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
4082 IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
4083 0, IOMMU_RESV_MSI, GFP_KERNEL);
4084 if (!reg)
4085 return;
4086 list_add_tail(®->list, head);
4087 }
4088
intel_iommu_device_group(struct device * dev)4089 static struct iommu_group *intel_iommu_device_group(struct device *dev)
4090 {
4091 if (dev_is_pci(dev))
4092 return pci_device_group(dev);
4093 return generic_device_group(dev);
4094 }
4095
intel_iommu_enable_sva(struct device * dev)4096 static int intel_iommu_enable_sva(struct device *dev)
4097 {
4098 struct device_domain_info *info = dev_iommu_priv_get(dev);
4099 struct intel_iommu *iommu;
4100
4101 if (!info || dmar_disabled)
4102 return -EINVAL;
4103
4104 iommu = info->iommu;
4105 if (!iommu)
4106 return -EINVAL;
4107
4108 if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE))
4109 return -ENODEV;
4110
4111 if (!info->pasid_enabled || !info->ats_enabled)
4112 return -EINVAL;
4113
4114 /*
4115 * Devices having device-specific I/O fault handling should not
4116 * support PCI/PRI. The IOMMU side has no means to check the
4117 * capability of device-specific IOPF. Therefore, IOMMU can only
4118 * default that if the device driver enables SVA on a non-PRI
4119 * device, it will handle IOPF in its own way.
4120 */
4121 if (!info->pri_supported)
4122 return 0;
4123
4124 /* Devices supporting PRI should have it enabled. */
4125 if (!info->pri_enabled)
4126 return -EINVAL;
4127
4128 return 0;
4129 }
4130
context_flip_pri(struct device_domain_info * info,bool enable)4131 static int context_flip_pri(struct device_domain_info *info, bool enable)
4132 {
4133 struct intel_iommu *iommu = info->iommu;
4134 u8 bus = info->bus, devfn = info->devfn;
4135 struct context_entry *context;
4136 u16 did;
4137
4138 spin_lock(&iommu->lock);
4139 if (context_copied(iommu, bus, devfn)) {
4140 spin_unlock(&iommu->lock);
4141 return -EINVAL;
4142 }
4143
4144 context = iommu_context_addr(iommu, bus, devfn, false);
4145 if (!context || !context_present(context)) {
4146 spin_unlock(&iommu->lock);
4147 return -ENODEV;
4148 }
4149 did = context_domain_id(context);
4150
4151 if (enable)
4152 context_set_sm_pre(context);
4153 else
4154 context_clear_sm_pre(context);
4155
4156 if (!ecap_coherent(iommu->ecap))
4157 clflush_cache_range(context, sizeof(*context));
4158 intel_context_flush_present(info, context, did, true);
4159 spin_unlock(&iommu->lock);
4160
4161 return 0;
4162 }
4163
intel_iommu_enable_iopf(struct device * dev)4164 static int intel_iommu_enable_iopf(struct device *dev)
4165 {
4166 struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4167 struct device_domain_info *info = dev_iommu_priv_get(dev);
4168 struct intel_iommu *iommu;
4169 int ret;
4170
4171 if (!pdev || !info || !info->ats_enabled || !info->pri_supported)
4172 return -ENODEV;
4173
4174 if (info->pri_enabled)
4175 return -EBUSY;
4176
4177 iommu = info->iommu;
4178 if (!iommu)
4179 return -EINVAL;
4180
4181 /* PASID is required in PRG Response Message. */
4182 if (info->pasid_enabled && !pci_prg_resp_pasid_required(pdev))
4183 return -EINVAL;
4184
4185 ret = pci_reset_pri(pdev);
4186 if (ret)
4187 return ret;
4188
4189 ret = iopf_queue_add_device(iommu->iopf_queue, dev);
4190 if (ret)
4191 return ret;
4192
4193 ret = context_flip_pri(info, true);
4194 if (ret)
4195 goto err_remove_device;
4196
4197 ret = pci_enable_pri(pdev, PRQ_DEPTH);
4198 if (ret)
4199 goto err_clear_pri;
4200
4201 info->pri_enabled = 1;
4202
4203 return 0;
4204 err_clear_pri:
4205 context_flip_pri(info, false);
4206 err_remove_device:
4207 iopf_queue_remove_device(iommu->iopf_queue, dev);
4208
4209 return ret;
4210 }
4211
intel_iommu_disable_iopf(struct device * dev)4212 static int intel_iommu_disable_iopf(struct device *dev)
4213 {
4214 struct device_domain_info *info = dev_iommu_priv_get(dev);
4215 struct intel_iommu *iommu = info->iommu;
4216
4217 if (!info->pri_enabled)
4218 return -EINVAL;
4219
4220 /* Disable new PRI reception: */
4221 context_flip_pri(info, false);
4222
4223 /*
4224 * Remove device from fault queue and acknowledge all outstanding
4225 * PRQs to the device:
4226 */
4227 iopf_queue_remove_device(iommu->iopf_queue, dev);
4228
4229 /*
4230 * PCIe spec states that by clearing PRI enable bit, the Page
4231 * Request Interface will not issue new page requests, but has
4232 * outstanding page requests that have been transmitted or are
4233 * queued for transmission. This is supposed to be called after
4234 * the device driver has stopped DMA, all PASIDs have been
4235 * unbound and the outstanding PRQs have been drained.
4236 */
4237 pci_disable_pri(to_pci_dev(dev));
4238 info->pri_enabled = 0;
4239
4240 return 0;
4241 }
4242
4243 static int
intel_iommu_dev_enable_feat(struct device * dev,enum iommu_dev_features feat)4244 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
4245 {
4246 switch (feat) {
4247 case IOMMU_DEV_FEAT_IOPF:
4248 return intel_iommu_enable_iopf(dev);
4249
4250 case IOMMU_DEV_FEAT_SVA:
4251 return intel_iommu_enable_sva(dev);
4252
4253 default:
4254 return -ENODEV;
4255 }
4256 }
4257
4258 static int
intel_iommu_dev_disable_feat(struct device * dev,enum iommu_dev_features feat)4259 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
4260 {
4261 switch (feat) {
4262 case IOMMU_DEV_FEAT_IOPF:
4263 return intel_iommu_disable_iopf(dev);
4264
4265 case IOMMU_DEV_FEAT_SVA:
4266 return 0;
4267
4268 default:
4269 return -ENODEV;
4270 }
4271 }
4272
intel_iommu_is_attach_deferred(struct device * dev)4273 static bool intel_iommu_is_attach_deferred(struct device *dev)
4274 {
4275 struct device_domain_info *info = dev_iommu_priv_get(dev);
4276
4277 return translation_pre_enabled(info->iommu) && !info->domain;
4278 }
4279
4280 /*
4281 * Check that the device does not require DMA protection. Such devices should
4282 * not be able to apply quirks and thus not be able to bypass the IOMMU
4283 * restrictions.
4284 */
risky_device(struct pci_dev * pdev)4285 static bool risky_device(struct pci_dev *pdev)
4286 {
4287 if (pdev->requires_dma_protection) {
4288 pci_info(pdev,
4289 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
4290 pdev->vendor, pdev->device);
4291 pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
4292 return true;
4293 }
4294 return false;
4295 }
4296
intel_iommu_iotlb_sync_map(struct iommu_domain * domain,unsigned long iova,size_t size)4297 static int intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
4298 unsigned long iova, size_t size)
4299 {
4300 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4301
4302 if (dmar_domain->iotlb_sync_map)
4303 cache_tag_flush_range_np(dmar_domain, iova, iova + size - 1);
4304
4305 return 0;
4306 }
4307
intel_iommu_remove_dev_pasid(struct device * dev,ioasid_t pasid,struct iommu_domain * domain)4308 static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid,
4309 struct iommu_domain *domain)
4310 {
4311 struct device_domain_info *info = dev_iommu_priv_get(dev);
4312 struct dev_pasid_info *curr, *dev_pasid = NULL;
4313 struct intel_iommu *iommu = info->iommu;
4314 struct dmar_domain *dmar_domain;
4315 unsigned long flags;
4316
4317 if (domain->type == IOMMU_DOMAIN_IDENTITY) {
4318 intel_pasid_tear_down_entry(iommu, dev, pasid, false);
4319 return;
4320 }
4321
4322 dmar_domain = to_dmar_domain(domain);
4323 spin_lock_irqsave(&dmar_domain->lock, flags);
4324 list_for_each_entry(curr, &dmar_domain->dev_pasids, link_domain) {
4325 if (curr->dev == dev && curr->pasid == pasid) {
4326 list_del(&curr->link_domain);
4327 dev_pasid = curr;
4328 break;
4329 }
4330 }
4331 WARN_ON_ONCE(!dev_pasid);
4332 spin_unlock_irqrestore(&dmar_domain->lock, flags);
4333
4334 cache_tag_unassign_domain(dmar_domain, dev, pasid);
4335 domain_detach_iommu(dmar_domain, iommu);
4336 intel_iommu_debugfs_remove_dev_pasid(dev_pasid);
4337 kfree(dev_pasid);
4338 intel_pasid_tear_down_entry(iommu, dev, pasid, false);
4339 intel_drain_pasid_prq(dev, pasid);
4340 }
4341
intel_iommu_set_dev_pasid(struct iommu_domain * domain,struct device * dev,ioasid_t pasid)4342 static int intel_iommu_set_dev_pasid(struct iommu_domain *domain,
4343 struct device *dev, ioasid_t pasid)
4344 {
4345 struct device_domain_info *info = dev_iommu_priv_get(dev);
4346 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4347 struct intel_iommu *iommu = info->iommu;
4348 struct dev_pasid_info *dev_pasid;
4349 unsigned long flags;
4350 int ret;
4351
4352 if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev))
4353 return -EOPNOTSUPP;
4354
4355 if (domain->dirty_ops)
4356 return -EINVAL;
4357
4358 if (context_copied(iommu, info->bus, info->devfn))
4359 return -EBUSY;
4360
4361 ret = prepare_domain_attach_device(domain, dev);
4362 if (ret)
4363 return ret;
4364
4365 dev_pasid = kzalloc(sizeof(*dev_pasid), GFP_KERNEL);
4366 if (!dev_pasid)
4367 return -ENOMEM;
4368
4369 ret = domain_attach_iommu(dmar_domain, iommu);
4370 if (ret)
4371 goto out_free;
4372
4373 ret = cache_tag_assign_domain(dmar_domain, dev, pasid);
4374 if (ret)
4375 goto out_detach_iommu;
4376
4377 if (dmar_domain->use_first_level)
4378 ret = domain_setup_first_level(iommu, dmar_domain,
4379 dev, pasid);
4380 else
4381 ret = intel_pasid_setup_second_level(iommu, dmar_domain,
4382 dev, pasid);
4383 if (ret)
4384 goto out_unassign_tag;
4385
4386 dev_pasid->dev = dev;
4387 dev_pasid->pasid = pasid;
4388 spin_lock_irqsave(&dmar_domain->lock, flags);
4389 list_add(&dev_pasid->link_domain, &dmar_domain->dev_pasids);
4390 spin_unlock_irqrestore(&dmar_domain->lock, flags);
4391
4392 if (domain->type & __IOMMU_DOMAIN_PAGING)
4393 intel_iommu_debugfs_create_dev_pasid(dev_pasid);
4394
4395 return 0;
4396 out_unassign_tag:
4397 cache_tag_unassign_domain(dmar_domain, dev, pasid);
4398 out_detach_iommu:
4399 domain_detach_iommu(dmar_domain, iommu);
4400 out_free:
4401 kfree(dev_pasid);
4402 return ret;
4403 }
4404
intel_iommu_hw_info(struct device * dev,u32 * length,u32 * type)4405 static void *intel_iommu_hw_info(struct device *dev, u32 *length, u32 *type)
4406 {
4407 struct device_domain_info *info = dev_iommu_priv_get(dev);
4408 struct intel_iommu *iommu = info->iommu;
4409 struct iommu_hw_info_vtd *vtd;
4410
4411 vtd = kzalloc(sizeof(*vtd), GFP_KERNEL);
4412 if (!vtd)
4413 return ERR_PTR(-ENOMEM);
4414
4415 vtd->flags = IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17;
4416 vtd->cap_reg = iommu->cap;
4417 vtd->ecap_reg = iommu->ecap;
4418 *length = sizeof(*vtd);
4419 *type = IOMMU_HW_INFO_TYPE_INTEL_VTD;
4420 return vtd;
4421 }
4422
4423 /*
4424 * Set dirty tracking for the device list of a domain. The caller must
4425 * hold the domain->lock when calling it.
4426 */
device_set_dirty_tracking(struct list_head * devices,bool enable)4427 static int device_set_dirty_tracking(struct list_head *devices, bool enable)
4428 {
4429 struct device_domain_info *info;
4430 int ret = 0;
4431
4432 list_for_each_entry(info, devices, link) {
4433 ret = intel_pasid_setup_dirty_tracking(info->iommu, info->dev,
4434 IOMMU_NO_PASID, enable);
4435 if (ret)
4436 break;
4437 }
4438
4439 return ret;
4440 }
4441
parent_domain_set_dirty_tracking(struct dmar_domain * domain,bool enable)4442 static int parent_domain_set_dirty_tracking(struct dmar_domain *domain,
4443 bool enable)
4444 {
4445 struct dmar_domain *s1_domain;
4446 unsigned long flags;
4447 int ret;
4448
4449 spin_lock(&domain->s1_lock);
4450 list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) {
4451 spin_lock_irqsave(&s1_domain->lock, flags);
4452 ret = device_set_dirty_tracking(&s1_domain->devices, enable);
4453 spin_unlock_irqrestore(&s1_domain->lock, flags);
4454 if (ret)
4455 goto err_unwind;
4456 }
4457 spin_unlock(&domain->s1_lock);
4458 return 0;
4459
4460 err_unwind:
4461 list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) {
4462 spin_lock_irqsave(&s1_domain->lock, flags);
4463 device_set_dirty_tracking(&s1_domain->devices,
4464 domain->dirty_tracking);
4465 spin_unlock_irqrestore(&s1_domain->lock, flags);
4466 }
4467 spin_unlock(&domain->s1_lock);
4468 return ret;
4469 }
4470
intel_iommu_set_dirty_tracking(struct iommu_domain * domain,bool enable)4471 static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain,
4472 bool enable)
4473 {
4474 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4475 int ret;
4476
4477 spin_lock(&dmar_domain->lock);
4478 if (dmar_domain->dirty_tracking == enable)
4479 goto out_unlock;
4480
4481 ret = device_set_dirty_tracking(&dmar_domain->devices, enable);
4482 if (ret)
4483 goto err_unwind;
4484
4485 if (dmar_domain->nested_parent) {
4486 ret = parent_domain_set_dirty_tracking(dmar_domain, enable);
4487 if (ret)
4488 goto err_unwind;
4489 }
4490
4491 dmar_domain->dirty_tracking = enable;
4492 out_unlock:
4493 spin_unlock(&dmar_domain->lock);
4494
4495 return 0;
4496
4497 err_unwind:
4498 device_set_dirty_tracking(&dmar_domain->devices,
4499 dmar_domain->dirty_tracking);
4500 spin_unlock(&dmar_domain->lock);
4501 return ret;
4502 }
4503
intel_iommu_read_and_clear_dirty(struct iommu_domain * domain,unsigned long iova,size_t size,unsigned long flags,struct iommu_dirty_bitmap * dirty)4504 static int intel_iommu_read_and_clear_dirty(struct iommu_domain *domain,
4505 unsigned long iova, size_t size,
4506 unsigned long flags,
4507 struct iommu_dirty_bitmap *dirty)
4508 {
4509 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4510 unsigned long end = iova + size - 1;
4511 unsigned long pgsize;
4512
4513 /*
4514 * IOMMUFD core calls into a dirty tracking disabled domain without an
4515 * IOVA bitmap set in order to clean dirty bits in all PTEs that might
4516 * have occurred when we stopped dirty tracking. This ensures that we
4517 * never inherit dirtied bits from a previous cycle.
4518 */
4519 if (!dmar_domain->dirty_tracking && dirty->bitmap)
4520 return -EINVAL;
4521
4522 do {
4523 struct dma_pte *pte;
4524 int lvl = 0;
4525
4526 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &lvl,
4527 GFP_ATOMIC);
4528 pgsize = level_size(lvl) << VTD_PAGE_SHIFT;
4529 if (!pte || !dma_pte_present(pte)) {
4530 iova += pgsize;
4531 continue;
4532 }
4533
4534 if (dma_sl_pte_test_and_clear_dirty(pte, flags))
4535 iommu_dirty_bitmap_record(dirty, iova, pgsize);
4536 iova += pgsize;
4537 } while (iova < end);
4538
4539 return 0;
4540 }
4541
4542 static const struct iommu_dirty_ops intel_dirty_ops = {
4543 .set_dirty_tracking = intel_iommu_set_dirty_tracking,
4544 .read_and_clear_dirty = intel_iommu_read_and_clear_dirty,
4545 };
4546
context_setup_pass_through(struct device * dev,u8 bus,u8 devfn)4547 static int context_setup_pass_through(struct device *dev, u8 bus, u8 devfn)
4548 {
4549 struct device_domain_info *info = dev_iommu_priv_get(dev);
4550 struct intel_iommu *iommu = info->iommu;
4551 struct context_entry *context;
4552
4553 spin_lock(&iommu->lock);
4554 context = iommu_context_addr(iommu, bus, devfn, 1);
4555 if (!context) {
4556 spin_unlock(&iommu->lock);
4557 return -ENOMEM;
4558 }
4559
4560 if (context_present(context) && !context_copied(iommu, bus, devfn)) {
4561 spin_unlock(&iommu->lock);
4562 return 0;
4563 }
4564
4565 copied_context_tear_down(iommu, context, bus, devfn);
4566 context_clear_entry(context);
4567 context_set_domain_id(context, FLPT_DEFAULT_DID);
4568
4569 /*
4570 * In pass through mode, AW must be programmed to indicate the largest
4571 * AGAW value supported by hardware. And ASR is ignored by hardware.
4572 */
4573 context_set_address_width(context, iommu->msagaw);
4574 context_set_translation_type(context, CONTEXT_TT_PASS_THROUGH);
4575 context_set_fault_enable(context);
4576 context_set_present(context);
4577 if (!ecap_coherent(iommu->ecap))
4578 clflush_cache_range(context, sizeof(*context));
4579 context_present_cache_flush(iommu, FLPT_DEFAULT_DID, bus, devfn);
4580 spin_unlock(&iommu->lock);
4581
4582 return 0;
4583 }
4584
context_setup_pass_through_cb(struct pci_dev * pdev,u16 alias,void * data)4585 static int context_setup_pass_through_cb(struct pci_dev *pdev, u16 alias, void *data)
4586 {
4587 struct device *dev = data;
4588
4589 return context_setup_pass_through(dev, PCI_BUS_NUM(alias), alias & 0xff);
4590 }
4591
device_setup_pass_through(struct device * dev)4592 static int device_setup_pass_through(struct device *dev)
4593 {
4594 struct device_domain_info *info = dev_iommu_priv_get(dev);
4595
4596 if (!dev_is_pci(dev))
4597 return context_setup_pass_through(dev, info->bus, info->devfn);
4598
4599 return pci_for_each_dma_alias(to_pci_dev(dev),
4600 context_setup_pass_through_cb, dev);
4601 }
4602
identity_domain_attach_dev(struct iommu_domain * domain,struct device * dev)4603 static int identity_domain_attach_dev(struct iommu_domain *domain, struct device *dev)
4604 {
4605 struct device_domain_info *info = dev_iommu_priv_get(dev);
4606 struct intel_iommu *iommu = info->iommu;
4607 int ret;
4608
4609 device_block_translation(dev);
4610
4611 if (dev_is_real_dma_subdevice(dev))
4612 return 0;
4613
4614 if (sm_supported(iommu)) {
4615 ret = intel_pasid_setup_pass_through(iommu, dev, IOMMU_NO_PASID);
4616 if (!ret)
4617 iommu_enable_pci_caps(info);
4618 } else {
4619 ret = device_setup_pass_through(dev);
4620 }
4621
4622 if (!ret)
4623 info->domain_attached = true;
4624
4625 return ret;
4626 }
4627
identity_domain_set_dev_pasid(struct iommu_domain * domain,struct device * dev,ioasid_t pasid)4628 static int identity_domain_set_dev_pasid(struct iommu_domain *domain,
4629 struct device *dev, ioasid_t pasid)
4630 {
4631 struct device_domain_info *info = dev_iommu_priv_get(dev);
4632 struct intel_iommu *iommu = info->iommu;
4633
4634 if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev))
4635 return -EOPNOTSUPP;
4636
4637 return intel_pasid_setup_pass_through(iommu, dev, pasid);
4638 }
4639
4640 static struct iommu_domain identity_domain = {
4641 .type = IOMMU_DOMAIN_IDENTITY,
4642 .ops = &(const struct iommu_domain_ops) {
4643 .attach_dev = identity_domain_attach_dev,
4644 .set_dev_pasid = identity_domain_set_dev_pasid,
4645 },
4646 };
4647
4648 const struct iommu_ops intel_iommu_ops = {
4649 .blocked_domain = &blocking_domain,
4650 .release_domain = &blocking_domain,
4651 .identity_domain = &identity_domain,
4652 .capable = intel_iommu_capable,
4653 .hw_info = intel_iommu_hw_info,
4654 .domain_alloc = intel_iommu_domain_alloc,
4655 .domain_alloc_user = intel_iommu_domain_alloc_user,
4656 .domain_alloc_sva = intel_svm_domain_alloc,
4657 .probe_device = intel_iommu_probe_device,
4658 .release_device = intel_iommu_release_device,
4659 .get_resv_regions = intel_iommu_get_resv_regions,
4660 .device_group = intel_iommu_device_group,
4661 .dev_enable_feat = intel_iommu_dev_enable_feat,
4662 .dev_disable_feat = intel_iommu_dev_disable_feat,
4663 .is_attach_deferred = intel_iommu_is_attach_deferred,
4664 .def_domain_type = device_def_domain_type,
4665 .remove_dev_pasid = intel_iommu_remove_dev_pasid,
4666 .pgsize_bitmap = SZ_4K,
4667 #ifdef CONFIG_INTEL_IOMMU_SVM
4668 .page_response = intel_svm_page_response,
4669 #endif
4670 .default_domain_ops = &(const struct iommu_domain_ops) {
4671 .attach_dev = intel_iommu_attach_device,
4672 .set_dev_pasid = intel_iommu_set_dev_pasid,
4673 .map_pages = intel_iommu_map_pages,
4674 .unmap_pages = intel_iommu_unmap_pages,
4675 .iotlb_sync_map = intel_iommu_iotlb_sync_map,
4676 .flush_iotlb_all = intel_flush_iotlb_all,
4677 .iotlb_sync = intel_iommu_tlb_sync,
4678 .iova_to_phys = intel_iommu_iova_to_phys,
4679 .free = intel_iommu_domain_free,
4680 .enforce_cache_coherency = intel_iommu_enforce_cache_coherency,
4681 }
4682 };
4683
quirk_iommu_igfx(struct pci_dev * dev)4684 static void quirk_iommu_igfx(struct pci_dev *dev)
4685 {
4686 if (risky_device(dev))
4687 return;
4688
4689 pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
4690 disable_igfx_iommu = 1;
4691 }
4692
4693 /* G4x/GM45 integrated gfx dmar support is totally busted. */
4694 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
4695 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
4696 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
4697 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
4698 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
4699 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
4700 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
4701
4702 /* QM57/QS57 integrated gfx malfunctions with dmar */
4703 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_iommu_igfx);
4704
4705 /* Broadwell igfx malfunctions with dmar */
4706 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
4707 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
4708 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
4709 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
4710 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
4711 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
4712 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
4713 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
4714 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
4715 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
4716 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
4717 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
4718 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
4719 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
4720 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
4721 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
4722 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
4723 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
4724 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
4725 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
4726 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
4727 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
4728 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
4729 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
4730
quirk_iommu_rwbf(struct pci_dev * dev)4731 static void quirk_iommu_rwbf(struct pci_dev *dev)
4732 {
4733 if (risky_device(dev))
4734 return;
4735
4736 /*
4737 * Mobile 4 Series Chipset neglects to set RWBF capability,
4738 * but needs it. Same seems to hold for the desktop versions.
4739 */
4740 pci_info(dev, "Forcing write-buffer flush capability\n");
4741 rwbf_quirk = 1;
4742 }
4743
4744 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4745 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4746 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4747 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4748 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4749 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4750 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4751
4752 #define GGC 0x52
4753 #define GGC_MEMORY_SIZE_MASK (0xf << 8)
4754 #define GGC_MEMORY_SIZE_NONE (0x0 << 8)
4755 #define GGC_MEMORY_SIZE_1M (0x1 << 8)
4756 #define GGC_MEMORY_SIZE_2M (0x3 << 8)
4757 #define GGC_MEMORY_VT_ENABLED (0x8 << 8)
4758 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8)
4759 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8)
4760 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8)
4761
quirk_calpella_no_shadow_gtt(struct pci_dev * dev)4762 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4763 {
4764 unsigned short ggc;
4765
4766 if (risky_device(dev))
4767 return;
4768
4769 if (pci_read_config_word(dev, GGC, &ggc))
4770 return;
4771
4772 if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4773 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4774 disable_igfx_iommu = 1;
4775 } else if (!disable_igfx_iommu) {
4776 /* we have to ensure the gfx device is idle before we flush */
4777 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
4778 iommu_set_dma_strict();
4779 }
4780 }
4781 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4782 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4783 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4784
quirk_igfx_skip_te_disable(struct pci_dev * dev)4785 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
4786 {
4787 unsigned short ver;
4788
4789 if (!IS_GFX_DEVICE(dev))
4790 return;
4791
4792 ver = (dev->device >> 8) & 0xff;
4793 if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
4794 ver != 0x4e && ver != 0x8a && ver != 0x98 &&
4795 ver != 0x9a && ver != 0xa7 && ver != 0x7d)
4796 return;
4797
4798 if (risky_device(dev))
4799 return;
4800
4801 pci_info(dev, "Skip IOMMU disabling for graphics\n");
4802 iommu_skip_te_disable = 1;
4803 }
4804 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
4805
4806 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4807 ISOCH DMAR unit for the Azalia sound device, but not give it any
4808 TLB entries, which causes it to deadlock. Check for that. We do
4809 this in a function called from init_dmars(), instead of in a PCI
4810 quirk, because we don't want to print the obnoxious "BIOS broken"
4811 message if VT-d is actually disabled.
4812 */
check_tylersburg_isoch(void)4813 static void __init check_tylersburg_isoch(void)
4814 {
4815 struct pci_dev *pdev;
4816 uint32_t vtisochctrl;
4817
4818 /* If there's no Azalia in the system anyway, forget it. */
4819 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4820 if (!pdev)
4821 return;
4822
4823 if (risky_device(pdev)) {
4824 pci_dev_put(pdev);
4825 return;
4826 }
4827
4828 pci_dev_put(pdev);
4829
4830 /* System Management Registers. Might be hidden, in which case
4831 we can't do the sanity check. But that's OK, because the
4832 known-broken BIOSes _don't_ actually hide it, so far. */
4833 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4834 if (!pdev)
4835 return;
4836
4837 if (risky_device(pdev)) {
4838 pci_dev_put(pdev);
4839 return;
4840 }
4841
4842 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4843 pci_dev_put(pdev);
4844 return;
4845 }
4846
4847 pci_dev_put(pdev);
4848
4849 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4850 if (vtisochctrl & 1)
4851 return;
4852
4853 /* Drop all bits other than the number of TLB entries */
4854 vtisochctrl &= 0x1c;
4855
4856 /* If we have the recommended number of TLB entries (16), fine. */
4857 if (vtisochctrl == 0x10)
4858 return;
4859
4860 /* Zero TLB entries? You get to ride the short bus to school. */
4861 if (!vtisochctrl) {
4862 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4863 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4864 dmi_get_system_info(DMI_BIOS_VENDOR),
4865 dmi_get_system_info(DMI_BIOS_VERSION),
4866 dmi_get_system_info(DMI_PRODUCT_VERSION));
4867 iommu_identity_mapping |= IDENTMAP_AZALIA;
4868 return;
4869 }
4870
4871 pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4872 vtisochctrl);
4873 }
4874
4875 /*
4876 * Here we deal with a device TLB defect where device may inadvertently issue ATS
4877 * invalidation completion before posted writes initiated with translated address
4878 * that utilized translations matching the invalidation address range, violating
4879 * the invalidation completion ordering.
4880 * Therefore, any use cases that cannot guarantee DMA is stopped before unmap is
4881 * vulnerable to this defect. In other words, any dTLB invalidation initiated not
4882 * under the control of the trusted/privileged host device driver must use this
4883 * quirk.
4884 * Device TLBs are invalidated under the following six conditions:
4885 * 1. Device driver does DMA API unmap IOVA
4886 * 2. Device driver unbind a PASID from a process, sva_unbind_device()
4887 * 3. PASID is torn down, after PASID cache is flushed. e.g. process
4888 * exit_mmap() due to crash
4889 * 4. Under SVA usage, called by mmu_notifier.invalidate_range() where
4890 * VM has to free pages that were unmapped
4891 * 5. Userspace driver unmaps a DMA buffer
4892 * 6. Cache invalidation in vSVA usage (upcoming)
4893 *
4894 * For #1 and #2, device drivers are responsible for stopping DMA traffic
4895 * before unmap/unbind. For #3, iommu driver gets mmu_notifier to
4896 * invalidate TLB the same way as normal user unmap which will use this quirk.
4897 * The dTLB invalidation after PASID cache flush does not need this quirk.
4898 *
4899 * As a reminder, #6 will *NEED* this quirk as we enable nested translation.
4900 */
quirk_extra_dev_tlb_flush(struct device_domain_info * info,unsigned long address,unsigned long mask,u32 pasid,u16 qdep)4901 void quirk_extra_dev_tlb_flush(struct device_domain_info *info,
4902 unsigned long address, unsigned long mask,
4903 u32 pasid, u16 qdep)
4904 {
4905 u16 sid;
4906
4907 if (likely(!info->dtlb_extra_inval))
4908 return;
4909
4910 sid = PCI_DEVID(info->bus, info->devfn);
4911 if (pasid == IOMMU_NO_PASID) {
4912 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
4913 qdep, address, mask);
4914 } else {
4915 qi_flush_dev_iotlb_pasid(info->iommu, sid, info->pfsid,
4916 pasid, qdep, address, mask);
4917 }
4918 }
4919
4920 #define ecmd_get_status_code(res) (((res) & 0xff) >> 1)
4921
4922 /*
4923 * Function to submit a command to the enhanced command interface. The
4924 * valid enhanced command descriptions are defined in Table 47 of the
4925 * VT-d spec. The VT-d hardware implementation may support some but not
4926 * all commands, which can be determined by checking the Enhanced
4927 * Command Capability Register.
4928 *
4929 * Return values:
4930 * - 0: Command successful without any error;
4931 * - Negative: software error value;
4932 * - Nonzero positive: failure status code defined in Table 48.
4933 */
ecmd_submit_sync(struct intel_iommu * iommu,u8 ecmd,u64 oa,u64 ob)4934 int ecmd_submit_sync(struct intel_iommu *iommu, u8 ecmd, u64 oa, u64 ob)
4935 {
4936 unsigned long flags;
4937 u64 res;
4938 int ret;
4939
4940 if (!cap_ecmds(iommu->cap))
4941 return -ENODEV;
4942
4943 raw_spin_lock_irqsave(&iommu->register_lock, flags);
4944
4945 res = dmar_readq(iommu->reg + DMAR_ECRSP_REG);
4946 if (res & DMA_ECMD_ECRSP_IP) {
4947 ret = -EBUSY;
4948 goto err;
4949 }
4950
4951 /*
4952 * Unconditionally write the operand B, because
4953 * - There is no side effect if an ecmd doesn't require an
4954 * operand B, but we set the register to some value.
4955 * - It's not invoked in any critical path. The extra MMIO
4956 * write doesn't bring any performance concerns.
4957 */
4958 dmar_writeq(iommu->reg + DMAR_ECEO_REG, ob);
4959 dmar_writeq(iommu->reg + DMAR_ECMD_REG, ecmd | (oa << DMA_ECMD_OA_SHIFT));
4960
4961 IOMMU_WAIT_OP(iommu, DMAR_ECRSP_REG, dmar_readq,
4962 !(res & DMA_ECMD_ECRSP_IP), res);
4963
4964 if (res & DMA_ECMD_ECRSP_IP) {
4965 ret = -ETIMEDOUT;
4966 goto err;
4967 }
4968
4969 ret = ecmd_get_status_code(res);
4970 err:
4971 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
4972
4973 return ret;
4974 }
4975