• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright © 2006-2014 Intel Corporation.
4  *
5  * Authors: David Woodhouse <dwmw2@infradead.org>,
6  *          Ashok Raj <ashok.raj@intel.com>,
7  *          Shaohua Li <shaohua.li@intel.com>,
8  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9  *          Fenghua Yu <fenghua.yu@intel.com>
10  *          Joerg Roedel <jroedel@suse.de>
11  */
12 
13 #define pr_fmt(fmt)     "DMAR: " fmt
14 #define dev_fmt(fmt)    pr_fmt(fmt)
15 
16 #include <linux/crash_dump.h>
17 #include <linux/dma-direct.h>
18 #include <linux/dmi.h>
19 #include <linux/memory.h>
20 #include <linux/pci.h>
21 #include <linux/pci-ats.h>
22 #include <linux/spinlock.h>
23 #include <linux/syscore_ops.h>
24 #include <linux/tboot.h>
25 #include <uapi/linux/iommufd.h>
26 
27 #include "iommu.h"
28 #include "../dma-iommu.h"
29 #include "../irq_remapping.h"
30 #include "../iommu-pages.h"
31 #include "pasid.h"
32 #include "cap_audit.h"
33 #include "perfmon.h"
34 
35 #define ROOT_SIZE		VTD_PAGE_SIZE
36 #define CONTEXT_SIZE		VTD_PAGE_SIZE
37 
38 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
39 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
40 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
41 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
42 
43 #define IOAPIC_RANGE_START	(0xfee00000)
44 #define IOAPIC_RANGE_END	(0xfeefffff)
45 #define IOVA_START_ADDR		(0x1000)
46 
47 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
48 
49 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
50 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
51 
52 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
53    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
54 #define DOMAIN_MAX_PFN(gaw)	((unsigned long) min_t(uint64_t, \
55 				__DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
56 #define DOMAIN_MAX_ADDR(gaw)	(((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
57 
58 static void __init check_tylersburg_isoch(void);
59 static int rwbf_quirk;
60 
61 /*
62  * set to 1 to panic kernel if can't successfully enable VT-d
63  * (used when kernel is launched w/ TXT)
64  */
65 static int force_on = 0;
66 static int intel_iommu_tboot_noforce;
67 static int no_platform_optin;
68 
69 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
70 
71 /*
72  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
73  * if marked present.
74  */
root_entry_lctp(struct root_entry * re)75 static phys_addr_t root_entry_lctp(struct root_entry *re)
76 {
77 	if (!(re->lo & 1))
78 		return 0;
79 
80 	return re->lo & VTD_PAGE_MASK;
81 }
82 
83 /*
84  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
85  * if marked present.
86  */
root_entry_uctp(struct root_entry * re)87 static phys_addr_t root_entry_uctp(struct root_entry *re)
88 {
89 	if (!(re->hi & 1))
90 		return 0;
91 
92 	return re->hi & VTD_PAGE_MASK;
93 }
94 
device_rid_cmp_key(const void * key,const struct rb_node * node)95 static int device_rid_cmp_key(const void *key, const struct rb_node *node)
96 {
97 	struct device_domain_info *info =
98 		rb_entry(node, struct device_domain_info, node);
99 	const u16 *rid_lhs = key;
100 
101 	if (*rid_lhs < PCI_DEVID(info->bus, info->devfn))
102 		return -1;
103 
104 	if (*rid_lhs > PCI_DEVID(info->bus, info->devfn))
105 		return 1;
106 
107 	return 0;
108 }
109 
device_rid_cmp(struct rb_node * lhs,const struct rb_node * rhs)110 static int device_rid_cmp(struct rb_node *lhs, const struct rb_node *rhs)
111 {
112 	struct device_domain_info *info =
113 		rb_entry(lhs, struct device_domain_info, node);
114 	u16 key = PCI_DEVID(info->bus, info->devfn);
115 
116 	return device_rid_cmp_key(&key, rhs);
117 }
118 
119 /*
120  * Looks up an IOMMU-probed device using its source ID.
121  *
122  * Returns the pointer to the device if there is a match. Otherwise,
123  * returns NULL.
124  *
125  * Note that this helper doesn't guarantee that the device won't be
126  * released by the iommu subsystem after being returned. The caller
127  * should use its own synchronization mechanism to avoid the device
128  * being released during its use if its possibly the case.
129  */
device_rbtree_find(struct intel_iommu * iommu,u16 rid)130 struct device *device_rbtree_find(struct intel_iommu *iommu, u16 rid)
131 {
132 	struct device_domain_info *info = NULL;
133 	struct rb_node *node;
134 	unsigned long flags;
135 
136 	spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
137 	node = rb_find(&rid, &iommu->device_rbtree, device_rid_cmp_key);
138 	if (node)
139 		info = rb_entry(node, struct device_domain_info, node);
140 	spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags);
141 
142 	return info ? info->dev : NULL;
143 }
144 
device_rbtree_insert(struct intel_iommu * iommu,struct device_domain_info * info)145 static int device_rbtree_insert(struct intel_iommu *iommu,
146 				struct device_domain_info *info)
147 {
148 	struct rb_node *curr;
149 	unsigned long flags;
150 
151 	spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
152 	curr = rb_find_add(&info->node, &iommu->device_rbtree, device_rid_cmp);
153 	spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags);
154 	if (WARN_ON(curr))
155 		return -EEXIST;
156 
157 	return 0;
158 }
159 
device_rbtree_remove(struct device_domain_info * info)160 static void device_rbtree_remove(struct device_domain_info *info)
161 {
162 	struct intel_iommu *iommu = info->iommu;
163 	unsigned long flags;
164 
165 	spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
166 	rb_erase(&info->node, &iommu->device_rbtree);
167 	spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags);
168 }
169 
170 struct dmar_rmrr_unit {
171 	struct list_head list;		/* list of rmrr units	*/
172 	struct acpi_dmar_header *hdr;	/* ACPI header		*/
173 	u64	base_address;		/* reserved base address*/
174 	u64	end_address;		/* reserved end address */
175 	struct dmar_dev_scope *devices;	/* target devices */
176 	int	devices_cnt;		/* target device count */
177 };
178 
179 struct dmar_atsr_unit {
180 	struct list_head list;		/* list of ATSR units */
181 	struct acpi_dmar_header *hdr;	/* ACPI header */
182 	struct dmar_dev_scope *devices;	/* target devices */
183 	int devices_cnt;		/* target device count */
184 	u8 include_all:1;		/* include all ports */
185 };
186 
187 struct dmar_satc_unit {
188 	struct list_head list;		/* list of SATC units */
189 	struct acpi_dmar_header *hdr;	/* ACPI header */
190 	struct dmar_dev_scope *devices;	/* target devices */
191 	struct intel_iommu *iommu;	/* the corresponding iommu */
192 	int devices_cnt;		/* target device count */
193 	u8 atc_required:1;		/* ATS is required */
194 };
195 
196 static LIST_HEAD(dmar_atsr_units);
197 static LIST_HEAD(dmar_rmrr_units);
198 static LIST_HEAD(dmar_satc_units);
199 
200 #define for_each_rmrr_units(rmrr) \
201 	list_for_each_entry(rmrr, &dmar_rmrr_units, list)
202 
203 static void intel_iommu_domain_free(struct iommu_domain *domain);
204 
205 int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
206 int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON);
207 
208 int intel_iommu_enabled = 0;
209 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
210 
211 static int intel_iommu_superpage = 1;
212 static int iommu_identity_mapping;
213 static int iommu_skip_te_disable;
214 static int disable_igfx_iommu;
215 
216 #define IDENTMAP_AZALIA		4
217 
218 const struct iommu_ops intel_iommu_ops;
219 static const struct iommu_dirty_ops intel_dirty_ops;
220 
translation_pre_enabled(struct intel_iommu * iommu)221 static bool translation_pre_enabled(struct intel_iommu *iommu)
222 {
223 	return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
224 }
225 
clear_translation_pre_enabled(struct intel_iommu * iommu)226 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
227 {
228 	iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
229 }
230 
init_translation_status(struct intel_iommu * iommu)231 static void init_translation_status(struct intel_iommu *iommu)
232 {
233 	u32 gsts;
234 
235 	gsts = readl(iommu->reg + DMAR_GSTS_REG);
236 	if (gsts & DMA_GSTS_TES)
237 		iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
238 }
239 
intel_iommu_setup(char * str)240 static int __init intel_iommu_setup(char *str)
241 {
242 	if (!str)
243 		return -EINVAL;
244 
245 	while (*str) {
246 		if (!strncmp(str, "on", 2)) {
247 			dmar_disabled = 0;
248 			pr_info("IOMMU enabled\n");
249 		} else if (!strncmp(str, "off", 3)) {
250 			dmar_disabled = 1;
251 			no_platform_optin = 1;
252 			pr_info("IOMMU disabled\n");
253 		} else if (!strncmp(str, "igfx_off", 8)) {
254 			disable_igfx_iommu = 1;
255 			pr_info("Disable GFX device mapping\n");
256 		} else if (!strncmp(str, "forcedac", 8)) {
257 			pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
258 			iommu_dma_forcedac = true;
259 		} else if (!strncmp(str, "strict", 6)) {
260 			pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
261 			iommu_set_dma_strict();
262 		} else if (!strncmp(str, "sp_off", 6)) {
263 			pr_info("Disable supported super page\n");
264 			intel_iommu_superpage = 0;
265 		} else if (!strncmp(str, "sm_on", 5)) {
266 			pr_info("Enable scalable mode if hardware supports\n");
267 			intel_iommu_sm = 1;
268 		} else if (!strncmp(str, "sm_off", 6)) {
269 			pr_info("Scalable mode is disallowed\n");
270 			intel_iommu_sm = 0;
271 		} else if (!strncmp(str, "tboot_noforce", 13)) {
272 			pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
273 			intel_iommu_tboot_noforce = 1;
274 		} else {
275 			pr_notice("Unknown option - '%s'\n", str);
276 		}
277 
278 		str += strcspn(str, ",");
279 		while (*str == ',')
280 			str++;
281 	}
282 
283 	return 1;
284 }
285 __setup("intel_iommu=", intel_iommu_setup);
286 
domain_pfn_supported(struct dmar_domain * domain,unsigned long pfn)287 static int domain_pfn_supported(struct dmar_domain *domain, unsigned long pfn)
288 {
289 	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
290 
291 	return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
292 }
293 
294 /*
295  * Calculate the Supported Adjusted Guest Address Widths of an IOMMU.
296  * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of
297  * the returned SAGAW.
298  */
__iommu_calculate_sagaw(struct intel_iommu * iommu)299 static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu)
300 {
301 	unsigned long fl_sagaw, sl_sagaw;
302 
303 	fl_sagaw = BIT(2) | (cap_fl5lp_support(iommu->cap) ? BIT(3) : 0);
304 	sl_sagaw = cap_sagaw(iommu->cap);
305 
306 	/* Second level only. */
307 	if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
308 		return sl_sagaw;
309 
310 	/* First level only. */
311 	if (!ecap_slts(iommu->ecap))
312 		return fl_sagaw;
313 
314 	return fl_sagaw & sl_sagaw;
315 }
316 
__iommu_calculate_agaw(struct intel_iommu * iommu,int max_gaw)317 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
318 {
319 	unsigned long sagaw;
320 	int agaw;
321 
322 	sagaw = __iommu_calculate_sagaw(iommu);
323 	for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) {
324 		if (test_bit(agaw, &sagaw))
325 			break;
326 	}
327 
328 	return agaw;
329 }
330 
331 /*
332  * Calculate max SAGAW for each iommu.
333  */
iommu_calculate_max_sagaw(struct intel_iommu * iommu)334 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
335 {
336 	return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
337 }
338 
339 /*
340  * calculate agaw for each iommu.
341  * "SAGAW" may be different across iommus, use a default agaw, and
342  * get a supported less agaw for iommus that don't support the default agaw.
343  */
iommu_calculate_agaw(struct intel_iommu * iommu)344 int iommu_calculate_agaw(struct intel_iommu *iommu)
345 {
346 	return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
347 }
348 
iommu_paging_structure_coherency(struct intel_iommu * iommu)349 static bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
350 {
351 	return sm_supported(iommu) ?
352 			ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
353 }
354 
domain_update_iommu_coherency(struct dmar_domain * domain)355 static void domain_update_iommu_coherency(struct dmar_domain *domain)
356 {
357 	struct iommu_domain_info *info;
358 	struct dmar_drhd_unit *drhd;
359 	struct intel_iommu *iommu;
360 	bool found = false;
361 	unsigned long i;
362 
363 	domain->iommu_coherency = true;
364 	xa_for_each(&domain->iommu_array, i, info) {
365 		found = true;
366 		if (!iommu_paging_structure_coherency(info->iommu)) {
367 			domain->iommu_coherency = false;
368 			break;
369 		}
370 	}
371 	if (found)
372 		return;
373 
374 	/* No hardware attached; use lowest common denominator */
375 	rcu_read_lock();
376 	for_each_active_iommu(iommu, drhd) {
377 		if (!iommu_paging_structure_coherency(iommu)) {
378 			domain->iommu_coherency = false;
379 			break;
380 		}
381 	}
382 	rcu_read_unlock();
383 }
384 
domain_update_iommu_superpage(struct dmar_domain * domain,struct intel_iommu * skip)385 static int domain_update_iommu_superpage(struct dmar_domain *domain,
386 					 struct intel_iommu *skip)
387 {
388 	struct dmar_drhd_unit *drhd;
389 	struct intel_iommu *iommu;
390 	int mask = 0x3;
391 
392 	if (!intel_iommu_superpage)
393 		return 0;
394 
395 	/* set iommu_superpage to the smallest common denominator */
396 	rcu_read_lock();
397 	for_each_active_iommu(iommu, drhd) {
398 		if (iommu != skip) {
399 			if (domain && domain->use_first_level) {
400 				if (!cap_fl1gp_support(iommu->cap))
401 					mask = 0x1;
402 			} else {
403 				mask &= cap_super_page_val(iommu->cap);
404 			}
405 
406 			if (!mask)
407 				break;
408 		}
409 	}
410 	rcu_read_unlock();
411 
412 	return fls(mask);
413 }
414 
domain_update_device_node(struct dmar_domain * domain)415 static int domain_update_device_node(struct dmar_domain *domain)
416 {
417 	struct device_domain_info *info;
418 	int nid = NUMA_NO_NODE;
419 	unsigned long flags;
420 
421 	spin_lock_irqsave(&domain->lock, flags);
422 	list_for_each_entry(info, &domain->devices, link) {
423 		/*
424 		 * There could possibly be multiple device numa nodes as devices
425 		 * within the same domain may sit behind different IOMMUs. There
426 		 * isn't perfect answer in such situation, so we select first
427 		 * come first served policy.
428 		 */
429 		nid = dev_to_node(info->dev);
430 		if (nid != NUMA_NO_NODE)
431 			break;
432 	}
433 	spin_unlock_irqrestore(&domain->lock, flags);
434 
435 	return nid;
436 }
437 
438 /* Return the super pagesize bitmap if supported. */
domain_super_pgsize_bitmap(struct dmar_domain * domain)439 static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
440 {
441 	unsigned long bitmap = 0;
442 
443 	/*
444 	 * 1-level super page supports page size of 2MiB, 2-level super page
445 	 * supports page size of both 2MiB and 1GiB.
446 	 */
447 	if (domain->iommu_superpage == 1)
448 		bitmap |= SZ_2M;
449 	else if (domain->iommu_superpage == 2)
450 		bitmap |= SZ_2M | SZ_1G;
451 
452 	return bitmap;
453 }
454 
455 /* Some capabilities may be different across iommus */
domain_update_iommu_cap(struct dmar_domain * domain)456 void domain_update_iommu_cap(struct dmar_domain *domain)
457 {
458 	domain_update_iommu_coherency(domain);
459 	domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
460 
461 	/*
462 	 * If RHSA is missing, we should default to the device numa domain
463 	 * as fall back.
464 	 */
465 	if (domain->nid == NUMA_NO_NODE)
466 		domain->nid = domain_update_device_node(domain);
467 
468 	/*
469 	 * First-level translation restricts the input-address to a
470 	 * canonical address (i.e., address bits 63:N have the same
471 	 * value as address bit [N-1], where N is 48-bits with 4-level
472 	 * paging and 57-bits with 5-level paging). Hence, skip bit
473 	 * [N-1].
474 	 */
475 	if (domain->use_first_level)
476 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
477 	else
478 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
479 
480 	domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
481 }
482 
iommu_context_addr(struct intel_iommu * iommu,u8 bus,u8 devfn,int alloc)483 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
484 					 u8 devfn, int alloc)
485 {
486 	struct root_entry *root = &iommu->root_entry[bus];
487 	struct context_entry *context;
488 	u64 *entry;
489 
490 	/*
491 	 * Except that the caller requested to allocate a new entry,
492 	 * returning a copied context entry makes no sense.
493 	 */
494 	if (!alloc && context_copied(iommu, bus, devfn))
495 		return NULL;
496 
497 	entry = &root->lo;
498 	if (sm_supported(iommu)) {
499 		if (devfn >= 0x80) {
500 			devfn -= 0x80;
501 			entry = &root->hi;
502 		}
503 		devfn *= 2;
504 	}
505 	if (*entry & 1)
506 		context = phys_to_virt(*entry & VTD_PAGE_MASK);
507 	else {
508 		unsigned long phy_addr;
509 		if (!alloc)
510 			return NULL;
511 
512 		context = iommu_alloc_page_node(iommu->node, GFP_ATOMIC);
513 		if (!context)
514 			return NULL;
515 
516 		__iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
517 		phy_addr = virt_to_phys((void *)context);
518 		*entry = phy_addr | 1;
519 		__iommu_flush_cache(iommu, entry, sizeof(*entry));
520 	}
521 	return &context[devfn];
522 }
523 
524 /**
525  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
526  *				 sub-hierarchy of a candidate PCI-PCI bridge
527  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
528  * @bridge: the candidate PCI-PCI bridge
529  *
530  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
531  */
532 static bool
is_downstream_to_pci_bridge(struct device * dev,struct device * bridge)533 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
534 {
535 	struct pci_dev *pdev, *pbridge;
536 
537 	if (!dev_is_pci(dev) || !dev_is_pci(bridge))
538 		return false;
539 
540 	pdev = to_pci_dev(dev);
541 	pbridge = to_pci_dev(bridge);
542 
543 	if (pbridge->subordinate &&
544 	    pbridge->subordinate->number <= pdev->bus->number &&
545 	    pbridge->subordinate->busn_res.end >= pdev->bus->number)
546 		return true;
547 
548 	return false;
549 }
550 
quirk_ioat_snb_local_iommu(struct pci_dev * pdev)551 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
552 {
553 	struct dmar_drhd_unit *drhd;
554 	u32 vtbar;
555 	int rc;
556 
557 	/* We know that this device on this chipset has its own IOMMU.
558 	 * If we find it under a different IOMMU, then the BIOS is lying
559 	 * to us. Hope that the IOMMU for this device is actually
560 	 * disabled, and it needs no translation...
561 	 */
562 	rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
563 	if (rc) {
564 		/* "can't" happen */
565 		dev_info(&pdev->dev, "failed to run vt-d quirk\n");
566 		return false;
567 	}
568 	vtbar &= 0xffff0000;
569 
570 	/* we know that the this iommu should be at offset 0xa000 from vtbar */
571 	drhd = dmar_find_matched_drhd_unit(pdev);
572 	if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
573 		pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
574 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
575 		return true;
576 	}
577 
578 	return false;
579 }
580 
iommu_is_dummy(struct intel_iommu * iommu,struct device * dev)581 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
582 {
583 	if (!iommu || iommu->drhd->ignored)
584 		return true;
585 
586 	if (dev_is_pci(dev)) {
587 		struct pci_dev *pdev = to_pci_dev(dev);
588 
589 		if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
590 		    pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
591 		    quirk_ioat_snb_local_iommu(pdev))
592 			return true;
593 	}
594 
595 	return false;
596 }
597 
device_lookup_iommu(struct device * dev,u8 * bus,u8 * devfn)598 static struct intel_iommu *device_lookup_iommu(struct device *dev, u8 *bus, u8 *devfn)
599 {
600 	struct dmar_drhd_unit *drhd = NULL;
601 	struct pci_dev *pdev = NULL;
602 	struct intel_iommu *iommu;
603 	struct device *tmp;
604 	u16 segment = 0;
605 	int i;
606 
607 	if (!dev)
608 		return NULL;
609 
610 	if (dev_is_pci(dev)) {
611 		struct pci_dev *pf_pdev;
612 
613 		pdev = pci_real_dma_dev(to_pci_dev(dev));
614 
615 		/* VFs aren't listed in scope tables; we need to look up
616 		 * the PF instead to find the IOMMU. */
617 		pf_pdev = pci_physfn(pdev);
618 		dev = &pf_pdev->dev;
619 		segment = pci_domain_nr(pdev->bus);
620 	} else if (has_acpi_companion(dev))
621 		dev = &ACPI_COMPANION(dev)->dev;
622 
623 	rcu_read_lock();
624 	for_each_iommu(iommu, drhd) {
625 		if (pdev && segment != drhd->segment)
626 			continue;
627 
628 		for_each_active_dev_scope(drhd->devices,
629 					  drhd->devices_cnt, i, tmp) {
630 			if (tmp == dev) {
631 				/* For a VF use its original BDF# not that of the PF
632 				 * which we used for the IOMMU lookup. Strictly speaking
633 				 * we could do this for all PCI devices; we only need to
634 				 * get the BDF# from the scope table for ACPI matches. */
635 				if (pdev && pdev->is_virtfn)
636 					goto got_pdev;
637 
638 				if (bus && devfn) {
639 					*bus = drhd->devices[i].bus;
640 					*devfn = drhd->devices[i].devfn;
641 				}
642 				goto out;
643 			}
644 
645 			if (is_downstream_to_pci_bridge(dev, tmp))
646 				goto got_pdev;
647 		}
648 
649 		if (pdev && drhd->include_all) {
650 got_pdev:
651 			if (bus && devfn) {
652 				*bus = pdev->bus->number;
653 				*devfn = pdev->devfn;
654 			}
655 			goto out;
656 		}
657 	}
658 	iommu = NULL;
659 out:
660 	if (iommu_is_dummy(iommu, dev))
661 		iommu = NULL;
662 
663 	rcu_read_unlock();
664 
665 	return iommu;
666 }
667 
domain_flush_cache(struct dmar_domain * domain,void * addr,int size)668 static void domain_flush_cache(struct dmar_domain *domain,
669 			       void *addr, int size)
670 {
671 	if (!domain->iommu_coherency)
672 		clflush_cache_range(addr, size);
673 }
674 
free_context_table(struct intel_iommu * iommu)675 static void free_context_table(struct intel_iommu *iommu)
676 {
677 	struct context_entry *context;
678 	int i;
679 
680 	if (!iommu->root_entry)
681 		return;
682 
683 	for (i = 0; i < ROOT_ENTRY_NR; i++) {
684 		context = iommu_context_addr(iommu, i, 0, 0);
685 		if (context)
686 			iommu_free_page(context);
687 
688 		if (!sm_supported(iommu))
689 			continue;
690 
691 		context = iommu_context_addr(iommu, i, 0x80, 0);
692 		if (context)
693 			iommu_free_page(context);
694 	}
695 
696 	iommu_free_page(iommu->root_entry);
697 	iommu->root_entry = NULL;
698 }
699 
700 #ifdef CONFIG_DMAR_DEBUG
pgtable_walk(struct intel_iommu * iommu,unsigned long pfn,u8 bus,u8 devfn,struct dma_pte * parent,int level)701 static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn,
702 			 u8 bus, u8 devfn, struct dma_pte *parent, int level)
703 {
704 	struct dma_pte *pte;
705 	int offset;
706 
707 	while (1) {
708 		offset = pfn_level_offset(pfn, level);
709 		pte = &parent[offset];
710 
711 		pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val);
712 
713 		if (!dma_pte_present(pte)) {
714 			pr_info("page table not present at level %d\n", level - 1);
715 			break;
716 		}
717 
718 		if (level == 1 || dma_pte_superpage(pte))
719 			break;
720 
721 		parent = phys_to_virt(dma_pte_addr(pte));
722 		level--;
723 	}
724 }
725 
dmar_fault_dump_ptes(struct intel_iommu * iommu,u16 source_id,unsigned long long addr,u32 pasid)726 void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
727 			  unsigned long long addr, u32 pasid)
728 {
729 	struct pasid_dir_entry *dir, *pde;
730 	struct pasid_entry *entries, *pte;
731 	struct context_entry *ctx_entry;
732 	struct root_entry *rt_entry;
733 	int i, dir_index, index, level;
734 	u8 devfn = source_id & 0xff;
735 	u8 bus = source_id >> 8;
736 	struct dma_pte *pgtable;
737 
738 	pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr);
739 
740 	/* root entry dump */
741 	if (!iommu->root_entry) {
742 		pr_info("root table is not present\n");
743 		return;
744 	}
745 	rt_entry = &iommu->root_entry[bus];
746 
747 	if (sm_supported(iommu))
748 		pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n",
749 			rt_entry->hi, rt_entry->lo);
750 	else
751 		pr_info("root entry: 0x%016llx", rt_entry->lo);
752 
753 	/* context entry dump */
754 	ctx_entry = iommu_context_addr(iommu, bus, devfn, 0);
755 	if (!ctx_entry) {
756 		pr_info("context table is not present\n");
757 		return;
758 	}
759 
760 	pr_info("context entry: hi 0x%016llx, low 0x%016llx\n",
761 		ctx_entry->hi, ctx_entry->lo);
762 
763 	/* legacy mode does not require PASID entries */
764 	if (!sm_supported(iommu)) {
765 		if (!context_present(ctx_entry)) {
766 			pr_info("legacy mode page table is not present\n");
767 			return;
768 		}
769 		level = agaw_to_level(ctx_entry->hi & 7);
770 		pgtable = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
771 		goto pgtable_walk;
772 	}
773 
774 	if (!context_present(ctx_entry)) {
775 		pr_info("pasid directory table is not present\n");
776 		return;
777 	}
778 
779 	/* get the pointer to pasid directory entry */
780 	dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
781 
782 	/* For request-without-pasid, get the pasid from context entry */
783 	if (intel_iommu_sm && pasid == IOMMU_PASID_INVALID)
784 		pasid = IOMMU_NO_PASID;
785 
786 	dir_index = pasid >> PASID_PDE_SHIFT;
787 	pde = &dir[dir_index];
788 	pr_info("pasid dir entry: 0x%016llx\n", pde->val);
789 
790 	/* get the pointer to the pasid table entry */
791 	entries = get_pasid_table_from_pde(pde);
792 	if (!entries) {
793 		pr_info("pasid table is not present\n");
794 		return;
795 	}
796 	index = pasid & PASID_PTE_MASK;
797 	pte = &entries[index];
798 	for (i = 0; i < ARRAY_SIZE(pte->val); i++)
799 		pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]);
800 
801 	if (!pasid_pte_is_present(pte)) {
802 		pr_info("scalable mode page table is not present\n");
803 		return;
804 	}
805 
806 	if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) {
807 		level = pte->val[2] & BIT_ULL(2) ? 5 : 4;
808 		pgtable = phys_to_virt(pte->val[2] & VTD_PAGE_MASK);
809 	} else {
810 		level = agaw_to_level((pte->val[0] >> 2) & 0x7);
811 		pgtable = phys_to_virt(pte->val[0] & VTD_PAGE_MASK);
812 	}
813 
814 pgtable_walk:
815 	pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn, pgtable, level);
816 }
817 #endif
818 
pfn_to_dma_pte(struct dmar_domain * domain,unsigned long pfn,int * target_level,gfp_t gfp)819 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
820 				      unsigned long pfn, int *target_level,
821 				      gfp_t gfp)
822 {
823 	struct dma_pte *parent, *pte;
824 	int level = agaw_to_level(domain->agaw);
825 	int offset;
826 
827 	if (!domain_pfn_supported(domain, pfn))
828 		/* Address beyond IOMMU's addressing capabilities. */
829 		return NULL;
830 
831 	parent = domain->pgd;
832 
833 	while (1) {
834 		void *tmp_page;
835 
836 		offset = pfn_level_offset(pfn, level);
837 		pte = &parent[offset];
838 		if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
839 			break;
840 		if (level == *target_level)
841 			break;
842 
843 		if (!dma_pte_present(pte)) {
844 			uint64_t pteval, tmp;
845 
846 			tmp_page = iommu_alloc_page_node(domain->nid, gfp);
847 
848 			if (!tmp_page)
849 				return NULL;
850 
851 			domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
852 			pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
853 			if (domain->use_first_level)
854 				pteval |= DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
855 
856 			tmp = 0ULL;
857 			if (!try_cmpxchg64(&pte->val, &tmp, pteval))
858 				/* Someone else set it while we were thinking; use theirs. */
859 				iommu_free_page(tmp_page);
860 			else
861 				domain_flush_cache(domain, pte, sizeof(*pte));
862 		}
863 		if (level == 1)
864 			break;
865 
866 		parent = phys_to_virt(dma_pte_addr(pte));
867 		level--;
868 	}
869 
870 	if (!*target_level)
871 		*target_level = level;
872 
873 	return pte;
874 }
875 
876 /* return address's pte at specific level */
dma_pfn_level_pte(struct dmar_domain * domain,unsigned long pfn,int level,int * large_page)877 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
878 					 unsigned long pfn,
879 					 int level, int *large_page)
880 {
881 	struct dma_pte *parent, *pte;
882 	int total = agaw_to_level(domain->agaw);
883 	int offset;
884 
885 	parent = domain->pgd;
886 	while (level <= total) {
887 		offset = pfn_level_offset(pfn, total);
888 		pte = &parent[offset];
889 		if (level == total)
890 			return pte;
891 
892 		if (!dma_pte_present(pte)) {
893 			*large_page = total;
894 			break;
895 		}
896 
897 		if (dma_pte_superpage(pte)) {
898 			*large_page = total;
899 			return pte;
900 		}
901 
902 		parent = phys_to_virt(dma_pte_addr(pte));
903 		total--;
904 	}
905 	return NULL;
906 }
907 
908 /* clear last level pte, a tlb flush should be followed */
dma_pte_clear_range(struct dmar_domain * domain,unsigned long start_pfn,unsigned long last_pfn)909 static void dma_pte_clear_range(struct dmar_domain *domain,
910 				unsigned long start_pfn,
911 				unsigned long last_pfn)
912 {
913 	unsigned int large_page;
914 	struct dma_pte *first_pte, *pte;
915 
916 	if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
917 	    WARN_ON(start_pfn > last_pfn))
918 		return;
919 
920 	/* we don't need lock here; nobody else touches the iova range */
921 	do {
922 		large_page = 1;
923 		first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
924 		if (!pte) {
925 			start_pfn = align_to_level(start_pfn + 1, large_page + 1);
926 			continue;
927 		}
928 		do {
929 			dma_clear_pte(pte);
930 			start_pfn += lvl_to_nr_pages(large_page);
931 			pte++;
932 		} while (start_pfn <= last_pfn && !first_pte_in_page(pte));
933 
934 		domain_flush_cache(domain, first_pte,
935 				   (void *)pte - (void *)first_pte);
936 
937 	} while (start_pfn && start_pfn <= last_pfn);
938 }
939 
dma_pte_free_level(struct dmar_domain * domain,int level,int retain_level,struct dma_pte * pte,unsigned long pfn,unsigned long start_pfn,unsigned long last_pfn)940 static void dma_pte_free_level(struct dmar_domain *domain, int level,
941 			       int retain_level, struct dma_pte *pte,
942 			       unsigned long pfn, unsigned long start_pfn,
943 			       unsigned long last_pfn)
944 {
945 	pfn = max(start_pfn, pfn);
946 	pte = &pte[pfn_level_offset(pfn, level)];
947 
948 	do {
949 		unsigned long level_pfn;
950 		struct dma_pte *level_pte;
951 
952 		if (!dma_pte_present(pte) || dma_pte_superpage(pte))
953 			goto next;
954 
955 		level_pfn = pfn & level_mask(level);
956 		level_pte = phys_to_virt(dma_pte_addr(pte));
957 
958 		if (level > 2) {
959 			dma_pte_free_level(domain, level - 1, retain_level,
960 					   level_pte, level_pfn, start_pfn,
961 					   last_pfn);
962 		}
963 
964 		/*
965 		 * Free the page table if we're below the level we want to
966 		 * retain and the range covers the entire table.
967 		 */
968 		if (level < retain_level && !(start_pfn > level_pfn ||
969 		      last_pfn < level_pfn + level_size(level) - 1)) {
970 			dma_clear_pte(pte);
971 			domain_flush_cache(domain, pte, sizeof(*pte));
972 			iommu_free_page(level_pte);
973 		}
974 next:
975 		pfn += level_size(level);
976 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
977 }
978 
979 /*
980  * clear last level (leaf) ptes and free page table pages below the
981  * level we wish to keep intact.
982  */
dma_pte_free_pagetable(struct dmar_domain * domain,unsigned long start_pfn,unsigned long last_pfn,int retain_level)983 static void dma_pte_free_pagetable(struct dmar_domain *domain,
984 				   unsigned long start_pfn,
985 				   unsigned long last_pfn,
986 				   int retain_level)
987 {
988 	dma_pte_clear_range(domain, start_pfn, last_pfn);
989 
990 	/* We don't need lock here; nobody else touches the iova range */
991 	dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
992 			   domain->pgd, 0, start_pfn, last_pfn);
993 
994 	/* free pgd */
995 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
996 		iommu_free_page(domain->pgd);
997 		domain->pgd = NULL;
998 	}
999 }
1000 
1001 /* When a page at a given level is being unlinked from its parent, we don't
1002    need to *modify* it at all. All we need to do is make a list of all the
1003    pages which can be freed just as soon as we've flushed the IOTLB and we
1004    know the hardware page-walk will no longer touch them.
1005    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1006    be freed. */
dma_pte_list_pagetables(struct dmar_domain * domain,int level,struct dma_pte * pte,struct list_head * freelist)1007 static void dma_pte_list_pagetables(struct dmar_domain *domain,
1008 				    int level, struct dma_pte *pte,
1009 				    struct list_head *freelist)
1010 {
1011 	struct page *pg;
1012 
1013 	pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1014 	list_add_tail(&pg->lru, freelist);
1015 
1016 	if (level == 1)
1017 		return;
1018 
1019 	pte = page_address(pg);
1020 	do {
1021 		if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1022 			dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1023 		pte++;
1024 	} while (!first_pte_in_page(pte));
1025 }
1026 
dma_pte_clear_level(struct dmar_domain * domain,int level,struct dma_pte * pte,unsigned long pfn,unsigned long start_pfn,unsigned long last_pfn,struct list_head * freelist)1027 static void dma_pte_clear_level(struct dmar_domain *domain, int level,
1028 				struct dma_pte *pte, unsigned long pfn,
1029 				unsigned long start_pfn, unsigned long last_pfn,
1030 				struct list_head *freelist)
1031 {
1032 	struct dma_pte *first_pte = NULL, *last_pte = NULL;
1033 
1034 	pfn = max(start_pfn, pfn);
1035 	pte = &pte[pfn_level_offset(pfn, level)];
1036 
1037 	do {
1038 		unsigned long level_pfn = pfn & level_mask(level);
1039 
1040 		if (!dma_pte_present(pte))
1041 			goto next;
1042 
1043 		/* If range covers entire pagetable, free it */
1044 		if (start_pfn <= level_pfn &&
1045 		    last_pfn >= level_pfn + level_size(level) - 1) {
1046 			/* These suborbinate page tables are going away entirely. Don't
1047 			   bother to clear them; we're just going to *free* them. */
1048 			if (level > 1 && !dma_pte_superpage(pte))
1049 				dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1050 
1051 			dma_clear_pte(pte);
1052 			if (!first_pte)
1053 				first_pte = pte;
1054 			last_pte = pte;
1055 		} else if (level > 1) {
1056 			/* Recurse down into a level that isn't *entirely* obsolete */
1057 			dma_pte_clear_level(domain, level - 1,
1058 					    phys_to_virt(dma_pte_addr(pte)),
1059 					    level_pfn, start_pfn, last_pfn,
1060 					    freelist);
1061 		}
1062 next:
1063 		pfn = level_pfn + level_size(level);
1064 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1065 
1066 	if (first_pte)
1067 		domain_flush_cache(domain, first_pte,
1068 				   (void *)++last_pte - (void *)first_pte);
1069 }
1070 
1071 /* We can't just free the pages because the IOMMU may still be walking
1072    the page tables, and may have cached the intermediate levels. The
1073    pages can only be freed after the IOTLB flush has been done. */
domain_unmap(struct dmar_domain * domain,unsigned long start_pfn,unsigned long last_pfn,struct list_head * freelist)1074 static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn,
1075 			 unsigned long last_pfn, struct list_head *freelist)
1076 {
1077 	if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
1078 	    WARN_ON(start_pfn > last_pfn))
1079 		return;
1080 
1081 	/* we don't need lock here; nobody else touches the iova range */
1082 	dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1083 			    domain->pgd, 0, start_pfn, last_pfn, freelist);
1084 
1085 	/* free pgd */
1086 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1087 		struct page *pgd_page = virt_to_page(domain->pgd);
1088 		list_add_tail(&pgd_page->lru, freelist);
1089 		domain->pgd = NULL;
1090 	}
1091 }
1092 
1093 /* iommu handling */
iommu_alloc_root_entry(struct intel_iommu * iommu)1094 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1095 {
1096 	struct root_entry *root;
1097 
1098 	root = iommu_alloc_page_node(iommu->node, GFP_ATOMIC);
1099 	if (!root) {
1100 		pr_err("Allocating root entry for %s failed\n",
1101 			iommu->name);
1102 		return -ENOMEM;
1103 	}
1104 
1105 	__iommu_flush_cache(iommu, root, ROOT_SIZE);
1106 	iommu->root_entry = root;
1107 
1108 	return 0;
1109 }
1110 
iommu_set_root_entry(struct intel_iommu * iommu)1111 static void iommu_set_root_entry(struct intel_iommu *iommu)
1112 {
1113 	u64 addr;
1114 	u32 sts;
1115 	unsigned long flag;
1116 
1117 	addr = virt_to_phys(iommu->root_entry);
1118 	if (sm_supported(iommu))
1119 		addr |= DMA_RTADDR_SMT;
1120 
1121 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1122 	dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1123 
1124 	writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1125 
1126 	/* Make sure hardware complete it */
1127 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1128 		      readl, (sts & DMA_GSTS_RTPS), sts);
1129 
1130 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1131 
1132 	/*
1133 	 * Hardware invalidates all DMA remapping hardware translation
1134 	 * caches as part of SRTP flow.
1135 	 */
1136 	if (cap_esrtps(iommu->cap))
1137 		return;
1138 
1139 	iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1140 	if (sm_supported(iommu))
1141 		qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1142 	iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1143 }
1144 
iommu_flush_write_buffer(struct intel_iommu * iommu)1145 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1146 {
1147 	u32 val;
1148 	unsigned long flag;
1149 
1150 	if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1151 		return;
1152 
1153 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1154 	writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1155 
1156 	/* Make sure hardware complete it */
1157 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1158 		      readl, (!(val & DMA_GSTS_WBFS)), val);
1159 
1160 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1161 }
1162 
1163 /* return value determine if we need a write buffer flush */
__iommu_flush_context(struct intel_iommu * iommu,u16 did,u16 source_id,u8 function_mask,u64 type)1164 static void __iommu_flush_context(struct intel_iommu *iommu,
1165 				  u16 did, u16 source_id, u8 function_mask,
1166 				  u64 type)
1167 {
1168 	u64 val = 0;
1169 	unsigned long flag;
1170 
1171 	switch (type) {
1172 	case DMA_CCMD_GLOBAL_INVL:
1173 		val = DMA_CCMD_GLOBAL_INVL;
1174 		break;
1175 	case DMA_CCMD_DOMAIN_INVL:
1176 		val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1177 		break;
1178 	case DMA_CCMD_DEVICE_INVL:
1179 		val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1180 			| DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1181 		break;
1182 	default:
1183 		pr_warn("%s: Unexpected context-cache invalidation type 0x%llx\n",
1184 			iommu->name, type);
1185 		return;
1186 	}
1187 	val |= DMA_CCMD_ICC;
1188 
1189 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1190 	dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1191 
1192 	/* Make sure hardware complete it */
1193 	IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1194 		dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1195 
1196 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1197 }
1198 
__iommu_flush_iotlb(struct intel_iommu * iommu,u16 did,u64 addr,unsigned int size_order,u64 type)1199 void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
1200 			 unsigned int size_order, u64 type)
1201 {
1202 	int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1203 	u64 val = 0, val_iva = 0;
1204 	unsigned long flag;
1205 
1206 	switch (type) {
1207 	case DMA_TLB_GLOBAL_FLUSH:
1208 		/* global flush doesn't need set IVA_REG */
1209 		val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1210 		break;
1211 	case DMA_TLB_DSI_FLUSH:
1212 		val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1213 		break;
1214 	case DMA_TLB_PSI_FLUSH:
1215 		val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1216 		/* IH bit is passed in as part of address */
1217 		val_iva = size_order | addr;
1218 		break;
1219 	default:
1220 		pr_warn("%s: Unexpected iotlb invalidation type 0x%llx\n",
1221 			iommu->name, type);
1222 		return;
1223 	}
1224 
1225 	if (cap_write_drain(iommu->cap))
1226 		val |= DMA_TLB_WRITE_DRAIN;
1227 
1228 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1229 	/* Note: Only uses first TLB reg currently */
1230 	if (val_iva)
1231 		dmar_writeq(iommu->reg + tlb_offset, val_iva);
1232 	dmar_writeq(iommu->reg + tlb_offset + 8, val);
1233 
1234 	/* Make sure hardware complete it */
1235 	IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1236 		dmar_readq, (!(val & DMA_TLB_IVT)), val);
1237 
1238 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1239 
1240 	/* check IOTLB invalidation granularity */
1241 	if (DMA_TLB_IAIG(val) == 0)
1242 		pr_err("Flush IOTLB failed\n");
1243 	if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1244 		pr_debug("TLB flush request %Lx, actual %Lx\n",
1245 			(unsigned long long)DMA_TLB_IIRG(type),
1246 			(unsigned long long)DMA_TLB_IAIG(val));
1247 }
1248 
1249 static struct device_domain_info *
domain_lookup_dev_info(struct dmar_domain * domain,struct intel_iommu * iommu,u8 bus,u8 devfn)1250 domain_lookup_dev_info(struct dmar_domain *domain,
1251 		       struct intel_iommu *iommu, u8 bus, u8 devfn)
1252 {
1253 	struct device_domain_info *info;
1254 	unsigned long flags;
1255 
1256 	spin_lock_irqsave(&domain->lock, flags);
1257 	list_for_each_entry(info, &domain->devices, link) {
1258 		if (info->iommu == iommu && info->bus == bus &&
1259 		    info->devfn == devfn) {
1260 			spin_unlock_irqrestore(&domain->lock, flags);
1261 			return info;
1262 		}
1263 	}
1264 	spin_unlock_irqrestore(&domain->lock, flags);
1265 
1266 	return NULL;
1267 }
1268 
1269 /*
1270  * The extra devTLB flush quirk impacts those QAT devices with PCI device
1271  * IDs ranging from 0x4940 to 0x4943. It is exempted from risky_device()
1272  * check because it applies only to the built-in QAT devices and it doesn't
1273  * grant additional privileges.
1274  */
1275 #define BUGGY_QAT_DEVID_MASK 0x4940
dev_needs_extra_dtlb_flush(struct pci_dev * pdev)1276 static bool dev_needs_extra_dtlb_flush(struct pci_dev *pdev)
1277 {
1278 	if (pdev->vendor != PCI_VENDOR_ID_INTEL)
1279 		return false;
1280 
1281 	if ((pdev->device & 0xfffc) != BUGGY_QAT_DEVID_MASK)
1282 		return false;
1283 
1284 	return true;
1285 }
1286 
iommu_enable_pci_caps(struct device_domain_info * info)1287 static void iommu_enable_pci_caps(struct device_domain_info *info)
1288 {
1289 	struct pci_dev *pdev;
1290 
1291 	if (!dev_is_pci(info->dev))
1292 		return;
1293 
1294 	pdev = to_pci_dev(info->dev);
1295 	if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1296 	    !pci_enable_ats(pdev, VTD_PAGE_SHIFT))
1297 		info->ats_enabled = 1;
1298 }
1299 
iommu_disable_pci_caps(struct device_domain_info * info)1300 static void iommu_disable_pci_caps(struct device_domain_info *info)
1301 {
1302 	struct pci_dev *pdev;
1303 
1304 	if (!dev_is_pci(info->dev))
1305 		return;
1306 
1307 	pdev = to_pci_dev(info->dev);
1308 
1309 	if (info->ats_enabled) {
1310 		pci_disable_ats(pdev);
1311 		info->ats_enabled = 0;
1312 	}
1313 }
1314 
intel_flush_iotlb_all(struct iommu_domain * domain)1315 static void intel_flush_iotlb_all(struct iommu_domain *domain)
1316 {
1317 	cache_tag_flush_all(to_dmar_domain(domain));
1318 }
1319 
iommu_disable_protect_mem_regions(struct intel_iommu * iommu)1320 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1321 {
1322 	u32 pmen;
1323 	unsigned long flags;
1324 
1325 	if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1326 		return;
1327 
1328 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1329 	pmen = readl(iommu->reg + DMAR_PMEN_REG);
1330 	pmen &= ~DMA_PMEN_EPM;
1331 	writel(pmen, iommu->reg + DMAR_PMEN_REG);
1332 
1333 	/* wait for the protected region status bit to clear */
1334 	IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1335 		readl, !(pmen & DMA_PMEN_PRS), pmen);
1336 
1337 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1338 }
1339 
iommu_enable_translation(struct intel_iommu * iommu)1340 static void iommu_enable_translation(struct intel_iommu *iommu)
1341 {
1342 	u32 sts;
1343 	unsigned long flags;
1344 
1345 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1346 	iommu->gcmd |= DMA_GCMD_TE;
1347 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1348 
1349 	/* Make sure hardware complete it */
1350 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1351 		      readl, (sts & DMA_GSTS_TES), sts);
1352 
1353 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1354 }
1355 
iommu_disable_translation(struct intel_iommu * iommu)1356 static void iommu_disable_translation(struct intel_iommu *iommu)
1357 {
1358 	u32 sts;
1359 	unsigned long flag;
1360 
1361 	if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1362 	    (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1363 		return;
1364 
1365 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1366 	iommu->gcmd &= ~DMA_GCMD_TE;
1367 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1368 
1369 	/* Make sure hardware complete it */
1370 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1371 		      readl, (!(sts & DMA_GSTS_TES)), sts);
1372 
1373 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1374 }
1375 
iommu_init_domains(struct intel_iommu * iommu)1376 static int iommu_init_domains(struct intel_iommu *iommu)
1377 {
1378 	u32 ndomains;
1379 
1380 	ndomains = cap_ndoms(iommu->cap);
1381 	pr_debug("%s: Number of Domains supported <%d>\n",
1382 		 iommu->name, ndomains);
1383 
1384 	spin_lock_init(&iommu->lock);
1385 
1386 	iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL);
1387 	if (!iommu->domain_ids)
1388 		return -ENOMEM;
1389 
1390 	/*
1391 	 * If Caching mode is set, then invalid translations are tagged
1392 	 * with domain-id 0, hence we need to pre-allocate it. We also
1393 	 * use domain-id 0 as a marker for non-allocated domain-id, so
1394 	 * make sure it is not used for a real domain.
1395 	 */
1396 	set_bit(0, iommu->domain_ids);
1397 
1398 	/*
1399 	 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1400 	 * entry for first-level or pass-through translation modes should
1401 	 * be programmed with a domain id different from those used for
1402 	 * second-level or nested translation. We reserve a domain id for
1403 	 * this purpose. This domain id is also used for identity domain
1404 	 * in legacy mode.
1405 	 */
1406 	set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1407 
1408 	return 0;
1409 }
1410 
disable_dmar_iommu(struct intel_iommu * iommu)1411 static void disable_dmar_iommu(struct intel_iommu *iommu)
1412 {
1413 	if (!iommu->domain_ids)
1414 		return;
1415 
1416 	/*
1417 	 * All iommu domains must have been detached from the devices,
1418 	 * hence there should be no domain IDs in use.
1419 	 */
1420 	if (WARN_ON(bitmap_weight(iommu->domain_ids, cap_ndoms(iommu->cap))
1421 		    > NUM_RESERVED_DID))
1422 		return;
1423 
1424 	if (iommu->gcmd & DMA_GCMD_TE)
1425 		iommu_disable_translation(iommu);
1426 }
1427 
free_dmar_iommu(struct intel_iommu * iommu)1428 static void free_dmar_iommu(struct intel_iommu *iommu)
1429 {
1430 	if (iommu->domain_ids) {
1431 		bitmap_free(iommu->domain_ids);
1432 		iommu->domain_ids = NULL;
1433 	}
1434 
1435 	if (iommu->copied_tables) {
1436 		bitmap_free(iommu->copied_tables);
1437 		iommu->copied_tables = NULL;
1438 	}
1439 
1440 	/* free context mapping */
1441 	free_context_table(iommu);
1442 
1443 #ifdef CONFIG_INTEL_IOMMU_SVM
1444 	if (pasid_supported(iommu)) {
1445 		if (ecap_prs(iommu->ecap))
1446 			intel_svm_finish_prq(iommu);
1447 	}
1448 #endif
1449 }
1450 
1451 /*
1452  * Check and return whether first level is used by default for
1453  * DMA translation.
1454  */
first_level_by_default(unsigned int type)1455 static bool first_level_by_default(unsigned int type)
1456 {
1457 	/* Only SL is available in legacy mode */
1458 	if (!scalable_mode_support())
1459 		return false;
1460 
1461 	/* Only level (either FL or SL) is available, just use it */
1462 	if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity())
1463 		return intel_cap_flts_sanity();
1464 
1465 	/* Both levels are available, decide it based on domain type */
1466 	return type != IOMMU_DOMAIN_UNMANAGED;
1467 }
1468 
alloc_domain(unsigned int type)1469 static struct dmar_domain *alloc_domain(unsigned int type)
1470 {
1471 	struct dmar_domain *domain;
1472 
1473 	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
1474 	if (!domain)
1475 		return NULL;
1476 
1477 	domain->nid = NUMA_NO_NODE;
1478 	if (first_level_by_default(type))
1479 		domain->use_first_level = true;
1480 	INIT_LIST_HEAD(&domain->devices);
1481 	INIT_LIST_HEAD(&domain->dev_pasids);
1482 	INIT_LIST_HEAD(&domain->cache_tags);
1483 	spin_lock_init(&domain->lock);
1484 	spin_lock_init(&domain->cache_lock);
1485 	xa_init(&domain->iommu_array);
1486 
1487 	return domain;
1488 }
1489 
domain_attach_iommu(struct dmar_domain * domain,struct intel_iommu * iommu)1490 int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
1491 {
1492 	struct iommu_domain_info *info, *curr;
1493 	unsigned long ndomains;
1494 	int num, ret = -ENOSPC;
1495 
1496 	if (domain->domain.type == IOMMU_DOMAIN_SVA)
1497 		return 0;
1498 
1499 	info = kzalloc(sizeof(*info), GFP_KERNEL);
1500 	if (!info)
1501 		return -ENOMEM;
1502 
1503 	spin_lock(&iommu->lock);
1504 	curr = xa_load(&domain->iommu_array, iommu->seq_id);
1505 	if (curr) {
1506 		curr->refcnt++;
1507 		spin_unlock(&iommu->lock);
1508 		kfree(info);
1509 		return 0;
1510 	}
1511 
1512 	ndomains = cap_ndoms(iommu->cap);
1513 	num = find_first_zero_bit(iommu->domain_ids, ndomains);
1514 	if (num >= ndomains) {
1515 		pr_err("%s: No free domain ids\n", iommu->name);
1516 		goto err_unlock;
1517 	}
1518 
1519 	set_bit(num, iommu->domain_ids);
1520 	info->refcnt	= 1;
1521 	info->did	= num;
1522 	info->iommu	= iommu;
1523 	curr = xa_cmpxchg(&domain->iommu_array, iommu->seq_id,
1524 			  NULL, info, GFP_ATOMIC);
1525 	if (curr) {
1526 		ret = xa_err(curr) ? : -EBUSY;
1527 		goto err_clear;
1528 	}
1529 	domain_update_iommu_cap(domain);
1530 
1531 	spin_unlock(&iommu->lock);
1532 	return 0;
1533 
1534 err_clear:
1535 	clear_bit(info->did, iommu->domain_ids);
1536 err_unlock:
1537 	spin_unlock(&iommu->lock);
1538 	kfree(info);
1539 	return ret;
1540 }
1541 
domain_detach_iommu(struct dmar_domain * domain,struct intel_iommu * iommu)1542 void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
1543 {
1544 	struct iommu_domain_info *info;
1545 
1546 	if (domain->domain.type == IOMMU_DOMAIN_SVA)
1547 		return;
1548 
1549 	spin_lock(&iommu->lock);
1550 	info = xa_load(&domain->iommu_array, iommu->seq_id);
1551 	if (--info->refcnt == 0) {
1552 		clear_bit(info->did, iommu->domain_ids);
1553 		xa_erase(&domain->iommu_array, iommu->seq_id);
1554 		domain->nid = NUMA_NO_NODE;
1555 		domain_update_iommu_cap(domain);
1556 		kfree(info);
1557 	}
1558 	spin_unlock(&iommu->lock);
1559 }
1560 
guestwidth_to_adjustwidth(int gaw)1561 static int guestwidth_to_adjustwidth(int gaw)
1562 {
1563 	int agaw;
1564 	int r = (gaw - 12) % 9;
1565 
1566 	if (r == 0)
1567 		agaw = gaw;
1568 	else
1569 		agaw = gaw + 9 - r;
1570 	if (agaw > 64)
1571 		agaw = 64;
1572 	return agaw;
1573 }
1574 
domain_exit(struct dmar_domain * domain)1575 static void domain_exit(struct dmar_domain *domain)
1576 {
1577 	if (domain->pgd) {
1578 		LIST_HEAD(freelist);
1579 
1580 		domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist);
1581 		iommu_put_pages_list(&freelist);
1582 	}
1583 
1584 	if (WARN_ON(!list_empty(&domain->devices)))
1585 		return;
1586 
1587 	kfree(domain->qi_batch);
1588 	kfree(domain);
1589 }
1590 
1591 /*
1592  * For kdump cases, old valid entries may be cached due to the
1593  * in-flight DMA and copied pgtable, but there is no unmapping
1594  * behaviour for them, thus we need an explicit cache flush for
1595  * the newly-mapped device. For kdump, at this point, the device
1596  * is supposed to finish reset at its driver probe stage, so no
1597  * in-flight DMA will exist, and we don't need to worry anymore
1598  * hereafter.
1599  */
copied_context_tear_down(struct intel_iommu * iommu,struct context_entry * context,u8 bus,u8 devfn)1600 static void copied_context_tear_down(struct intel_iommu *iommu,
1601 				     struct context_entry *context,
1602 				     u8 bus, u8 devfn)
1603 {
1604 	u16 did_old;
1605 
1606 	if (!context_copied(iommu, bus, devfn))
1607 		return;
1608 
1609 	assert_spin_locked(&iommu->lock);
1610 
1611 	did_old = context_domain_id(context);
1612 	context_clear_entry(context);
1613 
1614 	if (did_old < cap_ndoms(iommu->cap)) {
1615 		iommu->flush.flush_context(iommu, did_old,
1616 					   (((u16)bus) << 8) | devfn,
1617 					   DMA_CCMD_MASK_NOBIT,
1618 					   DMA_CCMD_DEVICE_INVL);
1619 		iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
1620 					 DMA_TLB_DSI_FLUSH);
1621 	}
1622 
1623 	clear_context_copied(iommu, bus, devfn);
1624 }
1625 
1626 /*
1627  * It's a non-present to present mapping. If hardware doesn't cache
1628  * non-present entry we only need to flush the write-buffer. If the
1629  * _does_ cache non-present entries, then it does so in the special
1630  * domain #0, which we have to flush:
1631  */
context_present_cache_flush(struct intel_iommu * iommu,u16 did,u8 bus,u8 devfn)1632 static void context_present_cache_flush(struct intel_iommu *iommu, u16 did,
1633 					u8 bus, u8 devfn)
1634 {
1635 	if (cap_caching_mode(iommu->cap)) {
1636 		iommu->flush.flush_context(iommu, 0,
1637 					   (((u16)bus) << 8) | devfn,
1638 					   DMA_CCMD_MASK_NOBIT,
1639 					   DMA_CCMD_DEVICE_INVL);
1640 		iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
1641 	} else {
1642 		iommu_flush_write_buffer(iommu);
1643 	}
1644 }
1645 
domain_context_mapping_one(struct dmar_domain * domain,struct intel_iommu * iommu,u8 bus,u8 devfn)1646 static int domain_context_mapping_one(struct dmar_domain *domain,
1647 				      struct intel_iommu *iommu,
1648 				      u8 bus, u8 devfn)
1649 {
1650 	struct device_domain_info *info =
1651 			domain_lookup_dev_info(domain, iommu, bus, devfn);
1652 	u16 did = domain_id_iommu(domain, iommu);
1653 	int translation = CONTEXT_TT_MULTI_LEVEL;
1654 	struct dma_pte *pgd = domain->pgd;
1655 	struct context_entry *context;
1656 	int agaw, ret;
1657 
1658 	pr_debug("Set context mapping for %02x:%02x.%d\n",
1659 		bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1660 
1661 	spin_lock(&iommu->lock);
1662 	ret = -ENOMEM;
1663 	context = iommu_context_addr(iommu, bus, devfn, 1);
1664 	if (!context)
1665 		goto out_unlock;
1666 
1667 	ret = 0;
1668 	if (context_present(context) && !context_copied(iommu, bus, devfn))
1669 		goto out_unlock;
1670 
1671 	copied_context_tear_down(iommu, context, bus, devfn);
1672 	context_clear_entry(context);
1673 
1674 	context_set_domain_id(context, did);
1675 
1676 	/*
1677 	 * Skip top levels of page tables for iommu which has
1678 	 * less agaw than default. Unnecessary for PT mode.
1679 	 */
1680 	for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
1681 		ret = -ENOMEM;
1682 		pgd = phys_to_virt(dma_pte_addr(pgd));
1683 		if (!dma_pte_present(pgd))
1684 			goto out_unlock;
1685 	}
1686 
1687 	if (info && info->ats_supported)
1688 		translation = CONTEXT_TT_DEV_IOTLB;
1689 	else
1690 		translation = CONTEXT_TT_MULTI_LEVEL;
1691 
1692 	context_set_address_root(context, virt_to_phys(pgd));
1693 	context_set_address_width(context, agaw);
1694 	context_set_translation_type(context, translation);
1695 	context_set_fault_enable(context);
1696 	context_set_present(context);
1697 	if (!ecap_coherent(iommu->ecap))
1698 		clflush_cache_range(context, sizeof(*context));
1699 	context_present_cache_flush(iommu, did, bus, devfn);
1700 	ret = 0;
1701 
1702 out_unlock:
1703 	spin_unlock(&iommu->lock);
1704 
1705 	return ret;
1706 }
1707 
domain_context_mapping_cb(struct pci_dev * pdev,u16 alias,void * opaque)1708 static int domain_context_mapping_cb(struct pci_dev *pdev,
1709 				     u16 alias, void *opaque)
1710 {
1711 	struct device_domain_info *info = dev_iommu_priv_get(&pdev->dev);
1712 	struct intel_iommu *iommu = info->iommu;
1713 	struct dmar_domain *domain = opaque;
1714 
1715 	return domain_context_mapping_one(domain, iommu,
1716 					  PCI_BUS_NUM(alias), alias & 0xff);
1717 }
1718 
1719 static int
domain_context_mapping(struct dmar_domain * domain,struct device * dev)1720 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
1721 {
1722 	struct device_domain_info *info = dev_iommu_priv_get(dev);
1723 	struct intel_iommu *iommu = info->iommu;
1724 	u8 bus = info->bus, devfn = info->devfn;
1725 
1726 	if (!dev_is_pci(dev))
1727 		return domain_context_mapping_one(domain, iommu, bus, devfn);
1728 
1729 	return pci_for_each_dma_alias(to_pci_dev(dev),
1730 				      domain_context_mapping_cb, domain);
1731 }
1732 
1733 /* Return largest possible superpage level for a given mapping */
hardware_largepage_caps(struct dmar_domain * domain,unsigned long iov_pfn,unsigned long phy_pfn,unsigned long pages)1734 static int hardware_largepage_caps(struct dmar_domain *domain, unsigned long iov_pfn,
1735 				   unsigned long phy_pfn, unsigned long pages)
1736 {
1737 	int support, level = 1;
1738 	unsigned long pfnmerge;
1739 
1740 	support = domain->iommu_superpage;
1741 
1742 	/* To use a large page, the virtual *and* physical addresses
1743 	   must be aligned to 2MiB/1GiB/etc. Lower bits set in either
1744 	   of them will mean we have to use smaller pages. So just
1745 	   merge them and check both at once. */
1746 	pfnmerge = iov_pfn | phy_pfn;
1747 
1748 	while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
1749 		pages >>= VTD_STRIDE_SHIFT;
1750 		if (!pages)
1751 			break;
1752 		pfnmerge >>= VTD_STRIDE_SHIFT;
1753 		level++;
1754 		support--;
1755 	}
1756 	return level;
1757 }
1758 
1759 /*
1760  * Ensure that old small page tables are removed to make room for superpage(s).
1761  * We're going to add new large pages, so make sure we don't remove their parent
1762  * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
1763  */
switch_to_super_page(struct dmar_domain * domain,unsigned long start_pfn,unsigned long end_pfn,int level)1764 static void switch_to_super_page(struct dmar_domain *domain,
1765 				 unsigned long start_pfn,
1766 				 unsigned long end_pfn, int level)
1767 {
1768 	unsigned long lvl_pages = lvl_to_nr_pages(level);
1769 	struct dma_pte *pte = NULL;
1770 
1771 	if (WARN_ON(!IS_ALIGNED(start_pfn, lvl_pages) ||
1772 		    !IS_ALIGNED(end_pfn + 1, lvl_pages)))
1773 		return;
1774 
1775 	while (start_pfn <= end_pfn) {
1776 		if (!pte)
1777 			pte = pfn_to_dma_pte(domain, start_pfn, &level,
1778 					     GFP_ATOMIC);
1779 
1780 		if (dma_pte_present(pte)) {
1781 			dma_pte_free_pagetable(domain, start_pfn,
1782 					       start_pfn + lvl_pages - 1,
1783 					       level + 1);
1784 
1785 			cache_tag_flush_range(domain, start_pfn << VTD_PAGE_SHIFT,
1786 					      end_pfn << VTD_PAGE_SHIFT, 0);
1787 		}
1788 
1789 		pte++;
1790 		start_pfn += lvl_pages;
1791 		if (first_pte_in_page(pte))
1792 			pte = NULL;
1793 	}
1794 }
1795 
1796 static int
__domain_mapping(struct dmar_domain * domain,unsigned long iov_pfn,unsigned long phys_pfn,unsigned long nr_pages,int prot,gfp_t gfp)1797 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1798 		 unsigned long phys_pfn, unsigned long nr_pages, int prot,
1799 		 gfp_t gfp)
1800 {
1801 	struct dma_pte *first_pte = NULL, *pte = NULL;
1802 	unsigned int largepage_lvl = 0;
1803 	unsigned long lvl_pages = 0;
1804 	phys_addr_t pteval;
1805 	u64 attr;
1806 
1807 	if (unlikely(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1)))
1808 		return -EINVAL;
1809 
1810 	if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1811 		return -EINVAL;
1812 
1813 	if (!(prot & DMA_PTE_WRITE) && domain->nested_parent) {
1814 		pr_err_ratelimited("Read-only mapping is disallowed on the domain which serves as the parent in a nested configuration, due to HW errata (ERRATA_772415_SPR17)\n");
1815 		return -EINVAL;
1816 	}
1817 
1818 	attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
1819 	attr |= DMA_FL_PTE_PRESENT;
1820 	if (domain->use_first_level) {
1821 		attr |= DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
1822 		if (prot & DMA_PTE_WRITE)
1823 			attr |= DMA_FL_PTE_DIRTY;
1824 	}
1825 
1826 	domain->has_mappings = true;
1827 
1828 	pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
1829 
1830 	while (nr_pages > 0) {
1831 		uint64_t tmp;
1832 
1833 		if (!pte) {
1834 			largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
1835 					phys_pfn, nr_pages);
1836 
1837 			pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl,
1838 					     gfp);
1839 			if (!pte)
1840 				return -ENOMEM;
1841 			first_pte = pte;
1842 
1843 			lvl_pages = lvl_to_nr_pages(largepage_lvl);
1844 
1845 			/* It is large page*/
1846 			if (largepage_lvl > 1) {
1847 				unsigned long end_pfn;
1848 				unsigned long pages_to_remove;
1849 
1850 				pteval |= DMA_PTE_LARGE_PAGE;
1851 				pages_to_remove = min_t(unsigned long,
1852 							round_down(nr_pages, lvl_pages),
1853 							nr_pte_to_next_page(pte) * lvl_pages);
1854 				end_pfn = iov_pfn + pages_to_remove - 1;
1855 				switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
1856 			} else {
1857 				pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
1858 			}
1859 
1860 		}
1861 		/* We don't need lock here, nobody else
1862 		 * touches the iova range
1863 		 */
1864 		tmp = 0ULL;
1865 		if (!try_cmpxchg64_local(&pte->val, &tmp, pteval)) {
1866 			static int dumps = 5;
1867 			pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
1868 				iov_pfn, tmp, (unsigned long long)pteval);
1869 			if (dumps) {
1870 				dumps--;
1871 				debug_dma_dump_mappings(NULL);
1872 			}
1873 			WARN_ON(1);
1874 		}
1875 
1876 		nr_pages -= lvl_pages;
1877 		iov_pfn += lvl_pages;
1878 		phys_pfn += lvl_pages;
1879 		pteval += lvl_pages * VTD_PAGE_SIZE;
1880 
1881 		/* If the next PTE would be the first in a new page, then we
1882 		 * need to flush the cache on the entries we've just written.
1883 		 * And then we'll need to recalculate 'pte', so clear it and
1884 		 * let it get set again in the if (!pte) block above.
1885 		 *
1886 		 * If we're done (!nr_pages) we need to flush the cache too.
1887 		 *
1888 		 * Also if we've been setting superpages, we may need to
1889 		 * recalculate 'pte' and switch back to smaller pages for the
1890 		 * end of the mapping, if the trailing size is not enough to
1891 		 * use another superpage (i.e. nr_pages < lvl_pages).
1892 		 */
1893 		pte++;
1894 		if (!nr_pages || first_pte_in_page(pte) ||
1895 		    (largepage_lvl > 1 && nr_pages < lvl_pages)) {
1896 			domain_flush_cache(domain, first_pte,
1897 					   (void *)pte - (void *)first_pte);
1898 			pte = NULL;
1899 		}
1900 	}
1901 
1902 	return 0;
1903 }
1904 
domain_context_clear_one(struct device_domain_info * info,u8 bus,u8 devfn)1905 static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
1906 {
1907 	struct intel_iommu *iommu = info->iommu;
1908 	struct context_entry *context;
1909 	u16 did;
1910 
1911 	spin_lock(&iommu->lock);
1912 	context = iommu_context_addr(iommu, bus, devfn, 0);
1913 	if (!context) {
1914 		spin_unlock(&iommu->lock);
1915 		return;
1916 	}
1917 
1918 	did = context_domain_id(context);
1919 	context_clear_entry(context);
1920 	__iommu_flush_cache(iommu, context, sizeof(*context));
1921 	spin_unlock(&iommu->lock);
1922 	intel_context_flush_present(info, context, did, true);
1923 }
1924 
domain_setup_first_level(struct intel_iommu * iommu,struct dmar_domain * domain,struct device * dev,u32 pasid)1925 static int domain_setup_first_level(struct intel_iommu *iommu,
1926 				    struct dmar_domain *domain,
1927 				    struct device *dev,
1928 				    u32 pasid)
1929 {
1930 	struct dma_pte *pgd = domain->pgd;
1931 	int agaw, level;
1932 	int flags = 0;
1933 
1934 	/*
1935 	 * Skip top levels of page tables for iommu which has
1936 	 * less agaw than default. Unnecessary for PT mode.
1937 	 */
1938 	for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
1939 		pgd = phys_to_virt(dma_pte_addr(pgd));
1940 		if (!dma_pte_present(pgd))
1941 			return -ENOMEM;
1942 	}
1943 
1944 	level = agaw_to_level(agaw);
1945 	if (level != 4 && level != 5)
1946 		return -EINVAL;
1947 
1948 	if (level == 5)
1949 		flags |= PASID_FLAG_FL5LP;
1950 
1951 	if (domain->force_snooping)
1952 		flags |= PASID_FLAG_PAGE_SNOOP;
1953 
1954 	return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
1955 					     domain_id_iommu(domain, iommu),
1956 					     flags);
1957 }
1958 
dev_is_real_dma_subdevice(struct device * dev)1959 static bool dev_is_real_dma_subdevice(struct device *dev)
1960 {
1961 	return dev && dev_is_pci(dev) &&
1962 	       pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
1963 }
1964 
domain_need_iotlb_sync_map(struct dmar_domain * domain,struct intel_iommu * iommu)1965 static bool domain_need_iotlb_sync_map(struct dmar_domain *domain,
1966 				       struct intel_iommu *iommu)
1967 {
1968 	if (cap_caching_mode(iommu->cap) && !domain->use_first_level)
1969 		return true;
1970 
1971 	if (rwbf_quirk || cap_rwbf(iommu->cap))
1972 		return true;
1973 
1974 	return false;
1975 }
1976 
dmar_domain_attach_device(struct dmar_domain * domain,struct device * dev)1977 static int dmar_domain_attach_device(struct dmar_domain *domain,
1978 				     struct device *dev)
1979 {
1980 	struct device_domain_info *info = dev_iommu_priv_get(dev);
1981 	struct intel_iommu *iommu = info->iommu;
1982 	unsigned long flags;
1983 	int ret;
1984 
1985 	ret = domain_attach_iommu(domain, iommu);
1986 	if (ret)
1987 		return ret;
1988 
1989 	info->domain = domain;
1990 	info->domain_attached = true;
1991 	spin_lock_irqsave(&domain->lock, flags);
1992 	list_add(&info->link, &domain->devices);
1993 	spin_unlock_irqrestore(&domain->lock, flags);
1994 
1995 	if (dev_is_real_dma_subdevice(dev))
1996 		return 0;
1997 
1998 	if (!sm_supported(iommu))
1999 		ret = domain_context_mapping(domain, dev);
2000 	else if (domain->use_first_level)
2001 		ret = domain_setup_first_level(iommu, domain, dev, IOMMU_NO_PASID);
2002 	else
2003 		ret = intel_pasid_setup_second_level(iommu, domain, dev, IOMMU_NO_PASID);
2004 
2005 	if (ret)
2006 		goto out_block_translation;
2007 
2008 	iommu_enable_pci_caps(info);
2009 
2010 	ret = cache_tag_assign_domain(domain, dev, IOMMU_NO_PASID);
2011 	if (ret)
2012 		goto out_block_translation;
2013 
2014 	domain->iotlb_sync_map |= domain_need_iotlb_sync_map(domain, iommu);
2015 
2016 	return 0;
2017 
2018 out_block_translation:
2019 	device_block_translation(dev);
2020 	return ret;
2021 }
2022 
2023 /**
2024  * device_rmrr_is_relaxable - Test whether the RMRR of this device
2025  * is relaxable (ie. is allowed to be not enforced under some conditions)
2026  * @dev: device handle
2027  *
2028  * We assume that PCI USB devices with RMRRs have them largely
2029  * for historical reasons and that the RMRR space is not actively used post
2030  * boot.  This exclusion may change if vendors begin to abuse it.
2031  *
2032  * The same exception is made for graphics devices, with the requirement that
2033  * any use of the RMRR regions will be torn down before assigning the device
2034  * to a guest.
2035  *
2036  * Return: true if the RMRR is relaxable, false otherwise
2037  */
device_rmrr_is_relaxable(struct device * dev)2038 static bool device_rmrr_is_relaxable(struct device *dev)
2039 {
2040 	struct pci_dev *pdev;
2041 
2042 	if (!dev_is_pci(dev))
2043 		return false;
2044 
2045 	pdev = to_pci_dev(dev);
2046 	if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2047 		return true;
2048 	else
2049 		return false;
2050 }
2051 
device_def_domain_type(struct device * dev)2052 static int device_def_domain_type(struct device *dev)
2053 {
2054 	struct device_domain_info *info = dev_iommu_priv_get(dev);
2055 	struct intel_iommu *iommu = info->iommu;
2056 
2057 	/*
2058 	 * Hardware does not support the passthrough translation mode.
2059 	 * Always use a dynamaic mapping domain.
2060 	 */
2061 	if (!ecap_pass_through(iommu->ecap))
2062 		return IOMMU_DOMAIN_DMA;
2063 
2064 	if (dev_is_pci(dev)) {
2065 		struct pci_dev *pdev = to_pci_dev(dev);
2066 
2067 		if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2068 			return IOMMU_DOMAIN_IDENTITY;
2069 	}
2070 
2071 	return 0;
2072 }
2073 
intel_iommu_init_qi(struct intel_iommu * iommu)2074 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2075 {
2076 	/*
2077 	 * Start from the sane iommu hardware state.
2078 	 * If the queued invalidation is already initialized by us
2079 	 * (for example, while enabling interrupt-remapping) then
2080 	 * we got the things already rolling from a sane state.
2081 	 */
2082 	if (!iommu->qi) {
2083 		/*
2084 		 * Clear any previous faults.
2085 		 */
2086 		dmar_fault(-1, iommu);
2087 		/*
2088 		 * Disable queued invalidation if supported and already enabled
2089 		 * before OS handover.
2090 		 */
2091 		dmar_disable_qi(iommu);
2092 	}
2093 
2094 	if (dmar_enable_qi(iommu)) {
2095 		/*
2096 		 * Queued Invalidate not enabled, use Register Based Invalidate
2097 		 */
2098 		iommu->flush.flush_context = __iommu_flush_context;
2099 		iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2100 		pr_info("%s: Using Register based invalidation\n",
2101 			iommu->name);
2102 	} else {
2103 		iommu->flush.flush_context = qi_flush_context;
2104 		iommu->flush.flush_iotlb = qi_flush_iotlb;
2105 		pr_info("%s: Using Queued invalidation\n", iommu->name);
2106 	}
2107 }
2108 
copy_context_table(struct intel_iommu * iommu,struct root_entry * old_re,struct context_entry ** tbl,int bus,bool ext)2109 static int copy_context_table(struct intel_iommu *iommu,
2110 			      struct root_entry *old_re,
2111 			      struct context_entry **tbl,
2112 			      int bus, bool ext)
2113 {
2114 	int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2115 	struct context_entry *new_ce = NULL, ce;
2116 	struct context_entry *old_ce = NULL;
2117 	struct root_entry re;
2118 	phys_addr_t old_ce_phys;
2119 
2120 	tbl_idx = ext ? bus * 2 : bus;
2121 	memcpy(&re, old_re, sizeof(re));
2122 
2123 	for (devfn = 0; devfn < 256; devfn++) {
2124 		/* First calculate the correct index */
2125 		idx = (ext ? devfn * 2 : devfn) % 256;
2126 
2127 		if (idx == 0) {
2128 			/* First save what we may have and clean up */
2129 			if (new_ce) {
2130 				tbl[tbl_idx] = new_ce;
2131 				__iommu_flush_cache(iommu, new_ce,
2132 						    VTD_PAGE_SIZE);
2133 				pos = 1;
2134 			}
2135 
2136 			if (old_ce)
2137 				memunmap(old_ce);
2138 
2139 			ret = 0;
2140 			if (devfn < 0x80)
2141 				old_ce_phys = root_entry_lctp(&re);
2142 			else
2143 				old_ce_phys = root_entry_uctp(&re);
2144 
2145 			if (!old_ce_phys) {
2146 				if (ext && devfn == 0) {
2147 					/* No LCTP, try UCTP */
2148 					devfn = 0x7f;
2149 					continue;
2150 				} else {
2151 					goto out;
2152 				}
2153 			}
2154 
2155 			ret = -ENOMEM;
2156 			old_ce = memremap(old_ce_phys, PAGE_SIZE,
2157 					MEMREMAP_WB);
2158 			if (!old_ce)
2159 				goto out;
2160 
2161 			new_ce = iommu_alloc_page_node(iommu->node, GFP_KERNEL);
2162 			if (!new_ce)
2163 				goto out_unmap;
2164 
2165 			ret = 0;
2166 		}
2167 
2168 		/* Now copy the context entry */
2169 		memcpy(&ce, old_ce + idx, sizeof(ce));
2170 
2171 		if (!context_present(&ce))
2172 			continue;
2173 
2174 		did = context_domain_id(&ce);
2175 		if (did >= 0 && did < cap_ndoms(iommu->cap))
2176 			set_bit(did, iommu->domain_ids);
2177 
2178 		set_context_copied(iommu, bus, devfn);
2179 		new_ce[idx] = ce;
2180 	}
2181 
2182 	tbl[tbl_idx + pos] = new_ce;
2183 
2184 	__iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2185 
2186 out_unmap:
2187 	memunmap(old_ce);
2188 
2189 out:
2190 	return ret;
2191 }
2192 
copy_translation_tables(struct intel_iommu * iommu)2193 static int copy_translation_tables(struct intel_iommu *iommu)
2194 {
2195 	struct context_entry **ctxt_tbls;
2196 	struct root_entry *old_rt;
2197 	phys_addr_t old_rt_phys;
2198 	int ctxt_table_entries;
2199 	u64 rtaddr_reg;
2200 	int bus, ret;
2201 	bool new_ext, ext;
2202 
2203 	rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2204 	ext        = !!(rtaddr_reg & DMA_RTADDR_SMT);
2205 	new_ext    = !!sm_supported(iommu);
2206 
2207 	/*
2208 	 * The RTT bit can only be changed when translation is disabled,
2209 	 * but disabling translation means to open a window for data
2210 	 * corruption. So bail out and don't copy anything if we would
2211 	 * have to change the bit.
2212 	 */
2213 	if (new_ext != ext)
2214 		return -EINVAL;
2215 
2216 	iommu->copied_tables = bitmap_zalloc(BIT_ULL(16), GFP_KERNEL);
2217 	if (!iommu->copied_tables)
2218 		return -ENOMEM;
2219 
2220 	old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
2221 	if (!old_rt_phys)
2222 		return -EINVAL;
2223 
2224 	old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
2225 	if (!old_rt)
2226 		return -ENOMEM;
2227 
2228 	/* This is too big for the stack - allocate it from slab */
2229 	ctxt_table_entries = ext ? 512 : 256;
2230 	ret = -ENOMEM;
2231 	ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
2232 	if (!ctxt_tbls)
2233 		goto out_unmap;
2234 
2235 	for (bus = 0; bus < 256; bus++) {
2236 		ret = copy_context_table(iommu, &old_rt[bus],
2237 					 ctxt_tbls, bus, ext);
2238 		if (ret) {
2239 			pr_err("%s: Failed to copy context table for bus %d\n",
2240 				iommu->name, bus);
2241 			continue;
2242 		}
2243 	}
2244 
2245 	spin_lock(&iommu->lock);
2246 
2247 	/* Context tables are copied, now write them to the root_entry table */
2248 	for (bus = 0; bus < 256; bus++) {
2249 		int idx = ext ? bus * 2 : bus;
2250 		u64 val;
2251 
2252 		if (ctxt_tbls[idx]) {
2253 			val = virt_to_phys(ctxt_tbls[idx]) | 1;
2254 			iommu->root_entry[bus].lo = val;
2255 		}
2256 
2257 		if (!ext || !ctxt_tbls[idx + 1])
2258 			continue;
2259 
2260 		val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
2261 		iommu->root_entry[bus].hi = val;
2262 	}
2263 
2264 	spin_unlock(&iommu->lock);
2265 
2266 	kfree(ctxt_tbls);
2267 
2268 	__iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
2269 
2270 	ret = 0;
2271 
2272 out_unmap:
2273 	memunmap(old_rt);
2274 
2275 	return ret;
2276 }
2277 
init_dmars(void)2278 static int __init init_dmars(void)
2279 {
2280 	struct dmar_drhd_unit *drhd;
2281 	struct intel_iommu *iommu;
2282 	int ret;
2283 
2284 	ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL);
2285 	if (ret)
2286 		goto free_iommu;
2287 
2288 	for_each_iommu(iommu, drhd) {
2289 		if (drhd->ignored) {
2290 			iommu_disable_translation(iommu);
2291 			continue;
2292 		}
2293 
2294 		/*
2295 		 * Find the max pasid size of all IOMMU's in the system.
2296 		 * We need to ensure the system pasid table is no bigger
2297 		 * than the smallest supported.
2298 		 */
2299 		if (pasid_supported(iommu)) {
2300 			u32 temp = 2 << ecap_pss(iommu->ecap);
2301 
2302 			intel_pasid_max_id = min_t(u32, temp,
2303 						   intel_pasid_max_id);
2304 		}
2305 
2306 		intel_iommu_init_qi(iommu);
2307 
2308 		ret = iommu_init_domains(iommu);
2309 		if (ret)
2310 			goto free_iommu;
2311 
2312 		init_translation_status(iommu);
2313 
2314 		if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
2315 			iommu_disable_translation(iommu);
2316 			clear_translation_pre_enabled(iommu);
2317 			pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
2318 				iommu->name);
2319 		}
2320 
2321 		/*
2322 		 * TBD:
2323 		 * we could share the same root & context tables
2324 		 * among all IOMMU's. Need to Split it later.
2325 		 */
2326 		ret = iommu_alloc_root_entry(iommu);
2327 		if (ret)
2328 			goto free_iommu;
2329 
2330 		if (translation_pre_enabled(iommu)) {
2331 			pr_info("Translation already enabled - trying to copy translation structures\n");
2332 
2333 			ret = copy_translation_tables(iommu);
2334 			if (ret) {
2335 				/*
2336 				 * We found the IOMMU with translation
2337 				 * enabled - but failed to copy over the
2338 				 * old root-entry table. Try to proceed
2339 				 * by disabling translation now and
2340 				 * allocating a clean root-entry table.
2341 				 * This might cause DMAR faults, but
2342 				 * probably the dump will still succeed.
2343 				 */
2344 				pr_err("Failed to copy translation tables from previous kernel for %s\n",
2345 				       iommu->name);
2346 				iommu_disable_translation(iommu);
2347 				clear_translation_pre_enabled(iommu);
2348 			} else {
2349 				pr_info("Copied translation tables from previous kernel for %s\n",
2350 					iommu->name);
2351 			}
2352 		}
2353 
2354 		intel_svm_check(iommu);
2355 	}
2356 
2357 	/*
2358 	 * Now that qi is enabled on all iommus, set the root entry and flush
2359 	 * caches. This is required on some Intel X58 chipsets, otherwise the
2360 	 * flush_context function will loop forever and the boot hangs.
2361 	 */
2362 	for_each_active_iommu(iommu, drhd) {
2363 		iommu_flush_write_buffer(iommu);
2364 		iommu_set_root_entry(iommu);
2365 	}
2366 
2367 	check_tylersburg_isoch();
2368 
2369 	/*
2370 	 * for each drhd
2371 	 *   enable fault log
2372 	 *   global invalidate context cache
2373 	 *   global invalidate iotlb
2374 	 *   enable translation
2375 	 */
2376 	for_each_iommu(iommu, drhd) {
2377 		if (drhd->ignored) {
2378 			/*
2379 			 * we always have to disable PMRs or DMA may fail on
2380 			 * this device
2381 			 */
2382 			if (force_on)
2383 				iommu_disable_protect_mem_regions(iommu);
2384 			continue;
2385 		}
2386 
2387 		iommu_flush_write_buffer(iommu);
2388 
2389 #ifdef CONFIG_INTEL_IOMMU_SVM
2390 		if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
2391 			/*
2392 			 * Call dmar_alloc_hwirq() with dmar_global_lock held,
2393 			 * could cause possible lock race condition.
2394 			 */
2395 			up_write(&dmar_global_lock);
2396 			ret = intel_svm_enable_prq(iommu);
2397 			down_write(&dmar_global_lock);
2398 			if (ret)
2399 				goto free_iommu;
2400 		}
2401 #endif
2402 		ret = dmar_set_interrupt(iommu);
2403 		if (ret)
2404 			goto free_iommu;
2405 	}
2406 
2407 	return 0;
2408 
2409 free_iommu:
2410 	for_each_active_iommu(iommu, drhd) {
2411 		disable_dmar_iommu(iommu);
2412 		free_dmar_iommu(iommu);
2413 	}
2414 
2415 	return ret;
2416 }
2417 
init_no_remapping_devices(void)2418 static void __init init_no_remapping_devices(void)
2419 {
2420 	struct dmar_drhd_unit *drhd;
2421 	struct device *dev;
2422 	int i;
2423 
2424 	for_each_drhd_unit(drhd) {
2425 		if (!drhd->include_all) {
2426 			for_each_active_dev_scope(drhd->devices,
2427 						  drhd->devices_cnt, i, dev)
2428 				break;
2429 			/* ignore DMAR unit if no devices exist */
2430 			if (i == drhd->devices_cnt)
2431 				drhd->ignored = 1;
2432 		}
2433 	}
2434 
2435 	for_each_active_drhd_unit(drhd) {
2436 		if (drhd->include_all)
2437 			continue;
2438 
2439 		for_each_active_dev_scope(drhd->devices,
2440 					  drhd->devices_cnt, i, dev)
2441 			if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
2442 				break;
2443 		if (i < drhd->devices_cnt)
2444 			continue;
2445 
2446 		/* This IOMMU has *only* gfx devices. Either bypass it or
2447 		   set the gfx_mapped flag, as appropriate */
2448 		drhd->gfx_dedicated = 1;
2449 		if (disable_igfx_iommu)
2450 			drhd->ignored = 1;
2451 	}
2452 }
2453 
2454 #ifdef CONFIG_SUSPEND
init_iommu_hw(void)2455 static int init_iommu_hw(void)
2456 {
2457 	struct dmar_drhd_unit *drhd;
2458 	struct intel_iommu *iommu = NULL;
2459 	int ret;
2460 
2461 	for_each_active_iommu(iommu, drhd) {
2462 		if (iommu->qi) {
2463 			ret = dmar_reenable_qi(iommu);
2464 			if (ret)
2465 				return ret;
2466 		}
2467 	}
2468 
2469 	for_each_iommu(iommu, drhd) {
2470 		if (drhd->ignored) {
2471 			/*
2472 			 * we always have to disable PMRs or DMA may fail on
2473 			 * this device
2474 			 */
2475 			if (force_on)
2476 				iommu_disable_protect_mem_regions(iommu);
2477 			continue;
2478 		}
2479 
2480 		iommu_flush_write_buffer(iommu);
2481 		iommu_set_root_entry(iommu);
2482 		iommu_enable_translation(iommu);
2483 		iommu_disable_protect_mem_regions(iommu);
2484 	}
2485 
2486 	return 0;
2487 }
2488 
iommu_flush_all(void)2489 static void iommu_flush_all(void)
2490 {
2491 	struct dmar_drhd_unit *drhd;
2492 	struct intel_iommu *iommu;
2493 
2494 	for_each_active_iommu(iommu, drhd) {
2495 		iommu->flush.flush_context(iommu, 0, 0, 0,
2496 					   DMA_CCMD_GLOBAL_INVL);
2497 		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2498 					 DMA_TLB_GLOBAL_FLUSH);
2499 	}
2500 }
2501 
iommu_suspend(void)2502 static int iommu_suspend(void)
2503 {
2504 	struct dmar_drhd_unit *drhd;
2505 	struct intel_iommu *iommu = NULL;
2506 	unsigned long flag;
2507 
2508 	iommu_flush_all();
2509 
2510 	for_each_active_iommu(iommu, drhd) {
2511 		iommu_disable_translation(iommu);
2512 
2513 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
2514 
2515 		iommu->iommu_state[SR_DMAR_FECTL_REG] =
2516 			readl(iommu->reg + DMAR_FECTL_REG);
2517 		iommu->iommu_state[SR_DMAR_FEDATA_REG] =
2518 			readl(iommu->reg + DMAR_FEDATA_REG);
2519 		iommu->iommu_state[SR_DMAR_FEADDR_REG] =
2520 			readl(iommu->reg + DMAR_FEADDR_REG);
2521 		iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
2522 			readl(iommu->reg + DMAR_FEUADDR_REG);
2523 
2524 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
2525 	}
2526 	return 0;
2527 }
2528 
iommu_resume(void)2529 static void iommu_resume(void)
2530 {
2531 	struct dmar_drhd_unit *drhd;
2532 	struct intel_iommu *iommu = NULL;
2533 	unsigned long flag;
2534 
2535 	if (init_iommu_hw()) {
2536 		if (force_on)
2537 			panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
2538 		else
2539 			WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
2540 		return;
2541 	}
2542 
2543 	for_each_active_iommu(iommu, drhd) {
2544 
2545 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
2546 
2547 		writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
2548 			iommu->reg + DMAR_FECTL_REG);
2549 		writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
2550 			iommu->reg + DMAR_FEDATA_REG);
2551 		writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
2552 			iommu->reg + DMAR_FEADDR_REG);
2553 		writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
2554 			iommu->reg + DMAR_FEUADDR_REG);
2555 
2556 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
2557 	}
2558 }
2559 
2560 static struct syscore_ops iommu_syscore_ops = {
2561 	.resume		= iommu_resume,
2562 	.suspend	= iommu_suspend,
2563 };
2564 
init_iommu_pm_ops(void)2565 static void __init init_iommu_pm_ops(void)
2566 {
2567 	register_syscore_ops(&iommu_syscore_ops);
2568 }
2569 
2570 #else
init_iommu_pm_ops(void)2571 static inline void init_iommu_pm_ops(void) {}
2572 #endif	/* CONFIG_PM */
2573 
rmrr_sanity_check(struct acpi_dmar_reserved_memory * rmrr)2574 static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
2575 {
2576 	if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
2577 	    !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
2578 	    rmrr->end_address <= rmrr->base_address ||
2579 	    arch_rmrr_sanity_check(rmrr))
2580 		return -EINVAL;
2581 
2582 	return 0;
2583 }
2584 
dmar_parse_one_rmrr(struct acpi_dmar_header * header,void * arg)2585 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
2586 {
2587 	struct acpi_dmar_reserved_memory *rmrr;
2588 	struct dmar_rmrr_unit *rmrru;
2589 
2590 	rmrr = (struct acpi_dmar_reserved_memory *)header;
2591 	if (rmrr_sanity_check(rmrr)) {
2592 		pr_warn(FW_BUG
2593 			   "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
2594 			   "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2595 			   rmrr->base_address, rmrr->end_address,
2596 			   dmi_get_system_info(DMI_BIOS_VENDOR),
2597 			   dmi_get_system_info(DMI_BIOS_VERSION),
2598 			   dmi_get_system_info(DMI_PRODUCT_VERSION));
2599 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
2600 	}
2601 
2602 	rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
2603 	if (!rmrru)
2604 		goto out;
2605 
2606 	rmrru->hdr = header;
2607 
2608 	rmrru->base_address = rmrr->base_address;
2609 	rmrru->end_address = rmrr->end_address;
2610 
2611 	rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
2612 				((void *)rmrr) + rmrr->header.length,
2613 				&rmrru->devices_cnt);
2614 	if (rmrru->devices_cnt && rmrru->devices == NULL)
2615 		goto free_rmrru;
2616 
2617 	list_add(&rmrru->list, &dmar_rmrr_units);
2618 
2619 	return 0;
2620 free_rmrru:
2621 	kfree(rmrru);
2622 out:
2623 	return -ENOMEM;
2624 }
2625 
dmar_find_atsr(struct acpi_dmar_atsr * atsr)2626 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
2627 {
2628 	struct dmar_atsr_unit *atsru;
2629 	struct acpi_dmar_atsr *tmp;
2630 
2631 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
2632 				dmar_rcu_check()) {
2633 		tmp = (struct acpi_dmar_atsr *)atsru->hdr;
2634 		if (atsr->segment != tmp->segment)
2635 			continue;
2636 		if (atsr->header.length != tmp->header.length)
2637 			continue;
2638 		if (memcmp(atsr, tmp, atsr->header.length) == 0)
2639 			return atsru;
2640 	}
2641 
2642 	return NULL;
2643 }
2644 
dmar_parse_one_atsr(struct acpi_dmar_header * hdr,void * arg)2645 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
2646 {
2647 	struct acpi_dmar_atsr *atsr;
2648 	struct dmar_atsr_unit *atsru;
2649 
2650 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
2651 		return 0;
2652 
2653 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
2654 	atsru = dmar_find_atsr(atsr);
2655 	if (atsru)
2656 		return 0;
2657 
2658 	atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
2659 	if (!atsru)
2660 		return -ENOMEM;
2661 
2662 	/*
2663 	 * If memory is allocated from slab by ACPI _DSM method, we need to
2664 	 * copy the memory content because the memory buffer will be freed
2665 	 * on return.
2666 	 */
2667 	atsru->hdr = (void *)(atsru + 1);
2668 	memcpy(atsru->hdr, hdr, hdr->length);
2669 	atsru->include_all = atsr->flags & 0x1;
2670 	if (!atsru->include_all) {
2671 		atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
2672 				(void *)atsr + atsr->header.length,
2673 				&atsru->devices_cnt);
2674 		if (atsru->devices_cnt && atsru->devices == NULL) {
2675 			kfree(atsru);
2676 			return -ENOMEM;
2677 		}
2678 	}
2679 
2680 	list_add_rcu(&atsru->list, &dmar_atsr_units);
2681 
2682 	return 0;
2683 }
2684 
intel_iommu_free_atsr(struct dmar_atsr_unit * atsru)2685 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
2686 {
2687 	dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
2688 	kfree(atsru);
2689 }
2690 
dmar_release_one_atsr(struct acpi_dmar_header * hdr,void * arg)2691 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
2692 {
2693 	struct acpi_dmar_atsr *atsr;
2694 	struct dmar_atsr_unit *atsru;
2695 
2696 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
2697 	atsru = dmar_find_atsr(atsr);
2698 	if (atsru) {
2699 		list_del_rcu(&atsru->list);
2700 		synchronize_rcu();
2701 		intel_iommu_free_atsr(atsru);
2702 	}
2703 
2704 	return 0;
2705 }
2706 
dmar_check_one_atsr(struct acpi_dmar_header * hdr,void * arg)2707 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
2708 {
2709 	int i;
2710 	struct device *dev;
2711 	struct acpi_dmar_atsr *atsr;
2712 	struct dmar_atsr_unit *atsru;
2713 
2714 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
2715 	atsru = dmar_find_atsr(atsr);
2716 	if (!atsru)
2717 		return 0;
2718 
2719 	if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
2720 		for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
2721 					  i, dev)
2722 			return -EBUSY;
2723 	}
2724 
2725 	return 0;
2726 }
2727 
dmar_find_satc(struct acpi_dmar_satc * satc)2728 static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
2729 {
2730 	struct dmar_satc_unit *satcu;
2731 	struct acpi_dmar_satc *tmp;
2732 
2733 	list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
2734 				dmar_rcu_check()) {
2735 		tmp = (struct acpi_dmar_satc *)satcu->hdr;
2736 		if (satc->segment != tmp->segment)
2737 			continue;
2738 		if (satc->header.length != tmp->header.length)
2739 			continue;
2740 		if (memcmp(satc, tmp, satc->header.length) == 0)
2741 			return satcu;
2742 	}
2743 
2744 	return NULL;
2745 }
2746 
dmar_parse_one_satc(struct acpi_dmar_header * hdr,void * arg)2747 int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
2748 {
2749 	struct acpi_dmar_satc *satc;
2750 	struct dmar_satc_unit *satcu;
2751 
2752 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
2753 		return 0;
2754 
2755 	satc = container_of(hdr, struct acpi_dmar_satc, header);
2756 	satcu = dmar_find_satc(satc);
2757 	if (satcu)
2758 		return 0;
2759 
2760 	satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
2761 	if (!satcu)
2762 		return -ENOMEM;
2763 
2764 	satcu->hdr = (void *)(satcu + 1);
2765 	memcpy(satcu->hdr, hdr, hdr->length);
2766 	satcu->atc_required = satc->flags & 0x1;
2767 	satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
2768 					      (void *)satc + satc->header.length,
2769 					      &satcu->devices_cnt);
2770 	if (satcu->devices_cnt && !satcu->devices) {
2771 		kfree(satcu);
2772 		return -ENOMEM;
2773 	}
2774 	list_add_rcu(&satcu->list, &dmar_satc_units);
2775 
2776 	return 0;
2777 }
2778 
intel_iommu_add(struct dmar_drhd_unit * dmaru)2779 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
2780 {
2781 	int sp, ret;
2782 	struct intel_iommu *iommu = dmaru->iommu;
2783 
2784 	ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu);
2785 	if (ret)
2786 		goto out;
2787 
2788 	sp = domain_update_iommu_superpage(NULL, iommu) - 1;
2789 	if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
2790 		pr_warn("%s: Doesn't support large page.\n",
2791 			iommu->name);
2792 		return -ENXIO;
2793 	}
2794 
2795 	/*
2796 	 * Disable translation if already enabled prior to OS handover.
2797 	 */
2798 	if (iommu->gcmd & DMA_GCMD_TE)
2799 		iommu_disable_translation(iommu);
2800 
2801 	ret = iommu_init_domains(iommu);
2802 	if (ret == 0)
2803 		ret = iommu_alloc_root_entry(iommu);
2804 	if (ret)
2805 		goto out;
2806 
2807 	intel_svm_check(iommu);
2808 
2809 	if (dmaru->ignored) {
2810 		/*
2811 		 * we always have to disable PMRs or DMA may fail on this device
2812 		 */
2813 		if (force_on)
2814 			iommu_disable_protect_mem_regions(iommu);
2815 		return 0;
2816 	}
2817 
2818 	intel_iommu_init_qi(iommu);
2819 	iommu_flush_write_buffer(iommu);
2820 
2821 #ifdef CONFIG_INTEL_IOMMU_SVM
2822 	if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
2823 		ret = intel_svm_enable_prq(iommu);
2824 		if (ret)
2825 			goto disable_iommu;
2826 	}
2827 #endif
2828 	ret = dmar_set_interrupt(iommu);
2829 	if (ret)
2830 		goto disable_iommu;
2831 
2832 	iommu_set_root_entry(iommu);
2833 	iommu_enable_translation(iommu);
2834 
2835 	iommu_disable_protect_mem_regions(iommu);
2836 	return 0;
2837 
2838 disable_iommu:
2839 	disable_dmar_iommu(iommu);
2840 out:
2841 	free_dmar_iommu(iommu);
2842 	return ret;
2843 }
2844 
dmar_iommu_hotplug(struct dmar_drhd_unit * dmaru,bool insert)2845 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
2846 {
2847 	int ret = 0;
2848 	struct intel_iommu *iommu = dmaru->iommu;
2849 
2850 	if (!intel_iommu_enabled)
2851 		return 0;
2852 	if (iommu == NULL)
2853 		return -EINVAL;
2854 
2855 	if (insert) {
2856 		ret = intel_iommu_add(dmaru);
2857 	} else {
2858 		disable_dmar_iommu(iommu);
2859 		free_dmar_iommu(iommu);
2860 	}
2861 
2862 	return ret;
2863 }
2864 
intel_iommu_free_dmars(void)2865 static void intel_iommu_free_dmars(void)
2866 {
2867 	struct dmar_rmrr_unit *rmrru, *rmrr_n;
2868 	struct dmar_atsr_unit *atsru, *atsr_n;
2869 	struct dmar_satc_unit *satcu, *satc_n;
2870 
2871 	list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
2872 		list_del(&rmrru->list);
2873 		dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
2874 		kfree(rmrru);
2875 	}
2876 
2877 	list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
2878 		list_del(&atsru->list);
2879 		intel_iommu_free_atsr(atsru);
2880 	}
2881 	list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
2882 		list_del(&satcu->list);
2883 		dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
2884 		kfree(satcu);
2885 	}
2886 }
2887 
dmar_find_matched_satc_unit(struct pci_dev * dev)2888 static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev)
2889 {
2890 	struct dmar_satc_unit *satcu;
2891 	struct acpi_dmar_satc *satc;
2892 	struct device *tmp;
2893 	int i;
2894 
2895 	dev = pci_physfn(dev);
2896 	rcu_read_lock();
2897 
2898 	list_for_each_entry_rcu(satcu, &dmar_satc_units, list) {
2899 		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
2900 		if (satc->segment != pci_domain_nr(dev->bus))
2901 			continue;
2902 		for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp)
2903 			if (to_pci_dev(tmp) == dev)
2904 				goto out;
2905 	}
2906 	satcu = NULL;
2907 out:
2908 	rcu_read_unlock();
2909 	return satcu;
2910 }
2911 
dmar_ats_supported(struct pci_dev * dev,struct intel_iommu * iommu)2912 static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu)
2913 {
2914 	int i, ret = 1;
2915 	struct pci_bus *bus;
2916 	struct pci_dev *bridge = NULL;
2917 	struct device *tmp;
2918 	struct acpi_dmar_atsr *atsr;
2919 	struct dmar_atsr_unit *atsru;
2920 	struct dmar_satc_unit *satcu;
2921 
2922 	dev = pci_physfn(dev);
2923 	satcu = dmar_find_matched_satc_unit(dev);
2924 	if (satcu)
2925 		/*
2926 		 * This device supports ATS as it is in SATC table.
2927 		 * When IOMMU is in legacy mode, enabling ATS is done
2928 		 * automatically by HW for the device that requires
2929 		 * ATS, hence OS should not enable this device ATS
2930 		 * to avoid duplicated TLB invalidation.
2931 		 */
2932 		return !(satcu->atc_required && !sm_supported(iommu));
2933 
2934 	for (bus = dev->bus; bus; bus = bus->parent) {
2935 		bridge = bus->self;
2936 		/* If it's an integrated device, allow ATS */
2937 		if (!bridge)
2938 			return 1;
2939 		/* Connected via non-PCIe: no ATS */
2940 		if (!pci_is_pcie(bridge) ||
2941 		    pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
2942 			return 0;
2943 		/* If we found the root port, look it up in the ATSR */
2944 		if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
2945 			break;
2946 	}
2947 
2948 	rcu_read_lock();
2949 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
2950 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
2951 		if (atsr->segment != pci_domain_nr(dev->bus))
2952 			continue;
2953 
2954 		for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
2955 			if (tmp == &bridge->dev)
2956 				goto out;
2957 
2958 		if (atsru->include_all)
2959 			goto out;
2960 	}
2961 	ret = 0;
2962 out:
2963 	rcu_read_unlock();
2964 
2965 	return ret;
2966 }
2967 
dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info * info)2968 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
2969 {
2970 	int ret;
2971 	struct dmar_rmrr_unit *rmrru;
2972 	struct dmar_atsr_unit *atsru;
2973 	struct dmar_satc_unit *satcu;
2974 	struct acpi_dmar_atsr *atsr;
2975 	struct acpi_dmar_reserved_memory *rmrr;
2976 	struct acpi_dmar_satc *satc;
2977 
2978 	if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
2979 		return 0;
2980 
2981 	list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
2982 		rmrr = container_of(rmrru->hdr,
2983 				    struct acpi_dmar_reserved_memory, header);
2984 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
2985 			ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
2986 				((void *)rmrr) + rmrr->header.length,
2987 				rmrr->segment, rmrru->devices,
2988 				rmrru->devices_cnt);
2989 			if (ret < 0)
2990 				return ret;
2991 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
2992 			dmar_remove_dev_scope(info, rmrr->segment,
2993 				rmrru->devices, rmrru->devices_cnt);
2994 		}
2995 	}
2996 
2997 	list_for_each_entry(atsru, &dmar_atsr_units, list) {
2998 		if (atsru->include_all)
2999 			continue;
3000 
3001 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3002 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3003 			ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
3004 					(void *)atsr + atsr->header.length,
3005 					atsr->segment, atsru->devices,
3006 					atsru->devices_cnt);
3007 			if (ret > 0)
3008 				break;
3009 			else if (ret < 0)
3010 				return ret;
3011 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3012 			if (dmar_remove_dev_scope(info, atsr->segment,
3013 					atsru->devices, atsru->devices_cnt))
3014 				break;
3015 		}
3016 	}
3017 	list_for_each_entry(satcu, &dmar_satc_units, list) {
3018 		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3019 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3020 			ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
3021 					(void *)satc + satc->header.length,
3022 					satc->segment, satcu->devices,
3023 					satcu->devices_cnt);
3024 			if (ret > 0)
3025 				break;
3026 			else if (ret < 0)
3027 				return ret;
3028 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3029 			if (dmar_remove_dev_scope(info, satc->segment,
3030 					satcu->devices, satcu->devices_cnt))
3031 				break;
3032 		}
3033 	}
3034 
3035 	return 0;
3036 }
3037 
intel_disable_iommus(void)3038 static void intel_disable_iommus(void)
3039 {
3040 	struct intel_iommu *iommu = NULL;
3041 	struct dmar_drhd_unit *drhd;
3042 
3043 	for_each_iommu(iommu, drhd)
3044 		iommu_disable_translation(iommu);
3045 }
3046 
intel_iommu_shutdown(void)3047 void intel_iommu_shutdown(void)
3048 {
3049 	struct dmar_drhd_unit *drhd;
3050 	struct intel_iommu *iommu = NULL;
3051 
3052 	if (no_iommu || dmar_disabled)
3053 		return;
3054 
3055 	down_write(&dmar_global_lock);
3056 
3057 	/* Disable PMRs explicitly here. */
3058 	for_each_iommu(iommu, drhd)
3059 		iommu_disable_protect_mem_regions(iommu);
3060 
3061 	/* Make sure the IOMMUs are switched off */
3062 	intel_disable_iommus();
3063 
3064 	up_write(&dmar_global_lock);
3065 }
3066 
dev_to_intel_iommu(struct device * dev)3067 static struct intel_iommu *dev_to_intel_iommu(struct device *dev)
3068 {
3069 	struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
3070 
3071 	return container_of(iommu_dev, struct intel_iommu, iommu);
3072 }
3073 
version_show(struct device * dev,struct device_attribute * attr,char * buf)3074 static ssize_t version_show(struct device *dev,
3075 			    struct device_attribute *attr, char *buf)
3076 {
3077 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3078 	u32 ver = readl(iommu->reg + DMAR_VER_REG);
3079 	return sysfs_emit(buf, "%d:%d\n",
3080 			  DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
3081 }
3082 static DEVICE_ATTR_RO(version);
3083 
address_show(struct device * dev,struct device_attribute * attr,char * buf)3084 static ssize_t address_show(struct device *dev,
3085 			    struct device_attribute *attr, char *buf)
3086 {
3087 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3088 	return sysfs_emit(buf, "%llx\n", iommu->reg_phys);
3089 }
3090 static DEVICE_ATTR_RO(address);
3091 
cap_show(struct device * dev,struct device_attribute * attr,char * buf)3092 static ssize_t cap_show(struct device *dev,
3093 			struct device_attribute *attr, char *buf)
3094 {
3095 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3096 	return sysfs_emit(buf, "%llx\n", iommu->cap);
3097 }
3098 static DEVICE_ATTR_RO(cap);
3099 
ecap_show(struct device * dev,struct device_attribute * attr,char * buf)3100 static ssize_t ecap_show(struct device *dev,
3101 			 struct device_attribute *attr, char *buf)
3102 {
3103 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3104 	return sysfs_emit(buf, "%llx\n", iommu->ecap);
3105 }
3106 static DEVICE_ATTR_RO(ecap);
3107 
domains_supported_show(struct device * dev,struct device_attribute * attr,char * buf)3108 static ssize_t domains_supported_show(struct device *dev,
3109 				      struct device_attribute *attr, char *buf)
3110 {
3111 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3112 	return sysfs_emit(buf, "%ld\n", cap_ndoms(iommu->cap));
3113 }
3114 static DEVICE_ATTR_RO(domains_supported);
3115 
domains_used_show(struct device * dev,struct device_attribute * attr,char * buf)3116 static ssize_t domains_used_show(struct device *dev,
3117 				 struct device_attribute *attr, char *buf)
3118 {
3119 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3120 	return sysfs_emit(buf, "%d\n",
3121 			  bitmap_weight(iommu->domain_ids,
3122 					cap_ndoms(iommu->cap)));
3123 }
3124 static DEVICE_ATTR_RO(domains_used);
3125 
3126 static struct attribute *intel_iommu_attrs[] = {
3127 	&dev_attr_version.attr,
3128 	&dev_attr_address.attr,
3129 	&dev_attr_cap.attr,
3130 	&dev_attr_ecap.attr,
3131 	&dev_attr_domains_supported.attr,
3132 	&dev_attr_domains_used.attr,
3133 	NULL,
3134 };
3135 
3136 static struct attribute_group intel_iommu_group = {
3137 	.name = "intel-iommu",
3138 	.attrs = intel_iommu_attrs,
3139 };
3140 
3141 const struct attribute_group *intel_iommu_groups[] = {
3142 	&intel_iommu_group,
3143 	NULL,
3144 };
3145 
has_external_pci(void)3146 static bool has_external_pci(void)
3147 {
3148 	struct pci_dev *pdev = NULL;
3149 
3150 	for_each_pci_dev(pdev)
3151 		if (pdev->external_facing) {
3152 			pci_dev_put(pdev);
3153 			return true;
3154 		}
3155 
3156 	return false;
3157 }
3158 
platform_optin_force_iommu(void)3159 static int __init platform_optin_force_iommu(void)
3160 {
3161 	if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
3162 		return 0;
3163 
3164 	if (no_iommu || dmar_disabled)
3165 		pr_info("Intel-IOMMU force enabled due to platform opt in\n");
3166 
3167 	/*
3168 	 * If Intel-IOMMU is disabled by default, we will apply identity
3169 	 * map for all devices except those marked as requiring DMA protection.
3170 	 */
3171 	if (dmar_disabled)
3172 		iommu_set_default_passthrough(false);
3173 
3174 	dmar_disabled = 0;
3175 	no_iommu = 0;
3176 
3177 	return 1;
3178 }
3179 
probe_acpi_namespace_devices(void)3180 static int __init probe_acpi_namespace_devices(void)
3181 {
3182 	struct dmar_drhd_unit *drhd;
3183 	/* To avoid a -Wunused-but-set-variable warning. */
3184 	struct intel_iommu *iommu __maybe_unused;
3185 	struct device *dev;
3186 	int i, ret = 0;
3187 
3188 	for_each_active_iommu(iommu, drhd) {
3189 		for_each_active_dev_scope(drhd->devices,
3190 					  drhd->devices_cnt, i, dev) {
3191 			struct acpi_device_physical_node *pn;
3192 			struct acpi_device *adev;
3193 
3194 			if (dev->bus != &acpi_bus_type)
3195 				continue;
3196 
3197 			up_read(&dmar_global_lock);
3198 			adev = to_acpi_device(dev);
3199 			mutex_lock(&adev->physical_node_lock);
3200 			list_for_each_entry(pn,
3201 					    &adev->physical_node_list, node) {
3202 				ret = iommu_probe_device(pn->dev);
3203 				if (ret)
3204 					break;
3205 			}
3206 			mutex_unlock(&adev->physical_node_lock);
3207 			down_read(&dmar_global_lock);
3208 
3209 			if (ret)
3210 				return ret;
3211 		}
3212 	}
3213 
3214 	return 0;
3215 }
3216 
tboot_force_iommu(void)3217 static __init int tboot_force_iommu(void)
3218 {
3219 	if (!tboot_enabled())
3220 		return 0;
3221 
3222 	if (no_iommu || dmar_disabled)
3223 		pr_warn("Forcing Intel-IOMMU to enabled\n");
3224 
3225 	dmar_disabled = 0;
3226 	no_iommu = 0;
3227 
3228 	return 1;
3229 }
3230 
intel_iommu_init(void)3231 int __init intel_iommu_init(void)
3232 {
3233 	int ret = -ENODEV;
3234 	struct dmar_drhd_unit *drhd;
3235 	struct intel_iommu *iommu;
3236 
3237 	/*
3238 	 * Intel IOMMU is required for a TXT/tboot launch or platform
3239 	 * opt in, so enforce that.
3240 	 */
3241 	force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
3242 		    platform_optin_force_iommu();
3243 
3244 	down_write(&dmar_global_lock);
3245 	if (dmar_table_init()) {
3246 		if (force_on)
3247 			panic("tboot: Failed to initialize DMAR table\n");
3248 		goto out_free_dmar;
3249 	}
3250 
3251 	if (dmar_dev_scope_init() < 0) {
3252 		if (force_on)
3253 			panic("tboot: Failed to initialize DMAR device scope\n");
3254 		goto out_free_dmar;
3255 	}
3256 
3257 	up_write(&dmar_global_lock);
3258 
3259 	/*
3260 	 * The bus notifier takes the dmar_global_lock, so lockdep will
3261 	 * complain later when we register it under the lock.
3262 	 */
3263 	dmar_register_bus_notifier();
3264 
3265 	down_write(&dmar_global_lock);
3266 
3267 	if (!no_iommu)
3268 		intel_iommu_debugfs_init();
3269 
3270 	if (no_iommu || dmar_disabled) {
3271 		/*
3272 		 * We exit the function here to ensure IOMMU's remapping and
3273 		 * mempool aren't setup, which means that the IOMMU's PMRs
3274 		 * won't be disabled via the call to init_dmars(). So disable
3275 		 * it explicitly here. The PMRs were setup by tboot prior to
3276 		 * calling SENTER, but the kernel is expected to reset/tear
3277 		 * down the PMRs.
3278 		 */
3279 		if (intel_iommu_tboot_noforce) {
3280 			for_each_iommu(iommu, drhd)
3281 				iommu_disable_protect_mem_regions(iommu);
3282 		}
3283 
3284 		/*
3285 		 * Make sure the IOMMUs are switched off, even when we
3286 		 * boot into a kexec kernel and the previous kernel left
3287 		 * them enabled
3288 		 */
3289 		intel_disable_iommus();
3290 		goto out_free_dmar;
3291 	}
3292 
3293 	if (list_empty(&dmar_rmrr_units))
3294 		pr_info("No RMRR found\n");
3295 
3296 	if (list_empty(&dmar_atsr_units))
3297 		pr_info("No ATSR found\n");
3298 
3299 	if (list_empty(&dmar_satc_units))
3300 		pr_info("No SATC found\n");
3301 
3302 	init_no_remapping_devices();
3303 
3304 	ret = init_dmars();
3305 	if (ret) {
3306 		if (force_on)
3307 			panic("tboot: Failed to initialize DMARs\n");
3308 		pr_err("Initialization failed\n");
3309 		goto out_free_dmar;
3310 	}
3311 	up_write(&dmar_global_lock);
3312 
3313 	init_iommu_pm_ops();
3314 
3315 	down_read(&dmar_global_lock);
3316 	for_each_active_iommu(iommu, drhd) {
3317 		/*
3318 		 * The flush queue implementation does not perform
3319 		 * page-selective invalidations that are required for efficient
3320 		 * TLB flushes in virtual environments.  The benefit of batching
3321 		 * is likely to be much lower than the overhead of synchronizing
3322 		 * the virtual and physical IOMMU page-tables.
3323 		 */
3324 		if (cap_caching_mode(iommu->cap) &&
3325 		    !first_level_by_default(IOMMU_DOMAIN_DMA)) {
3326 			pr_info_once("IOMMU batching disallowed due to virtualization\n");
3327 			iommu_set_dma_strict();
3328 		}
3329 		iommu_device_sysfs_add(&iommu->iommu, NULL,
3330 				       intel_iommu_groups,
3331 				       "%s", iommu->name);
3332 		/*
3333 		 * The iommu device probe is protected by the iommu_probe_device_lock.
3334 		 * Release the dmar_global_lock before entering the device probe path
3335 		 * to avoid unnecessary lock order splat.
3336 		 */
3337 		up_read(&dmar_global_lock);
3338 		iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
3339 		down_read(&dmar_global_lock);
3340 
3341 		iommu_pmu_register(iommu);
3342 	}
3343 
3344 	if (probe_acpi_namespace_devices())
3345 		pr_warn("ACPI name space devices didn't probe correctly\n");
3346 
3347 	/* Finally, we enable the DMA remapping hardware. */
3348 	for_each_iommu(iommu, drhd) {
3349 		if (!drhd->ignored && !translation_pre_enabled(iommu))
3350 			iommu_enable_translation(iommu);
3351 
3352 		iommu_disable_protect_mem_regions(iommu);
3353 	}
3354 	up_read(&dmar_global_lock);
3355 
3356 	pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
3357 
3358 	intel_iommu_enabled = 1;
3359 
3360 	return 0;
3361 
3362 out_free_dmar:
3363 	intel_iommu_free_dmars();
3364 	up_write(&dmar_global_lock);
3365 	return ret;
3366 }
3367 
domain_context_clear_one_cb(struct pci_dev * pdev,u16 alias,void * opaque)3368 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
3369 {
3370 	struct device_domain_info *info = opaque;
3371 
3372 	domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
3373 	return 0;
3374 }
3375 
3376 /*
3377  * NB - intel-iommu lacks any sort of reference counting for the users of
3378  * dependent devices.  If multiple endpoints have intersecting dependent
3379  * devices, unbinding the driver from any one of them will possibly leave
3380  * the others unable to operate.
3381  */
domain_context_clear(struct device_domain_info * info)3382 static void domain_context_clear(struct device_domain_info *info)
3383 {
3384 	if (!dev_is_pci(info->dev)) {
3385 		domain_context_clear_one(info, info->bus, info->devfn);
3386 		return;
3387 	}
3388 
3389 	pci_for_each_dma_alias(to_pci_dev(info->dev),
3390 			       &domain_context_clear_one_cb, info);
3391 }
3392 
3393 /*
3394  * Clear the page table pointer in context or pasid table entries so that
3395  * all DMA requests without PASID from the device are blocked. If the page
3396  * table has been set, clean up the data structures.
3397  */
device_block_translation(struct device * dev)3398 void device_block_translation(struct device *dev)
3399 {
3400 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3401 	struct intel_iommu *iommu = info->iommu;
3402 	unsigned long flags;
3403 
3404 	/* Device in DMA blocking state. Noting to do. */
3405 	if (!info->domain_attached)
3406 		return;
3407 
3408 	if (info->domain)
3409 		cache_tag_unassign_domain(info->domain, dev, IOMMU_NO_PASID);
3410 
3411 	iommu_disable_pci_caps(info);
3412 	if (!dev_is_real_dma_subdevice(dev)) {
3413 		if (sm_supported(iommu))
3414 			intel_pasid_tear_down_entry(iommu, dev,
3415 						    IOMMU_NO_PASID, false);
3416 		else
3417 			domain_context_clear(info);
3418 	}
3419 
3420 	/* Device now in DMA blocking state. */
3421 	info->domain_attached = false;
3422 
3423 	if (!info->domain)
3424 		return;
3425 
3426 	spin_lock_irqsave(&info->domain->lock, flags);
3427 	list_del(&info->link);
3428 	spin_unlock_irqrestore(&info->domain->lock, flags);
3429 
3430 	domain_detach_iommu(info->domain, iommu);
3431 	info->domain = NULL;
3432 }
3433 
md_domain_init(struct dmar_domain * domain,int guest_width)3434 static int md_domain_init(struct dmar_domain *domain, int guest_width)
3435 {
3436 	int adjust_width;
3437 
3438 	/* calculate AGAW */
3439 	domain->gaw = guest_width;
3440 	adjust_width = guestwidth_to_adjustwidth(guest_width);
3441 	domain->agaw = width_to_agaw(adjust_width);
3442 
3443 	domain->iommu_coherency = false;
3444 	domain->iommu_superpage = 0;
3445 	domain->max_addr = 0;
3446 
3447 	/* always allocate the top pgd */
3448 	domain->pgd = iommu_alloc_page_node(domain->nid, GFP_ATOMIC);
3449 	if (!domain->pgd)
3450 		return -ENOMEM;
3451 	domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3452 	return 0;
3453 }
3454 
blocking_domain_attach_dev(struct iommu_domain * domain,struct device * dev)3455 static int blocking_domain_attach_dev(struct iommu_domain *domain,
3456 				      struct device *dev)
3457 {
3458 	device_block_translation(dev);
3459 	return 0;
3460 }
3461 
3462 static struct iommu_domain blocking_domain = {
3463 	.type = IOMMU_DOMAIN_BLOCKED,
3464 	.ops = &(const struct iommu_domain_ops) {
3465 		.attach_dev	= blocking_domain_attach_dev,
3466 	}
3467 };
3468 
iommu_superpage_capability(struct intel_iommu * iommu,bool first_stage)3469 static int iommu_superpage_capability(struct intel_iommu *iommu, bool first_stage)
3470 {
3471 	if (!intel_iommu_superpage)
3472 		return 0;
3473 
3474 	if (first_stage)
3475 		return cap_fl1gp_support(iommu->cap) ? 2 : 1;
3476 
3477 	return fls(cap_super_page_val(iommu->cap));
3478 }
3479 
paging_domain_alloc(struct device * dev,bool first_stage)3480 static struct dmar_domain *paging_domain_alloc(struct device *dev, bool first_stage)
3481 {
3482 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3483 	struct intel_iommu *iommu = info->iommu;
3484 	struct dmar_domain *domain;
3485 	int addr_width;
3486 
3487 	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
3488 	if (!domain)
3489 		return ERR_PTR(-ENOMEM);
3490 
3491 	INIT_LIST_HEAD(&domain->devices);
3492 	INIT_LIST_HEAD(&domain->dev_pasids);
3493 	INIT_LIST_HEAD(&domain->cache_tags);
3494 	spin_lock_init(&domain->lock);
3495 	spin_lock_init(&domain->cache_lock);
3496 	xa_init(&domain->iommu_array);
3497 
3498 	domain->nid = dev_to_node(dev);
3499 	domain->use_first_level = first_stage;
3500 
3501 	/* calculate the address width */
3502 	addr_width = agaw_to_width(iommu->agaw);
3503 	if (addr_width > cap_mgaw(iommu->cap))
3504 		addr_width = cap_mgaw(iommu->cap);
3505 	domain->gaw = addr_width;
3506 	domain->agaw = iommu->agaw;
3507 	domain->max_addr = __DOMAIN_MAX_ADDR(addr_width);
3508 
3509 	/* iommu memory access coherency */
3510 	domain->iommu_coherency = iommu_paging_structure_coherency(iommu);
3511 
3512 	/* pagesize bitmap */
3513 	domain->domain.pgsize_bitmap = SZ_4K;
3514 	domain->iommu_superpage = iommu_superpage_capability(iommu, first_stage);
3515 	domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
3516 
3517 	/*
3518 	 * IOVA aperture: First-level translation restricts the input-address
3519 	 * to a canonical address (i.e., address bits 63:N have the same value
3520 	 * as address bit [N-1], where N is 48-bits with 4-level paging and
3521 	 * 57-bits with 5-level paging). Hence, skip bit [N-1].
3522 	 */
3523 	domain->domain.geometry.force_aperture = true;
3524 	domain->domain.geometry.aperture_start = 0;
3525 	if (first_stage)
3526 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
3527 	else
3528 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
3529 
3530 	/* always allocate the top pgd */
3531 	domain->pgd = iommu_alloc_page_node(domain->nid, GFP_KERNEL);
3532 	if (!domain->pgd) {
3533 		kfree(domain);
3534 		return ERR_PTR(-ENOMEM);
3535 	}
3536 	domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3537 
3538 	return domain;
3539 }
3540 
intel_iommu_domain_alloc(unsigned type)3541 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
3542 {
3543 	struct dmar_domain *dmar_domain;
3544 	struct iommu_domain *domain;
3545 
3546 	switch (type) {
3547 	case IOMMU_DOMAIN_DMA:
3548 	case IOMMU_DOMAIN_UNMANAGED:
3549 		dmar_domain = alloc_domain(type);
3550 		if (!dmar_domain) {
3551 			pr_err("Can't allocate dmar_domain\n");
3552 			return NULL;
3553 		}
3554 		if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3555 			pr_err("Domain initialization failed\n");
3556 			domain_exit(dmar_domain);
3557 			return NULL;
3558 		}
3559 
3560 		domain = &dmar_domain->domain;
3561 		domain->geometry.aperture_start = 0;
3562 		domain->geometry.aperture_end   =
3563 				__DOMAIN_MAX_ADDR(dmar_domain->gaw);
3564 		domain->geometry.force_aperture = true;
3565 
3566 		return domain;
3567 	default:
3568 		return NULL;
3569 	}
3570 
3571 	return NULL;
3572 }
3573 
3574 static struct iommu_domain *
intel_iommu_domain_alloc_user(struct device * dev,u32 flags,struct iommu_domain * parent,const struct iommu_user_data * user_data)3575 intel_iommu_domain_alloc_user(struct device *dev, u32 flags,
3576 			      struct iommu_domain *parent,
3577 			      const struct iommu_user_data *user_data)
3578 {
3579 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3580 	bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
3581 	bool nested_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT;
3582 	struct intel_iommu *iommu = info->iommu;
3583 	struct dmar_domain *dmar_domain;
3584 	struct iommu_domain *domain;
3585 
3586 	/* Must be NESTING domain */
3587 	if (parent) {
3588 		if (!nested_supported(iommu) || flags)
3589 			return ERR_PTR(-EOPNOTSUPP);
3590 		return intel_nested_domain_alloc(parent, user_data);
3591 	}
3592 
3593 	if (flags &
3594 	    (~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING)))
3595 		return ERR_PTR(-EOPNOTSUPP);
3596 	if (nested_parent && !nested_supported(iommu))
3597 		return ERR_PTR(-EOPNOTSUPP);
3598 	if (user_data || (dirty_tracking && !ssads_supported(iommu)))
3599 		return ERR_PTR(-EOPNOTSUPP);
3600 
3601 	/* Do not use first stage for user domain translation. */
3602 	dmar_domain = paging_domain_alloc(dev, false);
3603 	if (IS_ERR(dmar_domain))
3604 		return ERR_CAST(dmar_domain);
3605 	domain = &dmar_domain->domain;
3606 	domain->type = IOMMU_DOMAIN_UNMANAGED;
3607 	domain->owner = &intel_iommu_ops;
3608 	domain->ops = intel_iommu_ops.default_domain_ops;
3609 
3610 	if (nested_parent) {
3611 		dmar_domain->nested_parent = true;
3612 		INIT_LIST_HEAD(&dmar_domain->s1_domains);
3613 		spin_lock_init(&dmar_domain->s1_lock);
3614 	}
3615 
3616 	if (dirty_tracking) {
3617 		if (dmar_domain->use_first_level) {
3618 			iommu_domain_free(domain);
3619 			return ERR_PTR(-EOPNOTSUPP);
3620 		}
3621 		domain->dirty_ops = &intel_dirty_ops;
3622 	}
3623 
3624 	return domain;
3625 }
3626 
intel_iommu_domain_free(struct iommu_domain * domain)3627 static void intel_iommu_domain_free(struct iommu_domain *domain)
3628 {
3629 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3630 
3631 	WARN_ON(dmar_domain->nested_parent &&
3632 		!list_empty(&dmar_domain->s1_domains));
3633 	domain_exit(dmar_domain);
3634 }
3635 
prepare_domain_attach_device(struct iommu_domain * domain,struct device * dev)3636 int prepare_domain_attach_device(struct iommu_domain *domain,
3637 				 struct device *dev)
3638 {
3639 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3640 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3641 	struct intel_iommu *iommu = info->iommu;
3642 	int addr_width;
3643 
3644 	if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap))
3645 		return -EINVAL;
3646 
3647 	if (domain->dirty_ops && !ssads_supported(iommu))
3648 		return -EINVAL;
3649 
3650 	/* check if this iommu agaw is sufficient for max mapped address */
3651 	addr_width = agaw_to_width(iommu->agaw);
3652 	if (addr_width > cap_mgaw(iommu->cap))
3653 		addr_width = cap_mgaw(iommu->cap);
3654 
3655 	if (dmar_domain->max_addr > (1LL << addr_width))
3656 		return -EINVAL;
3657 	dmar_domain->gaw = addr_width;
3658 
3659 	/*
3660 	 * Knock out extra levels of page tables if necessary
3661 	 */
3662 	while (iommu->agaw < dmar_domain->agaw) {
3663 		struct dma_pte *pte;
3664 
3665 		pte = dmar_domain->pgd;
3666 		if (dma_pte_present(pte)) {
3667 			dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte));
3668 			iommu_free_page(pte);
3669 		}
3670 		dmar_domain->agaw--;
3671 	}
3672 
3673 	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) &&
3674 	    context_copied(iommu, info->bus, info->devfn))
3675 		return intel_pasid_setup_sm_context(dev);
3676 
3677 	return 0;
3678 }
3679 
intel_iommu_attach_device(struct iommu_domain * domain,struct device * dev)3680 static int intel_iommu_attach_device(struct iommu_domain *domain,
3681 				     struct device *dev)
3682 {
3683 	int ret;
3684 
3685 	device_block_translation(dev);
3686 
3687 	ret = prepare_domain_attach_device(domain, dev);
3688 	if (ret)
3689 		return ret;
3690 
3691 	return dmar_domain_attach_device(to_dmar_domain(domain), dev);
3692 }
3693 
intel_iommu_map(struct iommu_domain * domain,unsigned long iova,phys_addr_t hpa,size_t size,int iommu_prot,gfp_t gfp)3694 static int intel_iommu_map(struct iommu_domain *domain,
3695 			   unsigned long iova, phys_addr_t hpa,
3696 			   size_t size, int iommu_prot, gfp_t gfp)
3697 {
3698 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3699 	u64 max_addr;
3700 	int prot = 0;
3701 
3702 	if (iommu_prot & IOMMU_READ)
3703 		prot |= DMA_PTE_READ;
3704 	if (iommu_prot & IOMMU_WRITE)
3705 		prot |= DMA_PTE_WRITE;
3706 	if (dmar_domain->set_pte_snp)
3707 		prot |= DMA_PTE_SNP;
3708 
3709 	max_addr = iova + size;
3710 	if (dmar_domain->max_addr < max_addr) {
3711 		u64 end;
3712 
3713 		/* check if minimum agaw is sufficient for mapped address */
3714 		end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
3715 		if (end < max_addr) {
3716 			pr_err("%s: iommu width (%d) is not "
3717 			       "sufficient for the mapped address (%llx)\n",
3718 			       __func__, dmar_domain->gaw, max_addr);
3719 			return -EFAULT;
3720 		}
3721 		dmar_domain->max_addr = max_addr;
3722 	}
3723 	/* Round up size to next multiple of PAGE_SIZE, if it and
3724 	   the low bits of hpa would take us onto the next page */
3725 	size = aligned_nrpages(hpa, size);
3726 	return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
3727 				hpa >> VTD_PAGE_SHIFT, size, prot, gfp);
3728 }
3729 
intel_iommu_map_pages(struct iommu_domain * domain,unsigned long iova,phys_addr_t paddr,size_t pgsize,size_t pgcount,int prot,gfp_t gfp,size_t * mapped)3730 static int intel_iommu_map_pages(struct iommu_domain *domain,
3731 				 unsigned long iova, phys_addr_t paddr,
3732 				 size_t pgsize, size_t pgcount,
3733 				 int prot, gfp_t gfp, size_t *mapped)
3734 {
3735 	unsigned long pgshift = __ffs(pgsize);
3736 	size_t size = pgcount << pgshift;
3737 	int ret;
3738 
3739 	if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
3740 		return -EINVAL;
3741 
3742 	if (!IS_ALIGNED(iova | paddr, pgsize))
3743 		return -EINVAL;
3744 
3745 	ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp);
3746 	if (!ret && mapped)
3747 		*mapped = size;
3748 
3749 	return ret;
3750 }
3751 
intel_iommu_unmap(struct iommu_domain * domain,unsigned long iova,size_t size,struct iommu_iotlb_gather * gather)3752 static size_t intel_iommu_unmap(struct iommu_domain *domain,
3753 				unsigned long iova, size_t size,
3754 				struct iommu_iotlb_gather *gather)
3755 {
3756 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3757 	unsigned long start_pfn, last_pfn;
3758 	int level = 0;
3759 
3760 	/* Cope with horrid API which requires us to unmap more than the
3761 	   size argument if it happens to be a large-page mapping. */
3762 	if (unlikely(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT,
3763 				     &level, GFP_ATOMIC)))
3764 		return 0;
3765 
3766 	if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
3767 		size = VTD_PAGE_SIZE << level_to_offset_bits(level);
3768 
3769 	start_pfn = iova >> VTD_PAGE_SHIFT;
3770 	last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
3771 
3772 	domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist);
3773 
3774 	if (dmar_domain->max_addr == iova + size)
3775 		dmar_domain->max_addr = iova;
3776 
3777 	/*
3778 	 * We do not use page-selective IOTLB invalidation in flush queue,
3779 	 * so there is no need to track page and sync iotlb.
3780 	 */
3781 	if (!iommu_iotlb_gather_queued(gather))
3782 		iommu_iotlb_gather_add_page(domain, gather, iova, size);
3783 
3784 	return size;
3785 }
3786 
intel_iommu_unmap_pages(struct iommu_domain * domain,unsigned long iova,size_t pgsize,size_t pgcount,struct iommu_iotlb_gather * gather)3787 static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
3788 				      unsigned long iova,
3789 				      size_t pgsize, size_t pgcount,
3790 				      struct iommu_iotlb_gather *gather)
3791 {
3792 	unsigned long pgshift = __ffs(pgsize);
3793 	size_t size = pgcount << pgshift;
3794 
3795 	return intel_iommu_unmap(domain, iova, size, gather);
3796 }
3797 
intel_iommu_tlb_sync(struct iommu_domain * domain,struct iommu_iotlb_gather * gather)3798 static void intel_iommu_tlb_sync(struct iommu_domain *domain,
3799 				 struct iommu_iotlb_gather *gather)
3800 {
3801 	cache_tag_flush_range(to_dmar_domain(domain), gather->start,
3802 			      gather->end, list_empty(&gather->freelist));
3803 	iommu_put_pages_list(&gather->freelist);
3804 }
3805 
intel_iommu_iova_to_phys(struct iommu_domain * domain,dma_addr_t iova)3806 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
3807 					    dma_addr_t iova)
3808 {
3809 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3810 	struct dma_pte *pte;
3811 	int level = 0;
3812 	u64 phys = 0;
3813 
3814 	pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level,
3815 			     GFP_ATOMIC);
3816 	if (pte && dma_pte_present(pte))
3817 		phys = dma_pte_addr(pte) +
3818 			(iova & (BIT_MASK(level_to_offset_bits(level) +
3819 						VTD_PAGE_SHIFT) - 1));
3820 
3821 	return phys;
3822 }
3823 
domain_support_force_snooping(struct dmar_domain * domain)3824 static bool domain_support_force_snooping(struct dmar_domain *domain)
3825 {
3826 	struct device_domain_info *info;
3827 	bool support = true;
3828 
3829 	assert_spin_locked(&domain->lock);
3830 	list_for_each_entry(info, &domain->devices, link) {
3831 		if (!ecap_sc_support(info->iommu->ecap)) {
3832 			support = false;
3833 			break;
3834 		}
3835 	}
3836 
3837 	return support;
3838 }
3839 
domain_set_force_snooping(struct dmar_domain * domain)3840 static void domain_set_force_snooping(struct dmar_domain *domain)
3841 {
3842 	struct device_domain_info *info;
3843 
3844 	assert_spin_locked(&domain->lock);
3845 	/*
3846 	 * Second level page table supports per-PTE snoop control. The
3847 	 * iommu_map() interface will handle this by setting SNP bit.
3848 	 */
3849 	if (!domain->use_first_level) {
3850 		domain->set_pte_snp = true;
3851 		return;
3852 	}
3853 
3854 	list_for_each_entry(info, &domain->devices, link)
3855 		intel_pasid_setup_page_snoop_control(info->iommu, info->dev,
3856 						     IOMMU_NO_PASID);
3857 }
3858 
intel_iommu_enforce_cache_coherency(struct iommu_domain * domain)3859 static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain)
3860 {
3861 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3862 	unsigned long flags;
3863 
3864 	if (dmar_domain->force_snooping)
3865 		return true;
3866 
3867 	spin_lock_irqsave(&dmar_domain->lock, flags);
3868 	if (!domain_support_force_snooping(dmar_domain) ||
3869 	    (!dmar_domain->use_first_level && dmar_domain->has_mappings)) {
3870 		spin_unlock_irqrestore(&dmar_domain->lock, flags);
3871 		return false;
3872 	}
3873 
3874 	domain_set_force_snooping(dmar_domain);
3875 	dmar_domain->force_snooping = true;
3876 	spin_unlock_irqrestore(&dmar_domain->lock, flags);
3877 
3878 	return true;
3879 }
3880 
intel_iommu_capable(struct device * dev,enum iommu_cap cap)3881 static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap)
3882 {
3883 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3884 
3885 	switch (cap) {
3886 	case IOMMU_CAP_CACHE_COHERENCY:
3887 	case IOMMU_CAP_DEFERRED_FLUSH:
3888 		return true;
3889 	case IOMMU_CAP_PRE_BOOT_PROTECTION:
3890 		return dmar_platform_optin();
3891 	case IOMMU_CAP_ENFORCE_CACHE_COHERENCY:
3892 		return ecap_sc_support(info->iommu->ecap);
3893 	case IOMMU_CAP_DIRTY_TRACKING:
3894 		return ssads_supported(info->iommu);
3895 	default:
3896 		return false;
3897 	}
3898 }
3899 
intel_iommu_probe_device(struct device * dev)3900 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
3901 {
3902 	struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
3903 	struct device_domain_info *info;
3904 	struct intel_iommu *iommu;
3905 	u8 bus, devfn;
3906 	int ret;
3907 
3908 	iommu = device_lookup_iommu(dev, &bus, &devfn);
3909 	if (!iommu || !iommu->iommu.ops)
3910 		return ERR_PTR(-ENODEV);
3911 
3912 	info = kzalloc(sizeof(*info), GFP_KERNEL);
3913 	if (!info)
3914 		return ERR_PTR(-ENOMEM);
3915 
3916 	if (dev_is_real_dma_subdevice(dev)) {
3917 		info->bus = pdev->bus->number;
3918 		info->devfn = pdev->devfn;
3919 		info->segment = pci_domain_nr(pdev->bus);
3920 	} else {
3921 		info->bus = bus;
3922 		info->devfn = devfn;
3923 		info->segment = iommu->segment;
3924 	}
3925 
3926 	info->dev = dev;
3927 	info->iommu = iommu;
3928 	if (dev_is_pci(dev)) {
3929 		if (ecap_dev_iotlb_support(iommu->ecap) &&
3930 		    pci_ats_supported(pdev) &&
3931 		    dmar_ats_supported(pdev, iommu)) {
3932 			info->ats_supported = 1;
3933 			info->dtlb_extra_inval = dev_needs_extra_dtlb_flush(pdev);
3934 
3935 			/*
3936 			 * For IOMMU that supports device IOTLB throttling
3937 			 * (DIT), we assign PFSID to the invalidation desc
3938 			 * of a VF such that IOMMU HW can gauge queue depth
3939 			 * at PF level. If DIT is not set, PFSID will be
3940 			 * treated as reserved, which should be set to 0.
3941 			 */
3942 			if (ecap_dit(iommu->ecap))
3943 				info->pfsid = pci_dev_id(pci_physfn(pdev));
3944 			info->ats_qdep = pci_ats_queue_depth(pdev);
3945 		}
3946 		if (sm_supported(iommu)) {
3947 			if (pasid_supported(iommu)) {
3948 				int features = pci_pasid_features(pdev);
3949 
3950 				if (features >= 0)
3951 					info->pasid_supported = features | 1;
3952 			}
3953 
3954 			if (info->ats_supported && ecap_prs(iommu->ecap) &&
3955 			    pci_pri_supported(pdev))
3956 				info->pri_supported = 1;
3957 		}
3958 	}
3959 
3960 	dev_iommu_priv_set(dev, info);
3961 	if (pdev && pci_ats_supported(pdev)) {
3962 		pci_prepare_ats(pdev, VTD_PAGE_SHIFT);
3963 		ret = device_rbtree_insert(iommu, info);
3964 		if (ret)
3965 			goto free;
3966 	}
3967 
3968 	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
3969 		ret = intel_pasid_alloc_table(dev);
3970 		if (ret) {
3971 			dev_err(dev, "PASID table allocation failed\n");
3972 			goto clear_rbtree;
3973 		}
3974 
3975 		if (!context_copied(iommu, info->bus, info->devfn)) {
3976 			ret = intel_pasid_setup_sm_context(dev);
3977 			if (ret)
3978 				goto free_table;
3979 		}
3980 	}
3981 
3982 	intel_iommu_debugfs_create_dev(info);
3983 
3984 	/*
3985 	 * The PCIe spec, in its wisdom, declares that the behaviour of the
3986 	 * device is undefined if you enable PASID support after ATS support.
3987 	 * So always enable PASID support on devices which have it, even if
3988 	 * we can't yet know if we're ever going to use it.
3989 	 */
3990 	if (info->pasid_supported &&
3991 	    !pci_enable_pasid(pdev, info->pasid_supported & ~1))
3992 		info->pasid_enabled = 1;
3993 
3994 	return &iommu->iommu;
3995 free_table:
3996 	intel_pasid_free_table(dev);
3997 clear_rbtree:
3998 	device_rbtree_remove(info);
3999 free:
4000 	kfree(info);
4001 
4002 	return ERR_PTR(ret);
4003 }
4004 
intel_iommu_release_device(struct device * dev)4005 static void intel_iommu_release_device(struct device *dev)
4006 {
4007 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4008 	struct intel_iommu *iommu = info->iommu;
4009 
4010 	if (info->pasid_enabled) {
4011 		pci_disable_pasid(to_pci_dev(dev));
4012 		info->pasid_enabled = 0;
4013 	}
4014 
4015 	mutex_lock(&iommu->iopf_lock);
4016 	if (dev_is_pci(dev) && pci_ats_supported(to_pci_dev(dev)))
4017 		device_rbtree_remove(info);
4018 	mutex_unlock(&iommu->iopf_lock);
4019 
4020 	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) &&
4021 	    !context_copied(iommu, info->bus, info->devfn))
4022 		intel_pasid_teardown_sm_context(dev);
4023 
4024 	intel_pasid_free_table(dev);
4025 	intel_iommu_debugfs_remove_dev(info);
4026 	kfree(info);
4027 	set_dma_ops(dev, NULL);
4028 }
4029 
intel_iommu_get_resv_regions(struct device * device,struct list_head * head)4030 static void intel_iommu_get_resv_regions(struct device *device,
4031 					 struct list_head *head)
4032 {
4033 	int prot = DMA_PTE_READ | DMA_PTE_WRITE;
4034 	struct iommu_resv_region *reg;
4035 	struct dmar_rmrr_unit *rmrr;
4036 	struct device *i_dev;
4037 	int i;
4038 
4039 	rcu_read_lock();
4040 	for_each_rmrr_units(rmrr) {
4041 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
4042 					  i, i_dev) {
4043 			struct iommu_resv_region *resv;
4044 			enum iommu_resv_type type;
4045 			size_t length;
4046 
4047 			if (i_dev != device &&
4048 			    !is_downstream_to_pci_bridge(device, i_dev))
4049 				continue;
4050 
4051 			length = rmrr->end_address - rmrr->base_address + 1;
4052 
4053 			type = device_rmrr_is_relaxable(device) ?
4054 				IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
4055 
4056 			resv = iommu_alloc_resv_region(rmrr->base_address,
4057 						       length, prot, type,
4058 						       GFP_ATOMIC);
4059 			if (!resv)
4060 				break;
4061 
4062 			list_add_tail(&resv->list, head);
4063 		}
4064 	}
4065 	rcu_read_unlock();
4066 
4067 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
4068 	if (dev_is_pci(device)) {
4069 		struct pci_dev *pdev = to_pci_dev(device);
4070 
4071 		if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
4072 			reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
4073 					IOMMU_RESV_DIRECT_RELAXABLE,
4074 					GFP_KERNEL);
4075 			if (reg)
4076 				list_add_tail(&reg->list, head);
4077 		}
4078 	}
4079 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
4080 
4081 	reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
4082 				      IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
4083 				      0, IOMMU_RESV_MSI, GFP_KERNEL);
4084 	if (!reg)
4085 		return;
4086 	list_add_tail(&reg->list, head);
4087 }
4088 
intel_iommu_device_group(struct device * dev)4089 static struct iommu_group *intel_iommu_device_group(struct device *dev)
4090 {
4091 	if (dev_is_pci(dev))
4092 		return pci_device_group(dev);
4093 	return generic_device_group(dev);
4094 }
4095 
intel_iommu_enable_sva(struct device * dev)4096 static int intel_iommu_enable_sva(struct device *dev)
4097 {
4098 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4099 	struct intel_iommu *iommu;
4100 
4101 	if (!info || dmar_disabled)
4102 		return -EINVAL;
4103 
4104 	iommu = info->iommu;
4105 	if (!iommu)
4106 		return -EINVAL;
4107 
4108 	if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE))
4109 		return -ENODEV;
4110 
4111 	if (!info->pasid_enabled || !info->ats_enabled)
4112 		return -EINVAL;
4113 
4114 	/*
4115 	 * Devices having device-specific I/O fault handling should not
4116 	 * support PCI/PRI. The IOMMU side has no means to check the
4117 	 * capability of device-specific IOPF.  Therefore, IOMMU can only
4118 	 * default that if the device driver enables SVA on a non-PRI
4119 	 * device, it will handle IOPF in its own way.
4120 	 */
4121 	if (!info->pri_supported)
4122 		return 0;
4123 
4124 	/* Devices supporting PRI should have it enabled. */
4125 	if (!info->pri_enabled)
4126 		return -EINVAL;
4127 
4128 	return 0;
4129 }
4130 
context_flip_pri(struct device_domain_info * info,bool enable)4131 static int context_flip_pri(struct device_domain_info *info, bool enable)
4132 {
4133 	struct intel_iommu *iommu = info->iommu;
4134 	u8 bus = info->bus, devfn = info->devfn;
4135 	struct context_entry *context;
4136 	u16 did;
4137 
4138 	spin_lock(&iommu->lock);
4139 	if (context_copied(iommu, bus, devfn)) {
4140 		spin_unlock(&iommu->lock);
4141 		return -EINVAL;
4142 	}
4143 
4144 	context = iommu_context_addr(iommu, bus, devfn, false);
4145 	if (!context || !context_present(context)) {
4146 		spin_unlock(&iommu->lock);
4147 		return -ENODEV;
4148 	}
4149 	did = context_domain_id(context);
4150 
4151 	if (enable)
4152 		context_set_sm_pre(context);
4153 	else
4154 		context_clear_sm_pre(context);
4155 
4156 	if (!ecap_coherent(iommu->ecap))
4157 		clflush_cache_range(context, sizeof(*context));
4158 	intel_context_flush_present(info, context, did, true);
4159 	spin_unlock(&iommu->lock);
4160 
4161 	return 0;
4162 }
4163 
intel_iommu_enable_iopf(struct device * dev)4164 static int intel_iommu_enable_iopf(struct device *dev)
4165 {
4166 	struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4167 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4168 	struct intel_iommu *iommu;
4169 	int ret;
4170 
4171 	if (!pdev || !info || !info->ats_enabled || !info->pri_supported)
4172 		return -ENODEV;
4173 
4174 	if (info->pri_enabled)
4175 		return -EBUSY;
4176 
4177 	iommu = info->iommu;
4178 	if (!iommu)
4179 		return -EINVAL;
4180 
4181 	/* PASID is required in PRG Response Message. */
4182 	if (info->pasid_enabled && !pci_prg_resp_pasid_required(pdev))
4183 		return -EINVAL;
4184 
4185 	ret = pci_reset_pri(pdev);
4186 	if (ret)
4187 		return ret;
4188 
4189 	ret = iopf_queue_add_device(iommu->iopf_queue, dev);
4190 	if (ret)
4191 		return ret;
4192 
4193 	ret = context_flip_pri(info, true);
4194 	if (ret)
4195 		goto err_remove_device;
4196 
4197 	ret = pci_enable_pri(pdev, PRQ_DEPTH);
4198 	if (ret)
4199 		goto err_clear_pri;
4200 
4201 	info->pri_enabled = 1;
4202 
4203 	return 0;
4204 err_clear_pri:
4205 	context_flip_pri(info, false);
4206 err_remove_device:
4207 	iopf_queue_remove_device(iommu->iopf_queue, dev);
4208 
4209 	return ret;
4210 }
4211 
intel_iommu_disable_iopf(struct device * dev)4212 static int intel_iommu_disable_iopf(struct device *dev)
4213 {
4214 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4215 	struct intel_iommu *iommu = info->iommu;
4216 
4217 	if (!info->pri_enabled)
4218 		return -EINVAL;
4219 
4220 	/* Disable new PRI reception: */
4221 	context_flip_pri(info, false);
4222 
4223 	/*
4224 	 * Remove device from fault queue and acknowledge all outstanding
4225 	 * PRQs to the device:
4226 	 */
4227 	iopf_queue_remove_device(iommu->iopf_queue, dev);
4228 
4229 	/*
4230 	 * PCIe spec states that by clearing PRI enable bit, the Page
4231 	 * Request Interface will not issue new page requests, but has
4232 	 * outstanding page requests that have been transmitted or are
4233 	 * queued for transmission. This is supposed to be called after
4234 	 * the device driver has stopped DMA, all PASIDs have been
4235 	 * unbound and the outstanding PRQs have been drained.
4236 	 */
4237 	pci_disable_pri(to_pci_dev(dev));
4238 	info->pri_enabled = 0;
4239 
4240 	return 0;
4241 }
4242 
4243 static int
intel_iommu_dev_enable_feat(struct device * dev,enum iommu_dev_features feat)4244 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
4245 {
4246 	switch (feat) {
4247 	case IOMMU_DEV_FEAT_IOPF:
4248 		return intel_iommu_enable_iopf(dev);
4249 
4250 	case IOMMU_DEV_FEAT_SVA:
4251 		return intel_iommu_enable_sva(dev);
4252 
4253 	default:
4254 		return -ENODEV;
4255 	}
4256 }
4257 
4258 static int
intel_iommu_dev_disable_feat(struct device * dev,enum iommu_dev_features feat)4259 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
4260 {
4261 	switch (feat) {
4262 	case IOMMU_DEV_FEAT_IOPF:
4263 		return intel_iommu_disable_iopf(dev);
4264 
4265 	case IOMMU_DEV_FEAT_SVA:
4266 		return 0;
4267 
4268 	default:
4269 		return -ENODEV;
4270 	}
4271 }
4272 
intel_iommu_is_attach_deferred(struct device * dev)4273 static bool intel_iommu_is_attach_deferred(struct device *dev)
4274 {
4275 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4276 
4277 	return translation_pre_enabled(info->iommu) && !info->domain;
4278 }
4279 
4280 /*
4281  * Check that the device does not require DMA protection. Such devices should
4282  * not be able to apply quirks and thus not be able to bypass the IOMMU
4283  * restrictions.
4284  */
risky_device(struct pci_dev * pdev)4285 static bool risky_device(struct pci_dev *pdev)
4286 {
4287 	if (pdev->requires_dma_protection) {
4288 		pci_info(pdev,
4289 			 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
4290 			 pdev->vendor, pdev->device);
4291 		pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
4292 		return true;
4293 	}
4294 	return false;
4295 }
4296 
intel_iommu_iotlb_sync_map(struct iommu_domain * domain,unsigned long iova,size_t size)4297 static int intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
4298 				      unsigned long iova, size_t size)
4299 {
4300 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4301 
4302 	if (dmar_domain->iotlb_sync_map)
4303 		cache_tag_flush_range_np(dmar_domain, iova, iova + size - 1);
4304 
4305 	return 0;
4306 }
4307 
intel_iommu_remove_dev_pasid(struct device * dev,ioasid_t pasid,struct iommu_domain * domain)4308 static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid,
4309 					 struct iommu_domain *domain)
4310 {
4311 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4312 	struct dev_pasid_info *curr, *dev_pasid = NULL;
4313 	struct intel_iommu *iommu = info->iommu;
4314 	struct dmar_domain *dmar_domain;
4315 	unsigned long flags;
4316 
4317 	if (domain->type == IOMMU_DOMAIN_IDENTITY) {
4318 		intel_pasid_tear_down_entry(iommu, dev, pasid, false);
4319 		return;
4320 	}
4321 
4322 	dmar_domain = to_dmar_domain(domain);
4323 	spin_lock_irqsave(&dmar_domain->lock, flags);
4324 	list_for_each_entry(curr, &dmar_domain->dev_pasids, link_domain) {
4325 		if (curr->dev == dev && curr->pasid == pasid) {
4326 			list_del(&curr->link_domain);
4327 			dev_pasid = curr;
4328 			break;
4329 		}
4330 	}
4331 	WARN_ON_ONCE(!dev_pasid);
4332 	spin_unlock_irqrestore(&dmar_domain->lock, flags);
4333 
4334 	cache_tag_unassign_domain(dmar_domain, dev, pasid);
4335 	domain_detach_iommu(dmar_domain, iommu);
4336 	intel_iommu_debugfs_remove_dev_pasid(dev_pasid);
4337 	kfree(dev_pasid);
4338 	intel_pasid_tear_down_entry(iommu, dev, pasid, false);
4339 	intel_drain_pasid_prq(dev, pasid);
4340 }
4341 
intel_iommu_set_dev_pasid(struct iommu_domain * domain,struct device * dev,ioasid_t pasid)4342 static int intel_iommu_set_dev_pasid(struct iommu_domain *domain,
4343 				     struct device *dev, ioasid_t pasid)
4344 {
4345 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4346 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4347 	struct intel_iommu *iommu = info->iommu;
4348 	struct dev_pasid_info *dev_pasid;
4349 	unsigned long flags;
4350 	int ret;
4351 
4352 	if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev))
4353 		return -EOPNOTSUPP;
4354 
4355 	if (domain->dirty_ops)
4356 		return -EINVAL;
4357 
4358 	if (context_copied(iommu, info->bus, info->devfn))
4359 		return -EBUSY;
4360 
4361 	ret = prepare_domain_attach_device(domain, dev);
4362 	if (ret)
4363 		return ret;
4364 
4365 	dev_pasid = kzalloc(sizeof(*dev_pasid), GFP_KERNEL);
4366 	if (!dev_pasid)
4367 		return -ENOMEM;
4368 
4369 	ret = domain_attach_iommu(dmar_domain, iommu);
4370 	if (ret)
4371 		goto out_free;
4372 
4373 	ret = cache_tag_assign_domain(dmar_domain, dev, pasid);
4374 	if (ret)
4375 		goto out_detach_iommu;
4376 
4377 	if (dmar_domain->use_first_level)
4378 		ret = domain_setup_first_level(iommu, dmar_domain,
4379 					       dev, pasid);
4380 	else
4381 		ret = intel_pasid_setup_second_level(iommu, dmar_domain,
4382 						     dev, pasid);
4383 	if (ret)
4384 		goto out_unassign_tag;
4385 
4386 	dev_pasid->dev = dev;
4387 	dev_pasid->pasid = pasid;
4388 	spin_lock_irqsave(&dmar_domain->lock, flags);
4389 	list_add(&dev_pasid->link_domain, &dmar_domain->dev_pasids);
4390 	spin_unlock_irqrestore(&dmar_domain->lock, flags);
4391 
4392 	if (domain->type & __IOMMU_DOMAIN_PAGING)
4393 		intel_iommu_debugfs_create_dev_pasid(dev_pasid);
4394 
4395 	return 0;
4396 out_unassign_tag:
4397 	cache_tag_unassign_domain(dmar_domain, dev, pasid);
4398 out_detach_iommu:
4399 	domain_detach_iommu(dmar_domain, iommu);
4400 out_free:
4401 	kfree(dev_pasid);
4402 	return ret;
4403 }
4404 
intel_iommu_hw_info(struct device * dev,u32 * length,u32 * type)4405 static void *intel_iommu_hw_info(struct device *dev, u32 *length, u32 *type)
4406 {
4407 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4408 	struct intel_iommu *iommu = info->iommu;
4409 	struct iommu_hw_info_vtd *vtd;
4410 
4411 	vtd = kzalloc(sizeof(*vtd), GFP_KERNEL);
4412 	if (!vtd)
4413 		return ERR_PTR(-ENOMEM);
4414 
4415 	vtd->flags = IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17;
4416 	vtd->cap_reg = iommu->cap;
4417 	vtd->ecap_reg = iommu->ecap;
4418 	*length = sizeof(*vtd);
4419 	*type = IOMMU_HW_INFO_TYPE_INTEL_VTD;
4420 	return vtd;
4421 }
4422 
4423 /*
4424  * Set dirty tracking for the device list of a domain. The caller must
4425  * hold the domain->lock when calling it.
4426  */
device_set_dirty_tracking(struct list_head * devices,bool enable)4427 static int device_set_dirty_tracking(struct list_head *devices, bool enable)
4428 {
4429 	struct device_domain_info *info;
4430 	int ret = 0;
4431 
4432 	list_for_each_entry(info, devices, link) {
4433 		ret = intel_pasid_setup_dirty_tracking(info->iommu, info->dev,
4434 						       IOMMU_NO_PASID, enable);
4435 		if (ret)
4436 			break;
4437 	}
4438 
4439 	return ret;
4440 }
4441 
parent_domain_set_dirty_tracking(struct dmar_domain * domain,bool enable)4442 static int parent_domain_set_dirty_tracking(struct dmar_domain *domain,
4443 					    bool enable)
4444 {
4445 	struct dmar_domain *s1_domain;
4446 	unsigned long flags;
4447 	int ret;
4448 
4449 	spin_lock(&domain->s1_lock);
4450 	list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) {
4451 		spin_lock_irqsave(&s1_domain->lock, flags);
4452 		ret = device_set_dirty_tracking(&s1_domain->devices, enable);
4453 		spin_unlock_irqrestore(&s1_domain->lock, flags);
4454 		if (ret)
4455 			goto err_unwind;
4456 	}
4457 	spin_unlock(&domain->s1_lock);
4458 	return 0;
4459 
4460 err_unwind:
4461 	list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) {
4462 		spin_lock_irqsave(&s1_domain->lock, flags);
4463 		device_set_dirty_tracking(&s1_domain->devices,
4464 					  domain->dirty_tracking);
4465 		spin_unlock_irqrestore(&s1_domain->lock, flags);
4466 	}
4467 	spin_unlock(&domain->s1_lock);
4468 	return ret;
4469 }
4470 
intel_iommu_set_dirty_tracking(struct iommu_domain * domain,bool enable)4471 static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain,
4472 					  bool enable)
4473 {
4474 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4475 	int ret;
4476 
4477 	spin_lock(&dmar_domain->lock);
4478 	if (dmar_domain->dirty_tracking == enable)
4479 		goto out_unlock;
4480 
4481 	ret = device_set_dirty_tracking(&dmar_domain->devices, enable);
4482 	if (ret)
4483 		goto err_unwind;
4484 
4485 	if (dmar_domain->nested_parent) {
4486 		ret = parent_domain_set_dirty_tracking(dmar_domain, enable);
4487 		if (ret)
4488 			goto err_unwind;
4489 	}
4490 
4491 	dmar_domain->dirty_tracking = enable;
4492 out_unlock:
4493 	spin_unlock(&dmar_domain->lock);
4494 
4495 	return 0;
4496 
4497 err_unwind:
4498 	device_set_dirty_tracking(&dmar_domain->devices,
4499 				  dmar_domain->dirty_tracking);
4500 	spin_unlock(&dmar_domain->lock);
4501 	return ret;
4502 }
4503 
intel_iommu_read_and_clear_dirty(struct iommu_domain * domain,unsigned long iova,size_t size,unsigned long flags,struct iommu_dirty_bitmap * dirty)4504 static int intel_iommu_read_and_clear_dirty(struct iommu_domain *domain,
4505 					    unsigned long iova, size_t size,
4506 					    unsigned long flags,
4507 					    struct iommu_dirty_bitmap *dirty)
4508 {
4509 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4510 	unsigned long end = iova + size - 1;
4511 	unsigned long pgsize;
4512 
4513 	/*
4514 	 * IOMMUFD core calls into a dirty tracking disabled domain without an
4515 	 * IOVA bitmap set in order to clean dirty bits in all PTEs that might
4516 	 * have occurred when we stopped dirty tracking. This ensures that we
4517 	 * never inherit dirtied bits from a previous cycle.
4518 	 */
4519 	if (!dmar_domain->dirty_tracking && dirty->bitmap)
4520 		return -EINVAL;
4521 
4522 	do {
4523 		struct dma_pte *pte;
4524 		int lvl = 0;
4525 
4526 		pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &lvl,
4527 				     GFP_ATOMIC);
4528 		pgsize = level_size(lvl) << VTD_PAGE_SHIFT;
4529 		if (!pte || !dma_pte_present(pte)) {
4530 			iova += pgsize;
4531 			continue;
4532 		}
4533 
4534 		if (dma_sl_pte_test_and_clear_dirty(pte, flags))
4535 			iommu_dirty_bitmap_record(dirty, iova, pgsize);
4536 		iova += pgsize;
4537 	} while (iova < end);
4538 
4539 	return 0;
4540 }
4541 
4542 static const struct iommu_dirty_ops intel_dirty_ops = {
4543 	.set_dirty_tracking = intel_iommu_set_dirty_tracking,
4544 	.read_and_clear_dirty = intel_iommu_read_and_clear_dirty,
4545 };
4546 
context_setup_pass_through(struct device * dev,u8 bus,u8 devfn)4547 static int context_setup_pass_through(struct device *dev, u8 bus, u8 devfn)
4548 {
4549 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4550 	struct intel_iommu *iommu = info->iommu;
4551 	struct context_entry *context;
4552 
4553 	spin_lock(&iommu->lock);
4554 	context = iommu_context_addr(iommu, bus, devfn, 1);
4555 	if (!context) {
4556 		spin_unlock(&iommu->lock);
4557 		return -ENOMEM;
4558 	}
4559 
4560 	if (context_present(context) && !context_copied(iommu, bus, devfn)) {
4561 		spin_unlock(&iommu->lock);
4562 		return 0;
4563 	}
4564 
4565 	copied_context_tear_down(iommu, context, bus, devfn);
4566 	context_clear_entry(context);
4567 	context_set_domain_id(context, FLPT_DEFAULT_DID);
4568 
4569 	/*
4570 	 * In pass through mode, AW must be programmed to indicate the largest
4571 	 * AGAW value supported by hardware. And ASR is ignored by hardware.
4572 	 */
4573 	context_set_address_width(context, iommu->msagaw);
4574 	context_set_translation_type(context, CONTEXT_TT_PASS_THROUGH);
4575 	context_set_fault_enable(context);
4576 	context_set_present(context);
4577 	if (!ecap_coherent(iommu->ecap))
4578 		clflush_cache_range(context, sizeof(*context));
4579 	context_present_cache_flush(iommu, FLPT_DEFAULT_DID, bus, devfn);
4580 	spin_unlock(&iommu->lock);
4581 
4582 	return 0;
4583 }
4584 
context_setup_pass_through_cb(struct pci_dev * pdev,u16 alias,void * data)4585 static int context_setup_pass_through_cb(struct pci_dev *pdev, u16 alias, void *data)
4586 {
4587 	struct device *dev = data;
4588 
4589 	return context_setup_pass_through(dev, PCI_BUS_NUM(alias), alias & 0xff);
4590 }
4591 
device_setup_pass_through(struct device * dev)4592 static int device_setup_pass_through(struct device *dev)
4593 {
4594 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4595 
4596 	if (!dev_is_pci(dev))
4597 		return context_setup_pass_through(dev, info->bus, info->devfn);
4598 
4599 	return pci_for_each_dma_alias(to_pci_dev(dev),
4600 				      context_setup_pass_through_cb, dev);
4601 }
4602 
identity_domain_attach_dev(struct iommu_domain * domain,struct device * dev)4603 static int identity_domain_attach_dev(struct iommu_domain *domain, struct device *dev)
4604 {
4605 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4606 	struct intel_iommu *iommu = info->iommu;
4607 	int ret;
4608 
4609 	device_block_translation(dev);
4610 
4611 	if (dev_is_real_dma_subdevice(dev))
4612 		return 0;
4613 
4614 	if (sm_supported(iommu)) {
4615 		ret = intel_pasid_setup_pass_through(iommu, dev, IOMMU_NO_PASID);
4616 		if (!ret)
4617 			iommu_enable_pci_caps(info);
4618 	} else {
4619 		ret = device_setup_pass_through(dev);
4620 	}
4621 
4622 	if (!ret)
4623 		info->domain_attached = true;
4624 
4625 	return ret;
4626 }
4627 
identity_domain_set_dev_pasid(struct iommu_domain * domain,struct device * dev,ioasid_t pasid)4628 static int identity_domain_set_dev_pasid(struct iommu_domain *domain,
4629 					 struct device *dev, ioasid_t pasid)
4630 {
4631 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4632 	struct intel_iommu *iommu = info->iommu;
4633 
4634 	if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev))
4635 		return -EOPNOTSUPP;
4636 
4637 	return intel_pasid_setup_pass_through(iommu, dev, pasid);
4638 }
4639 
4640 static struct iommu_domain identity_domain = {
4641 	.type = IOMMU_DOMAIN_IDENTITY,
4642 	.ops = &(const struct iommu_domain_ops) {
4643 		.attach_dev	= identity_domain_attach_dev,
4644 		.set_dev_pasid	= identity_domain_set_dev_pasid,
4645 	},
4646 };
4647 
4648 const struct iommu_ops intel_iommu_ops = {
4649 	.blocked_domain		= &blocking_domain,
4650 	.release_domain		= &blocking_domain,
4651 	.identity_domain	= &identity_domain,
4652 	.capable		= intel_iommu_capable,
4653 	.hw_info		= intel_iommu_hw_info,
4654 	.domain_alloc		= intel_iommu_domain_alloc,
4655 	.domain_alloc_user	= intel_iommu_domain_alloc_user,
4656 	.domain_alloc_sva	= intel_svm_domain_alloc,
4657 	.probe_device		= intel_iommu_probe_device,
4658 	.release_device		= intel_iommu_release_device,
4659 	.get_resv_regions	= intel_iommu_get_resv_regions,
4660 	.device_group		= intel_iommu_device_group,
4661 	.dev_enable_feat	= intel_iommu_dev_enable_feat,
4662 	.dev_disable_feat	= intel_iommu_dev_disable_feat,
4663 	.is_attach_deferred	= intel_iommu_is_attach_deferred,
4664 	.def_domain_type	= device_def_domain_type,
4665 	.remove_dev_pasid	= intel_iommu_remove_dev_pasid,
4666 	.pgsize_bitmap		= SZ_4K,
4667 #ifdef CONFIG_INTEL_IOMMU_SVM
4668 	.page_response		= intel_svm_page_response,
4669 #endif
4670 	.default_domain_ops = &(const struct iommu_domain_ops) {
4671 		.attach_dev		= intel_iommu_attach_device,
4672 		.set_dev_pasid		= intel_iommu_set_dev_pasid,
4673 		.map_pages		= intel_iommu_map_pages,
4674 		.unmap_pages		= intel_iommu_unmap_pages,
4675 		.iotlb_sync_map		= intel_iommu_iotlb_sync_map,
4676 		.flush_iotlb_all        = intel_flush_iotlb_all,
4677 		.iotlb_sync		= intel_iommu_tlb_sync,
4678 		.iova_to_phys		= intel_iommu_iova_to_phys,
4679 		.free			= intel_iommu_domain_free,
4680 		.enforce_cache_coherency = intel_iommu_enforce_cache_coherency,
4681 	}
4682 };
4683 
quirk_iommu_igfx(struct pci_dev * dev)4684 static void quirk_iommu_igfx(struct pci_dev *dev)
4685 {
4686 	if (risky_device(dev))
4687 		return;
4688 
4689 	pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
4690 	disable_igfx_iommu = 1;
4691 }
4692 
4693 /* G4x/GM45 integrated gfx dmar support is totally busted. */
4694 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
4695 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
4696 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
4697 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
4698 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
4699 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
4700 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
4701 
4702 /* QM57/QS57 integrated gfx malfunctions with dmar */
4703 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_iommu_igfx);
4704 
4705 /* Broadwell igfx malfunctions with dmar */
4706 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
4707 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
4708 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
4709 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
4710 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
4711 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
4712 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
4713 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
4714 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
4715 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
4716 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
4717 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
4718 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
4719 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
4720 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
4721 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
4722 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
4723 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
4724 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
4725 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
4726 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
4727 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
4728 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
4729 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
4730 
quirk_iommu_rwbf(struct pci_dev * dev)4731 static void quirk_iommu_rwbf(struct pci_dev *dev)
4732 {
4733 	if (risky_device(dev))
4734 		return;
4735 
4736 	/*
4737 	 * Mobile 4 Series Chipset neglects to set RWBF capability,
4738 	 * but needs it. Same seems to hold for the desktop versions.
4739 	 */
4740 	pci_info(dev, "Forcing write-buffer flush capability\n");
4741 	rwbf_quirk = 1;
4742 }
4743 
4744 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4745 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4746 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4747 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4748 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4749 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4750 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4751 
4752 #define GGC 0x52
4753 #define GGC_MEMORY_SIZE_MASK	(0xf << 8)
4754 #define GGC_MEMORY_SIZE_NONE	(0x0 << 8)
4755 #define GGC_MEMORY_SIZE_1M	(0x1 << 8)
4756 #define GGC_MEMORY_SIZE_2M	(0x3 << 8)
4757 #define GGC_MEMORY_VT_ENABLED	(0x8 << 8)
4758 #define GGC_MEMORY_SIZE_2M_VT	(0x9 << 8)
4759 #define GGC_MEMORY_SIZE_3M_VT	(0xa << 8)
4760 #define GGC_MEMORY_SIZE_4M_VT	(0xb << 8)
4761 
quirk_calpella_no_shadow_gtt(struct pci_dev * dev)4762 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4763 {
4764 	unsigned short ggc;
4765 
4766 	if (risky_device(dev))
4767 		return;
4768 
4769 	if (pci_read_config_word(dev, GGC, &ggc))
4770 		return;
4771 
4772 	if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4773 		pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4774 		disable_igfx_iommu = 1;
4775 	} else if (!disable_igfx_iommu) {
4776 		/* we have to ensure the gfx device is idle before we flush */
4777 		pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
4778 		iommu_set_dma_strict();
4779 	}
4780 }
4781 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4782 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4783 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4784 
quirk_igfx_skip_te_disable(struct pci_dev * dev)4785 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
4786 {
4787 	unsigned short ver;
4788 
4789 	if (!IS_GFX_DEVICE(dev))
4790 		return;
4791 
4792 	ver = (dev->device >> 8) & 0xff;
4793 	if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
4794 	    ver != 0x4e && ver != 0x8a && ver != 0x98 &&
4795 	    ver != 0x9a && ver != 0xa7 && ver != 0x7d)
4796 		return;
4797 
4798 	if (risky_device(dev))
4799 		return;
4800 
4801 	pci_info(dev, "Skip IOMMU disabling for graphics\n");
4802 	iommu_skip_te_disable = 1;
4803 }
4804 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
4805 
4806 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4807    ISOCH DMAR unit for the Azalia sound device, but not give it any
4808    TLB entries, which causes it to deadlock. Check for that.  We do
4809    this in a function called from init_dmars(), instead of in a PCI
4810    quirk, because we don't want to print the obnoxious "BIOS broken"
4811    message if VT-d is actually disabled.
4812 */
check_tylersburg_isoch(void)4813 static void __init check_tylersburg_isoch(void)
4814 {
4815 	struct pci_dev *pdev;
4816 	uint32_t vtisochctrl;
4817 
4818 	/* If there's no Azalia in the system anyway, forget it. */
4819 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4820 	if (!pdev)
4821 		return;
4822 
4823 	if (risky_device(pdev)) {
4824 		pci_dev_put(pdev);
4825 		return;
4826 	}
4827 
4828 	pci_dev_put(pdev);
4829 
4830 	/* System Management Registers. Might be hidden, in which case
4831 	   we can't do the sanity check. But that's OK, because the
4832 	   known-broken BIOSes _don't_ actually hide it, so far. */
4833 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4834 	if (!pdev)
4835 		return;
4836 
4837 	if (risky_device(pdev)) {
4838 		pci_dev_put(pdev);
4839 		return;
4840 	}
4841 
4842 	if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4843 		pci_dev_put(pdev);
4844 		return;
4845 	}
4846 
4847 	pci_dev_put(pdev);
4848 
4849 	/* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4850 	if (vtisochctrl & 1)
4851 		return;
4852 
4853 	/* Drop all bits other than the number of TLB entries */
4854 	vtisochctrl &= 0x1c;
4855 
4856 	/* If we have the recommended number of TLB entries (16), fine. */
4857 	if (vtisochctrl == 0x10)
4858 		return;
4859 
4860 	/* Zero TLB entries? You get to ride the short bus to school. */
4861 	if (!vtisochctrl) {
4862 		WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4863 		     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4864 		     dmi_get_system_info(DMI_BIOS_VENDOR),
4865 		     dmi_get_system_info(DMI_BIOS_VERSION),
4866 		     dmi_get_system_info(DMI_PRODUCT_VERSION));
4867 		iommu_identity_mapping |= IDENTMAP_AZALIA;
4868 		return;
4869 	}
4870 
4871 	pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4872 	       vtisochctrl);
4873 }
4874 
4875 /*
4876  * Here we deal with a device TLB defect where device may inadvertently issue ATS
4877  * invalidation completion before posted writes initiated with translated address
4878  * that utilized translations matching the invalidation address range, violating
4879  * the invalidation completion ordering.
4880  * Therefore, any use cases that cannot guarantee DMA is stopped before unmap is
4881  * vulnerable to this defect. In other words, any dTLB invalidation initiated not
4882  * under the control of the trusted/privileged host device driver must use this
4883  * quirk.
4884  * Device TLBs are invalidated under the following six conditions:
4885  * 1. Device driver does DMA API unmap IOVA
4886  * 2. Device driver unbind a PASID from a process, sva_unbind_device()
4887  * 3. PASID is torn down, after PASID cache is flushed. e.g. process
4888  *    exit_mmap() due to crash
4889  * 4. Under SVA usage, called by mmu_notifier.invalidate_range() where
4890  *    VM has to free pages that were unmapped
4891  * 5. Userspace driver unmaps a DMA buffer
4892  * 6. Cache invalidation in vSVA usage (upcoming)
4893  *
4894  * For #1 and #2, device drivers are responsible for stopping DMA traffic
4895  * before unmap/unbind. For #3, iommu driver gets mmu_notifier to
4896  * invalidate TLB the same way as normal user unmap which will use this quirk.
4897  * The dTLB invalidation after PASID cache flush does not need this quirk.
4898  *
4899  * As a reminder, #6 will *NEED* this quirk as we enable nested translation.
4900  */
quirk_extra_dev_tlb_flush(struct device_domain_info * info,unsigned long address,unsigned long mask,u32 pasid,u16 qdep)4901 void quirk_extra_dev_tlb_flush(struct device_domain_info *info,
4902 			       unsigned long address, unsigned long mask,
4903 			       u32 pasid, u16 qdep)
4904 {
4905 	u16 sid;
4906 
4907 	if (likely(!info->dtlb_extra_inval))
4908 		return;
4909 
4910 	sid = PCI_DEVID(info->bus, info->devfn);
4911 	if (pasid == IOMMU_NO_PASID) {
4912 		qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
4913 				   qdep, address, mask);
4914 	} else {
4915 		qi_flush_dev_iotlb_pasid(info->iommu, sid, info->pfsid,
4916 					 pasid, qdep, address, mask);
4917 	}
4918 }
4919 
4920 #define ecmd_get_status_code(res)	(((res) & 0xff) >> 1)
4921 
4922 /*
4923  * Function to submit a command to the enhanced command interface. The
4924  * valid enhanced command descriptions are defined in Table 47 of the
4925  * VT-d spec. The VT-d hardware implementation may support some but not
4926  * all commands, which can be determined by checking the Enhanced
4927  * Command Capability Register.
4928  *
4929  * Return values:
4930  *  - 0: Command successful without any error;
4931  *  - Negative: software error value;
4932  *  - Nonzero positive: failure status code defined in Table 48.
4933  */
ecmd_submit_sync(struct intel_iommu * iommu,u8 ecmd,u64 oa,u64 ob)4934 int ecmd_submit_sync(struct intel_iommu *iommu, u8 ecmd, u64 oa, u64 ob)
4935 {
4936 	unsigned long flags;
4937 	u64 res;
4938 	int ret;
4939 
4940 	if (!cap_ecmds(iommu->cap))
4941 		return -ENODEV;
4942 
4943 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
4944 
4945 	res = dmar_readq(iommu->reg + DMAR_ECRSP_REG);
4946 	if (res & DMA_ECMD_ECRSP_IP) {
4947 		ret = -EBUSY;
4948 		goto err;
4949 	}
4950 
4951 	/*
4952 	 * Unconditionally write the operand B, because
4953 	 * - There is no side effect if an ecmd doesn't require an
4954 	 *   operand B, but we set the register to some value.
4955 	 * - It's not invoked in any critical path. The extra MMIO
4956 	 *   write doesn't bring any performance concerns.
4957 	 */
4958 	dmar_writeq(iommu->reg + DMAR_ECEO_REG, ob);
4959 	dmar_writeq(iommu->reg + DMAR_ECMD_REG, ecmd | (oa << DMA_ECMD_OA_SHIFT));
4960 
4961 	IOMMU_WAIT_OP(iommu, DMAR_ECRSP_REG, dmar_readq,
4962 		      !(res & DMA_ECMD_ECRSP_IP), res);
4963 
4964 	if (res & DMA_ECMD_ECRSP_IP) {
4965 		ret = -ETIMEDOUT;
4966 		goto err;
4967 	}
4968 
4969 	ret = ecmd_get_status_code(res);
4970 err:
4971 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
4972 
4973 	return ret;
4974 }
4975