• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright © 2006-2014 Intel Corporation.
4  *
5  * Authors: David Woodhouse <dwmw2@infradead.org>,
6  *          Ashok Raj <ashok.raj@intel.com>,
7  *          Shaohua Li <shaohua.li@intel.com>,
8  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9  *          Fenghua Yu <fenghua.yu@intel.com>
10  *          Joerg Roedel <jroedel@suse.de>
11  */
12 
13 #define pr_fmt(fmt)     "DMAR: " fmt
14 #define dev_fmt(fmt)    pr_fmt(fmt)
15 
16 #include <linux/crash_dump.h>
17 #include <linux/dma-direct.h>
18 #include <linux/dmi.h>
19 #include <linux/intel-svm.h>
20 #include <linux/memory.h>
21 #include <linux/pci.h>
22 #include <linux/pci-ats.h>
23 #include <linux/spinlock.h>
24 #include <linux/syscore_ops.h>
25 #include <linux/tboot.h>
26 
27 #include "iommu.h"
28 #include "../dma-iommu.h"
29 #include "../irq_remapping.h"
30 #include "../iommu-sva.h"
31 #include "pasid.h"
32 #include "cap_audit.h"
33 
34 #define ROOT_SIZE		VTD_PAGE_SIZE
35 #define CONTEXT_SIZE		VTD_PAGE_SIZE
36 
37 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
38 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
39 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
40 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
41 
42 #define IOAPIC_RANGE_START	(0xfee00000)
43 #define IOAPIC_RANGE_END	(0xfeefffff)
44 #define IOVA_START_ADDR		(0x1000)
45 
46 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
47 
48 #define MAX_AGAW_WIDTH 64
49 #define MAX_AGAW_PFN_WIDTH	(MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
50 
51 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
52 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
53 
54 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
55    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
56 #define DOMAIN_MAX_PFN(gaw)	((unsigned long) min_t(uint64_t, \
57 				__DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
58 #define DOMAIN_MAX_ADDR(gaw)	(((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
59 
60 /* IO virtual address start page frame number */
61 #define IOVA_START_PFN		(1)
62 
63 #define IOVA_PFN(addr)		((addr) >> PAGE_SHIFT)
64 
65 /* page table handling */
66 #define LEVEL_STRIDE		(9)
67 #define LEVEL_MASK		(((u64)1 << LEVEL_STRIDE) - 1)
68 
agaw_to_level(int agaw)69 static inline int agaw_to_level(int agaw)
70 {
71 	return agaw + 2;
72 }
73 
agaw_to_width(int agaw)74 static inline int agaw_to_width(int agaw)
75 {
76 	return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
77 }
78 
width_to_agaw(int width)79 static inline int width_to_agaw(int width)
80 {
81 	return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
82 }
83 
level_to_offset_bits(int level)84 static inline unsigned int level_to_offset_bits(int level)
85 {
86 	return (level - 1) * LEVEL_STRIDE;
87 }
88 
pfn_level_offset(u64 pfn,int level)89 static inline int pfn_level_offset(u64 pfn, int level)
90 {
91 	return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
92 }
93 
level_mask(int level)94 static inline u64 level_mask(int level)
95 {
96 	return -1ULL << level_to_offset_bits(level);
97 }
98 
level_size(int level)99 static inline u64 level_size(int level)
100 {
101 	return 1ULL << level_to_offset_bits(level);
102 }
103 
align_to_level(u64 pfn,int level)104 static inline u64 align_to_level(u64 pfn, int level)
105 {
106 	return (pfn + level_size(level) - 1) & level_mask(level);
107 }
108 
lvl_to_nr_pages(unsigned int lvl)109 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
110 {
111 	return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
112 }
113 
114 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
115    are never going to work. */
mm_to_dma_pfn(unsigned long mm_pfn)116 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
117 {
118 	return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
119 }
page_to_dma_pfn(struct page * pg)120 static inline unsigned long page_to_dma_pfn(struct page *pg)
121 {
122 	return mm_to_dma_pfn(page_to_pfn(pg));
123 }
virt_to_dma_pfn(void * p)124 static inline unsigned long virt_to_dma_pfn(void *p)
125 {
126 	return page_to_dma_pfn(virt_to_page(p));
127 }
128 
129 static void __init check_tylersburg_isoch(void);
130 static int rwbf_quirk;
131 
132 /*
133  * set to 1 to panic kernel if can't successfully enable VT-d
134  * (used when kernel is launched w/ TXT)
135  */
136 static int force_on = 0;
137 static int intel_iommu_tboot_noforce;
138 static int no_platform_optin;
139 
140 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
141 
142 /*
143  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
144  * if marked present.
145  */
root_entry_lctp(struct root_entry * re)146 static phys_addr_t root_entry_lctp(struct root_entry *re)
147 {
148 	if (!(re->lo & 1))
149 		return 0;
150 
151 	return re->lo & VTD_PAGE_MASK;
152 }
153 
154 /*
155  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
156  * if marked present.
157  */
root_entry_uctp(struct root_entry * re)158 static phys_addr_t root_entry_uctp(struct root_entry *re)
159 {
160 	if (!(re->hi & 1))
161 		return 0;
162 
163 	return re->hi & VTD_PAGE_MASK;
164 }
165 
context_set_present(struct context_entry * context)166 static inline void context_set_present(struct context_entry *context)
167 {
168 	context->lo |= 1;
169 }
170 
context_set_fault_enable(struct context_entry * context)171 static inline void context_set_fault_enable(struct context_entry *context)
172 {
173 	context->lo &= (((u64)-1) << 2) | 1;
174 }
175 
context_set_translation_type(struct context_entry * context,unsigned long value)176 static inline void context_set_translation_type(struct context_entry *context,
177 						unsigned long value)
178 {
179 	context->lo &= (((u64)-1) << 4) | 3;
180 	context->lo |= (value & 3) << 2;
181 }
182 
context_set_address_root(struct context_entry * context,unsigned long value)183 static inline void context_set_address_root(struct context_entry *context,
184 					    unsigned long value)
185 {
186 	context->lo &= ~VTD_PAGE_MASK;
187 	context->lo |= value & VTD_PAGE_MASK;
188 }
189 
context_set_address_width(struct context_entry * context,unsigned long value)190 static inline void context_set_address_width(struct context_entry *context,
191 					     unsigned long value)
192 {
193 	context->hi |= value & 7;
194 }
195 
context_set_domain_id(struct context_entry * context,unsigned long value)196 static inline void context_set_domain_id(struct context_entry *context,
197 					 unsigned long value)
198 {
199 	context->hi |= (value & ((1 << 16) - 1)) << 8;
200 }
201 
context_set_pasid(struct context_entry * context)202 static inline void context_set_pasid(struct context_entry *context)
203 {
204 	context->lo |= CONTEXT_PASIDE;
205 }
206 
context_domain_id(struct context_entry * c)207 static inline int context_domain_id(struct context_entry *c)
208 {
209 	return((c->hi >> 8) & 0xffff);
210 }
211 
context_clear_entry(struct context_entry * context)212 static inline void context_clear_entry(struct context_entry *context)
213 {
214 	context->lo = 0;
215 	context->hi = 0;
216 }
217 
context_copied(struct intel_iommu * iommu,u8 bus,u8 devfn)218 static inline bool context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
219 {
220 	if (!iommu->copied_tables)
221 		return false;
222 
223 	return test_bit(((long)bus << 8) | devfn, iommu->copied_tables);
224 }
225 
226 static inline void
set_context_copied(struct intel_iommu * iommu,u8 bus,u8 devfn)227 set_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
228 {
229 	set_bit(((long)bus << 8) | devfn, iommu->copied_tables);
230 }
231 
232 static inline void
clear_context_copied(struct intel_iommu * iommu,u8 bus,u8 devfn)233 clear_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
234 {
235 	clear_bit(((long)bus << 8) | devfn, iommu->copied_tables);
236 }
237 
238 /*
239  * This domain is a statically identity mapping domain.
240  *	1. This domain creats a static 1:1 mapping to all usable memory.
241  * 	2. It maps to each iommu if successful.
242  *	3. Each iommu mapps to this domain if successful.
243  */
244 static struct dmar_domain *si_domain;
245 static int hw_pass_through = 1;
246 
247 struct dmar_rmrr_unit {
248 	struct list_head list;		/* list of rmrr units	*/
249 	struct acpi_dmar_header *hdr;	/* ACPI header		*/
250 	u64	base_address;		/* reserved base address*/
251 	u64	end_address;		/* reserved end address */
252 	struct dmar_dev_scope *devices;	/* target devices */
253 	int	devices_cnt;		/* target device count */
254 };
255 
256 struct dmar_atsr_unit {
257 	struct list_head list;		/* list of ATSR units */
258 	struct acpi_dmar_header *hdr;	/* ACPI header */
259 	struct dmar_dev_scope *devices;	/* target devices */
260 	int devices_cnt;		/* target device count */
261 	u8 include_all:1;		/* include all ports */
262 };
263 
264 struct dmar_satc_unit {
265 	struct list_head list;		/* list of SATC units */
266 	struct acpi_dmar_header *hdr;	/* ACPI header */
267 	struct dmar_dev_scope *devices;	/* target devices */
268 	struct intel_iommu *iommu;	/* the corresponding iommu */
269 	int devices_cnt;		/* target device count */
270 	u8 atc_required:1;		/* ATS is required */
271 };
272 
273 static LIST_HEAD(dmar_atsr_units);
274 static LIST_HEAD(dmar_rmrr_units);
275 static LIST_HEAD(dmar_satc_units);
276 
277 #define for_each_rmrr_units(rmrr) \
278 	list_for_each_entry(rmrr, &dmar_rmrr_units, list)
279 
280 static void device_block_translation(struct device *dev);
281 
282 int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
283 int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON);
284 
285 int intel_iommu_enabled = 0;
286 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
287 
288 static int dmar_map_gfx = 1;
289 static int intel_iommu_superpage = 1;
290 static int iommu_identity_mapping;
291 static int iommu_skip_te_disable;
292 
293 #define IDENTMAP_GFX		2
294 #define IDENTMAP_AZALIA		4
295 
296 const struct iommu_ops intel_iommu_ops;
297 
translation_pre_enabled(struct intel_iommu * iommu)298 static bool translation_pre_enabled(struct intel_iommu *iommu)
299 {
300 	return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
301 }
302 
clear_translation_pre_enabled(struct intel_iommu * iommu)303 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
304 {
305 	iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
306 }
307 
init_translation_status(struct intel_iommu * iommu)308 static void init_translation_status(struct intel_iommu *iommu)
309 {
310 	u32 gsts;
311 
312 	gsts = readl(iommu->reg + DMAR_GSTS_REG);
313 	if (gsts & DMA_GSTS_TES)
314 		iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
315 }
316 
intel_iommu_setup(char * str)317 static int __init intel_iommu_setup(char *str)
318 {
319 	if (!str)
320 		return -EINVAL;
321 
322 	while (*str) {
323 		if (!strncmp(str, "on", 2)) {
324 			dmar_disabled = 0;
325 			pr_info("IOMMU enabled\n");
326 		} else if (!strncmp(str, "off", 3)) {
327 			dmar_disabled = 1;
328 			no_platform_optin = 1;
329 			pr_info("IOMMU disabled\n");
330 		} else if (!strncmp(str, "igfx_off", 8)) {
331 			dmar_map_gfx = 0;
332 			pr_info("Disable GFX device mapping\n");
333 		} else if (!strncmp(str, "forcedac", 8)) {
334 			pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
335 			iommu_dma_forcedac = true;
336 		} else if (!strncmp(str, "strict", 6)) {
337 			pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
338 			iommu_set_dma_strict();
339 		} else if (!strncmp(str, "sp_off", 6)) {
340 			pr_info("Disable supported super page\n");
341 			intel_iommu_superpage = 0;
342 		} else if (!strncmp(str, "sm_on", 5)) {
343 			pr_info("Enable scalable mode if hardware supports\n");
344 			intel_iommu_sm = 1;
345 		} else if (!strncmp(str, "sm_off", 6)) {
346 			pr_info("Scalable mode is disallowed\n");
347 			intel_iommu_sm = 0;
348 		} else if (!strncmp(str, "tboot_noforce", 13)) {
349 			pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
350 			intel_iommu_tboot_noforce = 1;
351 		} else {
352 			pr_notice("Unknown option - '%s'\n", str);
353 		}
354 
355 		str += strcspn(str, ",");
356 		while (*str == ',')
357 			str++;
358 	}
359 
360 	return 1;
361 }
362 __setup("intel_iommu=", intel_iommu_setup);
363 
alloc_pgtable_page(int node)364 void *alloc_pgtable_page(int node)
365 {
366 	struct page *page;
367 	void *vaddr = NULL;
368 
369 	page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
370 	if (page)
371 		vaddr = page_address(page);
372 	return vaddr;
373 }
374 
free_pgtable_page(void * vaddr)375 void free_pgtable_page(void *vaddr)
376 {
377 	free_page((unsigned long)vaddr);
378 }
379 
domain_type_is_si(struct dmar_domain * domain)380 static inline int domain_type_is_si(struct dmar_domain *domain)
381 {
382 	return domain->domain.type == IOMMU_DOMAIN_IDENTITY;
383 }
384 
domain_use_first_level(struct dmar_domain * domain)385 static inline bool domain_use_first_level(struct dmar_domain *domain)
386 {
387 	return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL;
388 }
389 
domain_pfn_supported(struct dmar_domain * domain,unsigned long pfn)390 static inline int domain_pfn_supported(struct dmar_domain *domain,
391 				       unsigned long pfn)
392 {
393 	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
394 
395 	return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
396 }
397 
398 /*
399  * Calculate the Supported Adjusted Guest Address Widths of an IOMMU.
400  * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of
401  * the returned SAGAW.
402  */
__iommu_calculate_sagaw(struct intel_iommu * iommu)403 static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu)
404 {
405 	unsigned long fl_sagaw, sl_sagaw;
406 
407 	fl_sagaw = BIT(2) | (cap_fl5lp_support(iommu->cap) ? BIT(3) : 0);
408 	sl_sagaw = cap_sagaw(iommu->cap);
409 
410 	/* Second level only. */
411 	if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
412 		return sl_sagaw;
413 
414 	/* First level only. */
415 	if (!ecap_slts(iommu->ecap))
416 		return fl_sagaw;
417 
418 	return fl_sagaw & sl_sagaw;
419 }
420 
__iommu_calculate_agaw(struct intel_iommu * iommu,int max_gaw)421 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
422 {
423 	unsigned long sagaw;
424 	int agaw;
425 
426 	sagaw = __iommu_calculate_sagaw(iommu);
427 	for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) {
428 		if (test_bit(agaw, &sagaw))
429 			break;
430 	}
431 
432 	return agaw;
433 }
434 
435 /*
436  * Calculate max SAGAW for each iommu.
437  */
iommu_calculate_max_sagaw(struct intel_iommu * iommu)438 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
439 {
440 	return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
441 }
442 
443 /*
444  * calculate agaw for each iommu.
445  * "SAGAW" may be different across iommus, use a default agaw, and
446  * get a supported less agaw for iommus that don't support the default agaw.
447  */
iommu_calculate_agaw(struct intel_iommu * iommu)448 int iommu_calculate_agaw(struct intel_iommu *iommu)
449 {
450 	return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
451 }
452 
iommu_paging_structure_coherency(struct intel_iommu * iommu)453 static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
454 {
455 	return sm_supported(iommu) ?
456 			ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
457 }
458 
domain_update_iommu_coherency(struct dmar_domain * domain)459 static void domain_update_iommu_coherency(struct dmar_domain *domain)
460 {
461 	struct iommu_domain_info *info;
462 	struct dmar_drhd_unit *drhd;
463 	struct intel_iommu *iommu;
464 	bool found = false;
465 	unsigned long i;
466 
467 	domain->iommu_coherency = true;
468 	xa_for_each(&domain->iommu_array, i, info) {
469 		found = true;
470 		if (!iommu_paging_structure_coherency(info->iommu)) {
471 			domain->iommu_coherency = false;
472 			break;
473 		}
474 	}
475 	if (found)
476 		return;
477 
478 	/* No hardware attached; use lowest common denominator */
479 	rcu_read_lock();
480 	for_each_active_iommu(iommu, drhd) {
481 		if (!iommu_paging_structure_coherency(iommu)) {
482 			domain->iommu_coherency = false;
483 			break;
484 		}
485 	}
486 	rcu_read_unlock();
487 }
488 
domain_update_iommu_superpage(struct dmar_domain * domain,struct intel_iommu * skip)489 static int domain_update_iommu_superpage(struct dmar_domain *domain,
490 					 struct intel_iommu *skip)
491 {
492 	struct dmar_drhd_unit *drhd;
493 	struct intel_iommu *iommu;
494 	int mask = 0x3;
495 
496 	if (!intel_iommu_superpage)
497 		return 0;
498 
499 	/* set iommu_superpage to the smallest common denominator */
500 	rcu_read_lock();
501 	for_each_active_iommu(iommu, drhd) {
502 		if (iommu != skip) {
503 			if (domain && domain_use_first_level(domain)) {
504 				if (!cap_fl1gp_support(iommu->cap))
505 					mask = 0x1;
506 			} else {
507 				mask &= cap_super_page_val(iommu->cap);
508 			}
509 
510 			if (!mask)
511 				break;
512 		}
513 	}
514 	rcu_read_unlock();
515 
516 	return fls(mask);
517 }
518 
domain_update_device_node(struct dmar_domain * domain)519 static int domain_update_device_node(struct dmar_domain *domain)
520 {
521 	struct device_domain_info *info;
522 	int nid = NUMA_NO_NODE;
523 	unsigned long flags;
524 
525 	spin_lock_irqsave(&domain->lock, flags);
526 	list_for_each_entry(info, &domain->devices, link) {
527 		/*
528 		 * There could possibly be multiple device numa nodes as devices
529 		 * within the same domain may sit behind different IOMMUs. There
530 		 * isn't perfect answer in such situation, so we select first
531 		 * come first served policy.
532 		 */
533 		nid = dev_to_node(info->dev);
534 		if (nid != NUMA_NO_NODE)
535 			break;
536 	}
537 	spin_unlock_irqrestore(&domain->lock, flags);
538 
539 	return nid;
540 }
541 
542 static void domain_update_iotlb(struct dmar_domain *domain);
543 
544 /* Return the super pagesize bitmap if supported. */
domain_super_pgsize_bitmap(struct dmar_domain * domain)545 static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
546 {
547 	unsigned long bitmap = 0;
548 
549 	/*
550 	 * 1-level super page supports page size of 2MiB, 2-level super page
551 	 * supports page size of both 2MiB and 1GiB.
552 	 */
553 	if (domain->iommu_superpage == 1)
554 		bitmap |= SZ_2M;
555 	else if (domain->iommu_superpage == 2)
556 		bitmap |= SZ_2M | SZ_1G;
557 
558 	return bitmap;
559 }
560 
561 /* Some capabilities may be different across iommus */
domain_update_iommu_cap(struct dmar_domain * domain)562 static void domain_update_iommu_cap(struct dmar_domain *domain)
563 {
564 	domain_update_iommu_coherency(domain);
565 	domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
566 
567 	/*
568 	 * If RHSA is missing, we should default to the device numa domain
569 	 * as fall back.
570 	 */
571 	if (domain->nid == NUMA_NO_NODE)
572 		domain->nid = domain_update_device_node(domain);
573 
574 	/*
575 	 * First-level translation restricts the input-address to a
576 	 * canonical address (i.e., address bits 63:N have the same
577 	 * value as address bit [N-1], where N is 48-bits with 4-level
578 	 * paging and 57-bits with 5-level paging). Hence, skip bit
579 	 * [N-1].
580 	 */
581 	if (domain_use_first_level(domain))
582 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
583 	else
584 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
585 
586 	domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
587 	domain_update_iotlb(domain);
588 }
589 
iommu_context_addr(struct intel_iommu * iommu,u8 bus,u8 devfn,int alloc)590 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
591 					 u8 devfn, int alloc)
592 {
593 	struct root_entry *root = &iommu->root_entry[bus];
594 	struct context_entry *context;
595 	u64 *entry;
596 
597 	/*
598 	 * Except that the caller requested to allocate a new entry,
599 	 * returning a copied context entry makes no sense.
600 	 */
601 	if (!alloc && context_copied(iommu, bus, devfn))
602 		return NULL;
603 
604 	entry = &root->lo;
605 	if (sm_supported(iommu)) {
606 		if (devfn >= 0x80) {
607 			devfn -= 0x80;
608 			entry = &root->hi;
609 		}
610 		devfn *= 2;
611 	}
612 	if (*entry & 1)
613 		context = phys_to_virt(*entry & VTD_PAGE_MASK);
614 	else {
615 		unsigned long phy_addr;
616 		if (!alloc)
617 			return NULL;
618 
619 		context = alloc_pgtable_page(iommu->node);
620 		if (!context)
621 			return NULL;
622 
623 		__iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
624 		phy_addr = virt_to_phys((void *)context);
625 		*entry = phy_addr | 1;
626 		__iommu_flush_cache(iommu, entry, sizeof(*entry));
627 	}
628 	return &context[devfn];
629 }
630 
631 /**
632  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
633  *				 sub-hierarchy of a candidate PCI-PCI bridge
634  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
635  * @bridge: the candidate PCI-PCI bridge
636  *
637  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
638  */
639 static bool
is_downstream_to_pci_bridge(struct device * dev,struct device * bridge)640 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
641 {
642 	struct pci_dev *pdev, *pbridge;
643 
644 	if (!dev_is_pci(dev) || !dev_is_pci(bridge))
645 		return false;
646 
647 	pdev = to_pci_dev(dev);
648 	pbridge = to_pci_dev(bridge);
649 
650 	if (pbridge->subordinate &&
651 	    pbridge->subordinate->number <= pdev->bus->number &&
652 	    pbridge->subordinate->busn_res.end >= pdev->bus->number)
653 		return true;
654 
655 	return false;
656 }
657 
quirk_ioat_snb_local_iommu(struct pci_dev * pdev)658 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
659 {
660 	struct dmar_drhd_unit *drhd;
661 	u32 vtbar;
662 	int rc;
663 
664 	/* We know that this device on this chipset has its own IOMMU.
665 	 * If we find it under a different IOMMU, then the BIOS is lying
666 	 * to us. Hope that the IOMMU for this device is actually
667 	 * disabled, and it needs no translation...
668 	 */
669 	rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
670 	if (rc) {
671 		/* "can't" happen */
672 		dev_info(&pdev->dev, "failed to run vt-d quirk\n");
673 		return false;
674 	}
675 	vtbar &= 0xffff0000;
676 
677 	/* we know that the this iommu should be at offset 0xa000 from vtbar */
678 	drhd = dmar_find_matched_drhd_unit(pdev);
679 	if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
680 		pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
681 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
682 		return true;
683 	}
684 
685 	return false;
686 }
687 
iommu_is_dummy(struct intel_iommu * iommu,struct device * dev)688 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
689 {
690 	if (!iommu || iommu->drhd->ignored)
691 		return true;
692 
693 	if (dev_is_pci(dev)) {
694 		struct pci_dev *pdev = to_pci_dev(dev);
695 
696 		if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
697 		    pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
698 		    quirk_ioat_snb_local_iommu(pdev))
699 			return true;
700 	}
701 
702 	return false;
703 }
704 
device_to_iommu(struct device * dev,u8 * bus,u8 * devfn)705 struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
706 {
707 	struct dmar_drhd_unit *drhd = NULL;
708 	struct pci_dev *pdev = NULL;
709 	struct intel_iommu *iommu;
710 	struct device *tmp;
711 	u16 segment = 0;
712 	int i;
713 
714 	if (!dev)
715 		return NULL;
716 
717 	if (dev_is_pci(dev)) {
718 		struct pci_dev *pf_pdev;
719 
720 		pdev = pci_real_dma_dev(to_pci_dev(dev));
721 
722 		/* VFs aren't listed in scope tables; we need to look up
723 		 * the PF instead to find the IOMMU. */
724 		pf_pdev = pci_physfn(pdev);
725 		dev = &pf_pdev->dev;
726 		segment = pci_domain_nr(pdev->bus);
727 	} else if (has_acpi_companion(dev))
728 		dev = &ACPI_COMPANION(dev)->dev;
729 
730 	rcu_read_lock();
731 	for_each_iommu(iommu, drhd) {
732 		if (pdev && segment != drhd->segment)
733 			continue;
734 
735 		for_each_active_dev_scope(drhd->devices,
736 					  drhd->devices_cnt, i, tmp) {
737 			if (tmp == dev) {
738 				/* For a VF use its original BDF# not that of the PF
739 				 * which we used for the IOMMU lookup. Strictly speaking
740 				 * we could do this for all PCI devices; we only need to
741 				 * get the BDF# from the scope table for ACPI matches. */
742 				if (pdev && pdev->is_virtfn)
743 					goto got_pdev;
744 
745 				if (bus && devfn) {
746 					*bus = drhd->devices[i].bus;
747 					*devfn = drhd->devices[i].devfn;
748 				}
749 				goto out;
750 			}
751 
752 			if (is_downstream_to_pci_bridge(dev, tmp))
753 				goto got_pdev;
754 		}
755 
756 		if (pdev && drhd->include_all) {
757 got_pdev:
758 			if (bus && devfn) {
759 				*bus = pdev->bus->number;
760 				*devfn = pdev->devfn;
761 			}
762 			goto out;
763 		}
764 	}
765 	iommu = NULL;
766 out:
767 	if (iommu_is_dummy(iommu, dev))
768 		iommu = NULL;
769 
770 	rcu_read_unlock();
771 
772 	return iommu;
773 }
774 
domain_flush_cache(struct dmar_domain * domain,void * addr,int size)775 static void domain_flush_cache(struct dmar_domain *domain,
776 			       void *addr, int size)
777 {
778 	if (!domain->iommu_coherency)
779 		clflush_cache_range(addr, size);
780 }
781 
device_context_mapped(struct intel_iommu * iommu,u8 bus,u8 devfn)782 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
783 {
784 	struct context_entry *context;
785 	int ret = 0;
786 
787 	spin_lock(&iommu->lock);
788 	context = iommu_context_addr(iommu, bus, devfn, 0);
789 	if (context)
790 		ret = context_present(context);
791 	spin_unlock(&iommu->lock);
792 	return ret;
793 }
794 
free_context_table(struct intel_iommu * iommu)795 static void free_context_table(struct intel_iommu *iommu)
796 {
797 	struct context_entry *context;
798 	int i;
799 
800 	if (!iommu->root_entry)
801 		return;
802 
803 	for (i = 0; i < ROOT_ENTRY_NR; i++) {
804 		context = iommu_context_addr(iommu, i, 0, 0);
805 		if (context)
806 			free_pgtable_page(context);
807 
808 		if (!sm_supported(iommu))
809 			continue;
810 
811 		context = iommu_context_addr(iommu, i, 0x80, 0);
812 		if (context)
813 			free_pgtable_page(context);
814 	}
815 
816 	free_pgtable_page(iommu->root_entry);
817 	iommu->root_entry = NULL;
818 }
819 
820 #ifdef CONFIG_DMAR_DEBUG
pgtable_walk(struct intel_iommu * iommu,unsigned long pfn,u8 bus,u8 devfn,struct dma_pte * parent,int level)821 static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn,
822 			 u8 bus, u8 devfn, struct dma_pte *parent, int level)
823 {
824 	struct dma_pte *pte;
825 	int offset;
826 
827 	while (1) {
828 		offset = pfn_level_offset(pfn, level);
829 		pte = &parent[offset];
830 		if (!pte || (dma_pte_superpage(pte) || !dma_pte_present(pte))) {
831 			pr_info("PTE not present at level %d\n", level);
832 			break;
833 		}
834 
835 		pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val);
836 
837 		if (level == 1)
838 			break;
839 
840 		parent = phys_to_virt(dma_pte_addr(pte));
841 		level--;
842 	}
843 }
844 
dmar_fault_dump_ptes(struct intel_iommu * iommu,u16 source_id,unsigned long long addr,u32 pasid)845 void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
846 			  unsigned long long addr, u32 pasid)
847 {
848 	struct pasid_dir_entry *dir, *pde;
849 	struct pasid_entry *entries, *pte;
850 	struct context_entry *ctx_entry;
851 	struct root_entry *rt_entry;
852 	int i, dir_index, index, level;
853 	u8 devfn = source_id & 0xff;
854 	u8 bus = source_id >> 8;
855 	struct dma_pte *pgtable;
856 
857 	pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr);
858 
859 	/* root entry dump */
860 	rt_entry = &iommu->root_entry[bus];
861 	if (!rt_entry) {
862 		pr_info("root table entry is not present\n");
863 		return;
864 	}
865 
866 	if (sm_supported(iommu))
867 		pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n",
868 			rt_entry->hi, rt_entry->lo);
869 	else
870 		pr_info("root entry: 0x%016llx", rt_entry->lo);
871 
872 	/* context entry dump */
873 	ctx_entry = iommu_context_addr(iommu, bus, devfn, 0);
874 	if (!ctx_entry) {
875 		pr_info("context table entry is not present\n");
876 		return;
877 	}
878 
879 	pr_info("context entry: hi 0x%016llx, low 0x%016llx\n",
880 		ctx_entry->hi, ctx_entry->lo);
881 
882 	/* legacy mode does not require PASID entries */
883 	if (!sm_supported(iommu)) {
884 		level = agaw_to_level(ctx_entry->hi & 7);
885 		pgtable = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
886 		goto pgtable_walk;
887 	}
888 
889 	/* get the pointer to pasid directory entry */
890 	dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
891 	if (!dir) {
892 		pr_info("pasid directory entry is not present\n");
893 		return;
894 	}
895 	/* For request-without-pasid, get the pasid from context entry */
896 	if (intel_iommu_sm && pasid == INVALID_IOASID)
897 		pasid = PASID_RID2PASID;
898 
899 	dir_index = pasid >> PASID_PDE_SHIFT;
900 	pde = &dir[dir_index];
901 	pr_info("pasid dir entry: 0x%016llx\n", pde->val);
902 
903 	/* get the pointer to the pasid table entry */
904 	entries = get_pasid_table_from_pde(pde);
905 	if (!entries) {
906 		pr_info("pasid table entry is not present\n");
907 		return;
908 	}
909 	index = pasid & PASID_PTE_MASK;
910 	pte = &entries[index];
911 	for (i = 0; i < ARRAY_SIZE(pte->val); i++)
912 		pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]);
913 
914 	if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) {
915 		level = pte->val[2] & BIT_ULL(2) ? 5 : 4;
916 		pgtable = phys_to_virt(pte->val[2] & VTD_PAGE_MASK);
917 	} else {
918 		level = agaw_to_level((pte->val[0] >> 2) & 0x7);
919 		pgtable = phys_to_virt(pte->val[0] & VTD_PAGE_MASK);
920 	}
921 
922 pgtable_walk:
923 	pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn, pgtable, level);
924 }
925 #endif
926 
pfn_to_dma_pte(struct dmar_domain * domain,unsigned long pfn,int * target_level)927 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
928 				      unsigned long pfn, int *target_level)
929 {
930 	struct dma_pte *parent, *pte;
931 	int level = agaw_to_level(domain->agaw);
932 	int offset;
933 
934 	BUG_ON(!domain->pgd);
935 
936 	if (!domain_pfn_supported(domain, pfn))
937 		/* Address beyond IOMMU's addressing capabilities. */
938 		return NULL;
939 
940 	parent = domain->pgd;
941 
942 	while (1) {
943 		void *tmp_page;
944 
945 		offset = pfn_level_offset(pfn, level);
946 		pte = &parent[offset];
947 		if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
948 			break;
949 		if (level == *target_level)
950 			break;
951 
952 		if (!dma_pte_present(pte)) {
953 			uint64_t pteval;
954 
955 			tmp_page = alloc_pgtable_page(domain->nid);
956 
957 			if (!tmp_page)
958 				return NULL;
959 
960 			domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
961 			pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
962 			if (domain_use_first_level(domain))
963 				pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
964 
965 			if (cmpxchg64(&pte->val, 0ULL, pteval))
966 				/* Someone else set it while we were thinking; use theirs. */
967 				free_pgtable_page(tmp_page);
968 			else
969 				domain_flush_cache(domain, pte, sizeof(*pte));
970 		}
971 		if (level == 1)
972 			break;
973 
974 		parent = phys_to_virt(dma_pte_addr(pte));
975 		level--;
976 	}
977 
978 	if (!*target_level)
979 		*target_level = level;
980 
981 	return pte;
982 }
983 
984 /* return address's pte at specific level */
dma_pfn_level_pte(struct dmar_domain * domain,unsigned long pfn,int level,int * large_page)985 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
986 					 unsigned long pfn,
987 					 int level, int *large_page)
988 {
989 	struct dma_pte *parent, *pte;
990 	int total = agaw_to_level(domain->agaw);
991 	int offset;
992 
993 	parent = domain->pgd;
994 	while (level <= total) {
995 		offset = pfn_level_offset(pfn, total);
996 		pte = &parent[offset];
997 		if (level == total)
998 			return pte;
999 
1000 		if (!dma_pte_present(pte)) {
1001 			*large_page = total;
1002 			break;
1003 		}
1004 
1005 		if (dma_pte_superpage(pte)) {
1006 			*large_page = total;
1007 			return pte;
1008 		}
1009 
1010 		parent = phys_to_virt(dma_pte_addr(pte));
1011 		total--;
1012 	}
1013 	return NULL;
1014 }
1015 
1016 /* clear last level pte, a tlb flush should be followed */
dma_pte_clear_range(struct dmar_domain * domain,unsigned long start_pfn,unsigned long last_pfn)1017 static void dma_pte_clear_range(struct dmar_domain *domain,
1018 				unsigned long start_pfn,
1019 				unsigned long last_pfn)
1020 {
1021 	unsigned int large_page;
1022 	struct dma_pte *first_pte, *pte;
1023 
1024 	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1025 	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1026 	BUG_ON(start_pfn > last_pfn);
1027 
1028 	/* we don't need lock here; nobody else touches the iova range */
1029 	do {
1030 		large_page = 1;
1031 		first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1032 		if (!pte) {
1033 			start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1034 			continue;
1035 		}
1036 		do {
1037 			dma_clear_pte(pte);
1038 			start_pfn += lvl_to_nr_pages(large_page);
1039 			pte++;
1040 		} while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1041 
1042 		domain_flush_cache(domain, first_pte,
1043 				   (void *)pte - (void *)first_pte);
1044 
1045 	} while (start_pfn && start_pfn <= last_pfn);
1046 }
1047 
dma_pte_free_level(struct dmar_domain * domain,int level,int retain_level,struct dma_pte * pte,unsigned long pfn,unsigned long start_pfn,unsigned long last_pfn)1048 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1049 			       int retain_level, struct dma_pte *pte,
1050 			       unsigned long pfn, unsigned long start_pfn,
1051 			       unsigned long last_pfn)
1052 {
1053 	pfn = max(start_pfn, pfn);
1054 	pte = &pte[pfn_level_offset(pfn, level)];
1055 
1056 	do {
1057 		unsigned long level_pfn;
1058 		struct dma_pte *level_pte;
1059 
1060 		if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1061 			goto next;
1062 
1063 		level_pfn = pfn & level_mask(level);
1064 		level_pte = phys_to_virt(dma_pte_addr(pte));
1065 
1066 		if (level > 2) {
1067 			dma_pte_free_level(domain, level - 1, retain_level,
1068 					   level_pte, level_pfn, start_pfn,
1069 					   last_pfn);
1070 		}
1071 
1072 		/*
1073 		 * Free the page table if we're below the level we want to
1074 		 * retain and the range covers the entire table.
1075 		 */
1076 		if (level < retain_level && !(start_pfn > level_pfn ||
1077 		      last_pfn < level_pfn + level_size(level) - 1)) {
1078 			dma_clear_pte(pte);
1079 			domain_flush_cache(domain, pte, sizeof(*pte));
1080 			free_pgtable_page(level_pte);
1081 		}
1082 next:
1083 		pfn += level_size(level);
1084 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1085 }
1086 
1087 /*
1088  * clear last level (leaf) ptes and free page table pages below the
1089  * level we wish to keep intact.
1090  */
dma_pte_free_pagetable(struct dmar_domain * domain,unsigned long start_pfn,unsigned long last_pfn,int retain_level)1091 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1092 				   unsigned long start_pfn,
1093 				   unsigned long last_pfn,
1094 				   int retain_level)
1095 {
1096 	dma_pte_clear_range(domain, start_pfn, last_pfn);
1097 
1098 	/* We don't need lock here; nobody else touches the iova range */
1099 	dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1100 			   domain->pgd, 0, start_pfn, last_pfn);
1101 
1102 	/* free pgd */
1103 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1104 		free_pgtable_page(domain->pgd);
1105 		domain->pgd = NULL;
1106 	}
1107 }
1108 
1109 /* When a page at a given level is being unlinked from its parent, we don't
1110    need to *modify* it at all. All we need to do is make a list of all the
1111    pages which can be freed just as soon as we've flushed the IOTLB and we
1112    know the hardware page-walk will no longer touch them.
1113    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1114    be freed. */
dma_pte_list_pagetables(struct dmar_domain * domain,int level,struct dma_pte * pte,struct list_head * freelist)1115 static void dma_pte_list_pagetables(struct dmar_domain *domain,
1116 				    int level, struct dma_pte *pte,
1117 				    struct list_head *freelist)
1118 {
1119 	struct page *pg;
1120 
1121 	pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1122 	list_add_tail(&pg->lru, freelist);
1123 
1124 	if (level == 1)
1125 		return;
1126 
1127 	pte = page_address(pg);
1128 	do {
1129 		if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1130 			dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1131 		pte++;
1132 	} while (!first_pte_in_page(pte));
1133 }
1134 
dma_pte_clear_level(struct dmar_domain * domain,int level,struct dma_pte * pte,unsigned long pfn,unsigned long start_pfn,unsigned long last_pfn,struct list_head * freelist)1135 static void dma_pte_clear_level(struct dmar_domain *domain, int level,
1136 				struct dma_pte *pte, unsigned long pfn,
1137 				unsigned long start_pfn, unsigned long last_pfn,
1138 				struct list_head *freelist)
1139 {
1140 	struct dma_pte *first_pte = NULL, *last_pte = NULL;
1141 
1142 	pfn = max(start_pfn, pfn);
1143 	pte = &pte[pfn_level_offset(pfn, level)];
1144 
1145 	do {
1146 		unsigned long level_pfn = pfn & level_mask(level);
1147 
1148 		if (!dma_pte_present(pte))
1149 			goto next;
1150 
1151 		/* If range covers entire pagetable, free it */
1152 		if (start_pfn <= level_pfn &&
1153 		    last_pfn >= level_pfn + level_size(level) - 1) {
1154 			/* These suborbinate page tables are going away entirely. Don't
1155 			   bother to clear them; we're just going to *free* them. */
1156 			if (level > 1 && !dma_pte_superpage(pte))
1157 				dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1158 
1159 			dma_clear_pte(pte);
1160 			if (!first_pte)
1161 				first_pte = pte;
1162 			last_pte = pte;
1163 		} else if (level > 1) {
1164 			/* Recurse down into a level that isn't *entirely* obsolete */
1165 			dma_pte_clear_level(domain, level - 1,
1166 					    phys_to_virt(dma_pte_addr(pte)),
1167 					    level_pfn, start_pfn, last_pfn,
1168 					    freelist);
1169 		}
1170 next:
1171 		pfn = level_pfn + level_size(level);
1172 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1173 
1174 	if (first_pte)
1175 		domain_flush_cache(domain, first_pte,
1176 				   (void *)++last_pte - (void *)first_pte);
1177 }
1178 
1179 /* We can't just free the pages because the IOMMU may still be walking
1180    the page tables, and may have cached the intermediate levels. The
1181    pages can only be freed after the IOTLB flush has been done. */
domain_unmap(struct dmar_domain * domain,unsigned long start_pfn,unsigned long last_pfn,struct list_head * freelist)1182 static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn,
1183 			 unsigned long last_pfn, struct list_head *freelist)
1184 {
1185 	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1186 	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1187 	BUG_ON(start_pfn > last_pfn);
1188 
1189 	/* we don't need lock here; nobody else touches the iova range */
1190 	dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1191 			    domain->pgd, 0, start_pfn, last_pfn, freelist);
1192 
1193 	/* free pgd */
1194 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1195 		struct page *pgd_page = virt_to_page(domain->pgd);
1196 		list_add_tail(&pgd_page->lru, freelist);
1197 		domain->pgd = NULL;
1198 	}
1199 }
1200 
1201 /* iommu handling */
iommu_alloc_root_entry(struct intel_iommu * iommu)1202 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1203 {
1204 	struct root_entry *root;
1205 
1206 	root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1207 	if (!root) {
1208 		pr_err("Allocating root entry for %s failed\n",
1209 			iommu->name);
1210 		return -ENOMEM;
1211 	}
1212 
1213 	__iommu_flush_cache(iommu, root, ROOT_SIZE);
1214 	iommu->root_entry = root;
1215 
1216 	return 0;
1217 }
1218 
iommu_set_root_entry(struct intel_iommu * iommu)1219 static void iommu_set_root_entry(struct intel_iommu *iommu)
1220 {
1221 	u64 addr;
1222 	u32 sts;
1223 	unsigned long flag;
1224 
1225 	addr = virt_to_phys(iommu->root_entry);
1226 	if (sm_supported(iommu))
1227 		addr |= DMA_RTADDR_SMT;
1228 
1229 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1230 	dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1231 
1232 	writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1233 
1234 	/* Make sure hardware complete it */
1235 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1236 		      readl, (sts & DMA_GSTS_RTPS), sts);
1237 
1238 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1239 
1240 	/*
1241 	 * Hardware invalidates all DMA remapping hardware translation
1242 	 * caches as part of SRTP flow.
1243 	 */
1244 	if (cap_esrtps(iommu->cap))
1245 		return;
1246 
1247 	iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1248 	if (sm_supported(iommu))
1249 		qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1250 	iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1251 }
1252 
iommu_flush_write_buffer(struct intel_iommu * iommu)1253 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1254 {
1255 	u32 val;
1256 	unsigned long flag;
1257 
1258 	if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1259 		return;
1260 
1261 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1262 	writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1263 
1264 	/* Make sure hardware complete it */
1265 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1266 		      readl, (!(val & DMA_GSTS_WBFS)), val);
1267 
1268 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1269 }
1270 
1271 /* return value determine if we need a write buffer flush */
__iommu_flush_context(struct intel_iommu * iommu,u16 did,u16 source_id,u8 function_mask,u64 type)1272 static void __iommu_flush_context(struct intel_iommu *iommu,
1273 				  u16 did, u16 source_id, u8 function_mask,
1274 				  u64 type)
1275 {
1276 	u64 val = 0;
1277 	unsigned long flag;
1278 
1279 	switch (type) {
1280 	case DMA_CCMD_GLOBAL_INVL:
1281 		val = DMA_CCMD_GLOBAL_INVL;
1282 		break;
1283 	case DMA_CCMD_DOMAIN_INVL:
1284 		val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1285 		break;
1286 	case DMA_CCMD_DEVICE_INVL:
1287 		val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1288 			| DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1289 		break;
1290 	default:
1291 		BUG();
1292 	}
1293 	val |= DMA_CCMD_ICC;
1294 
1295 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1296 	dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1297 
1298 	/* Make sure hardware complete it */
1299 	IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1300 		dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1301 
1302 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1303 }
1304 
1305 /* return value determine if we need a write buffer flush */
__iommu_flush_iotlb(struct intel_iommu * iommu,u16 did,u64 addr,unsigned int size_order,u64 type)1306 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1307 				u64 addr, unsigned int size_order, u64 type)
1308 {
1309 	int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1310 	u64 val = 0, val_iva = 0;
1311 	unsigned long flag;
1312 
1313 	switch (type) {
1314 	case DMA_TLB_GLOBAL_FLUSH:
1315 		/* global flush doesn't need set IVA_REG */
1316 		val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1317 		break;
1318 	case DMA_TLB_DSI_FLUSH:
1319 		val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1320 		break;
1321 	case DMA_TLB_PSI_FLUSH:
1322 		val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1323 		/* IH bit is passed in as part of address */
1324 		val_iva = size_order | addr;
1325 		break;
1326 	default:
1327 		BUG();
1328 	}
1329 	/* Note: set drain read/write */
1330 #if 0
1331 	/*
1332 	 * This is probably to be super secure.. Looks like we can
1333 	 * ignore it without any impact.
1334 	 */
1335 	if (cap_read_drain(iommu->cap))
1336 		val |= DMA_TLB_READ_DRAIN;
1337 #endif
1338 	if (cap_write_drain(iommu->cap))
1339 		val |= DMA_TLB_WRITE_DRAIN;
1340 
1341 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1342 	/* Note: Only uses first TLB reg currently */
1343 	if (val_iva)
1344 		dmar_writeq(iommu->reg + tlb_offset, val_iva);
1345 	dmar_writeq(iommu->reg + tlb_offset + 8, val);
1346 
1347 	/* Make sure hardware complete it */
1348 	IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1349 		dmar_readq, (!(val & DMA_TLB_IVT)), val);
1350 
1351 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1352 
1353 	/* check IOTLB invalidation granularity */
1354 	if (DMA_TLB_IAIG(val) == 0)
1355 		pr_err("Flush IOTLB failed\n");
1356 	if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1357 		pr_debug("TLB flush request %Lx, actual %Lx\n",
1358 			(unsigned long long)DMA_TLB_IIRG(type),
1359 			(unsigned long long)DMA_TLB_IAIG(val));
1360 }
1361 
1362 static struct device_domain_info *
domain_lookup_dev_info(struct dmar_domain * domain,struct intel_iommu * iommu,u8 bus,u8 devfn)1363 domain_lookup_dev_info(struct dmar_domain *domain,
1364 		       struct intel_iommu *iommu, u8 bus, u8 devfn)
1365 {
1366 	struct device_domain_info *info;
1367 	unsigned long flags;
1368 
1369 	spin_lock_irqsave(&domain->lock, flags);
1370 	list_for_each_entry(info, &domain->devices, link) {
1371 		if (info->iommu == iommu && info->bus == bus &&
1372 		    info->devfn == devfn) {
1373 			spin_unlock_irqrestore(&domain->lock, flags);
1374 			return info;
1375 		}
1376 	}
1377 	spin_unlock_irqrestore(&domain->lock, flags);
1378 
1379 	return NULL;
1380 }
1381 
domain_update_iotlb(struct dmar_domain * domain)1382 static void domain_update_iotlb(struct dmar_domain *domain)
1383 {
1384 	struct device_domain_info *info;
1385 	bool has_iotlb_device = false;
1386 	unsigned long flags;
1387 
1388 	spin_lock_irqsave(&domain->lock, flags);
1389 	list_for_each_entry(info, &domain->devices, link) {
1390 		if (info->ats_enabled) {
1391 			has_iotlb_device = true;
1392 			break;
1393 		}
1394 	}
1395 	domain->has_iotlb_device = has_iotlb_device;
1396 	spin_unlock_irqrestore(&domain->lock, flags);
1397 }
1398 
1399 /*
1400  * The extra devTLB flush quirk impacts those QAT devices with PCI device
1401  * IDs ranging from 0x4940 to 0x4943. It is exempted from risky_device()
1402  * check because it applies only to the built-in QAT devices and it doesn't
1403  * grant additional privileges.
1404  */
1405 #define BUGGY_QAT_DEVID_MASK 0x4940
dev_needs_extra_dtlb_flush(struct pci_dev * pdev)1406 static bool dev_needs_extra_dtlb_flush(struct pci_dev *pdev)
1407 {
1408 	if (pdev->vendor != PCI_VENDOR_ID_INTEL)
1409 		return false;
1410 
1411 	if ((pdev->device & 0xfffc) != BUGGY_QAT_DEVID_MASK)
1412 		return false;
1413 
1414 	return true;
1415 }
1416 
iommu_enable_pci_caps(struct device_domain_info * info)1417 static void iommu_enable_pci_caps(struct device_domain_info *info)
1418 {
1419 	struct pci_dev *pdev;
1420 
1421 	if (!dev_is_pci(info->dev))
1422 		return;
1423 
1424 	pdev = to_pci_dev(info->dev);
1425 	/* For IOMMU that supports device IOTLB throttling (DIT), we assign
1426 	 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1427 	 * queue depth at PF level. If DIT is not set, PFSID will be treated as
1428 	 * reserved, which should be set to 0.
1429 	 */
1430 	if (!ecap_dit(info->iommu->ecap))
1431 		info->pfsid = 0;
1432 	else {
1433 		struct pci_dev *pf_pdev;
1434 
1435 		/* pdev will be returned if device is not a vf */
1436 		pf_pdev = pci_physfn(pdev);
1437 		info->pfsid = pci_dev_id(pf_pdev);
1438 	}
1439 
1440 	/* The PCIe spec, in its wisdom, declares that the behaviour of
1441 	   the device if you enable PASID support after ATS support is
1442 	   undefined. So always enable PASID support on devices which
1443 	   have it, even if we can't yet know if we're ever going to
1444 	   use it. */
1445 	if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1446 		info->pasid_enabled = 1;
1447 
1448 	if (info->pri_supported &&
1449 	    (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1)  &&
1450 	    !pci_reset_pri(pdev) && !pci_enable_pri(pdev, PRQ_DEPTH))
1451 		info->pri_enabled = 1;
1452 
1453 	if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1454 	    !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1455 		info->ats_enabled = 1;
1456 		domain_update_iotlb(info->domain);
1457 		info->ats_qdep = pci_ats_queue_depth(pdev);
1458 	}
1459 }
1460 
iommu_disable_dev_iotlb(struct device_domain_info * info)1461 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1462 {
1463 	struct pci_dev *pdev;
1464 
1465 	if (!dev_is_pci(info->dev))
1466 		return;
1467 
1468 	pdev = to_pci_dev(info->dev);
1469 
1470 	if (info->ats_enabled) {
1471 		pci_disable_ats(pdev);
1472 		info->ats_enabled = 0;
1473 		domain_update_iotlb(info->domain);
1474 	}
1475 
1476 	if (info->pri_enabled) {
1477 		pci_disable_pri(pdev);
1478 		info->pri_enabled = 0;
1479 	}
1480 
1481 	if (info->pasid_enabled) {
1482 		pci_disable_pasid(pdev);
1483 		info->pasid_enabled = 0;
1484 	}
1485 }
1486 
__iommu_flush_dev_iotlb(struct device_domain_info * info,u64 addr,unsigned int mask)1487 static void __iommu_flush_dev_iotlb(struct device_domain_info *info,
1488 				    u64 addr, unsigned int mask)
1489 {
1490 	u16 sid, qdep;
1491 
1492 	if (!info || !info->ats_enabled)
1493 		return;
1494 
1495 	sid = info->bus << 8 | info->devfn;
1496 	qdep = info->ats_qdep;
1497 	qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1498 			   qdep, addr, mask);
1499 	quirk_extra_dev_tlb_flush(info, addr, mask, PASID_RID2PASID, qdep);
1500 }
1501 
iommu_flush_dev_iotlb(struct dmar_domain * domain,u64 addr,unsigned mask)1502 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1503 				  u64 addr, unsigned mask)
1504 {
1505 	struct device_domain_info *info;
1506 	unsigned long flags;
1507 
1508 	if (!domain->has_iotlb_device)
1509 		return;
1510 
1511 	spin_lock_irqsave(&domain->lock, flags);
1512 	list_for_each_entry(info, &domain->devices, link)
1513 		__iommu_flush_dev_iotlb(info, addr, mask);
1514 	spin_unlock_irqrestore(&domain->lock, flags);
1515 }
1516 
iommu_flush_iotlb_psi(struct intel_iommu * iommu,struct dmar_domain * domain,unsigned long pfn,unsigned int pages,int ih,int map)1517 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1518 				  struct dmar_domain *domain,
1519 				  unsigned long pfn, unsigned int pages,
1520 				  int ih, int map)
1521 {
1522 	unsigned int aligned_pages = __roundup_pow_of_two(pages);
1523 	unsigned int mask = ilog2(aligned_pages);
1524 	uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1525 	u16 did = domain_id_iommu(domain, iommu);
1526 
1527 	BUG_ON(pages == 0);
1528 
1529 	if (ih)
1530 		ih = 1 << 6;
1531 
1532 	if (domain_use_first_level(domain)) {
1533 		qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, pages, ih);
1534 	} else {
1535 		unsigned long bitmask = aligned_pages - 1;
1536 
1537 		/*
1538 		 * PSI masks the low order bits of the base address. If the
1539 		 * address isn't aligned to the mask, then compute a mask value
1540 		 * needed to ensure the target range is flushed.
1541 		 */
1542 		if (unlikely(bitmask & pfn)) {
1543 			unsigned long end_pfn = pfn + pages - 1, shared_bits;
1544 
1545 			/*
1546 			 * Since end_pfn <= pfn + bitmask, the only way bits
1547 			 * higher than bitmask can differ in pfn and end_pfn is
1548 			 * by carrying. This means after masking out bitmask,
1549 			 * high bits starting with the first set bit in
1550 			 * shared_bits are all equal in both pfn and end_pfn.
1551 			 */
1552 			shared_bits = ~(pfn ^ end_pfn) & ~bitmask;
1553 			mask = shared_bits ? __ffs(shared_bits) : BITS_PER_LONG;
1554 		}
1555 
1556 		/*
1557 		 * Fallback to domain selective flush if no PSI support or
1558 		 * the size is too big.
1559 		 */
1560 		if (!cap_pgsel_inv(iommu->cap) ||
1561 		    mask > cap_max_amask_val(iommu->cap))
1562 			iommu->flush.flush_iotlb(iommu, did, 0, 0,
1563 							DMA_TLB_DSI_FLUSH);
1564 		else
1565 			iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1566 							DMA_TLB_PSI_FLUSH);
1567 	}
1568 
1569 	/*
1570 	 * In caching mode, changes of pages from non-present to present require
1571 	 * flush. However, device IOTLB doesn't need to be flushed in this case.
1572 	 */
1573 	if (!cap_caching_mode(iommu->cap) || !map)
1574 		iommu_flush_dev_iotlb(domain, addr, mask);
1575 }
1576 
1577 /* Notification for newly created mappings */
__mapping_notify_one(struct intel_iommu * iommu,struct dmar_domain * domain,unsigned long pfn,unsigned int pages)1578 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1579 					struct dmar_domain *domain,
1580 					unsigned long pfn, unsigned int pages)
1581 {
1582 	/*
1583 	 * It's a non-present to present mapping. Only flush if caching mode
1584 	 * and second level.
1585 	 */
1586 	if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain))
1587 		iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1588 	else
1589 		iommu_flush_write_buffer(iommu);
1590 }
1591 
intel_flush_iotlb_all(struct iommu_domain * domain)1592 static void intel_flush_iotlb_all(struct iommu_domain *domain)
1593 {
1594 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
1595 	struct iommu_domain_info *info;
1596 	unsigned long idx;
1597 
1598 	xa_for_each(&dmar_domain->iommu_array, idx, info) {
1599 		struct intel_iommu *iommu = info->iommu;
1600 		u16 did = domain_id_iommu(dmar_domain, iommu);
1601 
1602 		if (domain_use_first_level(dmar_domain))
1603 			qi_flush_piotlb(iommu, did, PASID_RID2PASID, 0, -1, 0);
1604 		else
1605 			iommu->flush.flush_iotlb(iommu, did, 0, 0,
1606 						 DMA_TLB_DSI_FLUSH);
1607 
1608 		if (!cap_caching_mode(iommu->cap))
1609 			iommu_flush_dev_iotlb(dmar_domain, 0, MAX_AGAW_PFN_WIDTH);
1610 	}
1611 }
1612 
iommu_disable_protect_mem_regions(struct intel_iommu * iommu)1613 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1614 {
1615 	u32 pmen;
1616 	unsigned long flags;
1617 
1618 	if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1619 		return;
1620 
1621 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1622 	pmen = readl(iommu->reg + DMAR_PMEN_REG);
1623 	pmen &= ~DMA_PMEN_EPM;
1624 	writel(pmen, iommu->reg + DMAR_PMEN_REG);
1625 
1626 	/* wait for the protected region status bit to clear */
1627 	IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1628 		readl, !(pmen & DMA_PMEN_PRS), pmen);
1629 
1630 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1631 }
1632 
iommu_enable_translation(struct intel_iommu * iommu)1633 static void iommu_enable_translation(struct intel_iommu *iommu)
1634 {
1635 	u32 sts;
1636 	unsigned long flags;
1637 
1638 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1639 	iommu->gcmd |= DMA_GCMD_TE;
1640 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1641 
1642 	/* Make sure hardware complete it */
1643 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1644 		      readl, (sts & DMA_GSTS_TES), sts);
1645 
1646 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1647 }
1648 
iommu_disable_translation(struct intel_iommu * iommu)1649 static void iommu_disable_translation(struct intel_iommu *iommu)
1650 {
1651 	u32 sts;
1652 	unsigned long flag;
1653 
1654 	if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1655 	    (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1656 		return;
1657 
1658 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1659 	iommu->gcmd &= ~DMA_GCMD_TE;
1660 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1661 
1662 	/* Make sure hardware complete it */
1663 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1664 		      readl, (!(sts & DMA_GSTS_TES)), sts);
1665 
1666 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1667 }
1668 
iommu_init_domains(struct intel_iommu * iommu)1669 static int iommu_init_domains(struct intel_iommu *iommu)
1670 {
1671 	u32 ndomains;
1672 
1673 	ndomains = cap_ndoms(iommu->cap);
1674 	pr_debug("%s: Number of Domains supported <%d>\n",
1675 		 iommu->name, ndomains);
1676 
1677 	spin_lock_init(&iommu->lock);
1678 
1679 	iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL);
1680 	if (!iommu->domain_ids)
1681 		return -ENOMEM;
1682 
1683 	/*
1684 	 * If Caching mode is set, then invalid translations are tagged
1685 	 * with domain-id 0, hence we need to pre-allocate it. We also
1686 	 * use domain-id 0 as a marker for non-allocated domain-id, so
1687 	 * make sure it is not used for a real domain.
1688 	 */
1689 	set_bit(0, iommu->domain_ids);
1690 
1691 	/*
1692 	 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1693 	 * entry for first-level or pass-through translation modes should
1694 	 * be programmed with a domain id different from those used for
1695 	 * second-level or nested translation. We reserve a domain id for
1696 	 * this purpose.
1697 	 */
1698 	if (sm_supported(iommu))
1699 		set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1700 
1701 	return 0;
1702 }
1703 
disable_dmar_iommu(struct intel_iommu * iommu)1704 static void disable_dmar_iommu(struct intel_iommu *iommu)
1705 {
1706 	if (!iommu->domain_ids)
1707 		return;
1708 
1709 	/*
1710 	 * All iommu domains must have been detached from the devices,
1711 	 * hence there should be no domain IDs in use.
1712 	 */
1713 	if (WARN_ON(bitmap_weight(iommu->domain_ids, cap_ndoms(iommu->cap))
1714 		    > NUM_RESERVED_DID))
1715 		return;
1716 
1717 	if (iommu->gcmd & DMA_GCMD_TE)
1718 		iommu_disable_translation(iommu);
1719 }
1720 
free_dmar_iommu(struct intel_iommu * iommu)1721 static void free_dmar_iommu(struct intel_iommu *iommu)
1722 {
1723 	if (iommu->domain_ids) {
1724 		bitmap_free(iommu->domain_ids);
1725 		iommu->domain_ids = NULL;
1726 	}
1727 
1728 	if (iommu->copied_tables) {
1729 		bitmap_free(iommu->copied_tables);
1730 		iommu->copied_tables = NULL;
1731 	}
1732 
1733 	/* free context mapping */
1734 	free_context_table(iommu);
1735 
1736 #ifdef CONFIG_INTEL_IOMMU_SVM
1737 	if (pasid_supported(iommu)) {
1738 		if (ecap_prs(iommu->ecap))
1739 			intel_svm_finish_prq(iommu);
1740 	}
1741 	if (vccap_pasid(iommu->vccap))
1742 		ioasid_unregister_allocator(&iommu->pasid_allocator);
1743 
1744 #endif
1745 }
1746 
1747 /*
1748  * Check and return whether first level is used by default for
1749  * DMA translation.
1750  */
first_level_by_default(unsigned int type)1751 static bool first_level_by_default(unsigned int type)
1752 {
1753 	/* Only SL is available in legacy mode */
1754 	if (!scalable_mode_support())
1755 		return false;
1756 
1757 	/* Only level (either FL or SL) is available, just use it */
1758 	if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity())
1759 		return intel_cap_flts_sanity();
1760 
1761 	/* Both levels are available, decide it based on domain type */
1762 	return type != IOMMU_DOMAIN_UNMANAGED;
1763 }
1764 
alloc_domain(unsigned int type)1765 static struct dmar_domain *alloc_domain(unsigned int type)
1766 {
1767 	struct dmar_domain *domain;
1768 
1769 	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
1770 	if (!domain)
1771 		return NULL;
1772 
1773 	domain->nid = NUMA_NO_NODE;
1774 	if (first_level_by_default(type))
1775 		domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL;
1776 	domain->has_iotlb_device = false;
1777 	INIT_LIST_HEAD(&domain->devices);
1778 	spin_lock_init(&domain->lock);
1779 	xa_init(&domain->iommu_array);
1780 
1781 	return domain;
1782 }
1783 
domain_attach_iommu(struct dmar_domain * domain,struct intel_iommu * iommu)1784 static int domain_attach_iommu(struct dmar_domain *domain,
1785 			       struct intel_iommu *iommu)
1786 {
1787 	struct iommu_domain_info *info, *curr;
1788 	unsigned long ndomains;
1789 	int num, ret = -ENOSPC;
1790 
1791 	info = kzalloc(sizeof(*info), GFP_KERNEL);
1792 	if (!info)
1793 		return -ENOMEM;
1794 
1795 	spin_lock(&iommu->lock);
1796 	curr = xa_load(&domain->iommu_array, iommu->seq_id);
1797 	if (curr) {
1798 		curr->refcnt++;
1799 		spin_unlock(&iommu->lock);
1800 		kfree(info);
1801 		return 0;
1802 	}
1803 
1804 	ndomains = cap_ndoms(iommu->cap);
1805 	num = find_first_zero_bit(iommu->domain_ids, ndomains);
1806 	if (num >= ndomains) {
1807 		pr_err("%s: No free domain ids\n", iommu->name);
1808 		goto err_unlock;
1809 	}
1810 
1811 	set_bit(num, iommu->domain_ids);
1812 	info->refcnt	= 1;
1813 	info->did	= num;
1814 	info->iommu	= iommu;
1815 	curr = xa_cmpxchg(&domain->iommu_array, iommu->seq_id,
1816 			  NULL, info, GFP_ATOMIC);
1817 	if (curr) {
1818 		ret = xa_err(curr) ? : -EBUSY;
1819 		goto err_clear;
1820 	}
1821 	domain_update_iommu_cap(domain);
1822 
1823 	spin_unlock(&iommu->lock);
1824 	return 0;
1825 
1826 err_clear:
1827 	clear_bit(info->did, iommu->domain_ids);
1828 err_unlock:
1829 	spin_unlock(&iommu->lock);
1830 	kfree(info);
1831 	return ret;
1832 }
1833 
domain_detach_iommu(struct dmar_domain * domain,struct intel_iommu * iommu)1834 static void domain_detach_iommu(struct dmar_domain *domain,
1835 				struct intel_iommu *iommu)
1836 {
1837 	struct iommu_domain_info *info;
1838 
1839 	spin_lock(&iommu->lock);
1840 	info = xa_load(&domain->iommu_array, iommu->seq_id);
1841 	if (--info->refcnt == 0) {
1842 		clear_bit(info->did, iommu->domain_ids);
1843 		xa_erase(&domain->iommu_array, iommu->seq_id);
1844 		domain->nid = NUMA_NO_NODE;
1845 		domain_update_iommu_cap(domain);
1846 		kfree(info);
1847 	}
1848 	spin_unlock(&iommu->lock);
1849 }
1850 
guestwidth_to_adjustwidth(int gaw)1851 static inline int guestwidth_to_adjustwidth(int gaw)
1852 {
1853 	int agaw;
1854 	int r = (gaw - 12) % 9;
1855 
1856 	if (r == 0)
1857 		agaw = gaw;
1858 	else
1859 		agaw = gaw + 9 - r;
1860 	if (agaw > 64)
1861 		agaw = 64;
1862 	return agaw;
1863 }
1864 
domain_exit(struct dmar_domain * domain)1865 static void domain_exit(struct dmar_domain *domain)
1866 {
1867 	if (domain->pgd) {
1868 		LIST_HEAD(freelist);
1869 
1870 		domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist);
1871 		put_pages_list(&freelist);
1872 	}
1873 
1874 	if (WARN_ON(!list_empty(&domain->devices)))
1875 		return;
1876 
1877 	kfree(domain);
1878 }
1879 
1880 /*
1881  * Get the PASID directory size for scalable mode context entry.
1882  * Value of X in the PDTS field of a scalable mode context entry
1883  * indicates PASID directory with 2^(X + 7) entries.
1884  */
context_get_sm_pds(struct pasid_table * table)1885 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1886 {
1887 	unsigned long pds, max_pde;
1888 
1889 	max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1890 	pds = find_first_bit(&max_pde, MAX_NR_PASID_BITS);
1891 	if (pds < 7)
1892 		return 0;
1893 
1894 	return pds - 7;
1895 }
1896 
1897 /*
1898  * Set the RID_PASID field of a scalable mode context entry. The
1899  * IOMMU hardware will use the PASID value set in this field for
1900  * DMA translations of DMA requests without PASID.
1901  */
1902 static inline void
context_set_sm_rid2pasid(struct context_entry * context,unsigned long pasid)1903 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
1904 {
1905 	context->hi |= pasid & ((1 << 20) - 1);
1906 }
1907 
1908 /*
1909  * Set the DTE(Device-TLB Enable) field of a scalable mode context
1910  * entry.
1911  */
context_set_sm_dte(struct context_entry * context)1912 static inline void context_set_sm_dte(struct context_entry *context)
1913 {
1914 	context->lo |= (1 << 2);
1915 }
1916 
1917 /*
1918  * Set the PRE(Page Request Enable) field of a scalable mode context
1919  * entry.
1920  */
context_set_sm_pre(struct context_entry * context)1921 static inline void context_set_sm_pre(struct context_entry *context)
1922 {
1923 	context->lo |= (1 << 4);
1924 }
1925 
1926 /* Convert value to context PASID directory size field coding. */
1927 #define context_pdts(pds)	(((pds) & 0x7) << 9)
1928 
domain_context_mapping_one(struct dmar_domain * domain,struct intel_iommu * iommu,struct pasid_table * table,u8 bus,u8 devfn)1929 static int domain_context_mapping_one(struct dmar_domain *domain,
1930 				      struct intel_iommu *iommu,
1931 				      struct pasid_table *table,
1932 				      u8 bus, u8 devfn)
1933 {
1934 	struct device_domain_info *info =
1935 			domain_lookup_dev_info(domain, iommu, bus, devfn);
1936 	u16 did = domain_id_iommu(domain, iommu);
1937 	int translation = CONTEXT_TT_MULTI_LEVEL;
1938 	struct context_entry *context;
1939 	int ret;
1940 
1941 	WARN_ON(did == 0);
1942 
1943 	if (hw_pass_through && domain_type_is_si(domain))
1944 		translation = CONTEXT_TT_PASS_THROUGH;
1945 
1946 	pr_debug("Set context mapping for %02x:%02x.%d\n",
1947 		bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1948 
1949 	BUG_ON(!domain->pgd);
1950 
1951 	spin_lock(&iommu->lock);
1952 	ret = -ENOMEM;
1953 	context = iommu_context_addr(iommu, bus, devfn, 1);
1954 	if (!context)
1955 		goto out_unlock;
1956 
1957 	ret = 0;
1958 	if (context_present(context) && !context_copied(iommu, bus, devfn))
1959 		goto out_unlock;
1960 
1961 	/*
1962 	 * For kdump cases, old valid entries may be cached due to the
1963 	 * in-flight DMA and copied pgtable, but there is no unmapping
1964 	 * behaviour for them, thus we need an explicit cache flush for
1965 	 * the newly-mapped device. For kdump, at this point, the device
1966 	 * is supposed to finish reset at its driver probe stage, so no
1967 	 * in-flight DMA will exist, and we don't need to worry anymore
1968 	 * hereafter.
1969 	 */
1970 	if (context_copied(iommu, bus, devfn)) {
1971 		u16 did_old = context_domain_id(context);
1972 
1973 		if (did_old < cap_ndoms(iommu->cap)) {
1974 			iommu->flush.flush_context(iommu, did_old,
1975 						   (((u16)bus) << 8) | devfn,
1976 						   DMA_CCMD_MASK_NOBIT,
1977 						   DMA_CCMD_DEVICE_INVL);
1978 			iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
1979 						 DMA_TLB_DSI_FLUSH);
1980 		}
1981 
1982 		clear_context_copied(iommu, bus, devfn);
1983 	}
1984 
1985 	context_clear_entry(context);
1986 
1987 	if (sm_supported(iommu)) {
1988 		unsigned long pds;
1989 
1990 		WARN_ON(!table);
1991 
1992 		/* Setup the PASID DIR pointer: */
1993 		pds = context_get_sm_pds(table);
1994 		context->lo = (u64)virt_to_phys(table->table) |
1995 				context_pdts(pds);
1996 
1997 		/* Setup the RID_PASID field: */
1998 		context_set_sm_rid2pasid(context, PASID_RID2PASID);
1999 
2000 		/*
2001 		 * Setup the Device-TLB enable bit and Page request
2002 		 * Enable bit:
2003 		 */
2004 		if (info && info->ats_supported)
2005 			context_set_sm_dte(context);
2006 		if (info && info->pri_supported)
2007 			context_set_sm_pre(context);
2008 		if (info && info->pasid_supported)
2009 			context_set_pasid(context);
2010 	} else {
2011 		struct dma_pte *pgd = domain->pgd;
2012 		int agaw;
2013 
2014 		context_set_domain_id(context, did);
2015 
2016 		if (translation != CONTEXT_TT_PASS_THROUGH) {
2017 			/*
2018 			 * Skip top levels of page tables for iommu which has
2019 			 * less agaw than default. Unnecessary for PT mode.
2020 			 */
2021 			for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2022 				ret = -ENOMEM;
2023 				pgd = phys_to_virt(dma_pte_addr(pgd));
2024 				if (!dma_pte_present(pgd))
2025 					goto out_unlock;
2026 			}
2027 
2028 			if (info && info->ats_supported)
2029 				translation = CONTEXT_TT_DEV_IOTLB;
2030 			else
2031 				translation = CONTEXT_TT_MULTI_LEVEL;
2032 
2033 			context_set_address_root(context, virt_to_phys(pgd));
2034 			context_set_address_width(context, agaw);
2035 		} else {
2036 			/*
2037 			 * In pass through mode, AW must be programmed to
2038 			 * indicate the largest AGAW value supported by
2039 			 * hardware. And ASR is ignored by hardware.
2040 			 */
2041 			context_set_address_width(context, iommu->msagaw);
2042 		}
2043 
2044 		context_set_translation_type(context, translation);
2045 	}
2046 
2047 	context_set_fault_enable(context);
2048 	context_set_present(context);
2049 	if (!ecap_coherent(iommu->ecap))
2050 		clflush_cache_range(context, sizeof(*context));
2051 
2052 	/*
2053 	 * It's a non-present to present mapping. If hardware doesn't cache
2054 	 * non-present entry we only need to flush the write-buffer. If the
2055 	 * _does_ cache non-present entries, then it does so in the special
2056 	 * domain #0, which we have to flush:
2057 	 */
2058 	if (cap_caching_mode(iommu->cap)) {
2059 		iommu->flush.flush_context(iommu, 0,
2060 					   (((u16)bus) << 8) | devfn,
2061 					   DMA_CCMD_MASK_NOBIT,
2062 					   DMA_CCMD_DEVICE_INVL);
2063 		iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2064 	} else {
2065 		iommu_flush_write_buffer(iommu);
2066 	}
2067 
2068 	ret = 0;
2069 
2070 out_unlock:
2071 	spin_unlock(&iommu->lock);
2072 
2073 	return ret;
2074 }
2075 
2076 struct domain_context_mapping_data {
2077 	struct dmar_domain *domain;
2078 	struct intel_iommu *iommu;
2079 	struct pasid_table *table;
2080 };
2081 
domain_context_mapping_cb(struct pci_dev * pdev,u16 alias,void * opaque)2082 static int domain_context_mapping_cb(struct pci_dev *pdev,
2083 				     u16 alias, void *opaque)
2084 {
2085 	struct domain_context_mapping_data *data = opaque;
2086 
2087 	return domain_context_mapping_one(data->domain, data->iommu,
2088 					  data->table, PCI_BUS_NUM(alias),
2089 					  alias & 0xff);
2090 }
2091 
2092 static int
domain_context_mapping(struct dmar_domain * domain,struct device * dev)2093 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2094 {
2095 	struct domain_context_mapping_data data;
2096 	struct pasid_table *table;
2097 	struct intel_iommu *iommu;
2098 	u8 bus, devfn;
2099 
2100 	iommu = device_to_iommu(dev, &bus, &devfn);
2101 	if (!iommu)
2102 		return -ENODEV;
2103 
2104 	table = intel_pasid_get_table(dev);
2105 
2106 	if (!dev_is_pci(dev))
2107 		return domain_context_mapping_one(domain, iommu, table,
2108 						  bus, devfn);
2109 
2110 	data.domain = domain;
2111 	data.iommu = iommu;
2112 	data.table = table;
2113 
2114 	return pci_for_each_dma_alias(to_pci_dev(dev),
2115 				      &domain_context_mapping_cb, &data);
2116 }
2117 
domain_context_mapped_cb(struct pci_dev * pdev,u16 alias,void * opaque)2118 static int domain_context_mapped_cb(struct pci_dev *pdev,
2119 				    u16 alias, void *opaque)
2120 {
2121 	struct intel_iommu *iommu = opaque;
2122 
2123 	return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2124 }
2125 
domain_context_mapped(struct device * dev)2126 static int domain_context_mapped(struct device *dev)
2127 {
2128 	struct intel_iommu *iommu;
2129 	u8 bus, devfn;
2130 
2131 	iommu = device_to_iommu(dev, &bus, &devfn);
2132 	if (!iommu)
2133 		return -ENODEV;
2134 
2135 	if (!dev_is_pci(dev))
2136 		return device_context_mapped(iommu, bus, devfn);
2137 
2138 	return !pci_for_each_dma_alias(to_pci_dev(dev),
2139 				       domain_context_mapped_cb, iommu);
2140 }
2141 
2142 /* Returns a number of VTD pages, but aligned to MM page size */
aligned_nrpages(unsigned long host_addr,size_t size)2143 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2144 					    size_t size)
2145 {
2146 	host_addr &= ~PAGE_MASK;
2147 	return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2148 }
2149 
2150 /* Return largest possible superpage level for a given mapping */
hardware_largepage_caps(struct dmar_domain * domain,unsigned long iov_pfn,unsigned long phy_pfn,unsigned long pages)2151 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2152 					  unsigned long iov_pfn,
2153 					  unsigned long phy_pfn,
2154 					  unsigned long pages)
2155 {
2156 	int support, level = 1;
2157 	unsigned long pfnmerge;
2158 
2159 	support = domain->iommu_superpage;
2160 
2161 	/* To use a large page, the virtual *and* physical addresses
2162 	   must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2163 	   of them will mean we have to use smaller pages. So just
2164 	   merge them and check both at once. */
2165 	pfnmerge = iov_pfn | phy_pfn;
2166 
2167 	while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2168 		pages >>= VTD_STRIDE_SHIFT;
2169 		if (!pages)
2170 			break;
2171 		pfnmerge >>= VTD_STRIDE_SHIFT;
2172 		level++;
2173 		support--;
2174 	}
2175 	return level;
2176 }
2177 
2178 /*
2179  * Ensure that old small page tables are removed to make room for superpage(s).
2180  * We're going to add new large pages, so make sure we don't remove their parent
2181  * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
2182  */
switch_to_super_page(struct dmar_domain * domain,unsigned long start_pfn,unsigned long end_pfn,int level)2183 static void switch_to_super_page(struct dmar_domain *domain,
2184 				 unsigned long start_pfn,
2185 				 unsigned long end_pfn, int level)
2186 {
2187 	unsigned long lvl_pages = lvl_to_nr_pages(level);
2188 	struct iommu_domain_info *info;
2189 	struct dma_pte *pte = NULL;
2190 	unsigned long i;
2191 
2192 	while (start_pfn <= end_pfn) {
2193 		if (!pte)
2194 			pte = pfn_to_dma_pte(domain, start_pfn, &level);
2195 
2196 		if (dma_pte_present(pte)) {
2197 			dma_pte_free_pagetable(domain, start_pfn,
2198 					       start_pfn + lvl_pages - 1,
2199 					       level + 1);
2200 
2201 			xa_for_each(&domain->iommu_array, i, info)
2202 				iommu_flush_iotlb_psi(info->iommu, domain,
2203 						      start_pfn, lvl_pages,
2204 						      0, 0);
2205 		}
2206 
2207 		pte++;
2208 		start_pfn += lvl_pages;
2209 		if (first_pte_in_page(pte))
2210 			pte = NULL;
2211 	}
2212 }
2213 
2214 static int
__domain_mapping(struct dmar_domain * domain,unsigned long iov_pfn,unsigned long phys_pfn,unsigned long nr_pages,int prot)2215 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2216 		 unsigned long phys_pfn, unsigned long nr_pages, int prot)
2217 {
2218 	struct dma_pte *first_pte = NULL, *pte = NULL;
2219 	unsigned int largepage_lvl = 0;
2220 	unsigned long lvl_pages = 0;
2221 	phys_addr_t pteval;
2222 	u64 attr;
2223 
2224 	BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2225 
2226 	if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2227 		return -EINVAL;
2228 
2229 	attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2230 	attr |= DMA_FL_PTE_PRESENT;
2231 	if (domain_use_first_level(domain)) {
2232 		attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
2233 		if (prot & DMA_PTE_WRITE)
2234 			attr |= DMA_FL_PTE_DIRTY;
2235 	}
2236 
2237 	pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2238 
2239 	while (nr_pages > 0) {
2240 		uint64_t tmp;
2241 
2242 		if (!pte) {
2243 			largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
2244 					phys_pfn, nr_pages);
2245 
2246 			pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2247 			if (!pte)
2248 				return -ENOMEM;
2249 			first_pte = pte;
2250 
2251 			lvl_pages = lvl_to_nr_pages(largepage_lvl);
2252 
2253 			/* It is large page*/
2254 			if (largepage_lvl > 1) {
2255 				unsigned long end_pfn;
2256 				unsigned long pages_to_remove;
2257 
2258 				pteval |= DMA_PTE_LARGE_PAGE;
2259 				pages_to_remove = min_t(unsigned long, nr_pages,
2260 							nr_pte_to_next_page(pte) * lvl_pages);
2261 				end_pfn = iov_pfn + pages_to_remove - 1;
2262 				switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
2263 			} else {
2264 				pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2265 			}
2266 
2267 		}
2268 		/* We don't need lock here, nobody else
2269 		 * touches the iova range
2270 		 */
2271 		tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2272 		if (tmp) {
2273 			static int dumps = 5;
2274 			pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2275 				iov_pfn, tmp, (unsigned long long)pteval);
2276 			if (dumps) {
2277 				dumps--;
2278 				debug_dma_dump_mappings(NULL);
2279 			}
2280 			WARN_ON(1);
2281 		}
2282 
2283 		nr_pages -= lvl_pages;
2284 		iov_pfn += lvl_pages;
2285 		phys_pfn += lvl_pages;
2286 		pteval += lvl_pages * VTD_PAGE_SIZE;
2287 
2288 		/* If the next PTE would be the first in a new page, then we
2289 		 * need to flush the cache on the entries we've just written.
2290 		 * And then we'll need to recalculate 'pte', so clear it and
2291 		 * let it get set again in the if (!pte) block above.
2292 		 *
2293 		 * If we're done (!nr_pages) we need to flush the cache too.
2294 		 *
2295 		 * Also if we've been setting superpages, we may need to
2296 		 * recalculate 'pte' and switch back to smaller pages for the
2297 		 * end of the mapping, if the trailing size is not enough to
2298 		 * use another superpage (i.e. nr_pages < lvl_pages).
2299 		 */
2300 		pte++;
2301 		if (!nr_pages || first_pte_in_page(pte) ||
2302 		    (largepage_lvl > 1 && nr_pages < lvl_pages)) {
2303 			domain_flush_cache(domain, first_pte,
2304 					   (void *)pte - (void *)first_pte);
2305 			pte = NULL;
2306 		}
2307 	}
2308 
2309 	return 0;
2310 }
2311 
domain_context_clear_one(struct device_domain_info * info,u8 bus,u8 devfn)2312 static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
2313 {
2314 	struct intel_iommu *iommu = info->iommu;
2315 	struct context_entry *context;
2316 	u16 did_old;
2317 
2318 	if (!iommu)
2319 		return;
2320 
2321 	spin_lock(&iommu->lock);
2322 	context = iommu_context_addr(iommu, bus, devfn, 0);
2323 	if (!context) {
2324 		spin_unlock(&iommu->lock);
2325 		return;
2326 	}
2327 
2328 	if (sm_supported(iommu)) {
2329 		if (hw_pass_through && domain_type_is_si(info->domain))
2330 			did_old = FLPT_DEFAULT_DID;
2331 		else
2332 			did_old = domain_id_iommu(info->domain, iommu);
2333 	} else {
2334 		did_old = context_domain_id(context);
2335 	}
2336 
2337 	context_clear_entry(context);
2338 	__iommu_flush_cache(iommu, context, sizeof(*context));
2339 	spin_unlock(&iommu->lock);
2340 	iommu->flush.flush_context(iommu,
2341 				   did_old,
2342 				   (((u16)bus) << 8) | devfn,
2343 				   DMA_CCMD_MASK_NOBIT,
2344 				   DMA_CCMD_DEVICE_INVL);
2345 
2346 	if (sm_supported(iommu))
2347 		qi_flush_pasid_cache(iommu, did_old, QI_PC_ALL_PASIDS, 0);
2348 
2349 	iommu->flush.flush_iotlb(iommu,
2350 				 did_old,
2351 				 0,
2352 				 0,
2353 				 DMA_TLB_DSI_FLUSH);
2354 
2355 	__iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH);
2356 }
2357 
domain_setup_first_level(struct intel_iommu * iommu,struct dmar_domain * domain,struct device * dev,u32 pasid)2358 static int domain_setup_first_level(struct intel_iommu *iommu,
2359 				    struct dmar_domain *domain,
2360 				    struct device *dev,
2361 				    u32 pasid)
2362 {
2363 	struct dma_pte *pgd = domain->pgd;
2364 	int agaw, level;
2365 	int flags = 0;
2366 
2367 	/*
2368 	 * Skip top levels of page tables for iommu which has
2369 	 * less agaw than default. Unnecessary for PT mode.
2370 	 */
2371 	for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2372 		pgd = phys_to_virt(dma_pte_addr(pgd));
2373 		if (!dma_pte_present(pgd))
2374 			return -ENOMEM;
2375 	}
2376 
2377 	level = agaw_to_level(agaw);
2378 	if (level != 4 && level != 5)
2379 		return -EINVAL;
2380 
2381 	if (pasid != PASID_RID2PASID)
2382 		flags |= PASID_FLAG_SUPERVISOR_MODE;
2383 	if (level == 5)
2384 		flags |= PASID_FLAG_FL5LP;
2385 
2386 	if (domain->force_snooping)
2387 		flags |= PASID_FLAG_PAGE_SNOOP;
2388 
2389 	return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2390 					     domain_id_iommu(domain, iommu),
2391 					     flags);
2392 }
2393 
dev_is_real_dma_subdevice(struct device * dev)2394 static bool dev_is_real_dma_subdevice(struct device *dev)
2395 {
2396 	return dev && dev_is_pci(dev) &&
2397 	       pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2398 }
2399 
iommu_domain_identity_map(struct dmar_domain * domain,unsigned long first_vpfn,unsigned long last_vpfn)2400 static int iommu_domain_identity_map(struct dmar_domain *domain,
2401 				     unsigned long first_vpfn,
2402 				     unsigned long last_vpfn)
2403 {
2404 	/*
2405 	 * RMRR range might have overlap with physical memory range,
2406 	 * clear it first
2407 	 */
2408 	dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2409 
2410 	return __domain_mapping(domain, first_vpfn,
2411 				first_vpfn, last_vpfn - first_vpfn + 1,
2412 				DMA_PTE_READ|DMA_PTE_WRITE);
2413 }
2414 
2415 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2416 
si_domain_init(int hw)2417 static int __init si_domain_init(int hw)
2418 {
2419 	struct dmar_rmrr_unit *rmrr;
2420 	struct device *dev;
2421 	int i, nid, ret;
2422 
2423 	si_domain = alloc_domain(IOMMU_DOMAIN_IDENTITY);
2424 	if (!si_domain)
2425 		return -EFAULT;
2426 
2427 	if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2428 		domain_exit(si_domain);
2429 		si_domain = NULL;
2430 		return -EFAULT;
2431 	}
2432 
2433 	if (hw)
2434 		return 0;
2435 
2436 	for_each_online_node(nid) {
2437 		unsigned long start_pfn, end_pfn;
2438 		int i;
2439 
2440 		for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2441 			ret = iommu_domain_identity_map(si_domain,
2442 					mm_to_dma_pfn(start_pfn),
2443 					mm_to_dma_pfn(end_pfn));
2444 			if (ret)
2445 				return ret;
2446 		}
2447 	}
2448 
2449 	/*
2450 	 * Identity map the RMRRs so that devices with RMRRs could also use
2451 	 * the si_domain.
2452 	 */
2453 	for_each_rmrr_units(rmrr) {
2454 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2455 					  i, dev) {
2456 			unsigned long long start = rmrr->base_address;
2457 			unsigned long long end = rmrr->end_address;
2458 
2459 			if (WARN_ON(end < start ||
2460 				    end >> agaw_to_width(si_domain->agaw)))
2461 				continue;
2462 
2463 			ret = iommu_domain_identity_map(si_domain,
2464 					mm_to_dma_pfn(start >> PAGE_SHIFT),
2465 					mm_to_dma_pfn(end >> PAGE_SHIFT));
2466 			if (ret)
2467 				return ret;
2468 		}
2469 	}
2470 
2471 	return 0;
2472 }
2473 
domain_add_dev_info(struct dmar_domain * domain,struct device * dev)2474 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2475 {
2476 	struct device_domain_info *info = dev_iommu_priv_get(dev);
2477 	struct intel_iommu *iommu;
2478 	unsigned long flags;
2479 	u8 bus, devfn;
2480 	int ret;
2481 
2482 	iommu = device_to_iommu(dev, &bus, &devfn);
2483 	if (!iommu)
2484 		return -ENODEV;
2485 
2486 	ret = domain_attach_iommu(domain, iommu);
2487 	if (ret)
2488 		return ret;
2489 	info->domain = domain;
2490 	spin_lock_irqsave(&domain->lock, flags);
2491 	list_add(&info->link, &domain->devices);
2492 	spin_unlock_irqrestore(&domain->lock, flags);
2493 
2494 	/* PASID table is mandatory for a PCI device in scalable mode. */
2495 	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
2496 		/* Setup the PASID entry for requests without PASID: */
2497 		if (hw_pass_through && domain_type_is_si(domain))
2498 			ret = intel_pasid_setup_pass_through(iommu, domain,
2499 					dev, PASID_RID2PASID);
2500 		else if (domain_use_first_level(domain))
2501 			ret = domain_setup_first_level(iommu, domain, dev,
2502 					PASID_RID2PASID);
2503 		else
2504 			ret = intel_pasid_setup_second_level(iommu, domain,
2505 					dev, PASID_RID2PASID);
2506 		if (ret) {
2507 			dev_err(dev, "Setup RID2PASID failed\n");
2508 			device_block_translation(dev);
2509 			return ret;
2510 		}
2511 	}
2512 
2513 	ret = domain_context_mapping(domain, dev);
2514 	if (ret) {
2515 		dev_err(dev, "Domain context map failed\n");
2516 		device_block_translation(dev);
2517 		return ret;
2518 	}
2519 
2520 	if (sm_supported(info->iommu) || !domain_type_is_si(info->domain))
2521 		iommu_enable_pci_caps(info);
2522 
2523 	return 0;
2524 }
2525 
device_has_rmrr(struct device * dev)2526 static bool device_has_rmrr(struct device *dev)
2527 {
2528 	struct dmar_rmrr_unit *rmrr;
2529 	struct device *tmp;
2530 	int i;
2531 
2532 	rcu_read_lock();
2533 	for_each_rmrr_units(rmrr) {
2534 		/*
2535 		 * Return TRUE if this RMRR contains the device that
2536 		 * is passed in.
2537 		 */
2538 		for_each_active_dev_scope(rmrr->devices,
2539 					  rmrr->devices_cnt, i, tmp)
2540 			if (tmp == dev ||
2541 			    is_downstream_to_pci_bridge(dev, tmp)) {
2542 				rcu_read_unlock();
2543 				return true;
2544 			}
2545 	}
2546 	rcu_read_unlock();
2547 	return false;
2548 }
2549 
2550 /**
2551  * device_rmrr_is_relaxable - Test whether the RMRR of this device
2552  * is relaxable (ie. is allowed to be not enforced under some conditions)
2553  * @dev: device handle
2554  *
2555  * We assume that PCI USB devices with RMRRs have them largely
2556  * for historical reasons and that the RMRR space is not actively used post
2557  * boot.  This exclusion may change if vendors begin to abuse it.
2558  *
2559  * The same exception is made for graphics devices, with the requirement that
2560  * any use of the RMRR regions will be torn down before assigning the device
2561  * to a guest.
2562  *
2563  * Return: true if the RMRR is relaxable, false otherwise
2564  */
device_rmrr_is_relaxable(struct device * dev)2565 static bool device_rmrr_is_relaxable(struct device *dev)
2566 {
2567 	struct pci_dev *pdev;
2568 
2569 	if (!dev_is_pci(dev))
2570 		return false;
2571 
2572 	pdev = to_pci_dev(dev);
2573 	if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2574 		return true;
2575 	else
2576 		return false;
2577 }
2578 
2579 /*
2580  * There are a couple cases where we need to restrict the functionality of
2581  * devices associated with RMRRs.  The first is when evaluating a device for
2582  * identity mapping because problems exist when devices are moved in and out
2583  * of domains and their respective RMRR information is lost.  This means that
2584  * a device with associated RMRRs will never be in a "passthrough" domain.
2585  * The second is use of the device through the IOMMU API.  This interface
2586  * expects to have full control of the IOVA space for the device.  We cannot
2587  * satisfy both the requirement that RMRR access is maintained and have an
2588  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2589  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2590  * We therefore prevent devices associated with an RMRR from participating in
2591  * the IOMMU API, which eliminates them from device assignment.
2592  *
2593  * In both cases, devices which have relaxable RMRRs are not concerned by this
2594  * restriction. See device_rmrr_is_relaxable comment.
2595  */
device_is_rmrr_locked(struct device * dev)2596 static bool device_is_rmrr_locked(struct device *dev)
2597 {
2598 	if (!device_has_rmrr(dev))
2599 		return false;
2600 
2601 	if (device_rmrr_is_relaxable(dev))
2602 		return false;
2603 
2604 	return true;
2605 }
2606 
2607 /*
2608  * Return the required default domain type for a specific device.
2609  *
2610  * @dev: the device in query
2611  * @startup: true if this is during early boot
2612  *
2613  * Returns:
2614  *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2615  *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2616  *  - 0: both identity and dynamic domains work for this device
2617  */
device_def_domain_type(struct device * dev)2618 static int device_def_domain_type(struct device *dev)
2619 {
2620 	if (dev_is_pci(dev)) {
2621 		struct pci_dev *pdev = to_pci_dev(dev);
2622 
2623 		if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2624 			return IOMMU_DOMAIN_IDENTITY;
2625 
2626 		if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2627 			return IOMMU_DOMAIN_IDENTITY;
2628 	}
2629 
2630 	return 0;
2631 }
2632 
intel_iommu_init_qi(struct intel_iommu * iommu)2633 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2634 {
2635 	/*
2636 	 * Start from the sane iommu hardware state.
2637 	 * If the queued invalidation is already initialized by us
2638 	 * (for example, while enabling interrupt-remapping) then
2639 	 * we got the things already rolling from a sane state.
2640 	 */
2641 	if (!iommu->qi) {
2642 		/*
2643 		 * Clear any previous faults.
2644 		 */
2645 		dmar_fault(-1, iommu);
2646 		/*
2647 		 * Disable queued invalidation if supported and already enabled
2648 		 * before OS handover.
2649 		 */
2650 		dmar_disable_qi(iommu);
2651 	}
2652 
2653 	if (dmar_enable_qi(iommu)) {
2654 		/*
2655 		 * Queued Invalidate not enabled, use Register Based Invalidate
2656 		 */
2657 		iommu->flush.flush_context = __iommu_flush_context;
2658 		iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2659 		pr_info("%s: Using Register based invalidation\n",
2660 			iommu->name);
2661 	} else {
2662 		iommu->flush.flush_context = qi_flush_context;
2663 		iommu->flush.flush_iotlb = qi_flush_iotlb;
2664 		pr_info("%s: Using Queued invalidation\n", iommu->name);
2665 	}
2666 }
2667 
copy_context_table(struct intel_iommu * iommu,struct root_entry * old_re,struct context_entry ** tbl,int bus,bool ext)2668 static int copy_context_table(struct intel_iommu *iommu,
2669 			      struct root_entry *old_re,
2670 			      struct context_entry **tbl,
2671 			      int bus, bool ext)
2672 {
2673 	int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2674 	struct context_entry *new_ce = NULL, ce;
2675 	struct context_entry *old_ce = NULL;
2676 	struct root_entry re;
2677 	phys_addr_t old_ce_phys;
2678 
2679 	tbl_idx = ext ? bus * 2 : bus;
2680 	memcpy(&re, old_re, sizeof(re));
2681 
2682 	for (devfn = 0; devfn < 256; devfn++) {
2683 		/* First calculate the correct index */
2684 		idx = (ext ? devfn * 2 : devfn) % 256;
2685 
2686 		if (idx == 0) {
2687 			/* First save what we may have and clean up */
2688 			if (new_ce) {
2689 				tbl[tbl_idx] = new_ce;
2690 				__iommu_flush_cache(iommu, new_ce,
2691 						    VTD_PAGE_SIZE);
2692 				pos = 1;
2693 			}
2694 
2695 			if (old_ce)
2696 				memunmap(old_ce);
2697 
2698 			ret = 0;
2699 			if (devfn < 0x80)
2700 				old_ce_phys = root_entry_lctp(&re);
2701 			else
2702 				old_ce_phys = root_entry_uctp(&re);
2703 
2704 			if (!old_ce_phys) {
2705 				if (ext && devfn == 0) {
2706 					/* No LCTP, try UCTP */
2707 					devfn = 0x7f;
2708 					continue;
2709 				} else {
2710 					goto out;
2711 				}
2712 			}
2713 
2714 			ret = -ENOMEM;
2715 			old_ce = memremap(old_ce_phys, PAGE_SIZE,
2716 					MEMREMAP_WB);
2717 			if (!old_ce)
2718 				goto out;
2719 
2720 			new_ce = alloc_pgtable_page(iommu->node);
2721 			if (!new_ce)
2722 				goto out_unmap;
2723 
2724 			ret = 0;
2725 		}
2726 
2727 		/* Now copy the context entry */
2728 		memcpy(&ce, old_ce + idx, sizeof(ce));
2729 
2730 		if (!context_present(&ce))
2731 			continue;
2732 
2733 		did = context_domain_id(&ce);
2734 		if (did >= 0 && did < cap_ndoms(iommu->cap))
2735 			set_bit(did, iommu->domain_ids);
2736 
2737 		set_context_copied(iommu, bus, devfn);
2738 		new_ce[idx] = ce;
2739 	}
2740 
2741 	tbl[tbl_idx + pos] = new_ce;
2742 
2743 	__iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2744 
2745 out_unmap:
2746 	memunmap(old_ce);
2747 
2748 out:
2749 	return ret;
2750 }
2751 
copy_translation_tables(struct intel_iommu * iommu)2752 static int copy_translation_tables(struct intel_iommu *iommu)
2753 {
2754 	struct context_entry **ctxt_tbls;
2755 	struct root_entry *old_rt;
2756 	phys_addr_t old_rt_phys;
2757 	int ctxt_table_entries;
2758 	u64 rtaddr_reg;
2759 	int bus, ret;
2760 	bool new_ext, ext;
2761 
2762 	rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2763 	ext        = !!(rtaddr_reg & DMA_RTADDR_SMT);
2764 	new_ext    = !!sm_supported(iommu);
2765 
2766 	/*
2767 	 * The RTT bit can only be changed when translation is disabled,
2768 	 * but disabling translation means to open a window for data
2769 	 * corruption. So bail out and don't copy anything if we would
2770 	 * have to change the bit.
2771 	 */
2772 	if (new_ext != ext)
2773 		return -EINVAL;
2774 
2775 	iommu->copied_tables = bitmap_zalloc(BIT_ULL(16), GFP_KERNEL);
2776 	if (!iommu->copied_tables)
2777 		return -ENOMEM;
2778 
2779 	old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
2780 	if (!old_rt_phys)
2781 		return -EINVAL;
2782 
2783 	old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
2784 	if (!old_rt)
2785 		return -ENOMEM;
2786 
2787 	/* This is too big for the stack - allocate it from slab */
2788 	ctxt_table_entries = ext ? 512 : 256;
2789 	ret = -ENOMEM;
2790 	ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
2791 	if (!ctxt_tbls)
2792 		goto out_unmap;
2793 
2794 	for (bus = 0; bus < 256; bus++) {
2795 		ret = copy_context_table(iommu, &old_rt[bus],
2796 					 ctxt_tbls, bus, ext);
2797 		if (ret) {
2798 			pr_err("%s: Failed to copy context table for bus %d\n",
2799 				iommu->name, bus);
2800 			continue;
2801 		}
2802 	}
2803 
2804 	spin_lock(&iommu->lock);
2805 
2806 	/* Context tables are copied, now write them to the root_entry table */
2807 	for (bus = 0; bus < 256; bus++) {
2808 		int idx = ext ? bus * 2 : bus;
2809 		u64 val;
2810 
2811 		if (ctxt_tbls[idx]) {
2812 			val = virt_to_phys(ctxt_tbls[idx]) | 1;
2813 			iommu->root_entry[bus].lo = val;
2814 		}
2815 
2816 		if (!ext || !ctxt_tbls[idx + 1])
2817 			continue;
2818 
2819 		val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
2820 		iommu->root_entry[bus].hi = val;
2821 	}
2822 
2823 	spin_unlock(&iommu->lock);
2824 
2825 	kfree(ctxt_tbls);
2826 
2827 	__iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
2828 
2829 	ret = 0;
2830 
2831 out_unmap:
2832 	memunmap(old_rt);
2833 
2834 	return ret;
2835 }
2836 
2837 #ifdef CONFIG_INTEL_IOMMU_SVM
intel_vcmd_ioasid_alloc(ioasid_t min,ioasid_t max,void * data)2838 static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data)
2839 {
2840 	struct intel_iommu *iommu = data;
2841 	ioasid_t ioasid;
2842 
2843 	if (!iommu)
2844 		return INVALID_IOASID;
2845 	/*
2846 	 * VT-d virtual command interface always uses the full 20 bit
2847 	 * PASID range. Host can partition guest PASID range based on
2848 	 * policies but it is out of guest's control.
2849 	 */
2850 	if (min < PASID_MIN || max > intel_pasid_max_id)
2851 		return INVALID_IOASID;
2852 
2853 	if (vcmd_alloc_pasid(iommu, &ioasid))
2854 		return INVALID_IOASID;
2855 
2856 	return ioasid;
2857 }
2858 
intel_vcmd_ioasid_free(ioasid_t ioasid,void * data)2859 static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data)
2860 {
2861 	struct intel_iommu *iommu = data;
2862 
2863 	if (!iommu)
2864 		return;
2865 	/*
2866 	 * Sanity check the ioasid owner is done at upper layer, e.g. VFIO
2867 	 * We can only free the PASID when all the devices are unbound.
2868 	 */
2869 	if (ioasid_find(NULL, ioasid, NULL)) {
2870 		pr_alert("Cannot free active IOASID %d\n", ioasid);
2871 		return;
2872 	}
2873 	vcmd_free_pasid(iommu, ioasid);
2874 }
2875 
register_pasid_allocator(struct intel_iommu * iommu)2876 static void register_pasid_allocator(struct intel_iommu *iommu)
2877 {
2878 	/*
2879 	 * If we are running in the host, no need for custom allocator
2880 	 * in that PASIDs are allocated from the host system-wide.
2881 	 */
2882 	if (!cap_caching_mode(iommu->cap))
2883 		return;
2884 
2885 	if (!sm_supported(iommu)) {
2886 		pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n");
2887 		return;
2888 	}
2889 
2890 	/*
2891 	 * Register a custom PASID allocator if we are running in a guest,
2892 	 * guest PASID must be obtained via virtual command interface.
2893 	 * There can be multiple vIOMMUs in each guest but only one allocator
2894 	 * is active. All vIOMMU allocators will eventually be calling the same
2895 	 * host allocator.
2896 	 */
2897 	if (!vccap_pasid(iommu->vccap))
2898 		return;
2899 
2900 	pr_info("Register custom PASID allocator\n");
2901 	iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc;
2902 	iommu->pasid_allocator.free = intel_vcmd_ioasid_free;
2903 	iommu->pasid_allocator.pdata = (void *)iommu;
2904 	if (ioasid_register_allocator(&iommu->pasid_allocator)) {
2905 		pr_warn("Custom PASID allocator failed, scalable mode disabled\n");
2906 		/*
2907 		 * Disable scalable mode on this IOMMU if there
2908 		 * is no custom allocator. Mixing SM capable vIOMMU
2909 		 * and non-SM vIOMMU are not supported.
2910 		 */
2911 		intel_iommu_sm = 0;
2912 	}
2913 }
2914 #endif
2915 
init_dmars(void)2916 static int __init init_dmars(void)
2917 {
2918 	struct dmar_drhd_unit *drhd;
2919 	struct intel_iommu *iommu;
2920 	int ret;
2921 
2922 	ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL);
2923 	if (ret)
2924 		goto free_iommu;
2925 
2926 	for_each_iommu(iommu, drhd) {
2927 		if (drhd->ignored) {
2928 			iommu_disable_translation(iommu);
2929 			continue;
2930 		}
2931 
2932 		/*
2933 		 * Find the max pasid size of all IOMMU's in the system.
2934 		 * We need to ensure the system pasid table is no bigger
2935 		 * than the smallest supported.
2936 		 */
2937 		if (pasid_supported(iommu)) {
2938 			u32 temp = 2 << ecap_pss(iommu->ecap);
2939 
2940 			intel_pasid_max_id = min_t(u32, temp,
2941 						   intel_pasid_max_id);
2942 		}
2943 
2944 		intel_iommu_init_qi(iommu);
2945 
2946 		ret = iommu_init_domains(iommu);
2947 		if (ret)
2948 			goto free_iommu;
2949 
2950 		init_translation_status(iommu);
2951 
2952 		if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
2953 			iommu_disable_translation(iommu);
2954 			clear_translation_pre_enabled(iommu);
2955 			pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
2956 				iommu->name);
2957 		}
2958 
2959 		/*
2960 		 * TBD:
2961 		 * we could share the same root & context tables
2962 		 * among all IOMMU's. Need to Split it later.
2963 		 */
2964 		ret = iommu_alloc_root_entry(iommu);
2965 		if (ret)
2966 			goto free_iommu;
2967 
2968 		if (translation_pre_enabled(iommu)) {
2969 			pr_info("Translation already enabled - trying to copy translation structures\n");
2970 
2971 			ret = copy_translation_tables(iommu);
2972 			if (ret) {
2973 				/*
2974 				 * We found the IOMMU with translation
2975 				 * enabled - but failed to copy over the
2976 				 * old root-entry table. Try to proceed
2977 				 * by disabling translation now and
2978 				 * allocating a clean root-entry table.
2979 				 * This might cause DMAR faults, but
2980 				 * probably the dump will still succeed.
2981 				 */
2982 				pr_err("Failed to copy translation tables from previous kernel for %s\n",
2983 				       iommu->name);
2984 				iommu_disable_translation(iommu);
2985 				clear_translation_pre_enabled(iommu);
2986 			} else {
2987 				pr_info("Copied translation tables from previous kernel for %s\n",
2988 					iommu->name);
2989 			}
2990 		}
2991 
2992 		if (!ecap_pass_through(iommu->ecap))
2993 			hw_pass_through = 0;
2994 		intel_svm_check(iommu);
2995 	}
2996 
2997 	/*
2998 	 * Now that qi is enabled on all iommus, set the root entry and flush
2999 	 * caches. This is required on some Intel X58 chipsets, otherwise the
3000 	 * flush_context function will loop forever and the boot hangs.
3001 	 */
3002 	for_each_active_iommu(iommu, drhd) {
3003 		iommu_flush_write_buffer(iommu);
3004 #ifdef CONFIG_INTEL_IOMMU_SVM
3005 		register_pasid_allocator(iommu);
3006 #endif
3007 		iommu_set_root_entry(iommu);
3008 	}
3009 
3010 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3011 	dmar_map_gfx = 0;
3012 #endif
3013 
3014 	if (!dmar_map_gfx)
3015 		iommu_identity_mapping |= IDENTMAP_GFX;
3016 
3017 	check_tylersburg_isoch();
3018 
3019 	ret = si_domain_init(hw_pass_through);
3020 	if (ret)
3021 		goto free_iommu;
3022 
3023 	/*
3024 	 * for each drhd
3025 	 *   enable fault log
3026 	 *   global invalidate context cache
3027 	 *   global invalidate iotlb
3028 	 *   enable translation
3029 	 */
3030 	for_each_iommu(iommu, drhd) {
3031 		if (drhd->ignored) {
3032 			/*
3033 			 * we always have to disable PMRs or DMA may fail on
3034 			 * this device
3035 			 */
3036 			if (force_on)
3037 				iommu_disable_protect_mem_regions(iommu);
3038 			continue;
3039 		}
3040 
3041 		iommu_flush_write_buffer(iommu);
3042 
3043 #ifdef CONFIG_INTEL_IOMMU_SVM
3044 		if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3045 			/*
3046 			 * Call dmar_alloc_hwirq() with dmar_global_lock held,
3047 			 * could cause possible lock race condition.
3048 			 */
3049 			up_write(&dmar_global_lock);
3050 			ret = intel_svm_enable_prq(iommu);
3051 			down_write(&dmar_global_lock);
3052 			if (ret)
3053 				goto free_iommu;
3054 		}
3055 #endif
3056 		ret = dmar_set_interrupt(iommu);
3057 		if (ret)
3058 			goto free_iommu;
3059 	}
3060 
3061 	return 0;
3062 
3063 free_iommu:
3064 	for_each_active_iommu(iommu, drhd) {
3065 		disable_dmar_iommu(iommu);
3066 		free_dmar_iommu(iommu);
3067 	}
3068 	if (si_domain) {
3069 		domain_exit(si_domain);
3070 		si_domain = NULL;
3071 	}
3072 
3073 	return ret;
3074 }
3075 
init_no_remapping_devices(void)3076 static void __init init_no_remapping_devices(void)
3077 {
3078 	struct dmar_drhd_unit *drhd;
3079 	struct device *dev;
3080 	int i;
3081 
3082 	for_each_drhd_unit(drhd) {
3083 		if (!drhd->include_all) {
3084 			for_each_active_dev_scope(drhd->devices,
3085 						  drhd->devices_cnt, i, dev)
3086 				break;
3087 			/* ignore DMAR unit if no devices exist */
3088 			if (i == drhd->devices_cnt)
3089 				drhd->ignored = 1;
3090 		}
3091 	}
3092 
3093 	for_each_active_drhd_unit(drhd) {
3094 		if (drhd->include_all)
3095 			continue;
3096 
3097 		for_each_active_dev_scope(drhd->devices,
3098 					  drhd->devices_cnt, i, dev)
3099 			if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3100 				break;
3101 		if (i < drhd->devices_cnt)
3102 			continue;
3103 
3104 		/* This IOMMU has *only* gfx devices. Either bypass it or
3105 		   set the gfx_mapped flag, as appropriate */
3106 		drhd->gfx_dedicated = 1;
3107 		if (!dmar_map_gfx)
3108 			drhd->ignored = 1;
3109 	}
3110 }
3111 
3112 #ifdef CONFIG_SUSPEND
init_iommu_hw(void)3113 static int init_iommu_hw(void)
3114 {
3115 	struct dmar_drhd_unit *drhd;
3116 	struct intel_iommu *iommu = NULL;
3117 
3118 	for_each_active_iommu(iommu, drhd)
3119 		if (iommu->qi)
3120 			dmar_reenable_qi(iommu);
3121 
3122 	for_each_iommu(iommu, drhd) {
3123 		if (drhd->ignored) {
3124 			/*
3125 			 * we always have to disable PMRs or DMA may fail on
3126 			 * this device
3127 			 */
3128 			if (force_on)
3129 				iommu_disable_protect_mem_regions(iommu);
3130 			continue;
3131 		}
3132 
3133 		iommu_flush_write_buffer(iommu);
3134 		iommu_set_root_entry(iommu);
3135 		iommu_enable_translation(iommu);
3136 		iommu_disable_protect_mem_regions(iommu);
3137 	}
3138 
3139 	return 0;
3140 }
3141 
iommu_flush_all(void)3142 static void iommu_flush_all(void)
3143 {
3144 	struct dmar_drhd_unit *drhd;
3145 	struct intel_iommu *iommu;
3146 
3147 	for_each_active_iommu(iommu, drhd) {
3148 		iommu->flush.flush_context(iommu, 0, 0, 0,
3149 					   DMA_CCMD_GLOBAL_INVL);
3150 		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3151 					 DMA_TLB_GLOBAL_FLUSH);
3152 	}
3153 }
3154 
iommu_suspend(void)3155 static int iommu_suspend(void)
3156 {
3157 	struct dmar_drhd_unit *drhd;
3158 	struct intel_iommu *iommu = NULL;
3159 	unsigned long flag;
3160 
3161 	iommu_flush_all();
3162 
3163 	for_each_active_iommu(iommu, drhd) {
3164 		iommu_disable_translation(iommu);
3165 
3166 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
3167 
3168 		iommu->iommu_state[SR_DMAR_FECTL_REG] =
3169 			readl(iommu->reg + DMAR_FECTL_REG);
3170 		iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3171 			readl(iommu->reg + DMAR_FEDATA_REG);
3172 		iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3173 			readl(iommu->reg + DMAR_FEADDR_REG);
3174 		iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3175 			readl(iommu->reg + DMAR_FEUADDR_REG);
3176 
3177 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3178 	}
3179 	return 0;
3180 }
3181 
iommu_resume(void)3182 static void iommu_resume(void)
3183 {
3184 	struct dmar_drhd_unit *drhd;
3185 	struct intel_iommu *iommu = NULL;
3186 	unsigned long flag;
3187 
3188 	if (init_iommu_hw()) {
3189 		if (force_on)
3190 			panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3191 		else
3192 			WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3193 		return;
3194 	}
3195 
3196 	for_each_active_iommu(iommu, drhd) {
3197 
3198 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
3199 
3200 		writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3201 			iommu->reg + DMAR_FECTL_REG);
3202 		writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3203 			iommu->reg + DMAR_FEDATA_REG);
3204 		writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3205 			iommu->reg + DMAR_FEADDR_REG);
3206 		writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3207 			iommu->reg + DMAR_FEUADDR_REG);
3208 
3209 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3210 	}
3211 }
3212 
3213 static struct syscore_ops iommu_syscore_ops = {
3214 	.resume		= iommu_resume,
3215 	.suspend	= iommu_suspend,
3216 };
3217 
init_iommu_pm_ops(void)3218 static void __init init_iommu_pm_ops(void)
3219 {
3220 	register_syscore_ops(&iommu_syscore_ops);
3221 }
3222 
3223 #else
init_iommu_pm_ops(void)3224 static inline void init_iommu_pm_ops(void) {}
3225 #endif	/* CONFIG_PM */
3226 
rmrr_sanity_check(struct acpi_dmar_reserved_memory * rmrr)3227 static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
3228 {
3229 	if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
3230 	    !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
3231 	    rmrr->end_address <= rmrr->base_address ||
3232 	    arch_rmrr_sanity_check(rmrr))
3233 		return -EINVAL;
3234 
3235 	return 0;
3236 }
3237 
dmar_parse_one_rmrr(struct acpi_dmar_header * header,void * arg)3238 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
3239 {
3240 	struct acpi_dmar_reserved_memory *rmrr;
3241 	struct dmar_rmrr_unit *rmrru;
3242 
3243 	rmrr = (struct acpi_dmar_reserved_memory *)header;
3244 	if (rmrr_sanity_check(rmrr)) {
3245 		pr_warn(FW_BUG
3246 			   "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
3247 			   "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
3248 			   rmrr->base_address, rmrr->end_address,
3249 			   dmi_get_system_info(DMI_BIOS_VENDOR),
3250 			   dmi_get_system_info(DMI_BIOS_VERSION),
3251 			   dmi_get_system_info(DMI_PRODUCT_VERSION));
3252 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
3253 	}
3254 
3255 	rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3256 	if (!rmrru)
3257 		goto out;
3258 
3259 	rmrru->hdr = header;
3260 
3261 	rmrru->base_address = rmrr->base_address;
3262 	rmrru->end_address = rmrr->end_address;
3263 
3264 	rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
3265 				((void *)rmrr) + rmrr->header.length,
3266 				&rmrru->devices_cnt);
3267 	if (rmrru->devices_cnt && rmrru->devices == NULL)
3268 		goto free_rmrru;
3269 
3270 	list_add(&rmrru->list, &dmar_rmrr_units);
3271 
3272 	return 0;
3273 free_rmrru:
3274 	kfree(rmrru);
3275 out:
3276 	return -ENOMEM;
3277 }
3278 
dmar_find_atsr(struct acpi_dmar_atsr * atsr)3279 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
3280 {
3281 	struct dmar_atsr_unit *atsru;
3282 	struct acpi_dmar_atsr *tmp;
3283 
3284 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
3285 				dmar_rcu_check()) {
3286 		tmp = (struct acpi_dmar_atsr *)atsru->hdr;
3287 		if (atsr->segment != tmp->segment)
3288 			continue;
3289 		if (atsr->header.length != tmp->header.length)
3290 			continue;
3291 		if (memcmp(atsr, tmp, atsr->header.length) == 0)
3292 			return atsru;
3293 	}
3294 
3295 	return NULL;
3296 }
3297 
dmar_parse_one_atsr(struct acpi_dmar_header * hdr,void * arg)3298 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3299 {
3300 	struct acpi_dmar_atsr *atsr;
3301 	struct dmar_atsr_unit *atsru;
3302 
3303 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3304 		return 0;
3305 
3306 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3307 	atsru = dmar_find_atsr(atsr);
3308 	if (atsru)
3309 		return 0;
3310 
3311 	atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
3312 	if (!atsru)
3313 		return -ENOMEM;
3314 
3315 	/*
3316 	 * If memory is allocated from slab by ACPI _DSM method, we need to
3317 	 * copy the memory content because the memory buffer will be freed
3318 	 * on return.
3319 	 */
3320 	atsru->hdr = (void *)(atsru + 1);
3321 	memcpy(atsru->hdr, hdr, hdr->length);
3322 	atsru->include_all = atsr->flags & 0x1;
3323 	if (!atsru->include_all) {
3324 		atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
3325 				(void *)atsr + atsr->header.length,
3326 				&atsru->devices_cnt);
3327 		if (atsru->devices_cnt && atsru->devices == NULL) {
3328 			kfree(atsru);
3329 			return -ENOMEM;
3330 		}
3331 	}
3332 
3333 	list_add_rcu(&atsru->list, &dmar_atsr_units);
3334 
3335 	return 0;
3336 }
3337 
intel_iommu_free_atsr(struct dmar_atsr_unit * atsru)3338 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
3339 {
3340 	dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
3341 	kfree(atsru);
3342 }
3343 
dmar_release_one_atsr(struct acpi_dmar_header * hdr,void * arg)3344 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3345 {
3346 	struct acpi_dmar_atsr *atsr;
3347 	struct dmar_atsr_unit *atsru;
3348 
3349 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3350 	atsru = dmar_find_atsr(atsr);
3351 	if (atsru) {
3352 		list_del_rcu(&atsru->list);
3353 		synchronize_rcu();
3354 		intel_iommu_free_atsr(atsru);
3355 	}
3356 
3357 	return 0;
3358 }
3359 
dmar_check_one_atsr(struct acpi_dmar_header * hdr,void * arg)3360 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3361 {
3362 	int i;
3363 	struct device *dev;
3364 	struct acpi_dmar_atsr *atsr;
3365 	struct dmar_atsr_unit *atsru;
3366 
3367 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3368 	atsru = dmar_find_atsr(atsr);
3369 	if (!atsru)
3370 		return 0;
3371 
3372 	if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
3373 		for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
3374 					  i, dev)
3375 			return -EBUSY;
3376 	}
3377 
3378 	return 0;
3379 }
3380 
dmar_find_satc(struct acpi_dmar_satc * satc)3381 static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
3382 {
3383 	struct dmar_satc_unit *satcu;
3384 	struct acpi_dmar_satc *tmp;
3385 
3386 	list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
3387 				dmar_rcu_check()) {
3388 		tmp = (struct acpi_dmar_satc *)satcu->hdr;
3389 		if (satc->segment != tmp->segment)
3390 			continue;
3391 		if (satc->header.length != tmp->header.length)
3392 			continue;
3393 		if (memcmp(satc, tmp, satc->header.length) == 0)
3394 			return satcu;
3395 	}
3396 
3397 	return NULL;
3398 }
3399 
dmar_parse_one_satc(struct acpi_dmar_header * hdr,void * arg)3400 int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
3401 {
3402 	struct acpi_dmar_satc *satc;
3403 	struct dmar_satc_unit *satcu;
3404 
3405 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3406 		return 0;
3407 
3408 	satc = container_of(hdr, struct acpi_dmar_satc, header);
3409 	satcu = dmar_find_satc(satc);
3410 	if (satcu)
3411 		return 0;
3412 
3413 	satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
3414 	if (!satcu)
3415 		return -ENOMEM;
3416 
3417 	satcu->hdr = (void *)(satcu + 1);
3418 	memcpy(satcu->hdr, hdr, hdr->length);
3419 	satcu->atc_required = satc->flags & 0x1;
3420 	satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
3421 					      (void *)satc + satc->header.length,
3422 					      &satcu->devices_cnt);
3423 	if (satcu->devices_cnt && !satcu->devices) {
3424 		kfree(satcu);
3425 		return -ENOMEM;
3426 	}
3427 	list_add_rcu(&satcu->list, &dmar_satc_units);
3428 
3429 	return 0;
3430 }
3431 
intel_iommu_add(struct dmar_drhd_unit * dmaru)3432 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
3433 {
3434 	int sp, ret;
3435 	struct intel_iommu *iommu = dmaru->iommu;
3436 
3437 	ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu);
3438 	if (ret)
3439 		goto out;
3440 
3441 	if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
3442 		pr_warn("%s: Doesn't support hardware pass through.\n",
3443 			iommu->name);
3444 		return -ENXIO;
3445 	}
3446 
3447 	sp = domain_update_iommu_superpage(NULL, iommu) - 1;
3448 	if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
3449 		pr_warn("%s: Doesn't support large page.\n",
3450 			iommu->name);
3451 		return -ENXIO;
3452 	}
3453 
3454 	/*
3455 	 * Disable translation if already enabled prior to OS handover.
3456 	 */
3457 	if (iommu->gcmd & DMA_GCMD_TE)
3458 		iommu_disable_translation(iommu);
3459 
3460 	ret = iommu_init_domains(iommu);
3461 	if (ret == 0)
3462 		ret = iommu_alloc_root_entry(iommu);
3463 	if (ret)
3464 		goto out;
3465 
3466 	intel_svm_check(iommu);
3467 
3468 	if (dmaru->ignored) {
3469 		/*
3470 		 * we always have to disable PMRs or DMA may fail on this device
3471 		 */
3472 		if (force_on)
3473 			iommu_disable_protect_mem_regions(iommu);
3474 		return 0;
3475 	}
3476 
3477 	intel_iommu_init_qi(iommu);
3478 	iommu_flush_write_buffer(iommu);
3479 
3480 #ifdef CONFIG_INTEL_IOMMU_SVM
3481 	if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3482 		ret = intel_svm_enable_prq(iommu);
3483 		if (ret)
3484 			goto disable_iommu;
3485 	}
3486 #endif
3487 	ret = dmar_set_interrupt(iommu);
3488 	if (ret)
3489 		goto disable_iommu;
3490 
3491 	iommu_set_root_entry(iommu);
3492 	iommu_enable_translation(iommu);
3493 
3494 	iommu_disable_protect_mem_regions(iommu);
3495 	return 0;
3496 
3497 disable_iommu:
3498 	disable_dmar_iommu(iommu);
3499 out:
3500 	free_dmar_iommu(iommu);
3501 	return ret;
3502 }
3503 
dmar_iommu_hotplug(struct dmar_drhd_unit * dmaru,bool insert)3504 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
3505 {
3506 	int ret = 0;
3507 	struct intel_iommu *iommu = dmaru->iommu;
3508 
3509 	if (!intel_iommu_enabled)
3510 		return 0;
3511 	if (iommu == NULL)
3512 		return -EINVAL;
3513 
3514 	if (insert) {
3515 		ret = intel_iommu_add(dmaru);
3516 	} else {
3517 		disable_dmar_iommu(iommu);
3518 		free_dmar_iommu(iommu);
3519 	}
3520 
3521 	return ret;
3522 }
3523 
intel_iommu_free_dmars(void)3524 static void intel_iommu_free_dmars(void)
3525 {
3526 	struct dmar_rmrr_unit *rmrru, *rmrr_n;
3527 	struct dmar_atsr_unit *atsru, *atsr_n;
3528 	struct dmar_satc_unit *satcu, *satc_n;
3529 
3530 	list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
3531 		list_del(&rmrru->list);
3532 		dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
3533 		kfree(rmrru);
3534 	}
3535 
3536 	list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
3537 		list_del(&atsru->list);
3538 		intel_iommu_free_atsr(atsru);
3539 	}
3540 	list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
3541 		list_del(&satcu->list);
3542 		dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
3543 		kfree(satcu);
3544 	}
3545 }
3546 
dmar_find_matched_satc_unit(struct pci_dev * dev)3547 static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev)
3548 {
3549 	struct dmar_satc_unit *satcu;
3550 	struct acpi_dmar_satc *satc;
3551 	struct device *tmp;
3552 	int i;
3553 
3554 	dev = pci_physfn(dev);
3555 	rcu_read_lock();
3556 
3557 	list_for_each_entry_rcu(satcu, &dmar_satc_units, list) {
3558 		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3559 		if (satc->segment != pci_domain_nr(dev->bus))
3560 			continue;
3561 		for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp)
3562 			if (to_pci_dev(tmp) == dev)
3563 				goto out;
3564 	}
3565 	satcu = NULL;
3566 out:
3567 	rcu_read_unlock();
3568 	return satcu;
3569 }
3570 
dmar_ats_supported(struct pci_dev * dev,struct intel_iommu * iommu)3571 static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu)
3572 {
3573 	int i, ret = 1;
3574 	struct pci_bus *bus;
3575 	struct pci_dev *bridge = NULL;
3576 	struct device *tmp;
3577 	struct acpi_dmar_atsr *atsr;
3578 	struct dmar_atsr_unit *atsru;
3579 	struct dmar_satc_unit *satcu;
3580 
3581 	dev = pci_physfn(dev);
3582 	satcu = dmar_find_matched_satc_unit(dev);
3583 	if (satcu)
3584 		/*
3585 		 * This device supports ATS as it is in SATC table.
3586 		 * When IOMMU is in legacy mode, enabling ATS is done
3587 		 * automatically by HW for the device that requires
3588 		 * ATS, hence OS should not enable this device ATS
3589 		 * to avoid duplicated TLB invalidation.
3590 		 */
3591 		return !(satcu->atc_required && !sm_supported(iommu));
3592 
3593 	for (bus = dev->bus; bus; bus = bus->parent) {
3594 		bridge = bus->self;
3595 		/* If it's an integrated device, allow ATS */
3596 		if (!bridge)
3597 			return 1;
3598 		/* Connected via non-PCIe: no ATS */
3599 		if (!pci_is_pcie(bridge) ||
3600 		    pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3601 			return 0;
3602 		/* If we found the root port, look it up in the ATSR */
3603 		if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
3604 			break;
3605 	}
3606 
3607 	rcu_read_lock();
3608 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3609 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3610 		if (atsr->segment != pci_domain_nr(dev->bus))
3611 			continue;
3612 
3613 		for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
3614 			if (tmp == &bridge->dev)
3615 				goto out;
3616 
3617 		if (atsru->include_all)
3618 			goto out;
3619 	}
3620 	ret = 0;
3621 out:
3622 	rcu_read_unlock();
3623 
3624 	return ret;
3625 }
3626 
dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info * info)3627 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
3628 {
3629 	int ret;
3630 	struct dmar_rmrr_unit *rmrru;
3631 	struct dmar_atsr_unit *atsru;
3632 	struct dmar_satc_unit *satcu;
3633 	struct acpi_dmar_atsr *atsr;
3634 	struct acpi_dmar_reserved_memory *rmrr;
3635 	struct acpi_dmar_satc *satc;
3636 
3637 	if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
3638 		return 0;
3639 
3640 	list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
3641 		rmrr = container_of(rmrru->hdr,
3642 				    struct acpi_dmar_reserved_memory, header);
3643 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3644 			ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
3645 				((void *)rmrr) + rmrr->header.length,
3646 				rmrr->segment, rmrru->devices,
3647 				rmrru->devices_cnt);
3648 			if (ret < 0)
3649 				return ret;
3650 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3651 			dmar_remove_dev_scope(info, rmrr->segment,
3652 				rmrru->devices, rmrru->devices_cnt);
3653 		}
3654 	}
3655 
3656 	list_for_each_entry(atsru, &dmar_atsr_units, list) {
3657 		if (atsru->include_all)
3658 			continue;
3659 
3660 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3661 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3662 			ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
3663 					(void *)atsr + atsr->header.length,
3664 					atsr->segment, atsru->devices,
3665 					atsru->devices_cnt);
3666 			if (ret > 0)
3667 				break;
3668 			else if (ret < 0)
3669 				return ret;
3670 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3671 			if (dmar_remove_dev_scope(info, atsr->segment,
3672 					atsru->devices, atsru->devices_cnt))
3673 				break;
3674 		}
3675 	}
3676 	list_for_each_entry(satcu, &dmar_satc_units, list) {
3677 		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3678 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3679 			ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
3680 					(void *)satc + satc->header.length,
3681 					satc->segment, satcu->devices,
3682 					satcu->devices_cnt);
3683 			if (ret > 0)
3684 				break;
3685 			else if (ret < 0)
3686 				return ret;
3687 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3688 			if (dmar_remove_dev_scope(info, satc->segment,
3689 					satcu->devices, satcu->devices_cnt))
3690 				break;
3691 		}
3692 	}
3693 
3694 	return 0;
3695 }
3696 
intel_iommu_memory_notifier(struct notifier_block * nb,unsigned long val,void * v)3697 static int intel_iommu_memory_notifier(struct notifier_block *nb,
3698 				       unsigned long val, void *v)
3699 {
3700 	struct memory_notify *mhp = v;
3701 	unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
3702 	unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn +
3703 			mhp->nr_pages - 1);
3704 
3705 	switch (val) {
3706 	case MEM_GOING_ONLINE:
3707 		if (iommu_domain_identity_map(si_domain,
3708 					      start_vpfn, last_vpfn)) {
3709 			pr_warn("Failed to build identity map for [%lx-%lx]\n",
3710 				start_vpfn, last_vpfn);
3711 			return NOTIFY_BAD;
3712 		}
3713 		break;
3714 
3715 	case MEM_OFFLINE:
3716 	case MEM_CANCEL_ONLINE:
3717 		{
3718 			struct dmar_drhd_unit *drhd;
3719 			struct intel_iommu *iommu;
3720 			LIST_HEAD(freelist);
3721 
3722 			domain_unmap(si_domain, start_vpfn, last_vpfn, &freelist);
3723 
3724 			rcu_read_lock();
3725 			for_each_active_iommu(iommu, drhd)
3726 				iommu_flush_iotlb_psi(iommu, si_domain,
3727 					start_vpfn, mhp->nr_pages,
3728 					list_empty(&freelist), 0);
3729 			rcu_read_unlock();
3730 			put_pages_list(&freelist);
3731 		}
3732 		break;
3733 	}
3734 
3735 	return NOTIFY_OK;
3736 }
3737 
3738 static struct notifier_block intel_iommu_memory_nb = {
3739 	.notifier_call = intel_iommu_memory_notifier,
3740 	.priority = 0
3741 };
3742 
intel_disable_iommus(void)3743 static void intel_disable_iommus(void)
3744 {
3745 	struct intel_iommu *iommu = NULL;
3746 	struct dmar_drhd_unit *drhd;
3747 
3748 	for_each_iommu(iommu, drhd)
3749 		iommu_disable_translation(iommu);
3750 }
3751 
intel_iommu_shutdown(void)3752 void intel_iommu_shutdown(void)
3753 {
3754 	struct dmar_drhd_unit *drhd;
3755 	struct intel_iommu *iommu = NULL;
3756 
3757 	if (no_iommu || dmar_disabled)
3758 		return;
3759 
3760 	down_write(&dmar_global_lock);
3761 
3762 	/* Disable PMRs explicitly here. */
3763 	for_each_iommu(iommu, drhd)
3764 		iommu_disable_protect_mem_regions(iommu);
3765 
3766 	/* Make sure the IOMMUs are switched off */
3767 	intel_disable_iommus();
3768 
3769 	up_write(&dmar_global_lock);
3770 }
3771 
dev_to_intel_iommu(struct device * dev)3772 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
3773 {
3774 	struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
3775 
3776 	return container_of(iommu_dev, struct intel_iommu, iommu);
3777 }
3778 
version_show(struct device * dev,struct device_attribute * attr,char * buf)3779 static ssize_t version_show(struct device *dev,
3780 			    struct device_attribute *attr, char *buf)
3781 {
3782 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3783 	u32 ver = readl(iommu->reg + DMAR_VER_REG);
3784 	return sprintf(buf, "%d:%d\n",
3785 		       DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
3786 }
3787 static DEVICE_ATTR_RO(version);
3788 
address_show(struct device * dev,struct device_attribute * attr,char * buf)3789 static ssize_t address_show(struct device *dev,
3790 			    struct device_attribute *attr, char *buf)
3791 {
3792 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3793 	return sprintf(buf, "%llx\n", iommu->reg_phys);
3794 }
3795 static DEVICE_ATTR_RO(address);
3796 
cap_show(struct device * dev,struct device_attribute * attr,char * buf)3797 static ssize_t cap_show(struct device *dev,
3798 			struct device_attribute *attr, char *buf)
3799 {
3800 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3801 	return sprintf(buf, "%llx\n", iommu->cap);
3802 }
3803 static DEVICE_ATTR_RO(cap);
3804 
ecap_show(struct device * dev,struct device_attribute * attr,char * buf)3805 static ssize_t ecap_show(struct device *dev,
3806 			 struct device_attribute *attr, char *buf)
3807 {
3808 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3809 	return sprintf(buf, "%llx\n", iommu->ecap);
3810 }
3811 static DEVICE_ATTR_RO(ecap);
3812 
domains_supported_show(struct device * dev,struct device_attribute * attr,char * buf)3813 static ssize_t domains_supported_show(struct device *dev,
3814 				      struct device_attribute *attr, char *buf)
3815 {
3816 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3817 	return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
3818 }
3819 static DEVICE_ATTR_RO(domains_supported);
3820 
domains_used_show(struct device * dev,struct device_attribute * attr,char * buf)3821 static ssize_t domains_used_show(struct device *dev,
3822 				 struct device_attribute *attr, char *buf)
3823 {
3824 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3825 	return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
3826 						  cap_ndoms(iommu->cap)));
3827 }
3828 static DEVICE_ATTR_RO(domains_used);
3829 
3830 static struct attribute *intel_iommu_attrs[] = {
3831 	&dev_attr_version.attr,
3832 	&dev_attr_address.attr,
3833 	&dev_attr_cap.attr,
3834 	&dev_attr_ecap.attr,
3835 	&dev_attr_domains_supported.attr,
3836 	&dev_attr_domains_used.attr,
3837 	NULL,
3838 };
3839 
3840 static struct attribute_group intel_iommu_group = {
3841 	.name = "intel-iommu",
3842 	.attrs = intel_iommu_attrs,
3843 };
3844 
3845 const struct attribute_group *intel_iommu_groups[] = {
3846 	&intel_iommu_group,
3847 	NULL,
3848 };
3849 
has_external_pci(void)3850 static inline bool has_external_pci(void)
3851 {
3852 	struct pci_dev *pdev = NULL;
3853 
3854 	for_each_pci_dev(pdev)
3855 		if (pdev->external_facing) {
3856 			pci_dev_put(pdev);
3857 			return true;
3858 		}
3859 
3860 	return false;
3861 }
3862 
platform_optin_force_iommu(void)3863 static int __init platform_optin_force_iommu(void)
3864 {
3865 	if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
3866 		return 0;
3867 
3868 	if (no_iommu || dmar_disabled)
3869 		pr_info("Intel-IOMMU force enabled due to platform opt in\n");
3870 
3871 	/*
3872 	 * If Intel-IOMMU is disabled by default, we will apply identity
3873 	 * map for all devices except those marked as being untrusted.
3874 	 */
3875 	if (dmar_disabled)
3876 		iommu_set_default_passthrough(false);
3877 
3878 	dmar_disabled = 0;
3879 	no_iommu = 0;
3880 
3881 	return 1;
3882 }
3883 
probe_acpi_namespace_devices(void)3884 static int __init probe_acpi_namespace_devices(void)
3885 {
3886 	struct dmar_drhd_unit *drhd;
3887 	/* To avoid a -Wunused-but-set-variable warning. */
3888 	struct intel_iommu *iommu __maybe_unused;
3889 	struct device *dev;
3890 	int i, ret = 0;
3891 
3892 	for_each_active_iommu(iommu, drhd) {
3893 		for_each_active_dev_scope(drhd->devices,
3894 					  drhd->devices_cnt, i, dev) {
3895 			struct acpi_device_physical_node *pn;
3896 			struct iommu_group *group;
3897 			struct acpi_device *adev;
3898 
3899 			if (dev->bus != &acpi_bus_type)
3900 				continue;
3901 
3902 			adev = to_acpi_device(dev);
3903 			mutex_lock(&adev->physical_node_lock);
3904 			list_for_each_entry(pn,
3905 					    &adev->physical_node_list, node) {
3906 				group = iommu_group_get(pn->dev);
3907 				if (group) {
3908 					iommu_group_put(group);
3909 					continue;
3910 				}
3911 
3912 				ret = iommu_probe_device(pn->dev);
3913 				if (ret)
3914 					break;
3915 			}
3916 			mutex_unlock(&adev->physical_node_lock);
3917 
3918 			if (ret)
3919 				return ret;
3920 		}
3921 	}
3922 
3923 	return 0;
3924 }
3925 
tboot_force_iommu(void)3926 static __init int tboot_force_iommu(void)
3927 {
3928 	if (!tboot_enabled())
3929 		return 0;
3930 
3931 	if (no_iommu || dmar_disabled)
3932 		pr_warn("Forcing Intel-IOMMU to enabled\n");
3933 
3934 	dmar_disabled = 0;
3935 	no_iommu = 0;
3936 
3937 	return 1;
3938 }
3939 
intel_iommu_init(void)3940 int __init intel_iommu_init(void)
3941 {
3942 	int ret = -ENODEV;
3943 	struct dmar_drhd_unit *drhd;
3944 	struct intel_iommu *iommu;
3945 
3946 	/*
3947 	 * Intel IOMMU is required for a TXT/tboot launch or platform
3948 	 * opt in, so enforce that.
3949 	 */
3950 	force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
3951 		    platform_optin_force_iommu();
3952 
3953 	down_write(&dmar_global_lock);
3954 	if (dmar_table_init()) {
3955 		if (force_on)
3956 			panic("tboot: Failed to initialize DMAR table\n");
3957 		goto out_free_dmar;
3958 	}
3959 
3960 	if (dmar_dev_scope_init() < 0) {
3961 		if (force_on)
3962 			panic("tboot: Failed to initialize DMAR device scope\n");
3963 		goto out_free_dmar;
3964 	}
3965 
3966 	up_write(&dmar_global_lock);
3967 
3968 	/*
3969 	 * The bus notifier takes the dmar_global_lock, so lockdep will
3970 	 * complain later when we register it under the lock.
3971 	 */
3972 	dmar_register_bus_notifier();
3973 
3974 	down_write(&dmar_global_lock);
3975 
3976 	if (!no_iommu)
3977 		intel_iommu_debugfs_init();
3978 
3979 	if (no_iommu || dmar_disabled) {
3980 		/*
3981 		 * We exit the function here to ensure IOMMU's remapping and
3982 		 * mempool aren't setup, which means that the IOMMU's PMRs
3983 		 * won't be disabled via the call to init_dmars(). So disable
3984 		 * it explicitly here. The PMRs were setup by tboot prior to
3985 		 * calling SENTER, but the kernel is expected to reset/tear
3986 		 * down the PMRs.
3987 		 */
3988 		if (intel_iommu_tboot_noforce) {
3989 			for_each_iommu(iommu, drhd)
3990 				iommu_disable_protect_mem_regions(iommu);
3991 		}
3992 
3993 		/*
3994 		 * Make sure the IOMMUs are switched off, even when we
3995 		 * boot into a kexec kernel and the previous kernel left
3996 		 * them enabled
3997 		 */
3998 		intel_disable_iommus();
3999 		goto out_free_dmar;
4000 	}
4001 
4002 	if (list_empty(&dmar_rmrr_units))
4003 		pr_info("No RMRR found\n");
4004 
4005 	if (list_empty(&dmar_atsr_units))
4006 		pr_info("No ATSR found\n");
4007 
4008 	if (list_empty(&dmar_satc_units))
4009 		pr_info("No SATC found\n");
4010 
4011 	init_no_remapping_devices();
4012 
4013 	ret = init_dmars();
4014 	if (ret) {
4015 		if (force_on)
4016 			panic("tboot: Failed to initialize DMARs\n");
4017 		pr_err("Initialization failed\n");
4018 		goto out_free_dmar;
4019 	}
4020 	up_write(&dmar_global_lock);
4021 
4022 	init_iommu_pm_ops();
4023 
4024 	down_read(&dmar_global_lock);
4025 	for_each_active_iommu(iommu, drhd) {
4026 		/*
4027 		 * The flush queue implementation does not perform
4028 		 * page-selective invalidations that are required for efficient
4029 		 * TLB flushes in virtual environments.  The benefit of batching
4030 		 * is likely to be much lower than the overhead of synchronizing
4031 		 * the virtual and physical IOMMU page-tables.
4032 		 */
4033 		if (cap_caching_mode(iommu->cap) &&
4034 		    !first_level_by_default(IOMMU_DOMAIN_DMA)) {
4035 			pr_info_once("IOMMU batching disallowed due to virtualization\n");
4036 			iommu_set_dma_strict();
4037 		}
4038 		iommu_device_sysfs_add(&iommu->iommu, NULL,
4039 				       intel_iommu_groups,
4040 				       "%s", iommu->name);
4041 		iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
4042 	}
4043 	up_read(&dmar_global_lock);
4044 
4045 	if (si_domain && !hw_pass_through)
4046 		register_memory_notifier(&intel_iommu_memory_nb);
4047 
4048 	down_read(&dmar_global_lock);
4049 	if (probe_acpi_namespace_devices())
4050 		pr_warn("ACPI name space devices didn't probe correctly\n");
4051 
4052 	/* Finally, we enable the DMA remapping hardware. */
4053 	for_each_iommu(iommu, drhd) {
4054 		if (!drhd->ignored && !translation_pre_enabled(iommu))
4055 			iommu_enable_translation(iommu);
4056 
4057 		iommu_disable_protect_mem_regions(iommu);
4058 	}
4059 	up_read(&dmar_global_lock);
4060 
4061 	pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4062 
4063 	intel_iommu_enabled = 1;
4064 
4065 	return 0;
4066 
4067 out_free_dmar:
4068 	intel_iommu_free_dmars();
4069 	up_write(&dmar_global_lock);
4070 	return ret;
4071 }
4072 
domain_context_clear_one_cb(struct pci_dev * pdev,u16 alias,void * opaque)4073 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4074 {
4075 	struct device_domain_info *info = opaque;
4076 
4077 	domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
4078 	return 0;
4079 }
4080 
4081 /*
4082  * NB - intel-iommu lacks any sort of reference counting for the users of
4083  * dependent devices.  If multiple endpoints have intersecting dependent
4084  * devices, unbinding the driver from any one of them will possibly leave
4085  * the others unable to operate.
4086  */
domain_context_clear(struct device_domain_info * info)4087 static void domain_context_clear(struct device_domain_info *info)
4088 {
4089 	if (!dev_is_pci(info->dev))
4090 		domain_context_clear_one(info, info->bus, info->devfn);
4091 
4092 	pci_for_each_dma_alias(to_pci_dev(info->dev),
4093 			       &domain_context_clear_one_cb, info);
4094 }
4095 
dmar_remove_one_dev_info(struct device * dev)4096 static void dmar_remove_one_dev_info(struct device *dev)
4097 {
4098 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4099 	struct dmar_domain *domain = info->domain;
4100 	struct intel_iommu *iommu = info->iommu;
4101 	unsigned long flags;
4102 
4103 	if (!dev_is_real_dma_subdevice(info->dev)) {
4104 		if (dev_is_pci(info->dev) && sm_supported(iommu))
4105 			intel_pasid_tear_down_entry(iommu, info->dev,
4106 					PASID_RID2PASID, false);
4107 
4108 		iommu_disable_dev_iotlb(info);
4109 		domain_context_clear(info);
4110 	}
4111 
4112 	spin_lock_irqsave(&domain->lock, flags);
4113 	list_del(&info->link);
4114 	spin_unlock_irqrestore(&domain->lock, flags);
4115 
4116 	domain_detach_iommu(domain, iommu);
4117 	info->domain = NULL;
4118 }
4119 
4120 /*
4121  * Clear the page table pointer in context or pasid table entries so that
4122  * all DMA requests without PASID from the device are blocked. If the page
4123  * table has been set, clean up the data structures.
4124  */
device_block_translation(struct device * dev)4125 static void device_block_translation(struct device *dev)
4126 {
4127 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4128 	struct intel_iommu *iommu = info->iommu;
4129 	unsigned long flags;
4130 
4131 	iommu_disable_dev_iotlb(info);
4132 	if (!dev_is_real_dma_subdevice(dev)) {
4133 		if (sm_supported(iommu))
4134 			intel_pasid_tear_down_entry(iommu, dev,
4135 						    PASID_RID2PASID, false);
4136 		else
4137 			domain_context_clear(info);
4138 	}
4139 
4140 	if (!info->domain)
4141 		return;
4142 
4143 	spin_lock_irqsave(&info->domain->lock, flags);
4144 	list_del(&info->link);
4145 	spin_unlock_irqrestore(&info->domain->lock, flags);
4146 
4147 	domain_detach_iommu(info->domain, iommu);
4148 	info->domain = NULL;
4149 }
4150 
md_domain_init(struct dmar_domain * domain,int guest_width)4151 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4152 {
4153 	int adjust_width;
4154 
4155 	/* calculate AGAW */
4156 	domain->gaw = guest_width;
4157 	adjust_width = guestwidth_to_adjustwidth(guest_width);
4158 	domain->agaw = width_to_agaw(adjust_width);
4159 
4160 	domain->iommu_coherency = false;
4161 	domain->iommu_superpage = 0;
4162 	domain->max_addr = 0;
4163 
4164 	/* always allocate the top pgd */
4165 	domain->pgd = alloc_pgtable_page(domain->nid);
4166 	if (!domain->pgd)
4167 		return -ENOMEM;
4168 	domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4169 	return 0;
4170 }
4171 
intel_iommu_domain_alloc(unsigned type)4172 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4173 {
4174 	struct dmar_domain *dmar_domain;
4175 	struct iommu_domain *domain;
4176 
4177 	switch (type) {
4178 	case IOMMU_DOMAIN_DMA:
4179 	case IOMMU_DOMAIN_DMA_FQ:
4180 	case IOMMU_DOMAIN_UNMANAGED:
4181 		dmar_domain = alloc_domain(type);
4182 		if (!dmar_domain) {
4183 			pr_err("Can't allocate dmar_domain\n");
4184 			return NULL;
4185 		}
4186 		if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4187 			pr_err("Domain initialization failed\n");
4188 			domain_exit(dmar_domain);
4189 			return NULL;
4190 		}
4191 
4192 		domain = &dmar_domain->domain;
4193 		domain->geometry.aperture_start = 0;
4194 		domain->geometry.aperture_end   =
4195 				__DOMAIN_MAX_ADDR(dmar_domain->gaw);
4196 		domain->geometry.force_aperture = true;
4197 
4198 		return domain;
4199 	case IOMMU_DOMAIN_IDENTITY:
4200 		return &si_domain->domain;
4201 	case IOMMU_DOMAIN_SVA:
4202 		return intel_svm_domain_alloc();
4203 	default:
4204 		return NULL;
4205 	}
4206 
4207 	return NULL;
4208 }
4209 
intel_iommu_domain_free(struct iommu_domain * domain)4210 static void intel_iommu_domain_free(struct iommu_domain *domain)
4211 {
4212 	if (domain != &si_domain->domain)
4213 		domain_exit(to_dmar_domain(domain));
4214 }
4215 
prepare_domain_attach_device(struct iommu_domain * domain,struct device * dev)4216 static int prepare_domain_attach_device(struct iommu_domain *domain,
4217 					struct device *dev)
4218 {
4219 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4220 	struct intel_iommu *iommu;
4221 	int addr_width;
4222 
4223 	iommu = device_to_iommu(dev, NULL, NULL);
4224 	if (!iommu)
4225 		return -ENODEV;
4226 
4227 	if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap))
4228 		return -EOPNOTSUPP;
4229 
4230 	/* check if this iommu agaw is sufficient for max mapped address */
4231 	addr_width = agaw_to_width(iommu->agaw);
4232 	if (addr_width > cap_mgaw(iommu->cap))
4233 		addr_width = cap_mgaw(iommu->cap);
4234 
4235 	if (dmar_domain->max_addr > (1LL << addr_width)) {
4236 		dev_err(dev, "%s: iommu width (%d) is not "
4237 		        "sufficient for the mapped address (%llx)\n",
4238 		        __func__, addr_width, dmar_domain->max_addr);
4239 		return -EFAULT;
4240 	}
4241 	dmar_domain->gaw = addr_width;
4242 
4243 	/*
4244 	 * Knock out extra levels of page tables if necessary
4245 	 */
4246 	while (iommu->agaw < dmar_domain->agaw) {
4247 		struct dma_pte *pte;
4248 
4249 		pte = dmar_domain->pgd;
4250 		if (dma_pte_present(pte)) {
4251 			dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte));
4252 			free_pgtable_page(pte);
4253 		}
4254 		dmar_domain->agaw--;
4255 	}
4256 
4257 	return 0;
4258 }
4259 
intel_iommu_attach_device(struct iommu_domain * domain,struct device * dev)4260 static int intel_iommu_attach_device(struct iommu_domain *domain,
4261 				     struct device *dev)
4262 {
4263 	int ret;
4264 
4265 	if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
4266 	    device_is_rmrr_locked(dev)) {
4267 		dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
4268 		return -EPERM;
4269 	}
4270 
4271 	/* normally dev is not mapped */
4272 	if (unlikely(domain_context_mapped(dev))) {
4273 		struct device_domain_info *info = dev_iommu_priv_get(dev);
4274 
4275 		if (info->domain)
4276 			device_block_translation(dev);
4277 	}
4278 
4279 	ret = prepare_domain_attach_device(domain, dev);
4280 	if (ret)
4281 		return ret;
4282 
4283 	return domain_add_dev_info(to_dmar_domain(domain), dev);
4284 }
4285 
intel_iommu_detach_device(struct iommu_domain * domain,struct device * dev)4286 static void intel_iommu_detach_device(struct iommu_domain *domain,
4287 				      struct device *dev)
4288 {
4289 	dmar_remove_one_dev_info(dev);
4290 }
4291 
intel_iommu_map(struct iommu_domain * domain,unsigned long iova,phys_addr_t hpa,size_t size,int iommu_prot,gfp_t gfp)4292 static int intel_iommu_map(struct iommu_domain *domain,
4293 			   unsigned long iova, phys_addr_t hpa,
4294 			   size_t size, int iommu_prot, gfp_t gfp)
4295 {
4296 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4297 	u64 max_addr;
4298 	int prot = 0;
4299 
4300 	if (iommu_prot & IOMMU_READ)
4301 		prot |= DMA_PTE_READ;
4302 	if (iommu_prot & IOMMU_WRITE)
4303 		prot |= DMA_PTE_WRITE;
4304 	if (dmar_domain->set_pte_snp)
4305 		prot |= DMA_PTE_SNP;
4306 
4307 	max_addr = iova + size;
4308 	if (dmar_domain->max_addr < max_addr) {
4309 		u64 end;
4310 
4311 		/* check if minimum agaw is sufficient for mapped address */
4312 		end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4313 		if (end < max_addr) {
4314 			pr_err("%s: iommu width (%d) is not "
4315 			       "sufficient for the mapped address (%llx)\n",
4316 			       __func__, dmar_domain->gaw, max_addr);
4317 			return -EFAULT;
4318 		}
4319 		dmar_domain->max_addr = max_addr;
4320 	}
4321 	/* Round up size to next multiple of PAGE_SIZE, if it and
4322 	   the low bits of hpa would take us onto the next page */
4323 	size = aligned_nrpages(hpa, size);
4324 	return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4325 				hpa >> VTD_PAGE_SHIFT, size, prot);
4326 }
4327 
intel_iommu_map_pages(struct iommu_domain * domain,unsigned long iova,phys_addr_t paddr,size_t pgsize,size_t pgcount,int prot,gfp_t gfp,size_t * mapped)4328 static int intel_iommu_map_pages(struct iommu_domain *domain,
4329 				 unsigned long iova, phys_addr_t paddr,
4330 				 size_t pgsize, size_t pgcount,
4331 				 int prot, gfp_t gfp, size_t *mapped)
4332 {
4333 	unsigned long pgshift = __ffs(pgsize);
4334 	size_t size = pgcount << pgshift;
4335 	int ret;
4336 
4337 	if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
4338 		return -EINVAL;
4339 
4340 	if (!IS_ALIGNED(iova | paddr, pgsize))
4341 		return -EINVAL;
4342 
4343 	ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp);
4344 	if (!ret && mapped)
4345 		*mapped = size;
4346 
4347 	return ret;
4348 }
4349 
intel_iommu_unmap(struct iommu_domain * domain,unsigned long iova,size_t size,struct iommu_iotlb_gather * gather)4350 static size_t intel_iommu_unmap(struct iommu_domain *domain,
4351 				unsigned long iova, size_t size,
4352 				struct iommu_iotlb_gather *gather)
4353 {
4354 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4355 	unsigned long start_pfn, last_pfn;
4356 	int level = 0;
4357 
4358 	/* Cope with horrid API which requires us to unmap more than the
4359 	   size argument if it happens to be a large-page mapping. */
4360 	BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
4361 
4362 	if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
4363 		size = VTD_PAGE_SIZE << level_to_offset_bits(level);
4364 
4365 	start_pfn = iova >> VTD_PAGE_SHIFT;
4366 	last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
4367 
4368 	domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist);
4369 
4370 	if (dmar_domain->max_addr == iova + size)
4371 		dmar_domain->max_addr = iova;
4372 
4373 	/*
4374 	 * We do not use page-selective IOTLB invalidation in flush queue,
4375 	 * so there is no need to track page and sync iotlb.
4376 	 */
4377 	if (!iommu_iotlb_gather_queued(gather))
4378 		iommu_iotlb_gather_add_page(domain, gather, iova, size);
4379 
4380 	return size;
4381 }
4382 
intel_iommu_unmap_pages(struct iommu_domain * domain,unsigned long iova,size_t pgsize,size_t pgcount,struct iommu_iotlb_gather * gather)4383 static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
4384 				      unsigned long iova,
4385 				      size_t pgsize, size_t pgcount,
4386 				      struct iommu_iotlb_gather *gather)
4387 {
4388 	unsigned long pgshift = __ffs(pgsize);
4389 	size_t size = pgcount << pgshift;
4390 
4391 	return intel_iommu_unmap(domain, iova, size, gather);
4392 }
4393 
intel_iommu_tlb_sync(struct iommu_domain * domain,struct iommu_iotlb_gather * gather)4394 static void intel_iommu_tlb_sync(struct iommu_domain *domain,
4395 				 struct iommu_iotlb_gather *gather)
4396 {
4397 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4398 	unsigned long iova_pfn = IOVA_PFN(gather->start);
4399 	size_t size = gather->end - gather->start;
4400 	struct iommu_domain_info *info;
4401 	unsigned long start_pfn;
4402 	unsigned long nrpages;
4403 	unsigned long i;
4404 
4405 	nrpages = aligned_nrpages(gather->start, size);
4406 	start_pfn = mm_to_dma_pfn(iova_pfn);
4407 
4408 	xa_for_each(&dmar_domain->iommu_array, i, info)
4409 		iommu_flush_iotlb_psi(info->iommu, dmar_domain,
4410 				      start_pfn, nrpages,
4411 				      list_empty(&gather->freelist), 0);
4412 
4413 	put_pages_list(&gather->freelist);
4414 }
4415 
intel_iommu_iova_to_phys(struct iommu_domain * domain,dma_addr_t iova)4416 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4417 					    dma_addr_t iova)
4418 {
4419 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4420 	struct dma_pte *pte;
4421 	int level = 0;
4422 	u64 phys = 0;
4423 
4424 	pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
4425 	if (pte && dma_pte_present(pte))
4426 		phys = dma_pte_addr(pte) +
4427 			(iova & (BIT_MASK(level_to_offset_bits(level) +
4428 						VTD_PAGE_SHIFT) - 1));
4429 
4430 	return phys;
4431 }
4432 
domain_support_force_snooping(struct dmar_domain * domain)4433 static bool domain_support_force_snooping(struct dmar_domain *domain)
4434 {
4435 	struct device_domain_info *info;
4436 	bool support = true;
4437 
4438 	assert_spin_locked(&domain->lock);
4439 	list_for_each_entry(info, &domain->devices, link) {
4440 		if (!ecap_sc_support(info->iommu->ecap)) {
4441 			support = false;
4442 			break;
4443 		}
4444 	}
4445 
4446 	return support;
4447 }
4448 
domain_set_force_snooping(struct dmar_domain * domain)4449 static void domain_set_force_snooping(struct dmar_domain *domain)
4450 {
4451 	struct device_domain_info *info;
4452 
4453 	assert_spin_locked(&domain->lock);
4454 	/*
4455 	 * Second level page table supports per-PTE snoop control. The
4456 	 * iommu_map() interface will handle this by setting SNP bit.
4457 	 */
4458 	if (!domain_use_first_level(domain)) {
4459 		domain->set_pte_snp = true;
4460 		return;
4461 	}
4462 
4463 	list_for_each_entry(info, &domain->devices, link)
4464 		intel_pasid_setup_page_snoop_control(info->iommu, info->dev,
4465 						     PASID_RID2PASID);
4466 }
4467 
intel_iommu_enforce_cache_coherency(struct iommu_domain * domain)4468 static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain)
4469 {
4470 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4471 	unsigned long flags;
4472 
4473 	if (dmar_domain->force_snooping)
4474 		return true;
4475 
4476 	spin_lock_irqsave(&dmar_domain->lock, flags);
4477 	if (!domain_support_force_snooping(dmar_domain)) {
4478 		spin_unlock_irqrestore(&dmar_domain->lock, flags);
4479 		return false;
4480 	}
4481 
4482 	domain_set_force_snooping(dmar_domain);
4483 	dmar_domain->force_snooping = true;
4484 	spin_unlock_irqrestore(&dmar_domain->lock, flags);
4485 
4486 	return true;
4487 }
4488 
intel_iommu_capable(struct device * dev,enum iommu_cap cap)4489 static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap)
4490 {
4491 	if (cap == IOMMU_CAP_CACHE_COHERENCY)
4492 		return true;
4493 	if (cap == IOMMU_CAP_INTR_REMAP)
4494 		return irq_remapping_enabled == 1;
4495 	if (cap == IOMMU_CAP_PRE_BOOT_PROTECTION)
4496 		return dmar_platform_optin();
4497 
4498 	return false;
4499 }
4500 
intel_iommu_probe_device(struct device * dev)4501 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
4502 {
4503 	struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4504 	struct device_domain_info *info;
4505 	struct intel_iommu *iommu;
4506 	u8 bus, devfn;
4507 	int ret;
4508 
4509 	iommu = device_to_iommu(dev, &bus, &devfn);
4510 	if (!iommu || !iommu->iommu.ops)
4511 		return ERR_PTR(-ENODEV);
4512 
4513 	info = kzalloc(sizeof(*info), GFP_KERNEL);
4514 	if (!info)
4515 		return ERR_PTR(-ENOMEM);
4516 
4517 	if (dev_is_real_dma_subdevice(dev)) {
4518 		info->bus = pdev->bus->number;
4519 		info->devfn = pdev->devfn;
4520 		info->segment = pci_domain_nr(pdev->bus);
4521 	} else {
4522 		info->bus = bus;
4523 		info->devfn = devfn;
4524 		info->segment = iommu->segment;
4525 	}
4526 
4527 	info->dev = dev;
4528 	info->iommu = iommu;
4529 	if (dev_is_pci(dev)) {
4530 		if (ecap_dev_iotlb_support(iommu->ecap) &&
4531 		    pci_ats_supported(pdev) &&
4532 		    dmar_ats_supported(pdev, iommu)) {
4533 			info->ats_supported = 1;
4534 			info->dtlb_extra_inval = dev_needs_extra_dtlb_flush(pdev);
4535 		}
4536 		if (sm_supported(iommu)) {
4537 			if (pasid_supported(iommu)) {
4538 				int features = pci_pasid_features(pdev);
4539 
4540 				if (features >= 0)
4541 					info->pasid_supported = features | 1;
4542 			}
4543 
4544 			if (info->ats_supported && ecap_prs(iommu->ecap) &&
4545 			    pci_pri_supported(pdev))
4546 				info->pri_supported = 1;
4547 		}
4548 	}
4549 
4550 	dev_iommu_priv_set(dev, info);
4551 
4552 	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
4553 		ret = intel_pasid_alloc_table(dev);
4554 		if (ret) {
4555 			dev_err(dev, "PASID table allocation failed\n");
4556 			dev_iommu_priv_set(dev, NULL);
4557 			kfree(info);
4558 			return ERR_PTR(ret);
4559 		}
4560 	}
4561 
4562 	return &iommu->iommu;
4563 }
4564 
intel_iommu_release_device(struct device * dev)4565 static void intel_iommu_release_device(struct device *dev)
4566 {
4567 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4568 
4569 	dmar_remove_one_dev_info(dev);
4570 	intel_pasid_free_table(dev);
4571 	dev_iommu_priv_set(dev, NULL);
4572 	kfree(info);
4573 	set_dma_ops(dev, NULL);
4574 }
4575 
intel_iommu_probe_finalize(struct device * dev)4576 static void intel_iommu_probe_finalize(struct device *dev)
4577 {
4578 	set_dma_ops(dev, NULL);
4579 	iommu_setup_dma_ops(dev, 0, U64_MAX);
4580 }
4581 
intel_iommu_get_resv_regions(struct device * device,struct list_head * head)4582 static void intel_iommu_get_resv_regions(struct device *device,
4583 					 struct list_head *head)
4584 {
4585 	int prot = DMA_PTE_READ | DMA_PTE_WRITE;
4586 	struct iommu_resv_region *reg;
4587 	struct dmar_rmrr_unit *rmrr;
4588 	struct device *i_dev;
4589 	int i;
4590 
4591 	rcu_read_lock();
4592 	for_each_rmrr_units(rmrr) {
4593 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
4594 					  i, i_dev) {
4595 			struct iommu_resv_region *resv;
4596 			enum iommu_resv_type type;
4597 			size_t length;
4598 
4599 			if (i_dev != device &&
4600 			    !is_downstream_to_pci_bridge(device, i_dev))
4601 				continue;
4602 
4603 			length = rmrr->end_address - rmrr->base_address + 1;
4604 
4605 			type = device_rmrr_is_relaxable(device) ?
4606 				IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
4607 
4608 			resv = iommu_alloc_resv_region(rmrr->base_address,
4609 						       length, prot, type,
4610 						       GFP_ATOMIC);
4611 			if (!resv)
4612 				break;
4613 
4614 			list_add_tail(&resv->list, head);
4615 		}
4616 	}
4617 	rcu_read_unlock();
4618 
4619 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
4620 	if (dev_is_pci(device)) {
4621 		struct pci_dev *pdev = to_pci_dev(device);
4622 
4623 		if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
4624 			reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
4625 					IOMMU_RESV_DIRECT_RELAXABLE,
4626 					GFP_KERNEL);
4627 			if (reg)
4628 				list_add_tail(&reg->list, head);
4629 		}
4630 	}
4631 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
4632 
4633 	reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
4634 				      IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
4635 				      0, IOMMU_RESV_MSI, GFP_KERNEL);
4636 	if (!reg)
4637 		return;
4638 	list_add_tail(&reg->list, head);
4639 }
4640 
intel_iommu_device_group(struct device * dev)4641 static struct iommu_group *intel_iommu_device_group(struct device *dev)
4642 {
4643 	if (dev_is_pci(dev))
4644 		return pci_device_group(dev);
4645 	return generic_device_group(dev);
4646 }
4647 
intel_iommu_enable_sva(struct device * dev)4648 static int intel_iommu_enable_sva(struct device *dev)
4649 {
4650 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4651 	struct intel_iommu *iommu;
4652 	int ret;
4653 
4654 	if (!info || dmar_disabled)
4655 		return -EINVAL;
4656 
4657 	iommu = info->iommu;
4658 	if (!iommu)
4659 		return -EINVAL;
4660 
4661 	if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE))
4662 		return -ENODEV;
4663 
4664 	if (!info->pasid_enabled || !info->pri_enabled || !info->ats_enabled)
4665 		return -EINVAL;
4666 
4667 	ret = iopf_queue_add_device(iommu->iopf_queue, dev);
4668 	if (ret)
4669 		return ret;
4670 
4671 	ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev);
4672 	if (ret)
4673 		iopf_queue_remove_device(iommu->iopf_queue, dev);
4674 
4675 	return ret;
4676 }
4677 
intel_iommu_disable_sva(struct device * dev)4678 static int intel_iommu_disable_sva(struct device *dev)
4679 {
4680 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4681 	struct intel_iommu *iommu = info->iommu;
4682 	int ret;
4683 
4684 	ret = iommu_unregister_device_fault_handler(dev);
4685 	if (ret)
4686 		return ret;
4687 
4688 	ret = iopf_queue_remove_device(iommu->iopf_queue, dev);
4689 	if (ret)
4690 		iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev);
4691 
4692 	return ret;
4693 }
4694 
intel_iommu_enable_iopf(struct device * dev)4695 static int intel_iommu_enable_iopf(struct device *dev)
4696 {
4697 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4698 
4699 	if (info && info->pri_supported)
4700 		return 0;
4701 
4702 	return -ENODEV;
4703 }
4704 
4705 static int
intel_iommu_dev_enable_feat(struct device * dev,enum iommu_dev_features feat)4706 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
4707 {
4708 	switch (feat) {
4709 	case IOMMU_DEV_FEAT_IOPF:
4710 		return intel_iommu_enable_iopf(dev);
4711 
4712 	case IOMMU_DEV_FEAT_SVA:
4713 		return intel_iommu_enable_sva(dev);
4714 
4715 	default:
4716 		return -ENODEV;
4717 	}
4718 }
4719 
4720 static int
intel_iommu_dev_disable_feat(struct device * dev,enum iommu_dev_features feat)4721 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
4722 {
4723 	switch (feat) {
4724 	case IOMMU_DEV_FEAT_IOPF:
4725 		return 0;
4726 
4727 	case IOMMU_DEV_FEAT_SVA:
4728 		return intel_iommu_disable_sva(dev);
4729 
4730 	default:
4731 		return -ENODEV;
4732 	}
4733 }
4734 
intel_iommu_is_attach_deferred(struct device * dev)4735 static bool intel_iommu_is_attach_deferred(struct device *dev)
4736 {
4737 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4738 
4739 	return translation_pre_enabled(info->iommu) && !info->domain;
4740 }
4741 
4742 /*
4743  * Check that the device does not live on an external facing PCI port that is
4744  * marked as untrusted. Such devices should not be able to apply quirks and
4745  * thus not be able to bypass the IOMMU restrictions.
4746  */
risky_device(struct pci_dev * pdev)4747 static bool risky_device(struct pci_dev *pdev)
4748 {
4749 	if (pdev->untrusted) {
4750 		pci_info(pdev,
4751 			 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
4752 			 pdev->vendor, pdev->device);
4753 		pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
4754 		return true;
4755 	}
4756 	return false;
4757 }
4758 
intel_iommu_iotlb_sync_map(struct iommu_domain * domain,unsigned long iova,size_t size)4759 static void intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
4760 				       unsigned long iova, size_t size)
4761 {
4762 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4763 	unsigned long pages = aligned_nrpages(iova, size);
4764 	unsigned long pfn = iova >> VTD_PAGE_SHIFT;
4765 	struct iommu_domain_info *info;
4766 	unsigned long i;
4767 
4768 	xa_for_each(&dmar_domain->iommu_array, i, info)
4769 		__mapping_notify_one(info->iommu, dmar_domain, pfn, pages);
4770 }
4771 
intel_iommu_remove_dev_pasid(struct device * dev,ioasid_t pasid)4772 static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid)
4773 {
4774 	struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL);
4775 	struct iommu_domain *domain;
4776 
4777 	/* Domain type specific cleanup: */
4778 	domain = iommu_get_domain_for_dev_pasid(dev, pasid, 0);
4779 	if (domain) {
4780 		switch (domain->type) {
4781 		case IOMMU_DOMAIN_SVA:
4782 			intel_svm_remove_dev_pasid(dev, pasid);
4783 			break;
4784 		default:
4785 			/* should never reach here */
4786 			WARN_ON(1);
4787 			break;
4788 		}
4789 	}
4790 
4791 	intel_pasid_tear_down_entry(iommu, dev, pasid, false);
4792 }
4793 
4794 const struct iommu_ops intel_iommu_ops = {
4795 	.capable		= intel_iommu_capable,
4796 	.domain_alloc		= intel_iommu_domain_alloc,
4797 	.probe_device		= intel_iommu_probe_device,
4798 	.probe_finalize		= intel_iommu_probe_finalize,
4799 	.release_device		= intel_iommu_release_device,
4800 	.get_resv_regions	= intel_iommu_get_resv_regions,
4801 	.device_group		= intel_iommu_device_group,
4802 	.dev_enable_feat	= intel_iommu_dev_enable_feat,
4803 	.dev_disable_feat	= intel_iommu_dev_disable_feat,
4804 	.is_attach_deferred	= intel_iommu_is_attach_deferred,
4805 	.def_domain_type	= device_def_domain_type,
4806 	.remove_dev_pasid	= intel_iommu_remove_dev_pasid,
4807 	.pgsize_bitmap		= SZ_4K,
4808 #ifdef CONFIG_INTEL_IOMMU_SVM
4809 	.page_response		= intel_svm_page_response,
4810 #endif
4811 	.default_domain_ops = &(const struct iommu_domain_ops) {
4812 		.attach_dev		= intel_iommu_attach_device,
4813 		.detach_dev		= intel_iommu_detach_device,
4814 		.map_pages		= intel_iommu_map_pages,
4815 		.unmap_pages		= intel_iommu_unmap_pages,
4816 		.iotlb_sync_map		= intel_iommu_iotlb_sync_map,
4817 		.flush_iotlb_all        = intel_flush_iotlb_all,
4818 		.iotlb_sync		= intel_iommu_tlb_sync,
4819 		.iova_to_phys		= intel_iommu_iova_to_phys,
4820 		.free			= intel_iommu_domain_free,
4821 		.enforce_cache_coherency = intel_iommu_enforce_cache_coherency,
4822 	}
4823 };
4824 
quirk_iommu_igfx(struct pci_dev * dev)4825 static void quirk_iommu_igfx(struct pci_dev *dev)
4826 {
4827 	if (risky_device(dev))
4828 		return;
4829 
4830 	pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
4831 	dmar_map_gfx = 0;
4832 }
4833 
4834 /* G4x/GM45 integrated gfx dmar support is totally busted. */
4835 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
4836 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
4837 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
4838 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
4839 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
4840 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
4841 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
4842 
4843 /* Broadwell igfx malfunctions with dmar */
4844 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
4845 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
4846 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
4847 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
4848 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
4849 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
4850 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
4851 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
4852 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
4853 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
4854 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
4855 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
4856 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
4857 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
4858 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
4859 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
4860 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
4861 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
4862 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
4863 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
4864 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
4865 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
4866 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
4867 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
4868 
quirk_iommu_rwbf(struct pci_dev * dev)4869 static void quirk_iommu_rwbf(struct pci_dev *dev)
4870 {
4871 	if (risky_device(dev))
4872 		return;
4873 
4874 	/*
4875 	 * Mobile 4 Series Chipset neglects to set RWBF capability,
4876 	 * but needs it. Same seems to hold for the desktop versions.
4877 	 */
4878 	pci_info(dev, "Forcing write-buffer flush capability\n");
4879 	rwbf_quirk = 1;
4880 }
4881 
4882 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4883 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4884 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4885 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4886 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4887 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4888 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4889 
4890 #define GGC 0x52
4891 #define GGC_MEMORY_SIZE_MASK	(0xf << 8)
4892 #define GGC_MEMORY_SIZE_NONE	(0x0 << 8)
4893 #define GGC_MEMORY_SIZE_1M	(0x1 << 8)
4894 #define GGC_MEMORY_SIZE_2M	(0x3 << 8)
4895 #define GGC_MEMORY_VT_ENABLED	(0x8 << 8)
4896 #define GGC_MEMORY_SIZE_2M_VT	(0x9 << 8)
4897 #define GGC_MEMORY_SIZE_3M_VT	(0xa << 8)
4898 #define GGC_MEMORY_SIZE_4M_VT	(0xb << 8)
4899 
quirk_calpella_no_shadow_gtt(struct pci_dev * dev)4900 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4901 {
4902 	unsigned short ggc;
4903 
4904 	if (risky_device(dev))
4905 		return;
4906 
4907 	if (pci_read_config_word(dev, GGC, &ggc))
4908 		return;
4909 
4910 	if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4911 		pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4912 		dmar_map_gfx = 0;
4913 	} else if (dmar_map_gfx) {
4914 		/* we have to ensure the gfx device is idle before we flush */
4915 		pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
4916 		iommu_set_dma_strict();
4917 	}
4918 }
4919 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4920 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4921 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4922 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4923 
quirk_igfx_skip_te_disable(struct pci_dev * dev)4924 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
4925 {
4926 	unsigned short ver;
4927 
4928 	if (!IS_GFX_DEVICE(dev))
4929 		return;
4930 
4931 	ver = (dev->device >> 8) & 0xff;
4932 	if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
4933 	    ver != 0x4e && ver != 0x8a && ver != 0x98 &&
4934 	    ver != 0x9a && ver != 0xa7 && ver != 0x7d)
4935 		return;
4936 
4937 	if (risky_device(dev))
4938 		return;
4939 
4940 	pci_info(dev, "Skip IOMMU disabling for graphics\n");
4941 	iommu_skip_te_disable = 1;
4942 }
4943 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
4944 
4945 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4946    ISOCH DMAR unit for the Azalia sound device, but not give it any
4947    TLB entries, which causes it to deadlock. Check for that.  We do
4948    this in a function called from init_dmars(), instead of in a PCI
4949    quirk, because we don't want to print the obnoxious "BIOS broken"
4950    message if VT-d is actually disabled.
4951 */
check_tylersburg_isoch(void)4952 static void __init check_tylersburg_isoch(void)
4953 {
4954 	struct pci_dev *pdev;
4955 	uint32_t vtisochctrl;
4956 
4957 	/* If there's no Azalia in the system anyway, forget it. */
4958 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4959 	if (!pdev)
4960 		return;
4961 
4962 	if (risky_device(pdev)) {
4963 		pci_dev_put(pdev);
4964 		return;
4965 	}
4966 
4967 	pci_dev_put(pdev);
4968 
4969 	/* System Management Registers. Might be hidden, in which case
4970 	   we can't do the sanity check. But that's OK, because the
4971 	   known-broken BIOSes _don't_ actually hide it, so far. */
4972 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4973 	if (!pdev)
4974 		return;
4975 
4976 	if (risky_device(pdev)) {
4977 		pci_dev_put(pdev);
4978 		return;
4979 	}
4980 
4981 	if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4982 		pci_dev_put(pdev);
4983 		return;
4984 	}
4985 
4986 	pci_dev_put(pdev);
4987 
4988 	/* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4989 	if (vtisochctrl & 1)
4990 		return;
4991 
4992 	/* Drop all bits other than the number of TLB entries */
4993 	vtisochctrl &= 0x1c;
4994 
4995 	/* If we have the recommended number of TLB entries (16), fine. */
4996 	if (vtisochctrl == 0x10)
4997 		return;
4998 
4999 	/* Zero TLB entries? You get to ride the short bus to school. */
5000 	if (!vtisochctrl) {
5001 		WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5002 		     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5003 		     dmi_get_system_info(DMI_BIOS_VENDOR),
5004 		     dmi_get_system_info(DMI_BIOS_VERSION),
5005 		     dmi_get_system_info(DMI_PRODUCT_VERSION));
5006 		iommu_identity_mapping |= IDENTMAP_AZALIA;
5007 		return;
5008 	}
5009 
5010 	pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
5011 	       vtisochctrl);
5012 }
5013 
5014 /*
5015  * Here we deal with a device TLB defect where device may inadvertently issue ATS
5016  * invalidation completion before posted writes initiated with translated address
5017  * that utilized translations matching the invalidation address range, violating
5018  * the invalidation completion ordering.
5019  * Therefore, any use cases that cannot guarantee DMA is stopped before unmap is
5020  * vulnerable to this defect. In other words, any dTLB invalidation initiated not
5021  * under the control of the trusted/privileged host device driver must use this
5022  * quirk.
5023  * Device TLBs are invalidated under the following six conditions:
5024  * 1. Device driver does DMA API unmap IOVA
5025  * 2. Device driver unbind a PASID from a process, sva_unbind_device()
5026  * 3. PASID is torn down, after PASID cache is flushed. e.g. process
5027  *    exit_mmap() due to crash
5028  * 4. Under SVA usage, called by mmu_notifier.invalidate_range() where
5029  *    VM has to free pages that were unmapped
5030  * 5. Userspace driver unmaps a DMA buffer
5031  * 6. Cache invalidation in vSVA usage (upcoming)
5032  *
5033  * For #1 and #2, device drivers are responsible for stopping DMA traffic
5034  * before unmap/unbind. For #3, iommu driver gets mmu_notifier to
5035  * invalidate TLB the same way as normal user unmap which will use this quirk.
5036  * The dTLB invalidation after PASID cache flush does not need this quirk.
5037  *
5038  * As a reminder, #6 will *NEED* this quirk as we enable nested translation.
5039  */
quirk_extra_dev_tlb_flush(struct device_domain_info * info,unsigned long address,unsigned long mask,u32 pasid,u16 qdep)5040 void quirk_extra_dev_tlb_flush(struct device_domain_info *info,
5041 			       unsigned long address, unsigned long mask,
5042 			       u32 pasid, u16 qdep)
5043 {
5044 	u16 sid;
5045 
5046 	if (likely(!info->dtlb_extra_inval))
5047 		return;
5048 
5049 	sid = PCI_DEVID(info->bus, info->devfn);
5050 	if (pasid == PASID_RID2PASID) {
5051 		qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
5052 				   qdep, address, mask);
5053 	} else {
5054 		qi_flush_dev_iotlb_pasid(info->iommu, sid, info->pfsid,
5055 					 pasid, qdep, address, mask);
5056 	}
5057 }
5058