• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright © 2006-2014 Intel Corporation.
4  *
5  * Authors: David Woodhouse <dwmw2@infradead.org>,
6  *          Ashok Raj <ashok.raj@intel.com>,
7  *          Shaohua Li <shaohua.li@intel.com>,
8  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9  *          Fenghua Yu <fenghua.yu@intel.com>
10  *          Joerg Roedel <jroedel@suse.de>
11  */
12 
13 #define pr_fmt(fmt)     "DMAR: " fmt
14 #define dev_fmt(fmt)    pr_fmt(fmt)
15 
16 #include <linux/init.h>
17 #include <linux/bitmap.h>
18 #include <linux/debugfs.h>
19 #include <linux/export.h>
20 #include <linux/slab.h>
21 #include <linux/irq.h>
22 #include <linux/interrupt.h>
23 #include <linux/spinlock.h>
24 #include <linux/pci.h>
25 #include <linux/dmar.h>
26 #include <linux/dma-map-ops.h>
27 #include <linux/mempool.h>
28 #include <linux/memory.h>
29 #include <linux/cpu.h>
30 #include <linux/timer.h>
31 #include <linux/io.h>
32 #include <linux/iova.h>
33 #include <linux/iommu.h>
34 #include <linux/dma-iommu.h>
35 #include <linux/intel-iommu.h>
36 #include <linux/intel-svm.h>
37 #include <linux/syscore_ops.h>
38 #include <linux/tboot.h>
39 #include <linux/dmi.h>
40 #include <linux/pci-ats.h>
41 #include <linux/memblock.h>
42 #include <linux/dma-direct.h>
43 #include <linux/crash_dump.h>
44 #include <linux/numa.h>
45 #include <asm/irq_remapping.h>
46 #include <asm/cacheflush.h>
47 #include <asm/iommu.h>
48 
49 #include "../irq_remapping.h"
50 #include "../iommu-sva-lib.h"
51 #include "pasid.h"
52 #include "cap_audit.h"
53 
54 #define ROOT_SIZE		VTD_PAGE_SIZE
55 #define CONTEXT_SIZE		VTD_PAGE_SIZE
56 
57 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
58 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
59 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
60 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
61 
62 #define IOAPIC_RANGE_START	(0xfee00000)
63 #define IOAPIC_RANGE_END	(0xfeefffff)
64 #define IOVA_START_ADDR		(0x1000)
65 
66 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
67 
68 #define MAX_AGAW_WIDTH 64
69 #define MAX_AGAW_PFN_WIDTH	(MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
70 
71 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
72 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
73 
74 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
75    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
76 #define DOMAIN_MAX_PFN(gaw)	((unsigned long) min_t(uint64_t, \
77 				__DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
78 #define DOMAIN_MAX_ADDR(gaw)	(((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
79 
80 /* IO virtual address start page frame number */
81 #define IOVA_START_PFN		(1)
82 
83 #define IOVA_PFN(addr)		((addr) >> PAGE_SHIFT)
84 
85 /* page table handling */
86 #define LEVEL_STRIDE		(9)
87 #define LEVEL_MASK		(((u64)1 << LEVEL_STRIDE) - 1)
88 
agaw_to_level(int agaw)89 static inline int agaw_to_level(int agaw)
90 {
91 	return agaw + 2;
92 }
93 
agaw_to_width(int agaw)94 static inline int agaw_to_width(int agaw)
95 {
96 	return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
97 }
98 
width_to_agaw(int width)99 static inline int width_to_agaw(int width)
100 {
101 	return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
102 }
103 
level_to_offset_bits(int level)104 static inline unsigned int level_to_offset_bits(int level)
105 {
106 	return (level - 1) * LEVEL_STRIDE;
107 }
108 
pfn_level_offset(u64 pfn,int level)109 static inline int pfn_level_offset(u64 pfn, int level)
110 {
111 	return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
112 }
113 
level_mask(int level)114 static inline u64 level_mask(int level)
115 {
116 	return -1ULL << level_to_offset_bits(level);
117 }
118 
level_size(int level)119 static inline u64 level_size(int level)
120 {
121 	return 1ULL << level_to_offset_bits(level);
122 }
123 
align_to_level(u64 pfn,int level)124 static inline u64 align_to_level(u64 pfn, int level)
125 {
126 	return (pfn + level_size(level) - 1) & level_mask(level);
127 }
128 
lvl_to_nr_pages(unsigned int lvl)129 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
130 {
131 	return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
132 }
133 
134 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
135    are never going to work. */
dma_to_mm_pfn(unsigned long dma_pfn)136 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
137 {
138 	return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
139 }
140 
mm_to_dma_pfn(unsigned long mm_pfn)141 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
142 {
143 	return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
144 }
page_to_dma_pfn(struct page * pg)145 static inline unsigned long page_to_dma_pfn(struct page *pg)
146 {
147 	return mm_to_dma_pfn(page_to_pfn(pg));
148 }
virt_to_dma_pfn(void * p)149 static inline unsigned long virt_to_dma_pfn(void *p)
150 {
151 	return page_to_dma_pfn(virt_to_page(p));
152 }
153 
154 /* global iommu list, set NULL for ignored DMAR units */
155 static struct intel_iommu **g_iommus;
156 
157 static void __init check_tylersburg_isoch(void);
158 static int rwbf_quirk;
159 
160 /*
161  * set to 1 to panic kernel if can't successfully enable VT-d
162  * (used when kernel is launched w/ TXT)
163  */
164 static int force_on = 0;
165 static int intel_iommu_tboot_noforce;
166 static int no_platform_optin;
167 
168 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
169 
170 /*
171  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
172  * if marked present.
173  */
root_entry_lctp(struct root_entry * re)174 static phys_addr_t root_entry_lctp(struct root_entry *re)
175 {
176 	if (!(re->lo & 1))
177 		return 0;
178 
179 	return re->lo & VTD_PAGE_MASK;
180 }
181 
182 /*
183  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
184  * if marked present.
185  */
root_entry_uctp(struct root_entry * re)186 static phys_addr_t root_entry_uctp(struct root_entry *re)
187 {
188 	if (!(re->hi & 1))
189 		return 0;
190 
191 	return re->hi & VTD_PAGE_MASK;
192 }
193 
context_set_present(struct context_entry * context)194 static inline void context_set_present(struct context_entry *context)
195 {
196 	context->lo |= 1;
197 }
198 
context_set_fault_enable(struct context_entry * context)199 static inline void context_set_fault_enable(struct context_entry *context)
200 {
201 	context->lo &= (((u64)-1) << 2) | 1;
202 }
203 
context_set_translation_type(struct context_entry * context,unsigned long value)204 static inline void context_set_translation_type(struct context_entry *context,
205 						unsigned long value)
206 {
207 	context->lo &= (((u64)-1) << 4) | 3;
208 	context->lo |= (value & 3) << 2;
209 }
210 
context_set_address_root(struct context_entry * context,unsigned long value)211 static inline void context_set_address_root(struct context_entry *context,
212 					    unsigned long value)
213 {
214 	context->lo &= ~VTD_PAGE_MASK;
215 	context->lo |= value & VTD_PAGE_MASK;
216 }
217 
context_set_address_width(struct context_entry * context,unsigned long value)218 static inline void context_set_address_width(struct context_entry *context,
219 					     unsigned long value)
220 {
221 	context->hi |= value & 7;
222 }
223 
context_set_domain_id(struct context_entry * context,unsigned long value)224 static inline void context_set_domain_id(struct context_entry *context,
225 					 unsigned long value)
226 {
227 	context->hi |= (value & ((1 << 16) - 1)) << 8;
228 }
229 
context_domain_id(struct context_entry * c)230 static inline int context_domain_id(struct context_entry *c)
231 {
232 	return((c->hi >> 8) & 0xffff);
233 }
234 
context_clear_entry(struct context_entry * context)235 static inline void context_clear_entry(struct context_entry *context)
236 {
237 	context->lo = 0;
238 	context->hi = 0;
239 }
240 
context_copied(struct intel_iommu * iommu,u8 bus,u8 devfn)241 static inline bool context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
242 {
243 	if (!iommu->copied_tables)
244 		return false;
245 
246 	return test_bit(((long)bus << 8) | devfn, iommu->copied_tables);
247 }
248 
249 static inline void
set_context_copied(struct intel_iommu * iommu,u8 bus,u8 devfn)250 set_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
251 {
252 	set_bit(((long)bus << 8) | devfn, iommu->copied_tables);
253 }
254 
255 static inline void
clear_context_copied(struct intel_iommu * iommu,u8 bus,u8 devfn)256 clear_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
257 {
258 	clear_bit(((long)bus << 8) | devfn, iommu->copied_tables);
259 }
260 
261 /*
262  * This domain is a statically identity mapping domain.
263  *	1. This domain creats a static 1:1 mapping to all usable memory.
264  * 	2. It maps to each iommu if successful.
265  *	3. Each iommu mapps to this domain if successful.
266  */
267 static struct dmar_domain *si_domain;
268 static int hw_pass_through = 1;
269 
270 #define for_each_domain_iommu(idx, domain)			\
271 	for (idx = 0; idx < g_num_of_iommus; idx++)		\
272 		if (domain->iommu_refcnt[idx])
273 
274 struct dmar_rmrr_unit {
275 	struct list_head list;		/* list of rmrr units	*/
276 	struct acpi_dmar_header *hdr;	/* ACPI header		*/
277 	u64	base_address;		/* reserved base address*/
278 	u64	end_address;		/* reserved end address */
279 	struct dmar_dev_scope *devices;	/* target devices */
280 	int	devices_cnt;		/* target device count */
281 };
282 
283 struct dmar_atsr_unit {
284 	struct list_head list;		/* list of ATSR units */
285 	struct acpi_dmar_header *hdr;	/* ACPI header */
286 	struct dmar_dev_scope *devices;	/* target devices */
287 	int devices_cnt;		/* target device count */
288 	u8 include_all:1;		/* include all ports */
289 };
290 
291 struct dmar_satc_unit {
292 	struct list_head list;		/* list of SATC units */
293 	struct acpi_dmar_header *hdr;	/* ACPI header */
294 	struct dmar_dev_scope *devices;	/* target devices */
295 	struct intel_iommu *iommu;	/* the corresponding iommu */
296 	int devices_cnt;		/* target device count */
297 	u8 atc_required:1;		/* ATS is required */
298 };
299 
300 static LIST_HEAD(dmar_atsr_units);
301 static LIST_HEAD(dmar_rmrr_units);
302 static LIST_HEAD(dmar_satc_units);
303 
304 #define for_each_rmrr_units(rmrr) \
305 	list_for_each_entry(rmrr, &dmar_rmrr_units, list)
306 
307 /* bitmap for indexing intel_iommus */
308 static int g_num_of_iommus;
309 
310 static void domain_exit(struct dmar_domain *domain);
311 static void domain_remove_dev_info(struct dmar_domain *domain);
312 static void dmar_remove_one_dev_info(struct device *dev);
313 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
314 static int intel_iommu_attach_device(struct iommu_domain *domain,
315 				     struct device *dev);
316 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
317 					    dma_addr_t iova);
318 
319 int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
320 int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON);
321 
322 int intel_iommu_enabled = 0;
323 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
324 
325 static int dmar_map_gfx = 1;
326 static int intel_iommu_superpage = 1;
327 static int iommu_identity_mapping;
328 static int iommu_skip_te_disable;
329 
330 #define IDENTMAP_GFX		2
331 #define IDENTMAP_AZALIA		4
332 
333 int intel_iommu_gfx_mapped;
334 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
335 
336 #define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2))
get_domain_info(struct device * dev)337 struct device_domain_info *get_domain_info(struct device *dev)
338 {
339 	struct device_domain_info *info;
340 
341 	if (!dev)
342 		return NULL;
343 
344 	info = dev_iommu_priv_get(dev);
345 	if (unlikely(info == DEFER_DEVICE_DOMAIN_INFO))
346 		return NULL;
347 
348 	return info;
349 }
350 
351 DEFINE_SPINLOCK(device_domain_lock);
352 static LIST_HEAD(device_domain_list);
353 
354 /*
355  * Iterate over elements in device_domain_list and call the specified
356  * callback @fn against each element.
357  */
for_each_device_domain(int (* fn)(struct device_domain_info * info,void * data),void * data)358 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
359 				     void *data), void *data)
360 {
361 	int ret = 0;
362 	unsigned long flags;
363 	struct device_domain_info *info;
364 
365 	spin_lock_irqsave(&device_domain_lock, flags);
366 	list_for_each_entry(info, &device_domain_list, global) {
367 		ret = fn(info, data);
368 		if (ret) {
369 			spin_unlock_irqrestore(&device_domain_lock, flags);
370 			return ret;
371 		}
372 	}
373 	spin_unlock_irqrestore(&device_domain_lock, flags);
374 
375 	return 0;
376 }
377 
378 const struct iommu_ops intel_iommu_ops;
379 
translation_pre_enabled(struct intel_iommu * iommu)380 static bool translation_pre_enabled(struct intel_iommu *iommu)
381 {
382 	return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
383 }
384 
clear_translation_pre_enabled(struct intel_iommu * iommu)385 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
386 {
387 	iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
388 }
389 
init_translation_status(struct intel_iommu * iommu)390 static void init_translation_status(struct intel_iommu *iommu)
391 {
392 	u32 gsts;
393 
394 	gsts = readl(iommu->reg + DMAR_GSTS_REG);
395 	if (gsts & DMA_GSTS_TES)
396 		iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
397 }
398 
intel_iommu_setup(char * str)399 static int __init intel_iommu_setup(char *str)
400 {
401 	if (!str)
402 		return -EINVAL;
403 
404 	while (*str) {
405 		if (!strncmp(str, "on", 2)) {
406 			dmar_disabled = 0;
407 			pr_info("IOMMU enabled\n");
408 		} else if (!strncmp(str, "off", 3)) {
409 			dmar_disabled = 1;
410 			no_platform_optin = 1;
411 			pr_info("IOMMU disabled\n");
412 		} else if (!strncmp(str, "igfx_off", 8)) {
413 			dmar_map_gfx = 0;
414 			pr_info("Disable GFX device mapping\n");
415 		} else if (!strncmp(str, "forcedac", 8)) {
416 			pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
417 			iommu_dma_forcedac = true;
418 		} else if (!strncmp(str, "strict", 6)) {
419 			pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
420 			iommu_set_dma_strict();
421 		} else if (!strncmp(str, "sp_off", 6)) {
422 			pr_info("Disable supported super page\n");
423 			intel_iommu_superpage = 0;
424 		} else if (!strncmp(str, "sm_on", 5)) {
425 			pr_info("Enable scalable mode if hardware supports\n");
426 			intel_iommu_sm = 1;
427 		} else if (!strncmp(str, "sm_off", 6)) {
428 			pr_info("Scalable mode is disallowed\n");
429 			intel_iommu_sm = 0;
430 		} else if (!strncmp(str, "tboot_noforce", 13)) {
431 			pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
432 			intel_iommu_tboot_noforce = 1;
433 		} else {
434 			pr_notice("Unknown option - '%s'\n", str);
435 		}
436 
437 		str += strcspn(str, ",");
438 		while (*str == ',')
439 			str++;
440 	}
441 
442 	return 1;
443 }
444 __setup("intel_iommu=", intel_iommu_setup);
445 
446 static struct kmem_cache *iommu_domain_cache;
447 static struct kmem_cache *iommu_devinfo_cache;
448 
get_iommu_domain(struct intel_iommu * iommu,u16 did)449 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
450 {
451 	struct dmar_domain **domains;
452 	int idx = did >> 8;
453 
454 	domains = iommu->domains[idx];
455 	if (!domains)
456 		return NULL;
457 
458 	return domains[did & 0xff];
459 }
460 
set_iommu_domain(struct intel_iommu * iommu,u16 did,struct dmar_domain * domain)461 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
462 			     struct dmar_domain *domain)
463 {
464 	struct dmar_domain **domains;
465 	int idx = did >> 8;
466 
467 	if (!iommu->domains[idx]) {
468 		size_t size = 256 * sizeof(struct dmar_domain *);
469 		iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
470 	}
471 
472 	domains = iommu->domains[idx];
473 	if (WARN_ON(!domains))
474 		return;
475 	else
476 		domains[did & 0xff] = domain;
477 }
478 
alloc_pgtable_page(int node)479 void *alloc_pgtable_page(int node)
480 {
481 	struct page *page;
482 	void *vaddr = NULL;
483 
484 	page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
485 	if (page)
486 		vaddr = page_address(page);
487 	return vaddr;
488 }
489 
free_pgtable_page(void * vaddr)490 void free_pgtable_page(void *vaddr)
491 {
492 	free_page((unsigned long)vaddr);
493 }
494 
alloc_domain_mem(void)495 static inline void *alloc_domain_mem(void)
496 {
497 	return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
498 }
499 
free_domain_mem(void * vaddr)500 static void free_domain_mem(void *vaddr)
501 {
502 	kmem_cache_free(iommu_domain_cache, vaddr);
503 }
504 
alloc_devinfo_mem(void)505 static inline void * alloc_devinfo_mem(void)
506 {
507 	return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
508 }
509 
free_devinfo_mem(void * vaddr)510 static inline void free_devinfo_mem(void *vaddr)
511 {
512 	kmem_cache_free(iommu_devinfo_cache, vaddr);
513 }
514 
domain_type_is_si(struct dmar_domain * domain)515 static inline int domain_type_is_si(struct dmar_domain *domain)
516 {
517 	return domain->domain.type == IOMMU_DOMAIN_IDENTITY;
518 }
519 
domain_use_first_level(struct dmar_domain * domain)520 static inline bool domain_use_first_level(struct dmar_domain *domain)
521 {
522 	return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL;
523 }
524 
domain_pfn_supported(struct dmar_domain * domain,unsigned long pfn)525 static inline int domain_pfn_supported(struct dmar_domain *domain,
526 				       unsigned long pfn)
527 {
528 	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
529 
530 	return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
531 }
532 
533 /*
534  * Calculate the Supported Adjusted Guest Address Widths of an IOMMU.
535  * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of
536  * the returned SAGAW.
537  */
__iommu_calculate_sagaw(struct intel_iommu * iommu)538 static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu)
539 {
540 	unsigned long fl_sagaw, sl_sagaw;
541 
542 	fl_sagaw = BIT(2) | (cap_5lp_support(iommu->cap) ? BIT(3) : 0);
543 	sl_sagaw = cap_sagaw(iommu->cap);
544 
545 	/* Second level only. */
546 	if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
547 		return sl_sagaw;
548 
549 	/* First level only. */
550 	if (!ecap_slts(iommu->ecap))
551 		return fl_sagaw;
552 
553 	return fl_sagaw & sl_sagaw;
554 }
555 
__iommu_calculate_agaw(struct intel_iommu * iommu,int max_gaw)556 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
557 {
558 	unsigned long sagaw;
559 	int agaw;
560 
561 	sagaw = __iommu_calculate_sagaw(iommu);
562 	for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) {
563 		if (test_bit(agaw, &sagaw))
564 			break;
565 	}
566 
567 	return agaw;
568 }
569 
570 /*
571  * Calculate max SAGAW for each iommu.
572  */
iommu_calculate_max_sagaw(struct intel_iommu * iommu)573 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
574 {
575 	return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
576 }
577 
578 /*
579  * calculate agaw for each iommu.
580  * "SAGAW" may be different across iommus, use a default agaw, and
581  * get a supported less agaw for iommus that don't support the default agaw.
582  */
iommu_calculate_agaw(struct intel_iommu * iommu)583 int iommu_calculate_agaw(struct intel_iommu *iommu)
584 {
585 	return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
586 }
587 
588 /* This functionin only returns single iommu in a domain */
domain_get_iommu(struct dmar_domain * domain)589 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
590 {
591 	int iommu_id;
592 
593 	/* si_domain and vm domain should not get here. */
594 	if (WARN_ON(!iommu_is_dma_domain(&domain->domain)))
595 		return NULL;
596 
597 	for_each_domain_iommu(iommu_id, domain)
598 		break;
599 
600 	if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
601 		return NULL;
602 
603 	return g_iommus[iommu_id];
604 }
605 
iommu_paging_structure_coherency(struct intel_iommu * iommu)606 static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
607 {
608 	return sm_supported(iommu) ?
609 			ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
610 }
611 
domain_update_iommu_coherency(struct dmar_domain * domain)612 static void domain_update_iommu_coherency(struct dmar_domain *domain)
613 {
614 	struct dmar_drhd_unit *drhd;
615 	struct intel_iommu *iommu;
616 	bool found = false;
617 	int i;
618 
619 	domain->iommu_coherency = true;
620 
621 	for_each_domain_iommu(i, domain) {
622 		found = true;
623 		if (!iommu_paging_structure_coherency(g_iommus[i])) {
624 			domain->iommu_coherency = false;
625 			break;
626 		}
627 	}
628 	if (found)
629 		return;
630 
631 	/* No hardware attached; use lowest common denominator */
632 	rcu_read_lock();
633 	for_each_active_iommu(iommu, drhd) {
634 		if (!iommu_paging_structure_coherency(iommu)) {
635 			domain->iommu_coherency = false;
636 			break;
637 		}
638 	}
639 	rcu_read_unlock();
640 }
641 
domain_update_iommu_snooping(struct intel_iommu * skip)642 static bool domain_update_iommu_snooping(struct intel_iommu *skip)
643 {
644 	struct dmar_drhd_unit *drhd;
645 	struct intel_iommu *iommu;
646 	bool ret = true;
647 
648 	rcu_read_lock();
649 	for_each_active_iommu(iommu, drhd) {
650 		if (iommu != skip) {
651 			/*
652 			 * If the hardware is operating in the scalable mode,
653 			 * the snooping control is always supported since we
654 			 * always set PASID-table-entry.PGSNP bit if the domain
655 			 * is managed outside (UNMANAGED).
656 			 */
657 			if (!sm_supported(iommu) &&
658 			    !ecap_sc_support(iommu->ecap)) {
659 				ret = false;
660 				break;
661 			}
662 		}
663 	}
664 	rcu_read_unlock();
665 
666 	return ret;
667 }
668 
domain_update_iommu_superpage(struct dmar_domain * domain,struct intel_iommu * skip)669 static int domain_update_iommu_superpage(struct dmar_domain *domain,
670 					 struct intel_iommu *skip)
671 {
672 	struct dmar_drhd_unit *drhd;
673 	struct intel_iommu *iommu;
674 	int mask = 0x3;
675 
676 	if (!intel_iommu_superpage)
677 		return 0;
678 
679 	/* set iommu_superpage to the smallest common denominator */
680 	rcu_read_lock();
681 	for_each_active_iommu(iommu, drhd) {
682 		if (iommu != skip) {
683 			if (domain && domain_use_first_level(domain)) {
684 				if (!cap_fl1gp_support(iommu->cap))
685 					mask = 0x1;
686 			} else {
687 				mask &= cap_super_page_val(iommu->cap);
688 			}
689 
690 			if (!mask)
691 				break;
692 		}
693 	}
694 	rcu_read_unlock();
695 
696 	return fls(mask);
697 }
698 
domain_update_device_node(struct dmar_domain * domain)699 static int domain_update_device_node(struct dmar_domain *domain)
700 {
701 	struct device_domain_info *info;
702 	int nid = NUMA_NO_NODE;
703 
704 	assert_spin_locked(&device_domain_lock);
705 
706 	if (list_empty(&domain->devices))
707 		return NUMA_NO_NODE;
708 
709 	list_for_each_entry(info, &domain->devices, link) {
710 		if (!info->dev)
711 			continue;
712 
713 		/*
714 		 * There could possibly be multiple device numa nodes as devices
715 		 * within the same domain may sit behind different IOMMUs. There
716 		 * isn't perfect answer in such situation, so we select first
717 		 * come first served policy.
718 		 */
719 		nid = dev_to_node(info->dev);
720 		if (nid != NUMA_NO_NODE)
721 			break;
722 	}
723 
724 	return nid;
725 }
726 
727 static void domain_update_iotlb(struct dmar_domain *domain);
728 
729 /* Return the super pagesize bitmap if supported. */
domain_super_pgsize_bitmap(struct dmar_domain * domain)730 static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
731 {
732 	unsigned long bitmap = 0;
733 
734 	/*
735 	 * 1-level super page supports page size of 2MiB, 2-level super page
736 	 * supports page size of both 2MiB and 1GiB.
737 	 */
738 	if (domain->iommu_superpage == 1)
739 		bitmap |= SZ_2M;
740 	else if (domain->iommu_superpage == 2)
741 		bitmap |= SZ_2M | SZ_1G;
742 
743 	return bitmap;
744 }
745 
746 /* Some capabilities may be different across iommus */
domain_update_iommu_cap(struct dmar_domain * domain)747 static void domain_update_iommu_cap(struct dmar_domain *domain)
748 {
749 	domain_update_iommu_coherency(domain);
750 	domain->iommu_snooping = domain_update_iommu_snooping(NULL);
751 	domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
752 
753 	/*
754 	 * If RHSA is missing, we should default to the device numa domain
755 	 * as fall back.
756 	 */
757 	if (domain->nid == NUMA_NO_NODE)
758 		domain->nid = domain_update_device_node(domain);
759 
760 	/*
761 	 * First-level translation restricts the input-address to a
762 	 * canonical address (i.e., address bits 63:N have the same
763 	 * value as address bit [N-1], where N is 48-bits with 4-level
764 	 * paging and 57-bits with 5-level paging). Hence, skip bit
765 	 * [N-1].
766 	 */
767 	if (domain_use_first_level(domain))
768 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
769 	else
770 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
771 
772 	domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
773 	domain_update_iotlb(domain);
774 }
775 
iommu_context_addr(struct intel_iommu * iommu,u8 bus,u8 devfn,int alloc)776 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
777 					 u8 devfn, int alloc)
778 {
779 	struct root_entry *root = &iommu->root_entry[bus];
780 	struct context_entry *context;
781 	u64 *entry;
782 
783 	/*
784 	 * Except that the caller requested to allocate a new entry,
785 	 * returning a copied context entry makes no sense.
786 	 */
787 	if (!alloc && context_copied(iommu, bus, devfn))
788 		return NULL;
789 
790 	entry = &root->lo;
791 	if (sm_supported(iommu)) {
792 		if (devfn >= 0x80) {
793 			devfn -= 0x80;
794 			entry = &root->hi;
795 		}
796 		devfn *= 2;
797 	}
798 	if (*entry & 1)
799 		context = phys_to_virt(*entry & VTD_PAGE_MASK);
800 	else {
801 		unsigned long phy_addr;
802 		if (!alloc)
803 			return NULL;
804 
805 		context = alloc_pgtable_page(iommu->node);
806 		if (!context)
807 			return NULL;
808 
809 		__iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
810 		phy_addr = virt_to_phys((void *)context);
811 		*entry = phy_addr | 1;
812 		__iommu_flush_cache(iommu, entry, sizeof(*entry));
813 	}
814 	return &context[devfn];
815 }
816 
attach_deferred(struct device * dev)817 static bool attach_deferred(struct device *dev)
818 {
819 	return dev_iommu_priv_get(dev) == DEFER_DEVICE_DOMAIN_INFO;
820 }
821 
822 /**
823  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
824  *				 sub-hierarchy of a candidate PCI-PCI bridge
825  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
826  * @bridge: the candidate PCI-PCI bridge
827  *
828  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
829  */
830 static bool
is_downstream_to_pci_bridge(struct device * dev,struct device * bridge)831 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
832 {
833 	struct pci_dev *pdev, *pbridge;
834 
835 	if (!dev_is_pci(dev) || !dev_is_pci(bridge))
836 		return false;
837 
838 	pdev = to_pci_dev(dev);
839 	pbridge = to_pci_dev(bridge);
840 
841 	if (pbridge->subordinate &&
842 	    pbridge->subordinate->number <= pdev->bus->number &&
843 	    pbridge->subordinate->busn_res.end >= pdev->bus->number)
844 		return true;
845 
846 	return false;
847 }
848 
quirk_ioat_snb_local_iommu(struct pci_dev * pdev)849 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
850 {
851 	struct dmar_drhd_unit *drhd;
852 	u32 vtbar;
853 	int rc;
854 
855 	/* We know that this device on this chipset has its own IOMMU.
856 	 * If we find it under a different IOMMU, then the BIOS is lying
857 	 * to us. Hope that the IOMMU for this device is actually
858 	 * disabled, and it needs no translation...
859 	 */
860 	rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
861 	if (rc) {
862 		/* "can't" happen */
863 		dev_info(&pdev->dev, "failed to run vt-d quirk\n");
864 		return false;
865 	}
866 	vtbar &= 0xffff0000;
867 
868 	/* we know that the this iommu should be at offset 0xa000 from vtbar */
869 	drhd = dmar_find_matched_drhd_unit(pdev);
870 	if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
871 		pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
872 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
873 		return true;
874 	}
875 
876 	return false;
877 }
878 
iommu_is_dummy(struct intel_iommu * iommu,struct device * dev)879 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
880 {
881 	if (!iommu || iommu->drhd->ignored)
882 		return true;
883 
884 	if (dev_is_pci(dev)) {
885 		struct pci_dev *pdev = to_pci_dev(dev);
886 
887 		if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
888 		    pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
889 		    quirk_ioat_snb_local_iommu(pdev))
890 			return true;
891 	}
892 
893 	return false;
894 }
895 
device_to_iommu(struct device * dev,u8 * bus,u8 * devfn)896 struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
897 {
898 	struct dmar_drhd_unit *drhd = NULL;
899 	struct pci_dev *pdev = NULL;
900 	struct intel_iommu *iommu;
901 	struct device *tmp;
902 	u16 segment = 0;
903 	int i;
904 
905 	if (!dev)
906 		return NULL;
907 
908 	if (dev_is_pci(dev)) {
909 		struct pci_dev *pf_pdev;
910 
911 		pdev = pci_real_dma_dev(to_pci_dev(dev));
912 
913 		/* VFs aren't listed in scope tables; we need to look up
914 		 * the PF instead to find the IOMMU. */
915 		pf_pdev = pci_physfn(pdev);
916 		dev = &pf_pdev->dev;
917 		segment = pci_domain_nr(pdev->bus);
918 	} else if (has_acpi_companion(dev))
919 		dev = &ACPI_COMPANION(dev)->dev;
920 
921 	rcu_read_lock();
922 	for_each_iommu(iommu, drhd) {
923 		if (pdev && segment != drhd->segment)
924 			continue;
925 
926 		for_each_active_dev_scope(drhd->devices,
927 					  drhd->devices_cnt, i, tmp) {
928 			if (tmp == dev) {
929 				/* For a VF use its original BDF# not that of the PF
930 				 * which we used for the IOMMU lookup. Strictly speaking
931 				 * we could do this for all PCI devices; we only need to
932 				 * get the BDF# from the scope table for ACPI matches. */
933 				if (pdev && pdev->is_virtfn)
934 					goto got_pdev;
935 
936 				if (bus && devfn) {
937 					*bus = drhd->devices[i].bus;
938 					*devfn = drhd->devices[i].devfn;
939 				}
940 				goto out;
941 			}
942 
943 			if (is_downstream_to_pci_bridge(dev, tmp))
944 				goto got_pdev;
945 		}
946 
947 		if (pdev && drhd->include_all) {
948 		got_pdev:
949 			if (bus && devfn) {
950 				*bus = pdev->bus->number;
951 				*devfn = pdev->devfn;
952 			}
953 			goto out;
954 		}
955 	}
956 	iommu = NULL;
957  out:
958 	if (iommu_is_dummy(iommu, dev))
959 		iommu = NULL;
960 
961 	rcu_read_unlock();
962 
963 	return iommu;
964 }
965 
domain_flush_cache(struct dmar_domain * domain,void * addr,int size)966 static void domain_flush_cache(struct dmar_domain *domain,
967 			       void *addr, int size)
968 {
969 	if (!domain->iommu_coherency)
970 		clflush_cache_range(addr, size);
971 }
972 
device_context_mapped(struct intel_iommu * iommu,u8 bus,u8 devfn)973 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
974 {
975 	struct context_entry *context;
976 	int ret = 0;
977 	unsigned long flags;
978 
979 	spin_lock_irqsave(&iommu->lock, flags);
980 	context = iommu_context_addr(iommu, bus, devfn, 0);
981 	if (context)
982 		ret = context_present(context);
983 	spin_unlock_irqrestore(&iommu->lock, flags);
984 	return ret;
985 }
986 
free_context_table(struct intel_iommu * iommu)987 static void free_context_table(struct intel_iommu *iommu)
988 {
989 	int i;
990 	unsigned long flags;
991 	struct context_entry *context;
992 
993 	spin_lock_irqsave(&iommu->lock, flags);
994 	if (!iommu->root_entry) {
995 		goto out;
996 	}
997 	for (i = 0; i < ROOT_ENTRY_NR; i++) {
998 		context = iommu_context_addr(iommu, i, 0, 0);
999 		if (context)
1000 			free_pgtable_page(context);
1001 
1002 		if (!sm_supported(iommu))
1003 			continue;
1004 
1005 		context = iommu_context_addr(iommu, i, 0x80, 0);
1006 		if (context)
1007 			free_pgtable_page(context);
1008 
1009 	}
1010 	free_pgtable_page(iommu->root_entry);
1011 	iommu->root_entry = NULL;
1012 out:
1013 	spin_unlock_irqrestore(&iommu->lock, flags);
1014 }
1015 
pfn_to_dma_pte(struct dmar_domain * domain,unsigned long pfn,int * target_level)1016 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
1017 				      unsigned long pfn, int *target_level)
1018 {
1019 	struct dma_pte *parent, *pte;
1020 	int level = agaw_to_level(domain->agaw);
1021 	int offset;
1022 
1023 	BUG_ON(!domain->pgd);
1024 
1025 	if (!domain_pfn_supported(domain, pfn))
1026 		/* Address beyond IOMMU's addressing capabilities. */
1027 		return NULL;
1028 
1029 	parent = domain->pgd;
1030 
1031 	while (1) {
1032 		void *tmp_page;
1033 
1034 		offset = pfn_level_offset(pfn, level);
1035 		pte = &parent[offset];
1036 		if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
1037 			break;
1038 		if (level == *target_level)
1039 			break;
1040 
1041 		if (!dma_pte_present(pte)) {
1042 			uint64_t pteval;
1043 
1044 			tmp_page = alloc_pgtable_page(domain->nid);
1045 
1046 			if (!tmp_page)
1047 				return NULL;
1048 
1049 			domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
1050 			pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
1051 			if (domain_use_first_level(domain))
1052 				pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
1053 
1054 			if (cmpxchg64(&pte->val, 0ULL, pteval))
1055 				/* Someone else set it while we were thinking; use theirs. */
1056 				free_pgtable_page(tmp_page);
1057 			else
1058 				domain_flush_cache(domain, pte, sizeof(*pte));
1059 		}
1060 		if (level == 1)
1061 			break;
1062 
1063 		parent = phys_to_virt(dma_pte_addr(pte));
1064 		level--;
1065 	}
1066 
1067 	if (!*target_level)
1068 		*target_level = level;
1069 
1070 	return pte;
1071 }
1072 
1073 /* return address's pte at specific level */
dma_pfn_level_pte(struct dmar_domain * domain,unsigned long pfn,int level,int * large_page)1074 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
1075 					 unsigned long pfn,
1076 					 int level, int *large_page)
1077 {
1078 	struct dma_pte *parent, *pte;
1079 	int total = agaw_to_level(domain->agaw);
1080 	int offset;
1081 
1082 	parent = domain->pgd;
1083 	while (level <= total) {
1084 		offset = pfn_level_offset(pfn, total);
1085 		pte = &parent[offset];
1086 		if (level == total)
1087 			return pte;
1088 
1089 		if (!dma_pte_present(pte)) {
1090 			*large_page = total;
1091 			break;
1092 		}
1093 
1094 		if (dma_pte_superpage(pte)) {
1095 			*large_page = total;
1096 			return pte;
1097 		}
1098 
1099 		parent = phys_to_virt(dma_pte_addr(pte));
1100 		total--;
1101 	}
1102 	return NULL;
1103 }
1104 
1105 /* clear last level pte, a tlb flush should be followed */
dma_pte_clear_range(struct dmar_domain * domain,unsigned long start_pfn,unsigned long last_pfn)1106 static void dma_pte_clear_range(struct dmar_domain *domain,
1107 				unsigned long start_pfn,
1108 				unsigned long last_pfn)
1109 {
1110 	unsigned int large_page;
1111 	struct dma_pte *first_pte, *pte;
1112 
1113 	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1114 	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1115 	BUG_ON(start_pfn > last_pfn);
1116 
1117 	/* we don't need lock here; nobody else touches the iova range */
1118 	do {
1119 		large_page = 1;
1120 		first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1121 		if (!pte) {
1122 			start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1123 			continue;
1124 		}
1125 		do {
1126 			dma_clear_pte(pte);
1127 			start_pfn += lvl_to_nr_pages(large_page);
1128 			pte++;
1129 		} while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1130 
1131 		domain_flush_cache(domain, first_pte,
1132 				   (void *)pte - (void *)first_pte);
1133 
1134 	} while (start_pfn && start_pfn <= last_pfn);
1135 }
1136 
dma_pte_free_level(struct dmar_domain * domain,int level,int retain_level,struct dma_pte * pte,unsigned long pfn,unsigned long start_pfn,unsigned long last_pfn)1137 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1138 			       int retain_level, struct dma_pte *pte,
1139 			       unsigned long pfn, unsigned long start_pfn,
1140 			       unsigned long last_pfn)
1141 {
1142 	pfn = max(start_pfn, pfn);
1143 	pte = &pte[pfn_level_offset(pfn, level)];
1144 
1145 	do {
1146 		unsigned long level_pfn;
1147 		struct dma_pte *level_pte;
1148 
1149 		if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1150 			goto next;
1151 
1152 		level_pfn = pfn & level_mask(level);
1153 		level_pte = phys_to_virt(dma_pte_addr(pte));
1154 
1155 		if (level > 2) {
1156 			dma_pte_free_level(domain, level - 1, retain_level,
1157 					   level_pte, level_pfn, start_pfn,
1158 					   last_pfn);
1159 		}
1160 
1161 		/*
1162 		 * Free the page table if we're below the level we want to
1163 		 * retain and the range covers the entire table.
1164 		 */
1165 		if (level < retain_level && !(start_pfn > level_pfn ||
1166 		      last_pfn < level_pfn + level_size(level) - 1)) {
1167 			dma_clear_pte(pte);
1168 			domain_flush_cache(domain, pte, sizeof(*pte));
1169 			free_pgtable_page(level_pte);
1170 		}
1171 next:
1172 		pfn += level_size(level);
1173 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1174 }
1175 
1176 /*
1177  * clear last level (leaf) ptes and free page table pages below the
1178  * level we wish to keep intact.
1179  */
dma_pte_free_pagetable(struct dmar_domain * domain,unsigned long start_pfn,unsigned long last_pfn,int retain_level)1180 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1181 				   unsigned long start_pfn,
1182 				   unsigned long last_pfn,
1183 				   int retain_level)
1184 {
1185 	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1186 	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1187 	BUG_ON(start_pfn > last_pfn);
1188 
1189 	dma_pte_clear_range(domain, start_pfn, last_pfn);
1190 
1191 	/* We don't need lock here; nobody else touches the iova range */
1192 	dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1193 			   domain->pgd, 0, start_pfn, last_pfn);
1194 
1195 	/* free pgd */
1196 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1197 		free_pgtable_page(domain->pgd);
1198 		domain->pgd = NULL;
1199 	}
1200 }
1201 
1202 /* When a page at a given level is being unlinked from its parent, we don't
1203    need to *modify* it at all. All we need to do is make a list of all the
1204    pages which can be freed just as soon as we've flushed the IOTLB and we
1205    know the hardware page-walk will no longer touch them.
1206    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1207    be freed. */
dma_pte_list_pagetables(struct dmar_domain * domain,int level,struct dma_pte * pte,struct page * freelist)1208 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1209 					    int level, struct dma_pte *pte,
1210 					    struct page *freelist)
1211 {
1212 	struct page *pg;
1213 
1214 	pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1215 	pg->freelist = freelist;
1216 	freelist = pg;
1217 
1218 	if (level == 1)
1219 		return freelist;
1220 
1221 	pte = page_address(pg);
1222 	do {
1223 		if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1224 			freelist = dma_pte_list_pagetables(domain, level - 1,
1225 							   pte, freelist);
1226 		pte++;
1227 	} while (!first_pte_in_page(pte));
1228 
1229 	return freelist;
1230 }
1231 
dma_pte_clear_level(struct dmar_domain * domain,int level,struct dma_pte * pte,unsigned long pfn,unsigned long start_pfn,unsigned long last_pfn,struct page * freelist)1232 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1233 					struct dma_pte *pte, unsigned long pfn,
1234 					unsigned long start_pfn,
1235 					unsigned long last_pfn,
1236 					struct page *freelist)
1237 {
1238 	struct dma_pte *first_pte = NULL, *last_pte = NULL;
1239 
1240 	pfn = max(start_pfn, pfn);
1241 	pte = &pte[pfn_level_offset(pfn, level)];
1242 
1243 	do {
1244 		unsigned long level_pfn = pfn & level_mask(level);
1245 
1246 		if (!dma_pte_present(pte))
1247 			goto next;
1248 
1249 		/* If range covers entire pagetable, free it */
1250 		if (start_pfn <= level_pfn &&
1251 		    last_pfn >= level_pfn + level_size(level) - 1) {
1252 			/* These suborbinate page tables are going away entirely. Don't
1253 			   bother to clear them; we're just going to *free* them. */
1254 			if (level > 1 && !dma_pte_superpage(pte))
1255 				freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1256 
1257 			dma_clear_pte(pte);
1258 			if (!first_pte)
1259 				first_pte = pte;
1260 			last_pte = pte;
1261 		} else if (level > 1) {
1262 			/* Recurse down into a level that isn't *entirely* obsolete */
1263 			freelist = dma_pte_clear_level(domain, level - 1,
1264 						       phys_to_virt(dma_pte_addr(pte)),
1265 						       level_pfn, start_pfn, last_pfn,
1266 						       freelist);
1267 		}
1268 next:
1269 		pfn = level_pfn + level_size(level);
1270 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1271 
1272 	if (first_pte)
1273 		domain_flush_cache(domain, first_pte,
1274 				   (void *)++last_pte - (void *)first_pte);
1275 
1276 	return freelist;
1277 }
1278 
1279 /* We can't just free the pages because the IOMMU may still be walking
1280    the page tables, and may have cached the intermediate levels. The
1281    pages can only be freed after the IOTLB flush has been done. */
domain_unmap(struct dmar_domain * domain,unsigned long start_pfn,unsigned long last_pfn,struct page * freelist)1282 static struct page *domain_unmap(struct dmar_domain *domain,
1283 				 unsigned long start_pfn,
1284 				 unsigned long last_pfn,
1285 				 struct page *freelist)
1286 {
1287 	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1288 	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1289 	BUG_ON(start_pfn > last_pfn);
1290 
1291 	/* we don't need lock here; nobody else touches the iova range */
1292 	freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1293 				       domain->pgd, 0, start_pfn, last_pfn,
1294 				       freelist);
1295 
1296 	/* free pgd */
1297 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1298 		struct page *pgd_page = virt_to_page(domain->pgd);
1299 		pgd_page->freelist = freelist;
1300 		freelist = pgd_page;
1301 
1302 		domain->pgd = NULL;
1303 	}
1304 
1305 	return freelist;
1306 }
1307 
dma_free_pagelist(struct page * freelist)1308 static void dma_free_pagelist(struct page *freelist)
1309 {
1310 	struct page *pg;
1311 
1312 	while ((pg = freelist)) {
1313 		freelist = pg->freelist;
1314 		free_pgtable_page(page_address(pg));
1315 	}
1316 }
1317 
1318 /* iommu handling */
iommu_alloc_root_entry(struct intel_iommu * iommu)1319 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1320 {
1321 	struct root_entry *root;
1322 	unsigned long flags;
1323 
1324 	root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1325 	if (!root) {
1326 		pr_err("Allocating root entry for %s failed\n",
1327 			iommu->name);
1328 		return -ENOMEM;
1329 	}
1330 
1331 	__iommu_flush_cache(iommu, root, ROOT_SIZE);
1332 
1333 	spin_lock_irqsave(&iommu->lock, flags);
1334 	iommu->root_entry = root;
1335 	spin_unlock_irqrestore(&iommu->lock, flags);
1336 
1337 	return 0;
1338 }
1339 
iommu_set_root_entry(struct intel_iommu * iommu)1340 static void iommu_set_root_entry(struct intel_iommu *iommu)
1341 {
1342 	u64 addr;
1343 	u32 sts;
1344 	unsigned long flag;
1345 
1346 	addr = virt_to_phys(iommu->root_entry);
1347 	if (sm_supported(iommu))
1348 		addr |= DMA_RTADDR_SMT;
1349 
1350 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1351 	dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1352 
1353 	writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1354 
1355 	/* Make sure hardware complete it */
1356 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1357 		      readl, (sts & DMA_GSTS_RTPS), sts);
1358 
1359 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1360 
1361 	iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1362 	if (sm_supported(iommu))
1363 		qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1364 	iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1365 }
1366 
iommu_flush_write_buffer(struct intel_iommu * iommu)1367 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1368 {
1369 	u32 val;
1370 	unsigned long flag;
1371 
1372 	if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1373 		return;
1374 
1375 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1376 	writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1377 
1378 	/* Make sure hardware complete it */
1379 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1380 		      readl, (!(val & DMA_GSTS_WBFS)), val);
1381 
1382 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1383 }
1384 
1385 /* return value determine if we need a write buffer flush */
__iommu_flush_context(struct intel_iommu * iommu,u16 did,u16 source_id,u8 function_mask,u64 type)1386 static void __iommu_flush_context(struct intel_iommu *iommu,
1387 				  u16 did, u16 source_id, u8 function_mask,
1388 				  u64 type)
1389 {
1390 	u64 val = 0;
1391 	unsigned long flag;
1392 
1393 	switch (type) {
1394 	case DMA_CCMD_GLOBAL_INVL:
1395 		val = DMA_CCMD_GLOBAL_INVL;
1396 		break;
1397 	case DMA_CCMD_DOMAIN_INVL:
1398 		val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1399 		break;
1400 	case DMA_CCMD_DEVICE_INVL:
1401 		val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1402 			| DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1403 		break;
1404 	default:
1405 		BUG();
1406 	}
1407 	val |= DMA_CCMD_ICC;
1408 
1409 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1410 	dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1411 
1412 	/* Make sure hardware complete it */
1413 	IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1414 		dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1415 
1416 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1417 }
1418 
1419 /* return value determine if we need a write buffer flush */
__iommu_flush_iotlb(struct intel_iommu * iommu,u16 did,u64 addr,unsigned int size_order,u64 type)1420 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1421 				u64 addr, unsigned int size_order, u64 type)
1422 {
1423 	int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1424 	u64 val = 0, val_iva = 0;
1425 	unsigned long flag;
1426 
1427 	switch (type) {
1428 	case DMA_TLB_GLOBAL_FLUSH:
1429 		/* global flush doesn't need set IVA_REG */
1430 		val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1431 		break;
1432 	case DMA_TLB_DSI_FLUSH:
1433 		val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1434 		break;
1435 	case DMA_TLB_PSI_FLUSH:
1436 		val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1437 		/* IH bit is passed in as part of address */
1438 		val_iva = size_order | addr;
1439 		break;
1440 	default:
1441 		BUG();
1442 	}
1443 	/* Note: set drain read/write */
1444 #if 0
1445 	/*
1446 	 * This is probably to be super secure.. Looks like we can
1447 	 * ignore it without any impact.
1448 	 */
1449 	if (cap_read_drain(iommu->cap))
1450 		val |= DMA_TLB_READ_DRAIN;
1451 #endif
1452 	if (cap_write_drain(iommu->cap))
1453 		val |= DMA_TLB_WRITE_DRAIN;
1454 
1455 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1456 	/* Note: Only uses first TLB reg currently */
1457 	if (val_iva)
1458 		dmar_writeq(iommu->reg + tlb_offset, val_iva);
1459 	dmar_writeq(iommu->reg + tlb_offset + 8, val);
1460 
1461 	/* Make sure hardware complete it */
1462 	IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1463 		dmar_readq, (!(val & DMA_TLB_IVT)), val);
1464 
1465 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1466 
1467 	/* check IOTLB invalidation granularity */
1468 	if (DMA_TLB_IAIG(val) == 0)
1469 		pr_err("Flush IOTLB failed\n");
1470 	if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1471 		pr_debug("TLB flush request %Lx, actual %Lx\n",
1472 			(unsigned long long)DMA_TLB_IIRG(type),
1473 			(unsigned long long)DMA_TLB_IAIG(val));
1474 }
1475 
1476 static struct device_domain_info *
iommu_support_dev_iotlb(struct dmar_domain * domain,struct intel_iommu * iommu,u8 bus,u8 devfn)1477 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1478 			 u8 bus, u8 devfn)
1479 {
1480 	struct device_domain_info *info;
1481 
1482 	assert_spin_locked(&device_domain_lock);
1483 
1484 	if (!iommu->qi)
1485 		return NULL;
1486 
1487 	list_for_each_entry(info, &domain->devices, link)
1488 		if (info->iommu == iommu && info->bus == bus &&
1489 		    info->devfn == devfn) {
1490 			if (info->ats_supported && info->dev)
1491 				return info;
1492 			break;
1493 		}
1494 
1495 	return NULL;
1496 }
1497 
domain_update_iotlb(struct dmar_domain * domain)1498 static void domain_update_iotlb(struct dmar_domain *domain)
1499 {
1500 	struct device_domain_info *info;
1501 	bool has_iotlb_device = false;
1502 
1503 	assert_spin_locked(&device_domain_lock);
1504 
1505 	list_for_each_entry(info, &domain->devices, link)
1506 		if (info->ats_enabled) {
1507 			has_iotlb_device = true;
1508 			break;
1509 		}
1510 
1511 	if (!has_iotlb_device) {
1512 		struct subdev_domain_info *sinfo;
1513 
1514 		list_for_each_entry(sinfo, &domain->subdevices, link_domain) {
1515 			info = get_domain_info(sinfo->pdev);
1516 			if (info && info->ats_enabled) {
1517 				has_iotlb_device = true;
1518 				break;
1519 			}
1520 		}
1521 	}
1522 
1523 	domain->has_iotlb_device = has_iotlb_device;
1524 }
1525 
iommu_enable_dev_iotlb(struct device_domain_info * info)1526 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1527 {
1528 	struct pci_dev *pdev;
1529 
1530 	assert_spin_locked(&device_domain_lock);
1531 
1532 	if (!info || !dev_is_pci(info->dev))
1533 		return;
1534 
1535 	pdev = to_pci_dev(info->dev);
1536 	/* For IOMMU that supports device IOTLB throttling (DIT), we assign
1537 	 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1538 	 * queue depth at PF level. If DIT is not set, PFSID will be treated as
1539 	 * reserved, which should be set to 0.
1540 	 */
1541 	if (!ecap_dit(info->iommu->ecap))
1542 		info->pfsid = 0;
1543 	else {
1544 		struct pci_dev *pf_pdev;
1545 
1546 		/* pdev will be returned if device is not a vf */
1547 		pf_pdev = pci_physfn(pdev);
1548 		info->pfsid = pci_dev_id(pf_pdev);
1549 	}
1550 
1551 #ifdef CONFIG_INTEL_IOMMU_SVM
1552 	/* The PCIe spec, in its wisdom, declares that the behaviour of
1553 	   the device if you enable PASID support after ATS support is
1554 	   undefined. So always enable PASID support on devices which
1555 	   have it, even if we can't yet know if we're ever going to
1556 	   use it. */
1557 	if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1558 		info->pasid_enabled = 1;
1559 
1560 	if (info->pri_supported &&
1561 	    (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1)  &&
1562 	    !pci_reset_pri(pdev) && !pci_enable_pri(pdev, PRQ_DEPTH))
1563 		info->pri_enabled = 1;
1564 #endif
1565 	if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1566 	    !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1567 		info->ats_enabled = 1;
1568 		domain_update_iotlb(info->domain);
1569 		info->ats_qdep = pci_ats_queue_depth(pdev);
1570 	}
1571 }
1572 
iommu_disable_dev_iotlb(struct device_domain_info * info)1573 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1574 {
1575 	struct pci_dev *pdev;
1576 
1577 	assert_spin_locked(&device_domain_lock);
1578 
1579 	if (!dev_is_pci(info->dev))
1580 		return;
1581 
1582 	pdev = to_pci_dev(info->dev);
1583 
1584 	if (info->ats_enabled) {
1585 		pci_disable_ats(pdev);
1586 		info->ats_enabled = 0;
1587 		domain_update_iotlb(info->domain);
1588 	}
1589 #ifdef CONFIG_INTEL_IOMMU_SVM
1590 	if (info->pri_enabled) {
1591 		pci_disable_pri(pdev);
1592 		info->pri_enabled = 0;
1593 	}
1594 	if (info->pasid_enabled) {
1595 		pci_disable_pasid(pdev);
1596 		info->pasid_enabled = 0;
1597 	}
1598 #endif
1599 }
1600 
__iommu_flush_dev_iotlb(struct device_domain_info * info,u64 addr,unsigned int mask)1601 static void __iommu_flush_dev_iotlb(struct device_domain_info *info,
1602 				    u64 addr, unsigned int mask)
1603 {
1604 	u16 sid, qdep;
1605 
1606 	if (!info || !info->ats_enabled)
1607 		return;
1608 
1609 	sid = info->bus << 8 | info->devfn;
1610 	qdep = info->ats_qdep;
1611 	qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1612 			   qdep, addr, mask);
1613 }
1614 
iommu_flush_dev_iotlb(struct dmar_domain * domain,u64 addr,unsigned mask)1615 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1616 				  u64 addr, unsigned mask)
1617 {
1618 	unsigned long flags;
1619 	struct device_domain_info *info;
1620 	struct subdev_domain_info *sinfo;
1621 
1622 	if (!domain->has_iotlb_device)
1623 		return;
1624 
1625 	spin_lock_irqsave(&device_domain_lock, flags);
1626 	list_for_each_entry(info, &domain->devices, link)
1627 		__iommu_flush_dev_iotlb(info, addr, mask);
1628 
1629 	list_for_each_entry(sinfo, &domain->subdevices, link_domain) {
1630 		info = get_domain_info(sinfo->pdev);
1631 		__iommu_flush_dev_iotlb(info, addr, mask);
1632 	}
1633 	spin_unlock_irqrestore(&device_domain_lock, flags);
1634 }
1635 
domain_flush_piotlb(struct intel_iommu * iommu,struct dmar_domain * domain,u64 addr,unsigned long npages,bool ih)1636 static void domain_flush_piotlb(struct intel_iommu *iommu,
1637 				struct dmar_domain *domain,
1638 				u64 addr, unsigned long npages, bool ih)
1639 {
1640 	u16 did = domain->iommu_did[iommu->seq_id];
1641 
1642 	if (domain->default_pasid)
1643 		qi_flush_piotlb(iommu, did, domain->default_pasid,
1644 				addr, npages, ih);
1645 
1646 	if (!list_empty(&domain->devices))
1647 		qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, npages, ih);
1648 }
1649 
iommu_flush_iotlb_psi(struct intel_iommu * iommu,struct dmar_domain * domain,unsigned long pfn,unsigned int pages,int ih,int map)1650 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1651 				  struct dmar_domain *domain,
1652 				  unsigned long pfn, unsigned int pages,
1653 				  int ih, int map)
1654 {
1655 	unsigned int aligned_pages = __roundup_pow_of_two(pages);
1656 	unsigned int mask = ilog2(aligned_pages);
1657 	uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1658 	u16 did = domain->iommu_did[iommu->seq_id];
1659 
1660 	BUG_ON(pages == 0);
1661 
1662 	if (ih)
1663 		ih = 1 << 6;
1664 
1665 	if (domain_use_first_level(domain)) {
1666 		domain_flush_piotlb(iommu, domain, addr, pages, ih);
1667 	} else {
1668 		unsigned long bitmask = aligned_pages - 1;
1669 
1670 		/*
1671 		 * PSI masks the low order bits of the base address. If the
1672 		 * address isn't aligned to the mask, then compute a mask value
1673 		 * needed to ensure the target range is flushed.
1674 		 */
1675 		if (unlikely(bitmask & pfn)) {
1676 			unsigned long end_pfn = pfn + pages - 1, shared_bits;
1677 
1678 			/*
1679 			 * Since end_pfn <= pfn + bitmask, the only way bits
1680 			 * higher than bitmask can differ in pfn and end_pfn is
1681 			 * by carrying. This means after masking out bitmask,
1682 			 * high bits starting with the first set bit in
1683 			 * shared_bits are all equal in both pfn and end_pfn.
1684 			 */
1685 			shared_bits = ~(pfn ^ end_pfn) & ~bitmask;
1686 			mask = shared_bits ? __ffs(shared_bits) : BITS_PER_LONG;
1687 		}
1688 
1689 		/*
1690 		 * Fallback to domain selective flush if no PSI support or
1691 		 * the size is too big.
1692 		 */
1693 		if (!cap_pgsel_inv(iommu->cap) ||
1694 		    mask > cap_max_amask_val(iommu->cap))
1695 			iommu->flush.flush_iotlb(iommu, did, 0, 0,
1696 							DMA_TLB_DSI_FLUSH);
1697 		else
1698 			iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1699 							DMA_TLB_PSI_FLUSH);
1700 	}
1701 
1702 	/*
1703 	 * In caching mode, changes of pages from non-present to present require
1704 	 * flush. However, device IOTLB doesn't need to be flushed in this case.
1705 	 */
1706 	if (!cap_caching_mode(iommu->cap) || !map)
1707 		iommu_flush_dev_iotlb(domain, addr, mask);
1708 }
1709 
1710 /* Notification for newly created mappings */
__mapping_notify_one(struct intel_iommu * iommu,struct dmar_domain * domain,unsigned long pfn,unsigned int pages)1711 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1712 					struct dmar_domain *domain,
1713 					unsigned long pfn, unsigned int pages)
1714 {
1715 	/*
1716 	 * It's a non-present to present mapping. Only flush if caching mode
1717 	 * and second level.
1718 	 */
1719 	if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain))
1720 		iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1721 	else
1722 		iommu_flush_write_buffer(iommu);
1723 }
1724 
intel_flush_iotlb_all(struct iommu_domain * domain)1725 static void intel_flush_iotlb_all(struct iommu_domain *domain)
1726 {
1727 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
1728 	int idx;
1729 
1730 	for_each_domain_iommu(idx, dmar_domain) {
1731 		struct intel_iommu *iommu = g_iommus[idx];
1732 		u16 did = dmar_domain->iommu_did[iommu->seq_id];
1733 
1734 		if (domain_use_first_level(dmar_domain))
1735 			domain_flush_piotlb(iommu, dmar_domain, 0, -1, 0);
1736 		else
1737 			iommu->flush.flush_iotlb(iommu, did, 0, 0,
1738 						 DMA_TLB_DSI_FLUSH);
1739 
1740 		if (!cap_caching_mode(iommu->cap))
1741 			iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1742 					      0, MAX_AGAW_PFN_WIDTH);
1743 	}
1744 }
1745 
iommu_disable_protect_mem_regions(struct intel_iommu * iommu)1746 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1747 {
1748 	u32 pmen;
1749 	unsigned long flags;
1750 
1751 	if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1752 		return;
1753 
1754 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1755 	pmen = readl(iommu->reg + DMAR_PMEN_REG);
1756 	pmen &= ~DMA_PMEN_EPM;
1757 	writel(pmen, iommu->reg + DMAR_PMEN_REG);
1758 
1759 	/* wait for the protected region status bit to clear */
1760 	IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1761 		readl, !(pmen & DMA_PMEN_PRS), pmen);
1762 
1763 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1764 }
1765 
iommu_enable_translation(struct intel_iommu * iommu)1766 static void iommu_enable_translation(struct intel_iommu *iommu)
1767 {
1768 	u32 sts;
1769 	unsigned long flags;
1770 
1771 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1772 	iommu->gcmd |= DMA_GCMD_TE;
1773 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1774 
1775 	/* Make sure hardware complete it */
1776 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1777 		      readl, (sts & DMA_GSTS_TES), sts);
1778 
1779 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1780 }
1781 
iommu_disable_translation(struct intel_iommu * iommu)1782 static void iommu_disable_translation(struct intel_iommu *iommu)
1783 {
1784 	u32 sts;
1785 	unsigned long flag;
1786 
1787 	if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1788 	    (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1789 		return;
1790 
1791 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1792 	iommu->gcmd &= ~DMA_GCMD_TE;
1793 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1794 
1795 	/* Make sure hardware complete it */
1796 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1797 		      readl, (!(sts & DMA_GSTS_TES)), sts);
1798 
1799 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1800 }
1801 
iommu_init_domains(struct intel_iommu * iommu)1802 static int iommu_init_domains(struct intel_iommu *iommu)
1803 {
1804 	u32 ndomains, nlongs;
1805 	size_t size;
1806 
1807 	ndomains = cap_ndoms(iommu->cap);
1808 	pr_debug("%s: Number of Domains supported <%d>\n",
1809 		 iommu->name, ndomains);
1810 	nlongs = BITS_TO_LONGS(ndomains);
1811 
1812 	spin_lock_init(&iommu->lock);
1813 
1814 	iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1815 	if (!iommu->domain_ids)
1816 		return -ENOMEM;
1817 
1818 	size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1819 	iommu->domains = kzalloc(size, GFP_KERNEL);
1820 
1821 	if (iommu->domains) {
1822 		size = 256 * sizeof(struct dmar_domain *);
1823 		iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1824 	}
1825 
1826 	if (!iommu->domains || !iommu->domains[0]) {
1827 		pr_err("%s: Allocating domain array failed\n",
1828 		       iommu->name);
1829 		kfree(iommu->domain_ids);
1830 		kfree(iommu->domains);
1831 		iommu->domain_ids = NULL;
1832 		iommu->domains    = NULL;
1833 		return -ENOMEM;
1834 	}
1835 
1836 	/*
1837 	 * If Caching mode is set, then invalid translations are tagged
1838 	 * with domain-id 0, hence we need to pre-allocate it. We also
1839 	 * use domain-id 0 as a marker for non-allocated domain-id, so
1840 	 * make sure it is not used for a real domain.
1841 	 */
1842 	set_bit(0, iommu->domain_ids);
1843 
1844 	/*
1845 	 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1846 	 * entry for first-level or pass-through translation modes should
1847 	 * be programmed with a domain id different from those used for
1848 	 * second-level or nested translation. We reserve a domain id for
1849 	 * this purpose.
1850 	 */
1851 	if (sm_supported(iommu))
1852 		set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1853 
1854 	return 0;
1855 }
1856 
disable_dmar_iommu(struct intel_iommu * iommu)1857 static void disable_dmar_iommu(struct intel_iommu *iommu)
1858 {
1859 	struct device_domain_info *info, *tmp;
1860 	unsigned long flags;
1861 
1862 	if (!iommu->domains || !iommu->domain_ids)
1863 		return;
1864 
1865 	spin_lock_irqsave(&device_domain_lock, flags);
1866 	list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1867 		if (info->iommu != iommu)
1868 			continue;
1869 
1870 		if (!info->dev || !info->domain)
1871 			continue;
1872 
1873 		__dmar_remove_one_dev_info(info);
1874 	}
1875 	spin_unlock_irqrestore(&device_domain_lock, flags);
1876 
1877 	if (iommu->gcmd & DMA_GCMD_TE)
1878 		iommu_disable_translation(iommu);
1879 }
1880 
free_dmar_iommu(struct intel_iommu * iommu)1881 static void free_dmar_iommu(struct intel_iommu *iommu)
1882 {
1883 	if ((iommu->domains) && (iommu->domain_ids)) {
1884 		int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1885 		int i;
1886 
1887 		for (i = 0; i < elems; i++)
1888 			kfree(iommu->domains[i]);
1889 		kfree(iommu->domains);
1890 		kfree(iommu->domain_ids);
1891 		iommu->domains = NULL;
1892 		iommu->domain_ids = NULL;
1893 	}
1894 
1895 	if (iommu->copied_tables) {
1896 		bitmap_free(iommu->copied_tables);
1897 		iommu->copied_tables = NULL;
1898 	}
1899 
1900 	g_iommus[iommu->seq_id] = NULL;
1901 
1902 	/* free context mapping */
1903 	free_context_table(iommu);
1904 
1905 #ifdef CONFIG_INTEL_IOMMU_SVM
1906 	if (pasid_supported(iommu)) {
1907 		if (ecap_prs(iommu->ecap))
1908 			intel_svm_finish_prq(iommu);
1909 	}
1910 	if (vccap_pasid(iommu->vccap))
1911 		ioasid_unregister_allocator(&iommu->pasid_allocator);
1912 
1913 #endif
1914 }
1915 
1916 /*
1917  * Check and return whether first level is used by default for
1918  * DMA translation.
1919  */
first_level_by_default(unsigned int type)1920 static bool first_level_by_default(unsigned int type)
1921 {
1922 	/* Only SL is available in legacy mode */
1923 	if (!scalable_mode_support())
1924 		return false;
1925 
1926 	/* Only level (either FL or SL) is available, just use it */
1927 	if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity())
1928 		return intel_cap_flts_sanity();
1929 
1930 	/* Both levels are available, decide it based on domain type */
1931 	return type != IOMMU_DOMAIN_UNMANAGED;
1932 }
1933 
alloc_domain(unsigned int type)1934 static struct dmar_domain *alloc_domain(unsigned int type)
1935 {
1936 	struct dmar_domain *domain;
1937 
1938 	domain = alloc_domain_mem();
1939 	if (!domain)
1940 		return NULL;
1941 
1942 	memset(domain, 0, sizeof(*domain));
1943 	domain->nid = NUMA_NO_NODE;
1944 	if (first_level_by_default(type))
1945 		domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL;
1946 	domain->has_iotlb_device = false;
1947 	INIT_LIST_HEAD(&domain->devices);
1948 	INIT_LIST_HEAD(&domain->subdevices);
1949 
1950 	return domain;
1951 }
1952 
1953 /* Must be called with iommu->lock */
domain_attach_iommu(struct dmar_domain * domain,struct intel_iommu * iommu)1954 static int domain_attach_iommu(struct dmar_domain *domain,
1955 			       struct intel_iommu *iommu)
1956 {
1957 	unsigned long ndomains;
1958 	int num;
1959 
1960 	assert_spin_locked(&device_domain_lock);
1961 	assert_spin_locked(&iommu->lock);
1962 
1963 	domain->iommu_refcnt[iommu->seq_id] += 1;
1964 	if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1965 		ndomains = cap_ndoms(iommu->cap);
1966 		num      = find_first_zero_bit(iommu->domain_ids, ndomains);
1967 
1968 		if (num >= ndomains) {
1969 			pr_err("%s: No free domain ids\n", iommu->name);
1970 			domain->iommu_refcnt[iommu->seq_id] -= 1;
1971 			return -ENOSPC;
1972 		}
1973 
1974 		set_bit(num, iommu->domain_ids);
1975 		set_iommu_domain(iommu, num, domain);
1976 
1977 		domain->iommu_did[iommu->seq_id] = num;
1978 		domain->nid			 = iommu->node;
1979 
1980 		domain_update_iommu_cap(domain);
1981 	}
1982 
1983 	return 0;
1984 }
1985 
domain_detach_iommu(struct dmar_domain * domain,struct intel_iommu * iommu)1986 static void domain_detach_iommu(struct dmar_domain *domain,
1987 				struct intel_iommu *iommu)
1988 {
1989 	int num;
1990 
1991 	assert_spin_locked(&device_domain_lock);
1992 	assert_spin_locked(&iommu->lock);
1993 
1994 	domain->iommu_refcnt[iommu->seq_id] -= 1;
1995 	if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1996 		num = domain->iommu_did[iommu->seq_id];
1997 		clear_bit(num, iommu->domain_ids);
1998 		set_iommu_domain(iommu, num, NULL);
1999 
2000 		domain_update_iommu_cap(domain);
2001 		domain->iommu_did[iommu->seq_id] = 0;
2002 	}
2003 }
2004 
guestwidth_to_adjustwidth(int gaw)2005 static inline int guestwidth_to_adjustwidth(int gaw)
2006 {
2007 	int agaw;
2008 	int r = (gaw - 12) % 9;
2009 
2010 	if (r == 0)
2011 		agaw = gaw;
2012 	else
2013 		agaw = gaw + 9 - r;
2014 	if (agaw > 64)
2015 		agaw = 64;
2016 	return agaw;
2017 }
2018 
domain_exit(struct dmar_domain * domain)2019 static void domain_exit(struct dmar_domain *domain)
2020 {
2021 
2022 	/* Remove associated devices and clear attached or cached domains */
2023 	domain_remove_dev_info(domain);
2024 
2025 	if (domain->pgd) {
2026 		struct page *freelist;
2027 
2028 		freelist = domain_unmap(domain, 0,
2029 					DOMAIN_MAX_PFN(domain->gaw), NULL);
2030 		dma_free_pagelist(freelist);
2031 	}
2032 
2033 	free_domain_mem(domain);
2034 }
2035 
2036 /*
2037  * Get the PASID directory size for scalable mode context entry.
2038  * Value of X in the PDTS field of a scalable mode context entry
2039  * indicates PASID directory with 2^(X + 7) entries.
2040  */
context_get_sm_pds(struct pasid_table * table)2041 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
2042 {
2043 	int pds, max_pde;
2044 
2045 	max_pde = table->max_pasid >> PASID_PDE_SHIFT;
2046 	pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
2047 	if (pds < 7)
2048 		return 0;
2049 
2050 	return pds - 7;
2051 }
2052 
2053 /*
2054  * Set the RID_PASID field of a scalable mode context entry. The
2055  * IOMMU hardware will use the PASID value set in this field for
2056  * DMA translations of DMA requests without PASID.
2057  */
2058 static inline void
context_set_sm_rid2pasid(struct context_entry * context,unsigned long pasid)2059 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
2060 {
2061 	context->hi |= pasid & ((1 << 20) - 1);
2062 }
2063 
2064 /*
2065  * Set the DTE(Device-TLB Enable) field of a scalable mode context
2066  * entry.
2067  */
context_set_sm_dte(struct context_entry * context)2068 static inline void context_set_sm_dte(struct context_entry *context)
2069 {
2070 	context->lo |= (1 << 2);
2071 }
2072 
2073 /*
2074  * Set the PRE(Page Request Enable) field of a scalable mode context
2075  * entry.
2076  */
context_set_sm_pre(struct context_entry * context)2077 static inline void context_set_sm_pre(struct context_entry *context)
2078 {
2079 	context->lo |= (1 << 4);
2080 }
2081 
2082 /* Convert value to context PASID directory size field coding. */
2083 #define context_pdts(pds)	(((pds) & 0x7) << 9)
2084 
domain_context_mapping_one(struct dmar_domain * domain,struct intel_iommu * iommu,struct pasid_table * table,u8 bus,u8 devfn)2085 static int domain_context_mapping_one(struct dmar_domain *domain,
2086 				      struct intel_iommu *iommu,
2087 				      struct pasid_table *table,
2088 				      u8 bus, u8 devfn)
2089 {
2090 	u16 did = domain->iommu_did[iommu->seq_id];
2091 	int translation = CONTEXT_TT_MULTI_LEVEL;
2092 	struct device_domain_info *info = NULL;
2093 	struct context_entry *context;
2094 	unsigned long flags;
2095 	int ret;
2096 
2097 	WARN_ON(did == 0);
2098 
2099 	if (hw_pass_through && domain_type_is_si(domain))
2100 		translation = CONTEXT_TT_PASS_THROUGH;
2101 
2102 	pr_debug("Set context mapping for %02x:%02x.%d\n",
2103 		bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
2104 
2105 	BUG_ON(!domain->pgd);
2106 
2107 	spin_lock_irqsave(&device_domain_lock, flags);
2108 	spin_lock(&iommu->lock);
2109 
2110 	ret = -ENOMEM;
2111 	context = iommu_context_addr(iommu, bus, devfn, 1);
2112 	if (!context)
2113 		goto out_unlock;
2114 
2115 	ret = 0;
2116 	if (context_present(context) && !context_copied(iommu, bus, devfn))
2117 		goto out_unlock;
2118 
2119 	/*
2120 	 * For kdump cases, old valid entries may be cached due to the
2121 	 * in-flight DMA and copied pgtable, but there is no unmapping
2122 	 * behaviour for them, thus we need an explicit cache flush for
2123 	 * the newly-mapped device. For kdump, at this point, the device
2124 	 * is supposed to finish reset at its driver probe stage, so no
2125 	 * in-flight DMA will exist, and we don't need to worry anymore
2126 	 * hereafter.
2127 	 */
2128 	if (context_copied(iommu, bus, devfn)) {
2129 		u16 did_old = context_domain_id(context);
2130 
2131 		if (did_old < cap_ndoms(iommu->cap)) {
2132 			iommu->flush.flush_context(iommu, did_old,
2133 						   (((u16)bus) << 8) | devfn,
2134 						   DMA_CCMD_MASK_NOBIT,
2135 						   DMA_CCMD_DEVICE_INVL);
2136 			iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2137 						 DMA_TLB_DSI_FLUSH);
2138 		}
2139 
2140 		clear_context_copied(iommu, bus, devfn);
2141 	}
2142 
2143 	context_clear_entry(context);
2144 
2145 	if (sm_supported(iommu)) {
2146 		unsigned long pds;
2147 
2148 		WARN_ON(!table);
2149 
2150 		/* Setup the PASID DIR pointer: */
2151 		pds = context_get_sm_pds(table);
2152 		context->lo = (u64)virt_to_phys(table->table) |
2153 				context_pdts(pds);
2154 
2155 		/* Setup the RID_PASID field: */
2156 		context_set_sm_rid2pasid(context, PASID_RID2PASID);
2157 
2158 		/*
2159 		 * Setup the Device-TLB enable bit and Page request
2160 		 * Enable bit:
2161 		 */
2162 		info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2163 		if (info && info->ats_supported)
2164 			context_set_sm_dte(context);
2165 		if (info && info->pri_supported)
2166 			context_set_sm_pre(context);
2167 	} else {
2168 		struct dma_pte *pgd = domain->pgd;
2169 		int agaw;
2170 
2171 		context_set_domain_id(context, did);
2172 
2173 		if (translation != CONTEXT_TT_PASS_THROUGH) {
2174 			/*
2175 			 * Skip top levels of page tables for iommu which has
2176 			 * less agaw than default. Unnecessary for PT mode.
2177 			 */
2178 			for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2179 				ret = -ENOMEM;
2180 				pgd = phys_to_virt(dma_pte_addr(pgd));
2181 				if (!dma_pte_present(pgd))
2182 					goto out_unlock;
2183 			}
2184 
2185 			info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2186 			if (info && info->ats_supported)
2187 				translation = CONTEXT_TT_DEV_IOTLB;
2188 			else
2189 				translation = CONTEXT_TT_MULTI_LEVEL;
2190 
2191 			context_set_address_root(context, virt_to_phys(pgd));
2192 			context_set_address_width(context, agaw);
2193 		} else {
2194 			/*
2195 			 * In pass through mode, AW must be programmed to
2196 			 * indicate the largest AGAW value supported by
2197 			 * hardware. And ASR is ignored by hardware.
2198 			 */
2199 			context_set_address_width(context, iommu->msagaw);
2200 		}
2201 
2202 		context_set_translation_type(context, translation);
2203 	}
2204 
2205 	context_set_fault_enable(context);
2206 	context_set_present(context);
2207 	if (!ecap_coherent(iommu->ecap))
2208 		clflush_cache_range(context, sizeof(*context));
2209 
2210 	/*
2211 	 * It's a non-present to present mapping. If hardware doesn't cache
2212 	 * non-present entry we only need to flush the write-buffer. If the
2213 	 * _does_ cache non-present entries, then it does so in the special
2214 	 * domain #0, which we have to flush:
2215 	 */
2216 	if (cap_caching_mode(iommu->cap)) {
2217 		iommu->flush.flush_context(iommu, 0,
2218 					   (((u16)bus) << 8) | devfn,
2219 					   DMA_CCMD_MASK_NOBIT,
2220 					   DMA_CCMD_DEVICE_INVL);
2221 		iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2222 	} else {
2223 		iommu_flush_write_buffer(iommu);
2224 	}
2225 	iommu_enable_dev_iotlb(info);
2226 
2227 	ret = 0;
2228 
2229 out_unlock:
2230 	spin_unlock(&iommu->lock);
2231 	spin_unlock_irqrestore(&device_domain_lock, flags);
2232 
2233 	return ret;
2234 }
2235 
2236 struct domain_context_mapping_data {
2237 	struct dmar_domain *domain;
2238 	struct intel_iommu *iommu;
2239 	struct pasid_table *table;
2240 };
2241 
domain_context_mapping_cb(struct pci_dev * pdev,u16 alias,void * opaque)2242 static int domain_context_mapping_cb(struct pci_dev *pdev,
2243 				     u16 alias, void *opaque)
2244 {
2245 	struct domain_context_mapping_data *data = opaque;
2246 
2247 	return domain_context_mapping_one(data->domain, data->iommu,
2248 					  data->table, PCI_BUS_NUM(alias),
2249 					  alias & 0xff);
2250 }
2251 
2252 static int
domain_context_mapping(struct dmar_domain * domain,struct device * dev)2253 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2254 {
2255 	struct domain_context_mapping_data data;
2256 	struct pasid_table *table;
2257 	struct intel_iommu *iommu;
2258 	u8 bus, devfn;
2259 
2260 	iommu = device_to_iommu(dev, &bus, &devfn);
2261 	if (!iommu)
2262 		return -ENODEV;
2263 
2264 	table = intel_pasid_get_table(dev);
2265 
2266 	if (!dev_is_pci(dev))
2267 		return domain_context_mapping_one(domain, iommu, table,
2268 						  bus, devfn);
2269 
2270 	data.domain = domain;
2271 	data.iommu = iommu;
2272 	data.table = table;
2273 
2274 	return pci_for_each_dma_alias(to_pci_dev(dev),
2275 				      &domain_context_mapping_cb, &data);
2276 }
2277 
domain_context_mapped_cb(struct pci_dev * pdev,u16 alias,void * opaque)2278 static int domain_context_mapped_cb(struct pci_dev *pdev,
2279 				    u16 alias, void *opaque)
2280 {
2281 	struct intel_iommu *iommu = opaque;
2282 
2283 	return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2284 }
2285 
domain_context_mapped(struct device * dev)2286 static int domain_context_mapped(struct device *dev)
2287 {
2288 	struct intel_iommu *iommu;
2289 	u8 bus, devfn;
2290 
2291 	iommu = device_to_iommu(dev, &bus, &devfn);
2292 	if (!iommu)
2293 		return -ENODEV;
2294 
2295 	if (!dev_is_pci(dev))
2296 		return device_context_mapped(iommu, bus, devfn);
2297 
2298 	return !pci_for_each_dma_alias(to_pci_dev(dev),
2299 				       domain_context_mapped_cb, iommu);
2300 }
2301 
2302 /* Returns a number of VTD pages, but aligned to MM page size */
aligned_nrpages(unsigned long host_addr,size_t size)2303 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2304 					    size_t size)
2305 {
2306 	host_addr &= ~PAGE_MASK;
2307 	return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2308 }
2309 
2310 /* Return largest possible superpage level for a given mapping */
hardware_largepage_caps(struct dmar_domain * domain,unsigned long iov_pfn,unsigned long phy_pfn,unsigned long pages)2311 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2312 					  unsigned long iov_pfn,
2313 					  unsigned long phy_pfn,
2314 					  unsigned long pages)
2315 {
2316 	int support, level = 1;
2317 	unsigned long pfnmerge;
2318 
2319 	support = domain->iommu_superpage;
2320 
2321 	/* To use a large page, the virtual *and* physical addresses
2322 	   must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2323 	   of them will mean we have to use smaller pages. So just
2324 	   merge them and check both at once. */
2325 	pfnmerge = iov_pfn | phy_pfn;
2326 
2327 	while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2328 		pages >>= VTD_STRIDE_SHIFT;
2329 		if (!pages)
2330 			break;
2331 		pfnmerge >>= VTD_STRIDE_SHIFT;
2332 		level++;
2333 		support--;
2334 	}
2335 	return level;
2336 }
2337 
2338 /*
2339  * Ensure that old small page tables are removed to make room for superpage(s).
2340  * We're going to add new large pages, so make sure we don't remove their parent
2341  * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
2342  */
switch_to_super_page(struct dmar_domain * domain,unsigned long start_pfn,unsigned long end_pfn,int level)2343 static void switch_to_super_page(struct dmar_domain *domain,
2344 				 unsigned long start_pfn,
2345 				 unsigned long end_pfn, int level)
2346 {
2347 	unsigned long lvl_pages = lvl_to_nr_pages(level);
2348 	struct dma_pte *pte = NULL;
2349 	int i;
2350 
2351 	while (start_pfn <= end_pfn) {
2352 		if (!pte)
2353 			pte = pfn_to_dma_pte(domain, start_pfn, &level);
2354 
2355 		if (dma_pte_present(pte)) {
2356 			dma_pte_free_pagetable(domain, start_pfn,
2357 					       start_pfn + lvl_pages - 1,
2358 					       level + 1);
2359 
2360 			for_each_domain_iommu(i, domain)
2361 				iommu_flush_iotlb_psi(g_iommus[i], domain,
2362 						      start_pfn, lvl_pages,
2363 						      0, 0);
2364 		}
2365 
2366 		pte++;
2367 		start_pfn += lvl_pages;
2368 		if (first_pte_in_page(pte))
2369 			pte = NULL;
2370 	}
2371 }
2372 
2373 static int
__domain_mapping(struct dmar_domain * domain,unsigned long iov_pfn,unsigned long phys_pfn,unsigned long nr_pages,int prot)2374 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2375 		 unsigned long phys_pfn, unsigned long nr_pages, int prot)
2376 {
2377 	struct dma_pte *first_pte = NULL, *pte = NULL;
2378 	unsigned int largepage_lvl = 0;
2379 	unsigned long lvl_pages = 0;
2380 	phys_addr_t pteval;
2381 	u64 attr;
2382 
2383 	BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2384 
2385 	if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2386 		return -EINVAL;
2387 
2388 	attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2389 	attr |= DMA_FL_PTE_PRESENT;
2390 	if (domain_use_first_level(domain)) {
2391 		attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
2392 		if (prot & DMA_PTE_WRITE)
2393 			attr |= DMA_FL_PTE_DIRTY;
2394 	}
2395 
2396 	pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2397 
2398 	while (nr_pages > 0) {
2399 		uint64_t tmp;
2400 
2401 		if (!pte) {
2402 			largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
2403 					phys_pfn, nr_pages);
2404 
2405 			pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2406 			if (!pte)
2407 				return -ENOMEM;
2408 			first_pte = pte;
2409 
2410 			/* It is large page*/
2411 			if (largepage_lvl > 1) {
2412 				unsigned long end_pfn;
2413 
2414 				pteval |= DMA_PTE_LARGE_PAGE;
2415 				end_pfn = ((iov_pfn + nr_pages) & level_mask(largepage_lvl)) - 1;
2416 				switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
2417 			} else {
2418 				pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2419 			}
2420 
2421 		}
2422 		/* We don't need lock here, nobody else
2423 		 * touches the iova range
2424 		 */
2425 		tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2426 		if (tmp) {
2427 			static int dumps = 5;
2428 			pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2429 				iov_pfn, tmp, (unsigned long long)pteval);
2430 			if (dumps) {
2431 				dumps--;
2432 				debug_dma_dump_mappings(NULL);
2433 			}
2434 			WARN_ON(1);
2435 		}
2436 
2437 		lvl_pages = lvl_to_nr_pages(largepage_lvl);
2438 
2439 		BUG_ON(nr_pages < lvl_pages);
2440 
2441 		nr_pages -= lvl_pages;
2442 		iov_pfn += lvl_pages;
2443 		phys_pfn += lvl_pages;
2444 		pteval += lvl_pages * VTD_PAGE_SIZE;
2445 
2446 		/* If the next PTE would be the first in a new page, then we
2447 		 * need to flush the cache on the entries we've just written.
2448 		 * And then we'll need to recalculate 'pte', so clear it and
2449 		 * let it get set again in the if (!pte) block above.
2450 		 *
2451 		 * If we're done (!nr_pages) we need to flush the cache too.
2452 		 *
2453 		 * Also if we've been setting superpages, we may need to
2454 		 * recalculate 'pte' and switch back to smaller pages for the
2455 		 * end of the mapping, if the trailing size is not enough to
2456 		 * use another superpage (i.e. nr_pages < lvl_pages).
2457 		 */
2458 		pte++;
2459 		if (!nr_pages || first_pte_in_page(pte) ||
2460 		    (largepage_lvl > 1 && nr_pages < lvl_pages)) {
2461 			domain_flush_cache(domain, first_pte,
2462 					   (void *)pte - (void *)first_pte);
2463 			pte = NULL;
2464 		}
2465 	}
2466 
2467 	return 0;
2468 }
2469 
domain_context_clear_one(struct device_domain_info * info,u8 bus,u8 devfn)2470 static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
2471 {
2472 	struct intel_iommu *iommu = info->iommu;
2473 	struct context_entry *context;
2474 	unsigned long flags;
2475 	u16 did_old;
2476 
2477 	if (!iommu)
2478 		return;
2479 
2480 	spin_lock_irqsave(&iommu->lock, flags);
2481 	context = iommu_context_addr(iommu, bus, devfn, 0);
2482 	if (!context) {
2483 		spin_unlock_irqrestore(&iommu->lock, flags);
2484 		return;
2485 	}
2486 
2487 	if (sm_supported(iommu)) {
2488 		if (hw_pass_through && domain_type_is_si(info->domain))
2489 			did_old = FLPT_DEFAULT_DID;
2490 		else
2491 			did_old = info->domain->iommu_did[iommu->seq_id];
2492 	} else {
2493 		did_old = context_domain_id(context);
2494 	}
2495 
2496 	context_clear_entry(context);
2497 	__iommu_flush_cache(iommu, context, sizeof(*context));
2498 	spin_unlock_irqrestore(&iommu->lock, flags);
2499 	iommu->flush.flush_context(iommu,
2500 				   did_old,
2501 				   (((u16)bus) << 8) | devfn,
2502 				   DMA_CCMD_MASK_NOBIT,
2503 				   DMA_CCMD_DEVICE_INVL);
2504 
2505 	if (sm_supported(iommu))
2506 		qi_flush_pasid_cache(iommu, did_old, QI_PC_ALL_PASIDS, 0);
2507 
2508 	iommu->flush.flush_iotlb(iommu,
2509 				 did_old,
2510 				 0,
2511 				 0,
2512 				 DMA_TLB_DSI_FLUSH);
2513 
2514 	__iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH);
2515 }
2516 
unlink_domain_info(struct device_domain_info * info)2517 static inline void unlink_domain_info(struct device_domain_info *info)
2518 {
2519 	assert_spin_locked(&device_domain_lock);
2520 	list_del(&info->link);
2521 	list_del(&info->global);
2522 	if (info->dev)
2523 		dev_iommu_priv_set(info->dev, NULL);
2524 }
2525 
domain_remove_dev_info(struct dmar_domain * domain)2526 static void domain_remove_dev_info(struct dmar_domain *domain)
2527 {
2528 	struct device_domain_info *info, *tmp;
2529 	unsigned long flags;
2530 
2531 	spin_lock_irqsave(&device_domain_lock, flags);
2532 	list_for_each_entry_safe(info, tmp, &domain->devices, link)
2533 		__dmar_remove_one_dev_info(info);
2534 	spin_unlock_irqrestore(&device_domain_lock, flags);
2535 }
2536 
find_domain(struct device * dev)2537 struct dmar_domain *find_domain(struct device *dev)
2538 {
2539 	struct device_domain_info *info;
2540 
2541 	if (unlikely(!dev || !dev->iommu))
2542 		return NULL;
2543 
2544 	if (unlikely(attach_deferred(dev)))
2545 		return NULL;
2546 
2547 	/* No lock here, assumes no domain exit in normal case */
2548 	info = get_domain_info(dev);
2549 	if (likely(info))
2550 		return info->domain;
2551 
2552 	return NULL;
2553 }
2554 
2555 static inline struct device_domain_info *
dmar_search_domain_by_dev_info(int segment,int bus,int devfn)2556 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2557 {
2558 	struct device_domain_info *info;
2559 
2560 	list_for_each_entry(info, &device_domain_list, global)
2561 		if (info->segment == segment && info->bus == bus &&
2562 		    info->devfn == devfn)
2563 			return info;
2564 
2565 	return NULL;
2566 }
2567 
domain_setup_first_level(struct intel_iommu * iommu,struct dmar_domain * domain,struct device * dev,u32 pasid)2568 static int domain_setup_first_level(struct intel_iommu *iommu,
2569 				    struct dmar_domain *domain,
2570 				    struct device *dev,
2571 				    u32 pasid)
2572 {
2573 	struct dma_pte *pgd = domain->pgd;
2574 	int agaw, level;
2575 	int flags = 0;
2576 
2577 	/*
2578 	 * Skip top levels of page tables for iommu which has
2579 	 * less agaw than default. Unnecessary for PT mode.
2580 	 */
2581 	for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2582 		pgd = phys_to_virt(dma_pte_addr(pgd));
2583 		if (!dma_pte_present(pgd))
2584 			return -ENOMEM;
2585 	}
2586 
2587 	level = agaw_to_level(agaw);
2588 	if (level != 4 && level != 5)
2589 		return -EINVAL;
2590 
2591 	if (pasid != PASID_RID2PASID)
2592 		flags |= PASID_FLAG_SUPERVISOR_MODE;
2593 	if (level == 5)
2594 		flags |= PASID_FLAG_FL5LP;
2595 
2596 	if (domain->domain.type == IOMMU_DOMAIN_UNMANAGED)
2597 		flags |= PASID_FLAG_PAGE_SNOOP;
2598 
2599 	return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2600 					     domain->iommu_did[iommu->seq_id],
2601 					     flags);
2602 }
2603 
dev_is_real_dma_subdevice(struct device * dev)2604 static bool dev_is_real_dma_subdevice(struct device *dev)
2605 {
2606 	return dev && dev_is_pci(dev) &&
2607 	       pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2608 }
2609 
dmar_insert_one_dev_info(struct intel_iommu * iommu,int bus,int devfn,struct device * dev,struct dmar_domain * domain)2610 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2611 						    int bus, int devfn,
2612 						    struct device *dev,
2613 						    struct dmar_domain *domain)
2614 {
2615 	struct dmar_domain *found = NULL;
2616 	struct device_domain_info *info;
2617 	unsigned long flags;
2618 	int ret;
2619 
2620 	info = alloc_devinfo_mem();
2621 	if (!info)
2622 		return NULL;
2623 
2624 	if (!dev_is_real_dma_subdevice(dev)) {
2625 		info->bus = bus;
2626 		info->devfn = devfn;
2627 		info->segment = iommu->segment;
2628 	} else {
2629 		struct pci_dev *pdev = to_pci_dev(dev);
2630 
2631 		info->bus = pdev->bus->number;
2632 		info->devfn = pdev->devfn;
2633 		info->segment = pci_domain_nr(pdev->bus);
2634 	}
2635 
2636 	info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2637 	info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2638 	info->ats_qdep = 0;
2639 	info->dev = dev;
2640 	info->domain = domain;
2641 	info->iommu = iommu;
2642 	info->pasid_table = NULL;
2643 	info->auxd_enabled = 0;
2644 	INIT_LIST_HEAD(&info->subdevices);
2645 
2646 	if (dev && dev_is_pci(dev)) {
2647 		struct pci_dev *pdev = to_pci_dev(info->dev);
2648 
2649 		if (ecap_dev_iotlb_support(iommu->ecap) &&
2650 		    pci_ats_supported(pdev) &&
2651 		    dmar_find_matched_atsr_unit(pdev))
2652 			info->ats_supported = 1;
2653 
2654 		if (sm_supported(iommu)) {
2655 			if (pasid_supported(iommu)) {
2656 				int features = pci_pasid_features(pdev);
2657 				if (features >= 0)
2658 					info->pasid_supported = features | 1;
2659 			}
2660 
2661 			if (info->ats_supported && ecap_prs(iommu->ecap) &&
2662 			    pci_pri_supported(pdev))
2663 				info->pri_supported = 1;
2664 		}
2665 	}
2666 
2667 	spin_lock_irqsave(&device_domain_lock, flags);
2668 	if (dev)
2669 		found = find_domain(dev);
2670 
2671 	if (!found) {
2672 		struct device_domain_info *info2;
2673 		info2 = dmar_search_domain_by_dev_info(info->segment, info->bus,
2674 						       info->devfn);
2675 		if (info2) {
2676 			found      = info2->domain;
2677 			info2->dev = dev;
2678 		}
2679 	}
2680 
2681 	if (found) {
2682 		spin_unlock_irqrestore(&device_domain_lock, flags);
2683 		free_devinfo_mem(info);
2684 		/* Caller must free the original domain */
2685 		return found;
2686 	}
2687 
2688 	spin_lock(&iommu->lock);
2689 	ret = domain_attach_iommu(domain, iommu);
2690 	spin_unlock(&iommu->lock);
2691 
2692 	if (ret) {
2693 		spin_unlock_irqrestore(&device_domain_lock, flags);
2694 		free_devinfo_mem(info);
2695 		return NULL;
2696 	}
2697 
2698 	list_add(&info->link, &domain->devices);
2699 	list_add(&info->global, &device_domain_list);
2700 	if (dev)
2701 		dev_iommu_priv_set(dev, info);
2702 	spin_unlock_irqrestore(&device_domain_lock, flags);
2703 
2704 	/* PASID table is mandatory for a PCI device in scalable mode. */
2705 	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
2706 		ret = intel_pasid_alloc_table(dev);
2707 		if (ret) {
2708 			dev_err(dev, "PASID table allocation failed\n");
2709 			dmar_remove_one_dev_info(dev);
2710 			return NULL;
2711 		}
2712 
2713 		/* Setup the PASID entry for requests without PASID: */
2714 		spin_lock_irqsave(&iommu->lock, flags);
2715 		if (hw_pass_through && domain_type_is_si(domain))
2716 			ret = intel_pasid_setup_pass_through(iommu, domain,
2717 					dev, PASID_RID2PASID);
2718 		else if (domain_use_first_level(domain))
2719 			ret = domain_setup_first_level(iommu, domain, dev,
2720 					PASID_RID2PASID);
2721 		else
2722 			ret = intel_pasid_setup_second_level(iommu, domain,
2723 					dev, PASID_RID2PASID);
2724 		spin_unlock_irqrestore(&iommu->lock, flags);
2725 		if (ret) {
2726 			dev_err(dev, "Setup RID2PASID failed\n");
2727 			dmar_remove_one_dev_info(dev);
2728 			return NULL;
2729 		}
2730 	}
2731 
2732 	if (dev && domain_context_mapping(domain, dev)) {
2733 		dev_err(dev, "Domain context map failed\n");
2734 		dmar_remove_one_dev_info(dev);
2735 		return NULL;
2736 	}
2737 
2738 	return domain;
2739 }
2740 
iommu_domain_identity_map(struct dmar_domain * domain,unsigned long first_vpfn,unsigned long last_vpfn)2741 static int iommu_domain_identity_map(struct dmar_domain *domain,
2742 				     unsigned long first_vpfn,
2743 				     unsigned long last_vpfn)
2744 {
2745 	/*
2746 	 * RMRR range might have overlap with physical memory range,
2747 	 * clear it first
2748 	 */
2749 	dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2750 
2751 	return __domain_mapping(domain, first_vpfn,
2752 				first_vpfn, last_vpfn - first_vpfn + 1,
2753 				DMA_PTE_READ|DMA_PTE_WRITE);
2754 }
2755 
2756 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2757 
si_domain_init(int hw)2758 static int __init si_domain_init(int hw)
2759 {
2760 	struct dmar_rmrr_unit *rmrr;
2761 	struct device *dev;
2762 	int i, nid, ret;
2763 
2764 	si_domain = alloc_domain(IOMMU_DOMAIN_IDENTITY);
2765 	if (!si_domain)
2766 		return -EFAULT;
2767 
2768 	if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2769 		domain_exit(si_domain);
2770 		si_domain = NULL;
2771 		return -EFAULT;
2772 	}
2773 
2774 	if (hw)
2775 		return 0;
2776 
2777 	for_each_online_node(nid) {
2778 		unsigned long start_pfn, end_pfn;
2779 		int i;
2780 
2781 		for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2782 			ret = iommu_domain_identity_map(si_domain,
2783 					mm_to_dma_pfn(start_pfn),
2784 					mm_to_dma_pfn(end_pfn));
2785 			if (ret)
2786 				return ret;
2787 		}
2788 	}
2789 
2790 	/*
2791 	 * Identity map the RMRRs so that devices with RMRRs could also use
2792 	 * the si_domain.
2793 	 */
2794 	for_each_rmrr_units(rmrr) {
2795 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2796 					  i, dev) {
2797 			unsigned long long start = rmrr->base_address;
2798 			unsigned long long end = rmrr->end_address;
2799 
2800 			if (WARN_ON(end < start ||
2801 				    end >> agaw_to_width(si_domain->agaw)))
2802 				continue;
2803 
2804 			ret = iommu_domain_identity_map(si_domain,
2805 					mm_to_dma_pfn(start >> PAGE_SHIFT),
2806 					mm_to_dma_pfn(end >> PAGE_SHIFT));
2807 			if (ret)
2808 				return ret;
2809 		}
2810 	}
2811 
2812 	return 0;
2813 }
2814 
domain_add_dev_info(struct dmar_domain * domain,struct device * dev)2815 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2816 {
2817 	struct dmar_domain *ndomain;
2818 	struct intel_iommu *iommu;
2819 	u8 bus, devfn;
2820 
2821 	iommu = device_to_iommu(dev, &bus, &devfn);
2822 	if (!iommu)
2823 		return -ENODEV;
2824 
2825 	ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2826 	if (ndomain != domain)
2827 		return -EBUSY;
2828 
2829 	return 0;
2830 }
2831 
device_has_rmrr(struct device * dev)2832 static bool device_has_rmrr(struct device *dev)
2833 {
2834 	struct dmar_rmrr_unit *rmrr;
2835 	struct device *tmp;
2836 	int i;
2837 
2838 	rcu_read_lock();
2839 	for_each_rmrr_units(rmrr) {
2840 		/*
2841 		 * Return TRUE if this RMRR contains the device that
2842 		 * is passed in.
2843 		 */
2844 		for_each_active_dev_scope(rmrr->devices,
2845 					  rmrr->devices_cnt, i, tmp)
2846 			if (tmp == dev ||
2847 			    is_downstream_to_pci_bridge(dev, tmp)) {
2848 				rcu_read_unlock();
2849 				return true;
2850 			}
2851 	}
2852 	rcu_read_unlock();
2853 	return false;
2854 }
2855 
2856 /**
2857  * device_rmrr_is_relaxable - Test whether the RMRR of this device
2858  * is relaxable (ie. is allowed to be not enforced under some conditions)
2859  * @dev: device handle
2860  *
2861  * We assume that PCI USB devices with RMRRs have them largely
2862  * for historical reasons and that the RMRR space is not actively used post
2863  * boot.  This exclusion may change if vendors begin to abuse it.
2864  *
2865  * The same exception is made for graphics devices, with the requirement that
2866  * any use of the RMRR regions will be torn down before assigning the device
2867  * to a guest.
2868  *
2869  * Return: true if the RMRR is relaxable, false otherwise
2870  */
device_rmrr_is_relaxable(struct device * dev)2871 static bool device_rmrr_is_relaxable(struct device *dev)
2872 {
2873 	struct pci_dev *pdev;
2874 
2875 	if (!dev_is_pci(dev))
2876 		return false;
2877 
2878 	pdev = to_pci_dev(dev);
2879 	if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2880 		return true;
2881 	else
2882 		return false;
2883 }
2884 
2885 /*
2886  * There are a couple cases where we need to restrict the functionality of
2887  * devices associated with RMRRs.  The first is when evaluating a device for
2888  * identity mapping because problems exist when devices are moved in and out
2889  * of domains and their respective RMRR information is lost.  This means that
2890  * a device with associated RMRRs will never be in a "passthrough" domain.
2891  * The second is use of the device through the IOMMU API.  This interface
2892  * expects to have full control of the IOVA space for the device.  We cannot
2893  * satisfy both the requirement that RMRR access is maintained and have an
2894  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2895  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2896  * We therefore prevent devices associated with an RMRR from participating in
2897  * the IOMMU API, which eliminates them from device assignment.
2898  *
2899  * In both cases, devices which have relaxable RMRRs are not concerned by this
2900  * restriction. See device_rmrr_is_relaxable comment.
2901  */
device_is_rmrr_locked(struct device * dev)2902 static bool device_is_rmrr_locked(struct device *dev)
2903 {
2904 	if (!device_has_rmrr(dev))
2905 		return false;
2906 
2907 	if (device_rmrr_is_relaxable(dev))
2908 		return false;
2909 
2910 	return true;
2911 }
2912 
2913 /*
2914  * Return the required default domain type for a specific device.
2915  *
2916  * @dev: the device in query
2917  * @startup: true if this is during early boot
2918  *
2919  * Returns:
2920  *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2921  *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2922  *  - 0: both identity and dynamic domains work for this device
2923  */
device_def_domain_type(struct device * dev)2924 static int device_def_domain_type(struct device *dev)
2925 {
2926 	if (dev_is_pci(dev)) {
2927 		struct pci_dev *pdev = to_pci_dev(dev);
2928 
2929 		if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2930 			return IOMMU_DOMAIN_IDENTITY;
2931 
2932 		if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2933 			return IOMMU_DOMAIN_IDENTITY;
2934 	}
2935 
2936 	return 0;
2937 }
2938 
intel_iommu_init_qi(struct intel_iommu * iommu)2939 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2940 {
2941 	/*
2942 	 * Start from the sane iommu hardware state.
2943 	 * If the queued invalidation is already initialized by us
2944 	 * (for example, while enabling interrupt-remapping) then
2945 	 * we got the things already rolling from a sane state.
2946 	 */
2947 	if (!iommu->qi) {
2948 		/*
2949 		 * Clear any previous faults.
2950 		 */
2951 		dmar_fault(-1, iommu);
2952 		/*
2953 		 * Disable queued invalidation if supported and already enabled
2954 		 * before OS handover.
2955 		 */
2956 		dmar_disable_qi(iommu);
2957 	}
2958 
2959 	if (dmar_enable_qi(iommu)) {
2960 		/*
2961 		 * Queued Invalidate not enabled, use Register Based Invalidate
2962 		 */
2963 		iommu->flush.flush_context = __iommu_flush_context;
2964 		iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2965 		pr_info("%s: Using Register based invalidation\n",
2966 			iommu->name);
2967 	} else {
2968 		iommu->flush.flush_context = qi_flush_context;
2969 		iommu->flush.flush_iotlb = qi_flush_iotlb;
2970 		pr_info("%s: Using Queued invalidation\n", iommu->name);
2971 	}
2972 }
2973 
copy_context_table(struct intel_iommu * iommu,struct root_entry * old_re,struct context_entry ** tbl,int bus,bool ext)2974 static int copy_context_table(struct intel_iommu *iommu,
2975 			      struct root_entry *old_re,
2976 			      struct context_entry **tbl,
2977 			      int bus, bool ext)
2978 {
2979 	int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2980 	struct context_entry *new_ce = NULL, ce;
2981 	struct context_entry *old_ce = NULL;
2982 	struct root_entry re;
2983 	phys_addr_t old_ce_phys;
2984 
2985 	tbl_idx = ext ? bus * 2 : bus;
2986 	memcpy(&re, old_re, sizeof(re));
2987 
2988 	for (devfn = 0; devfn < 256; devfn++) {
2989 		/* First calculate the correct index */
2990 		idx = (ext ? devfn * 2 : devfn) % 256;
2991 
2992 		if (idx == 0) {
2993 			/* First save what we may have and clean up */
2994 			if (new_ce) {
2995 				tbl[tbl_idx] = new_ce;
2996 				__iommu_flush_cache(iommu, new_ce,
2997 						    VTD_PAGE_SIZE);
2998 				pos = 1;
2999 			}
3000 
3001 			if (old_ce)
3002 				memunmap(old_ce);
3003 
3004 			ret = 0;
3005 			if (devfn < 0x80)
3006 				old_ce_phys = root_entry_lctp(&re);
3007 			else
3008 				old_ce_phys = root_entry_uctp(&re);
3009 
3010 			if (!old_ce_phys) {
3011 				if (ext && devfn == 0) {
3012 					/* No LCTP, try UCTP */
3013 					devfn = 0x7f;
3014 					continue;
3015 				} else {
3016 					goto out;
3017 				}
3018 			}
3019 
3020 			ret = -ENOMEM;
3021 			old_ce = memremap(old_ce_phys, PAGE_SIZE,
3022 					MEMREMAP_WB);
3023 			if (!old_ce)
3024 				goto out;
3025 
3026 			new_ce = alloc_pgtable_page(iommu->node);
3027 			if (!new_ce)
3028 				goto out_unmap;
3029 
3030 			ret = 0;
3031 		}
3032 
3033 		/* Now copy the context entry */
3034 		memcpy(&ce, old_ce + idx, sizeof(ce));
3035 
3036 		if (!context_present(&ce))
3037 			continue;
3038 
3039 		did = context_domain_id(&ce);
3040 		if (did >= 0 && did < cap_ndoms(iommu->cap))
3041 			set_bit(did, iommu->domain_ids);
3042 
3043 		set_context_copied(iommu, bus, devfn);
3044 		new_ce[idx] = ce;
3045 	}
3046 
3047 	tbl[tbl_idx + pos] = new_ce;
3048 
3049 	__iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3050 
3051 out_unmap:
3052 	memunmap(old_ce);
3053 
3054 out:
3055 	return ret;
3056 }
3057 
copy_translation_tables(struct intel_iommu * iommu)3058 static int copy_translation_tables(struct intel_iommu *iommu)
3059 {
3060 	struct context_entry **ctxt_tbls;
3061 	struct root_entry *old_rt;
3062 	phys_addr_t old_rt_phys;
3063 	int ctxt_table_entries;
3064 	unsigned long flags;
3065 	u64 rtaddr_reg;
3066 	int bus, ret;
3067 	bool new_ext, ext;
3068 
3069 	rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3070 	ext        = !!(rtaddr_reg & DMA_RTADDR_SMT);
3071 	new_ext    = !!sm_supported(iommu);
3072 
3073 	/*
3074 	 * The RTT bit can only be changed when translation is disabled,
3075 	 * but disabling translation means to open a window for data
3076 	 * corruption. So bail out and don't copy anything if we would
3077 	 * have to change the bit.
3078 	 */
3079 	if (new_ext != ext)
3080 		return -EINVAL;
3081 
3082 	iommu->copied_tables = bitmap_zalloc(BIT_ULL(16), GFP_KERNEL);
3083 	if (!iommu->copied_tables)
3084 		return -ENOMEM;
3085 
3086 	old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3087 	if (!old_rt_phys)
3088 		return -EINVAL;
3089 
3090 	old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3091 	if (!old_rt)
3092 		return -ENOMEM;
3093 
3094 	/* This is too big for the stack - allocate it from slab */
3095 	ctxt_table_entries = ext ? 512 : 256;
3096 	ret = -ENOMEM;
3097 	ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3098 	if (!ctxt_tbls)
3099 		goto out_unmap;
3100 
3101 	for (bus = 0; bus < 256; bus++) {
3102 		ret = copy_context_table(iommu, &old_rt[bus],
3103 					 ctxt_tbls, bus, ext);
3104 		if (ret) {
3105 			pr_err("%s: Failed to copy context table for bus %d\n",
3106 				iommu->name, bus);
3107 			continue;
3108 		}
3109 	}
3110 
3111 	spin_lock_irqsave(&iommu->lock, flags);
3112 
3113 	/* Context tables are copied, now write them to the root_entry table */
3114 	for (bus = 0; bus < 256; bus++) {
3115 		int idx = ext ? bus * 2 : bus;
3116 		u64 val;
3117 
3118 		if (ctxt_tbls[idx]) {
3119 			val = virt_to_phys(ctxt_tbls[idx]) | 1;
3120 			iommu->root_entry[bus].lo = val;
3121 		}
3122 
3123 		if (!ext || !ctxt_tbls[idx + 1])
3124 			continue;
3125 
3126 		val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3127 		iommu->root_entry[bus].hi = val;
3128 	}
3129 
3130 	spin_unlock_irqrestore(&iommu->lock, flags);
3131 
3132 	kfree(ctxt_tbls);
3133 
3134 	__iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3135 
3136 	ret = 0;
3137 
3138 out_unmap:
3139 	memunmap(old_rt);
3140 
3141 	return ret;
3142 }
3143 
3144 #ifdef CONFIG_INTEL_IOMMU_SVM
intel_vcmd_ioasid_alloc(ioasid_t min,ioasid_t max,void * data)3145 static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data)
3146 {
3147 	struct intel_iommu *iommu = data;
3148 	ioasid_t ioasid;
3149 
3150 	if (!iommu)
3151 		return INVALID_IOASID;
3152 	/*
3153 	 * VT-d virtual command interface always uses the full 20 bit
3154 	 * PASID range. Host can partition guest PASID range based on
3155 	 * policies but it is out of guest's control.
3156 	 */
3157 	if (min < PASID_MIN || max > intel_pasid_max_id)
3158 		return INVALID_IOASID;
3159 
3160 	if (vcmd_alloc_pasid(iommu, &ioasid))
3161 		return INVALID_IOASID;
3162 
3163 	return ioasid;
3164 }
3165 
intel_vcmd_ioasid_free(ioasid_t ioasid,void * data)3166 static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data)
3167 {
3168 	struct intel_iommu *iommu = data;
3169 
3170 	if (!iommu)
3171 		return;
3172 	/*
3173 	 * Sanity check the ioasid owner is done at upper layer, e.g. VFIO
3174 	 * We can only free the PASID when all the devices are unbound.
3175 	 */
3176 	if (ioasid_find(NULL, ioasid, NULL)) {
3177 		pr_alert("Cannot free active IOASID %d\n", ioasid);
3178 		return;
3179 	}
3180 	vcmd_free_pasid(iommu, ioasid);
3181 }
3182 
register_pasid_allocator(struct intel_iommu * iommu)3183 static void register_pasid_allocator(struct intel_iommu *iommu)
3184 {
3185 	/*
3186 	 * If we are running in the host, no need for custom allocator
3187 	 * in that PASIDs are allocated from the host system-wide.
3188 	 */
3189 	if (!cap_caching_mode(iommu->cap))
3190 		return;
3191 
3192 	if (!sm_supported(iommu)) {
3193 		pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n");
3194 		return;
3195 	}
3196 
3197 	/*
3198 	 * Register a custom PASID allocator if we are running in a guest,
3199 	 * guest PASID must be obtained via virtual command interface.
3200 	 * There can be multiple vIOMMUs in each guest but only one allocator
3201 	 * is active. All vIOMMU allocators will eventually be calling the same
3202 	 * host allocator.
3203 	 */
3204 	if (!vccap_pasid(iommu->vccap))
3205 		return;
3206 
3207 	pr_info("Register custom PASID allocator\n");
3208 	iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc;
3209 	iommu->pasid_allocator.free = intel_vcmd_ioasid_free;
3210 	iommu->pasid_allocator.pdata = (void *)iommu;
3211 	if (ioasid_register_allocator(&iommu->pasid_allocator)) {
3212 		pr_warn("Custom PASID allocator failed, scalable mode disabled\n");
3213 		/*
3214 		 * Disable scalable mode on this IOMMU if there
3215 		 * is no custom allocator. Mixing SM capable vIOMMU
3216 		 * and non-SM vIOMMU are not supported.
3217 		 */
3218 		intel_iommu_sm = 0;
3219 	}
3220 }
3221 #endif
3222 
init_dmars(void)3223 static int __init init_dmars(void)
3224 {
3225 	struct dmar_drhd_unit *drhd;
3226 	struct intel_iommu *iommu;
3227 	int ret;
3228 
3229 	/*
3230 	 * for each drhd
3231 	 *    allocate root
3232 	 *    initialize and program root entry to not present
3233 	 * endfor
3234 	 */
3235 	for_each_drhd_unit(drhd) {
3236 		/*
3237 		 * lock not needed as this is only incremented in the single
3238 		 * threaded kernel __init code path all other access are read
3239 		 * only
3240 		 */
3241 		if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3242 			g_num_of_iommus++;
3243 			continue;
3244 		}
3245 		pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3246 	}
3247 
3248 	/* Preallocate enough resources for IOMMU hot-addition */
3249 	if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3250 		g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3251 
3252 	g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3253 			GFP_KERNEL);
3254 	if (!g_iommus) {
3255 		ret = -ENOMEM;
3256 		goto error;
3257 	}
3258 
3259 	ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL);
3260 	if (ret)
3261 		goto free_iommu;
3262 
3263 	for_each_iommu(iommu, drhd) {
3264 		if (drhd->ignored) {
3265 			iommu_disable_translation(iommu);
3266 			continue;
3267 		}
3268 
3269 		/*
3270 		 * Find the max pasid size of all IOMMU's in the system.
3271 		 * We need to ensure the system pasid table is no bigger
3272 		 * than the smallest supported.
3273 		 */
3274 		if (pasid_supported(iommu)) {
3275 			u32 temp = 2 << ecap_pss(iommu->ecap);
3276 
3277 			intel_pasid_max_id = min_t(u32, temp,
3278 						   intel_pasid_max_id);
3279 		}
3280 
3281 		g_iommus[iommu->seq_id] = iommu;
3282 
3283 		intel_iommu_init_qi(iommu);
3284 
3285 		ret = iommu_init_domains(iommu);
3286 		if (ret)
3287 			goto free_iommu;
3288 
3289 		init_translation_status(iommu);
3290 
3291 		if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3292 			iommu_disable_translation(iommu);
3293 			clear_translation_pre_enabled(iommu);
3294 			pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3295 				iommu->name);
3296 		}
3297 
3298 		/*
3299 		 * TBD:
3300 		 * we could share the same root & context tables
3301 		 * among all IOMMU's. Need to Split it later.
3302 		 */
3303 		ret = iommu_alloc_root_entry(iommu);
3304 		if (ret)
3305 			goto free_iommu;
3306 
3307 		if (translation_pre_enabled(iommu)) {
3308 			pr_info("Translation already enabled - trying to copy translation structures\n");
3309 
3310 			ret = copy_translation_tables(iommu);
3311 			if (ret) {
3312 				/*
3313 				 * We found the IOMMU with translation
3314 				 * enabled - but failed to copy over the
3315 				 * old root-entry table. Try to proceed
3316 				 * by disabling translation now and
3317 				 * allocating a clean root-entry table.
3318 				 * This might cause DMAR faults, but
3319 				 * probably the dump will still succeed.
3320 				 */
3321 				pr_err("Failed to copy translation tables from previous kernel for %s\n",
3322 				       iommu->name);
3323 				iommu_disable_translation(iommu);
3324 				clear_translation_pre_enabled(iommu);
3325 			} else {
3326 				pr_info("Copied translation tables from previous kernel for %s\n",
3327 					iommu->name);
3328 			}
3329 		}
3330 
3331 		if (!ecap_pass_through(iommu->ecap))
3332 			hw_pass_through = 0;
3333 		intel_svm_check(iommu);
3334 	}
3335 
3336 	/*
3337 	 * Now that qi is enabled on all iommus, set the root entry and flush
3338 	 * caches. This is required on some Intel X58 chipsets, otherwise the
3339 	 * flush_context function will loop forever and the boot hangs.
3340 	 */
3341 	for_each_active_iommu(iommu, drhd) {
3342 		iommu_flush_write_buffer(iommu);
3343 #ifdef CONFIG_INTEL_IOMMU_SVM
3344 		register_pasid_allocator(iommu);
3345 #endif
3346 		iommu_set_root_entry(iommu);
3347 	}
3348 
3349 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3350 	dmar_map_gfx = 0;
3351 #endif
3352 
3353 	if (!dmar_map_gfx)
3354 		iommu_identity_mapping |= IDENTMAP_GFX;
3355 
3356 	check_tylersburg_isoch();
3357 
3358 	ret = si_domain_init(hw_pass_through);
3359 	if (ret)
3360 		goto free_iommu;
3361 
3362 	/*
3363 	 * for each drhd
3364 	 *   enable fault log
3365 	 *   global invalidate context cache
3366 	 *   global invalidate iotlb
3367 	 *   enable translation
3368 	 */
3369 	for_each_iommu(iommu, drhd) {
3370 		if (drhd->ignored) {
3371 			/*
3372 			 * we always have to disable PMRs or DMA may fail on
3373 			 * this device
3374 			 */
3375 			if (force_on)
3376 				iommu_disable_protect_mem_regions(iommu);
3377 			continue;
3378 		}
3379 
3380 		iommu_flush_write_buffer(iommu);
3381 
3382 #ifdef CONFIG_INTEL_IOMMU_SVM
3383 		if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3384 			/*
3385 			 * Call dmar_alloc_hwirq() with dmar_global_lock held,
3386 			 * could cause possible lock race condition.
3387 			 */
3388 			up_write(&dmar_global_lock);
3389 			ret = intel_svm_enable_prq(iommu);
3390 			down_write(&dmar_global_lock);
3391 			if (ret)
3392 				goto free_iommu;
3393 		}
3394 #endif
3395 		ret = dmar_set_interrupt(iommu);
3396 		if (ret)
3397 			goto free_iommu;
3398 	}
3399 
3400 	return 0;
3401 
3402 free_iommu:
3403 	for_each_active_iommu(iommu, drhd) {
3404 		disable_dmar_iommu(iommu);
3405 		free_dmar_iommu(iommu);
3406 	}
3407 	if (si_domain) {
3408 		domain_exit(si_domain);
3409 		si_domain = NULL;
3410 	}
3411 
3412 	kfree(g_iommus);
3413 
3414 error:
3415 	return ret;
3416 }
3417 
iommu_domain_cache_init(void)3418 static inline int iommu_domain_cache_init(void)
3419 {
3420 	int ret = 0;
3421 
3422 	iommu_domain_cache = kmem_cache_create("iommu_domain",
3423 					 sizeof(struct dmar_domain),
3424 					 0,
3425 					 SLAB_HWCACHE_ALIGN,
3426 
3427 					 NULL);
3428 	if (!iommu_domain_cache) {
3429 		pr_err("Couldn't create iommu_domain cache\n");
3430 		ret = -ENOMEM;
3431 	}
3432 
3433 	return ret;
3434 }
3435 
iommu_devinfo_cache_init(void)3436 static inline int iommu_devinfo_cache_init(void)
3437 {
3438 	int ret = 0;
3439 
3440 	iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3441 					 sizeof(struct device_domain_info),
3442 					 0,
3443 					 SLAB_HWCACHE_ALIGN,
3444 					 NULL);
3445 	if (!iommu_devinfo_cache) {
3446 		pr_err("Couldn't create devinfo cache\n");
3447 		ret = -ENOMEM;
3448 	}
3449 
3450 	return ret;
3451 }
3452 
iommu_init_mempool(void)3453 static int __init iommu_init_mempool(void)
3454 {
3455 	int ret;
3456 	ret = iova_cache_get();
3457 	if (ret)
3458 		return ret;
3459 
3460 	ret = iommu_domain_cache_init();
3461 	if (ret)
3462 		goto domain_error;
3463 
3464 	ret = iommu_devinfo_cache_init();
3465 	if (!ret)
3466 		return ret;
3467 
3468 	kmem_cache_destroy(iommu_domain_cache);
3469 domain_error:
3470 	iova_cache_put();
3471 
3472 	return -ENOMEM;
3473 }
3474 
iommu_exit_mempool(void)3475 static void __init iommu_exit_mempool(void)
3476 {
3477 	kmem_cache_destroy(iommu_devinfo_cache);
3478 	kmem_cache_destroy(iommu_domain_cache);
3479 	iova_cache_put();
3480 }
3481 
init_no_remapping_devices(void)3482 static void __init init_no_remapping_devices(void)
3483 {
3484 	struct dmar_drhd_unit *drhd;
3485 	struct device *dev;
3486 	int i;
3487 
3488 	for_each_drhd_unit(drhd) {
3489 		if (!drhd->include_all) {
3490 			for_each_active_dev_scope(drhd->devices,
3491 						  drhd->devices_cnt, i, dev)
3492 				break;
3493 			/* ignore DMAR unit if no devices exist */
3494 			if (i == drhd->devices_cnt)
3495 				drhd->ignored = 1;
3496 		}
3497 	}
3498 
3499 	for_each_active_drhd_unit(drhd) {
3500 		if (drhd->include_all)
3501 			continue;
3502 
3503 		for_each_active_dev_scope(drhd->devices,
3504 					  drhd->devices_cnt, i, dev)
3505 			if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3506 				break;
3507 		if (i < drhd->devices_cnt)
3508 			continue;
3509 
3510 		/* This IOMMU has *only* gfx devices. Either bypass it or
3511 		   set the gfx_mapped flag, as appropriate */
3512 		drhd->gfx_dedicated = 1;
3513 		if (!dmar_map_gfx)
3514 			drhd->ignored = 1;
3515 	}
3516 }
3517 
3518 #ifdef CONFIG_SUSPEND
init_iommu_hw(void)3519 static int init_iommu_hw(void)
3520 {
3521 	struct dmar_drhd_unit *drhd;
3522 	struct intel_iommu *iommu = NULL;
3523 
3524 	for_each_active_iommu(iommu, drhd)
3525 		if (iommu->qi)
3526 			dmar_reenable_qi(iommu);
3527 
3528 	for_each_iommu(iommu, drhd) {
3529 		if (drhd->ignored) {
3530 			/*
3531 			 * we always have to disable PMRs or DMA may fail on
3532 			 * this device
3533 			 */
3534 			if (force_on)
3535 				iommu_disable_protect_mem_regions(iommu);
3536 			continue;
3537 		}
3538 
3539 		iommu_flush_write_buffer(iommu);
3540 		iommu_set_root_entry(iommu);
3541 		iommu_enable_translation(iommu);
3542 		iommu_disable_protect_mem_regions(iommu);
3543 	}
3544 
3545 	return 0;
3546 }
3547 
iommu_flush_all(void)3548 static void iommu_flush_all(void)
3549 {
3550 	struct dmar_drhd_unit *drhd;
3551 	struct intel_iommu *iommu;
3552 
3553 	for_each_active_iommu(iommu, drhd) {
3554 		iommu->flush.flush_context(iommu, 0, 0, 0,
3555 					   DMA_CCMD_GLOBAL_INVL);
3556 		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3557 					 DMA_TLB_GLOBAL_FLUSH);
3558 	}
3559 }
3560 
iommu_suspend(void)3561 static int iommu_suspend(void)
3562 {
3563 	struct dmar_drhd_unit *drhd;
3564 	struct intel_iommu *iommu = NULL;
3565 	unsigned long flag;
3566 
3567 	iommu_flush_all();
3568 
3569 	for_each_active_iommu(iommu, drhd) {
3570 		iommu_disable_translation(iommu);
3571 
3572 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
3573 
3574 		iommu->iommu_state[SR_DMAR_FECTL_REG] =
3575 			readl(iommu->reg + DMAR_FECTL_REG);
3576 		iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3577 			readl(iommu->reg + DMAR_FEDATA_REG);
3578 		iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3579 			readl(iommu->reg + DMAR_FEADDR_REG);
3580 		iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3581 			readl(iommu->reg + DMAR_FEUADDR_REG);
3582 
3583 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3584 	}
3585 	return 0;
3586 }
3587 
iommu_resume(void)3588 static void iommu_resume(void)
3589 {
3590 	struct dmar_drhd_unit *drhd;
3591 	struct intel_iommu *iommu = NULL;
3592 	unsigned long flag;
3593 
3594 	if (init_iommu_hw()) {
3595 		if (force_on)
3596 			panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3597 		else
3598 			WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3599 		return;
3600 	}
3601 
3602 	for_each_active_iommu(iommu, drhd) {
3603 
3604 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
3605 
3606 		writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3607 			iommu->reg + DMAR_FECTL_REG);
3608 		writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3609 			iommu->reg + DMAR_FEDATA_REG);
3610 		writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3611 			iommu->reg + DMAR_FEADDR_REG);
3612 		writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3613 			iommu->reg + DMAR_FEUADDR_REG);
3614 
3615 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3616 	}
3617 }
3618 
3619 static struct syscore_ops iommu_syscore_ops = {
3620 	.resume		= iommu_resume,
3621 	.suspend	= iommu_suspend,
3622 };
3623 
init_iommu_pm_ops(void)3624 static void __init init_iommu_pm_ops(void)
3625 {
3626 	register_syscore_ops(&iommu_syscore_ops);
3627 }
3628 
3629 #else
init_iommu_pm_ops(void)3630 static inline void init_iommu_pm_ops(void) {}
3631 #endif	/* CONFIG_PM */
3632 
rmrr_sanity_check(struct acpi_dmar_reserved_memory * rmrr)3633 static int rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
3634 {
3635 	if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
3636 	    !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
3637 	    rmrr->end_address <= rmrr->base_address ||
3638 	    arch_rmrr_sanity_check(rmrr))
3639 		return -EINVAL;
3640 
3641 	return 0;
3642 }
3643 
dmar_parse_one_rmrr(struct acpi_dmar_header * header,void * arg)3644 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
3645 {
3646 	struct acpi_dmar_reserved_memory *rmrr;
3647 	struct dmar_rmrr_unit *rmrru;
3648 
3649 	rmrr = (struct acpi_dmar_reserved_memory *)header;
3650 	if (rmrr_sanity_check(rmrr)) {
3651 		pr_warn(FW_BUG
3652 			   "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
3653 			   "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
3654 			   rmrr->base_address, rmrr->end_address,
3655 			   dmi_get_system_info(DMI_BIOS_VENDOR),
3656 			   dmi_get_system_info(DMI_BIOS_VERSION),
3657 			   dmi_get_system_info(DMI_PRODUCT_VERSION));
3658 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
3659 	}
3660 
3661 	rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3662 	if (!rmrru)
3663 		goto out;
3664 
3665 	rmrru->hdr = header;
3666 
3667 	rmrru->base_address = rmrr->base_address;
3668 	rmrru->end_address = rmrr->end_address;
3669 
3670 	rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
3671 				((void *)rmrr) + rmrr->header.length,
3672 				&rmrru->devices_cnt);
3673 	if (rmrru->devices_cnt && rmrru->devices == NULL)
3674 		goto free_rmrru;
3675 
3676 	list_add(&rmrru->list, &dmar_rmrr_units);
3677 
3678 	return 0;
3679 free_rmrru:
3680 	kfree(rmrru);
3681 out:
3682 	return -ENOMEM;
3683 }
3684 
dmar_find_atsr(struct acpi_dmar_atsr * atsr)3685 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
3686 {
3687 	struct dmar_atsr_unit *atsru;
3688 	struct acpi_dmar_atsr *tmp;
3689 
3690 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
3691 				dmar_rcu_check()) {
3692 		tmp = (struct acpi_dmar_atsr *)atsru->hdr;
3693 		if (atsr->segment != tmp->segment)
3694 			continue;
3695 		if (atsr->header.length != tmp->header.length)
3696 			continue;
3697 		if (memcmp(atsr, tmp, atsr->header.length) == 0)
3698 			return atsru;
3699 	}
3700 
3701 	return NULL;
3702 }
3703 
dmar_parse_one_atsr(struct acpi_dmar_header * hdr,void * arg)3704 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3705 {
3706 	struct acpi_dmar_atsr *atsr;
3707 	struct dmar_atsr_unit *atsru;
3708 
3709 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3710 		return 0;
3711 
3712 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3713 	atsru = dmar_find_atsr(atsr);
3714 	if (atsru)
3715 		return 0;
3716 
3717 	atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
3718 	if (!atsru)
3719 		return -ENOMEM;
3720 
3721 	/*
3722 	 * If memory is allocated from slab by ACPI _DSM method, we need to
3723 	 * copy the memory content because the memory buffer will be freed
3724 	 * on return.
3725 	 */
3726 	atsru->hdr = (void *)(atsru + 1);
3727 	memcpy(atsru->hdr, hdr, hdr->length);
3728 	atsru->include_all = atsr->flags & 0x1;
3729 	if (!atsru->include_all) {
3730 		atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
3731 				(void *)atsr + atsr->header.length,
3732 				&atsru->devices_cnt);
3733 		if (atsru->devices_cnt && atsru->devices == NULL) {
3734 			kfree(atsru);
3735 			return -ENOMEM;
3736 		}
3737 	}
3738 
3739 	list_add_rcu(&atsru->list, &dmar_atsr_units);
3740 
3741 	return 0;
3742 }
3743 
intel_iommu_free_atsr(struct dmar_atsr_unit * atsru)3744 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
3745 {
3746 	dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
3747 	kfree(atsru);
3748 }
3749 
dmar_release_one_atsr(struct acpi_dmar_header * hdr,void * arg)3750 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3751 {
3752 	struct acpi_dmar_atsr *atsr;
3753 	struct dmar_atsr_unit *atsru;
3754 
3755 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3756 	atsru = dmar_find_atsr(atsr);
3757 	if (atsru) {
3758 		list_del_rcu(&atsru->list);
3759 		synchronize_rcu();
3760 		intel_iommu_free_atsr(atsru);
3761 	}
3762 
3763 	return 0;
3764 }
3765 
dmar_check_one_atsr(struct acpi_dmar_header * hdr,void * arg)3766 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3767 {
3768 	int i;
3769 	struct device *dev;
3770 	struct acpi_dmar_atsr *atsr;
3771 	struct dmar_atsr_unit *atsru;
3772 
3773 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3774 	atsru = dmar_find_atsr(atsr);
3775 	if (!atsru)
3776 		return 0;
3777 
3778 	if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
3779 		for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
3780 					  i, dev)
3781 			return -EBUSY;
3782 	}
3783 
3784 	return 0;
3785 }
3786 
dmar_find_satc(struct acpi_dmar_satc * satc)3787 static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
3788 {
3789 	struct dmar_satc_unit *satcu;
3790 	struct acpi_dmar_satc *tmp;
3791 
3792 	list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
3793 				dmar_rcu_check()) {
3794 		tmp = (struct acpi_dmar_satc *)satcu->hdr;
3795 		if (satc->segment != tmp->segment)
3796 			continue;
3797 		if (satc->header.length != tmp->header.length)
3798 			continue;
3799 		if (memcmp(satc, tmp, satc->header.length) == 0)
3800 			return satcu;
3801 	}
3802 
3803 	return NULL;
3804 }
3805 
dmar_parse_one_satc(struct acpi_dmar_header * hdr,void * arg)3806 int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
3807 {
3808 	struct acpi_dmar_satc *satc;
3809 	struct dmar_satc_unit *satcu;
3810 
3811 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3812 		return 0;
3813 
3814 	satc = container_of(hdr, struct acpi_dmar_satc, header);
3815 	satcu = dmar_find_satc(satc);
3816 	if (satcu)
3817 		return 0;
3818 
3819 	satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
3820 	if (!satcu)
3821 		return -ENOMEM;
3822 
3823 	satcu->hdr = (void *)(satcu + 1);
3824 	memcpy(satcu->hdr, hdr, hdr->length);
3825 	satcu->atc_required = satc->flags & 0x1;
3826 	satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
3827 					      (void *)satc + satc->header.length,
3828 					      &satcu->devices_cnt);
3829 	if (satcu->devices_cnt && !satcu->devices) {
3830 		kfree(satcu);
3831 		return -ENOMEM;
3832 	}
3833 	list_add_rcu(&satcu->list, &dmar_satc_units);
3834 
3835 	return 0;
3836 }
3837 
intel_iommu_add(struct dmar_drhd_unit * dmaru)3838 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
3839 {
3840 	int sp, ret;
3841 	struct intel_iommu *iommu = dmaru->iommu;
3842 
3843 	if (g_iommus[iommu->seq_id])
3844 		return 0;
3845 
3846 	ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu);
3847 	if (ret)
3848 		goto out;
3849 
3850 	if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
3851 		pr_warn("%s: Doesn't support hardware pass through.\n",
3852 			iommu->name);
3853 		return -ENXIO;
3854 	}
3855 	if (!ecap_sc_support(iommu->ecap) &&
3856 	    domain_update_iommu_snooping(iommu)) {
3857 		pr_warn("%s: Doesn't support snooping.\n",
3858 			iommu->name);
3859 		return -ENXIO;
3860 	}
3861 	sp = domain_update_iommu_superpage(NULL, iommu) - 1;
3862 	if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
3863 		pr_warn("%s: Doesn't support large page.\n",
3864 			iommu->name);
3865 		return -ENXIO;
3866 	}
3867 
3868 	/*
3869 	 * Disable translation if already enabled prior to OS handover.
3870 	 */
3871 	if (iommu->gcmd & DMA_GCMD_TE)
3872 		iommu_disable_translation(iommu);
3873 
3874 	g_iommus[iommu->seq_id] = iommu;
3875 	ret = iommu_init_domains(iommu);
3876 	if (ret == 0)
3877 		ret = iommu_alloc_root_entry(iommu);
3878 	if (ret)
3879 		goto out;
3880 
3881 	intel_svm_check(iommu);
3882 
3883 	if (dmaru->ignored) {
3884 		/*
3885 		 * we always have to disable PMRs or DMA may fail on this device
3886 		 */
3887 		if (force_on)
3888 			iommu_disable_protect_mem_regions(iommu);
3889 		return 0;
3890 	}
3891 
3892 	intel_iommu_init_qi(iommu);
3893 	iommu_flush_write_buffer(iommu);
3894 
3895 #ifdef CONFIG_INTEL_IOMMU_SVM
3896 	if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3897 		ret = intel_svm_enable_prq(iommu);
3898 		if (ret)
3899 			goto disable_iommu;
3900 	}
3901 #endif
3902 	ret = dmar_set_interrupt(iommu);
3903 	if (ret)
3904 		goto disable_iommu;
3905 
3906 	iommu_set_root_entry(iommu);
3907 	iommu_enable_translation(iommu);
3908 
3909 	iommu_disable_protect_mem_regions(iommu);
3910 	return 0;
3911 
3912 disable_iommu:
3913 	disable_dmar_iommu(iommu);
3914 out:
3915 	free_dmar_iommu(iommu);
3916 	return ret;
3917 }
3918 
dmar_iommu_hotplug(struct dmar_drhd_unit * dmaru,bool insert)3919 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
3920 {
3921 	int ret = 0;
3922 	struct intel_iommu *iommu = dmaru->iommu;
3923 
3924 	if (!intel_iommu_enabled)
3925 		return 0;
3926 	if (iommu == NULL)
3927 		return -EINVAL;
3928 
3929 	if (insert) {
3930 		ret = intel_iommu_add(dmaru);
3931 	} else {
3932 		disable_dmar_iommu(iommu);
3933 		free_dmar_iommu(iommu);
3934 	}
3935 
3936 	return ret;
3937 }
3938 
intel_iommu_free_dmars(void)3939 static void intel_iommu_free_dmars(void)
3940 {
3941 	struct dmar_rmrr_unit *rmrru, *rmrr_n;
3942 	struct dmar_atsr_unit *atsru, *atsr_n;
3943 	struct dmar_satc_unit *satcu, *satc_n;
3944 
3945 	list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
3946 		list_del(&rmrru->list);
3947 		dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
3948 		kfree(rmrru);
3949 	}
3950 
3951 	list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
3952 		list_del(&atsru->list);
3953 		intel_iommu_free_atsr(atsru);
3954 	}
3955 	list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
3956 		list_del(&satcu->list);
3957 		dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
3958 		kfree(satcu);
3959 	}
3960 }
3961 
dmar_find_matched_atsr_unit(struct pci_dev * dev)3962 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
3963 {
3964 	int i, ret = 1;
3965 	struct pci_bus *bus;
3966 	struct pci_dev *bridge = NULL;
3967 	struct device *tmp;
3968 	struct acpi_dmar_atsr *atsr;
3969 	struct dmar_atsr_unit *atsru;
3970 
3971 	dev = pci_physfn(dev);
3972 	for (bus = dev->bus; bus; bus = bus->parent) {
3973 		bridge = bus->self;
3974 		/* If it's an integrated device, allow ATS */
3975 		if (!bridge)
3976 			return 1;
3977 		/* Connected via non-PCIe: no ATS */
3978 		if (!pci_is_pcie(bridge) ||
3979 		    pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3980 			return 0;
3981 		/* If we found the root port, look it up in the ATSR */
3982 		if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
3983 			break;
3984 	}
3985 
3986 	rcu_read_lock();
3987 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3988 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3989 		if (atsr->segment != pci_domain_nr(dev->bus))
3990 			continue;
3991 
3992 		for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
3993 			if (tmp == &bridge->dev)
3994 				goto out;
3995 
3996 		if (atsru->include_all)
3997 			goto out;
3998 	}
3999 	ret = 0;
4000 out:
4001 	rcu_read_unlock();
4002 
4003 	return ret;
4004 }
4005 
dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info * info)4006 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4007 {
4008 	int ret;
4009 	struct dmar_rmrr_unit *rmrru;
4010 	struct dmar_atsr_unit *atsru;
4011 	struct dmar_satc_unit *satcu;
4012 	struct acpi_dmar_atsr *atsr;
4013 	struct acpi_dmar_reserved_memory *rmrr;
4014 	struct acpi_dmar_satc *satc;
4015 
4016 	if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4017 		return 0;
4018 
4019 	list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4020 		rmrr = container_of(rmrru->hdr,
4021 				    struct acpi_dmar_reserved_memory, header);
4022 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4023 			ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4024 				((void *)rmrr) + rmrr->header.length,
4025 				rmrr->segment, rmrru->devices,
4026 				rmrru->devices_cnt);
4027 			if (ret < 0)
4028 				return ret;
4029 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4030 			dmar_remove_dev_scope(info, rmrr->segment,
4031 				rmrru->devices, rmrru->devices_cnt);
4032 		}
4033 	}
4034 
4035 	list_for_each_entry(atsru, &dmar_atsr_units, list) {
4036 		if (atsru->include_all)
4037 			continue;
4038 
4039 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4040 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4041 			ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4042 					(void *)atsr + atsr->header.length,
4043 					atsr->segment, atsru->devices,
4044 					atsru->devices_cnt);
4045 			if (ret > 0)
4046 				break;
4047 			else if (ret < 0)
4048 				return ret;
4049 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4050 			if (dmar_remove_dev_scope(info, atsr->segment,
4051 					atsru->devices, atsru->devices_cnt))
4052 				break;
4053 		}
4054 	}
4055 	list_for_each_entry(satcu, &dmar_satc_units, list) {
4056 		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
4057 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4058 			ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
4059 					(void *)satc + satc->header.length,
4060 					satc->segment, satcu->devices,
4061 					satcu->devices_cnt);
4062 			if (ret > 0)
4063 				break;
4064 			else if (ret < 0)
4065 				return ret;
4066 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4067 			if (dmar_remove_dev_scope(info, satc->segment,
4068 					satcu->devices, satcu->devices_cnt))
4069 				break;
4070 		}
4071 	}
4072 
4073 	return 0;
4074 }
4075 
intel_iommu_memory_notifier(struct notifier_block * nb,unsigned long val,void * v)4076 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4077 				       unsigned long val, void *v)
4078 {
4079 	struct memory_notify *mhp = v;
4080 	unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4081 	unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn +
4082 			mhp->nr_pages - 1);
4083 
4084 	switch (val) {
4085 	case MEM_GOING_ONLINE:
4086 		if (iommu_domain_identity_map(si_domain,
4087 					      start_vpfn, last_vpfn)) {
4088 			pr_warn("Failed to build identity map for [%lx-%lx]\n",
4089 				start_vpfn, last_vpfn);
4090 			return NOTIFY_BAD;
4091 		}
4092 		break;
4093 
4094 	case MEM_OFFLINE:
4095 	case MEM_CANCEL_ONLINE:
4096 		{
4097 			struct dmar_drhd_unit *drhd;
4098 			struct intel_iommu *iommu;
4099 			struct page *freelist;
4100 
4101 			freelist = domain_unmap(si_domain,
4102 						start_vpfn, last_vpfn,
4103 						NULL);
4104 
4105 			rcu_read_lock();
4106 			for_each_active_iommu(iommu, drhd)
4107 				iommu_flush_iotlb_psi(iommu, si_domain,
4108 					start_vpfn, mhp->nr_pages,
4109 					!freelist, 0);
4110 			rcu_read_unlock();
4111 			dma_free_pagelist(freelist);
4112 		}
4113 		break;
4114 	}
4115 
4116 	return NOTIFY_OK;
4117 }
4118 
4119 static struct notifier_block intel_iommu_memory_nb = {
4120 	.notifier_call = intel_iommu_memory_notifier,
4121 	.priority = 0
4122 };
4123 
intel_disable_iommus(void)4124 static void intel_disable_iommus(void)
4125 {
4126 	struct intel_iommu *iommu = NULL;
4127 	struct dmar_drhd_unit *drhd;
4128 
4129 	for_each_iommu(iommu, drhd)
4130 		iommu_disable_translation(iommu);
4131 }
4132 
intel_iommu_shutdown(void)4133 void intel_iommu_shutdown(void)
4134 {
4135 	struct dmar_drhd_unit *drhd;
4136 	struct intel_iommu *iommu = NULL;
4137 
4138 	if (no_iommu || dmar_disabled)
4139 		return;
4140 
4141 	down_write(&dmar_global_lock);
4142 
4143 	/* Disable PMRs explicitly here. */
4144 	for_each_iommu(iommu, drhd)
4145 		iommu_disable_protect_mem_regions(iommu);
4146 
4147 	/* Make sure the IOMMUs are switched off */
4148 	intel_disable_iommus();
4149 
4150 	up_write(&dmar_global_lock);
4151 }
4152 
dev_to_intel_iommu(struct device * dev)4153 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4154 {
4155 	struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4156 
4157 	return container_of(iommu_dev, struct intel_iommu, iommu);
4158 }
4159 
version_show(struct device * dev,struct device_attribute * attr,char * buf)4160 static ssize_t version_show(struct device *dev,
4161 			    struct device_attribute *attr, char *buf)
4162 {
4163 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4164 	u32 ver = readl(iommu->reg + DMAR_VER_REG);
4165 	return sprintf(buf, "%d:%d\n",
4166 		       DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4167 }
4168 static DEVICE_ATTR_RO(version);
4169 
address_show(struct device * dev,struct device_attribute * attr,char * buf)4170 static ssize_t address_show(struct device *dev,
4171 			    struct device_attribute *attr, char *buf)
4172 {
4173 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4174 	return sprintf(buf, "%llx\n", iommu->reg_phys);
4175 }
4176 static DEVICE_ATTR_RO(address);
4177 
cap_show(struct device * dev,struct device_attribute * attr,char * buf)4178 static ssize_t cap_show(struct device *dev,
4179 			struct device_attribute *attr, char *buf)
4180 {
4181 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4182 	return sprintf(buf, "%llx\n", iommu->cap);
4183 }
4184 static DEVICE_ATTR_RO(cap);
4185 
ecap_show(struct device * dev,struct device_attribute * attr,char * buf)4186 static ssize_t ecap_show(struct device *dev,
4187 			 struct device_attribute *attr, char *buf)
4188 {
4189 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4190 	return sprintf(buf, "%llx\n", iommu->ecap);
4191 }
4192 static DEVICE_ATTR_RO(ecap);
4193 
domains_supported_show(struct device * dev,struct device_attribute * attr,char * buf)4194 static ssize_t domains_supported_show(struct device *dev,
4195 				      struct device_attribute *attr, char *buf)
4196 {
4197 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4198 	return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4199 }
4200 static DEVICE_ATTR_RO(domains_supported);
4201 
domains_used_show(struct device * dev,struct device_attribute * attr,char * buf)4202 static ssize_t domains_used_show(struct device *dev,
4203 				 struct device_attribute *attr, char *buf)
4204 {
4205 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4206 	return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4207 						  cap_ndoms(iommu->cap)));
4208 }
4209 static DEVICE_ATTR_RO(domains_used);
4210 
4211 static struct attribute *intel_iommu_attrs[] = {
4212 	&dev_attr_version.attr,
4213 	&dev_attr_address.attr,
4214 	&dev_attr_cap.attr,
4215 	&dev_attr_ecap.attr,
4216 	&dev_attr_domains_supported.attr,
4217 	&dev_attr_domains_used.attr,
4218 	NULL,
4219 };
4220 
4221 static struct attribute_group intel_iommu_group = {
4222 	.name = "intel-iommu",
4223 	.attrs = intel_iommu_attrs,
4224 };
4225 
4226 const struct attribute_group *intel_iommu_groups[] = {
4227 	&intel_iommu_group,
4228 	NULL,
4229 };
4230 
has_external_pci(void)4231 static inline bool has_external_pci(void)
4232 {
4233 	struct pci_dev *pdev = NULL;
4234 
4235 	for_each_pci_dev(pdev)
4236 		if (pdev->external_facing) {
4237 			pci_dev_put(pdev);
4238 			return true;
4239 		}
4240 
4241 	return false;
4242 }
4243 
platform_optin_force_iommu(void)4244 static int __init platform_optin_force_iommu(void)
4245 {
4246 	if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
4247 		return 0;
4248 
4249 	if (no_iommu || dmar_disabled)
4250 		pr_info("Intel-IOMMU force enabled due to platform opt in\n");
4251 
4252 	/*
4253 	 * If Intel-IOMMU is disabled by default, we will apply identity
4254 	 * map for all devices except those marked as being untrusted.
4255 	 */
4256 	if (dmar_disabled)
4257 		iommu_set_default_passthrough(false);
4258 
4259 	dmar_disabled = 0;
4260 	no_iommu = 0;
4261 
4262 	return 1;
4263 }
4264 
probe_acpi_namespace_devices(void)4265 static int __init probe_acpi_namespace_devices(void)
4266 {
4267 	struct dmar_drhd_unit *drhd;
4268 	/* To avoid a -Wunused-but-set-variable warning. */
4269 	struct intel_iommu *iommu __maybe_unused;
4270 	struct device *dev;
4271 	int i, ret = 0;
4272 
4273 	for_each_active_iommu(iommu, drhd) {
4274 		for_each_active_dev_scope(drhd->devices,
4275 					  drhd->devices_cnt, i, dev) {
4276 			struct acpi_device_physical_node *pn;
4277 			struct iommu_group *group;
4278 			struct acpi_device *adev;
4279 
4280 			if (dev->bus != &acpi_bus_type)
4281 				continue;
4282 
4283 			adev = to_acpi_device(dev);
4284 			mutex_lock(&adev->physical_node_lock);
4285 			list_for_each_entry(pn,
4286 					    &adev->physical_node_list, node) {
4287 				group = iommu_group_get(pn->dev);
4288 				if (group) {
4289 					iommu_group_put(group);
4290 					continue;
4291 				}
4292 
4293 				pn->dev->bus->iommu_ops = &intel_iommu_ops;
4294 				ret = iommu_probe_device(pn->dev);
4295 				if (ret)
4296 					break;
4297 			}
4298 			mutex_unlock(&adev->physical_node_lock);
4299 
4300 			if (ret)
4301 				return ret;
4302 		}
4303 	}
4304 
4305 	return 0;
4306 }
4307 
intel_iommu_init(void)4308 int __init intel_iommu_init(void)
4309 {
4310 	int ret = -ENODEV;
4311 	struct dmar_drhd_unit *drhd;
4312 	struct intel_iommu *iommu;
4313 
4314 	/*
4315 	 * Intel IOMMU is required for a TXT/tboot launch or platform
4316 	 * opt in, so enforce that.
4317 	 */
4318 	force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
4319 		    platform_optin_force_iommu();
4320 
4321 	if (iommu_init_mempool()) {
4322 		if (force_on)
4323 			panic("tboot: Failed to initialize iommu memory\n");
4324 		return -ENOMEM;
4325 	}
4326 
4327 	down_write(&dmar_global_lock);
4328 	if (dmar_table_init()) {
4329 		if (force_on)
4330 			panic("tboot: Failed to initialize DMAR table\n");
4331 		goto out_free_dmar;
4332 	}
4333 
4334 	if (dmar_dev_scope_init() < 0) {
4335 		if (force_on)
4336 			panic("tboot: Failed to initialize DMAR device scope\n");
4337 		goto out_free_dmar;
4338 	}
4339 
4340 	up_write(&dmar_global_lock);
4341 
4342 	/*
4343 	 * The bus notifier takes the dmar_global_lock, so lockdep will
4344 	 * complain later when we register it under the lock.
4345 	 */
4346 	dmar_register_bus_notifier();
4347 
4348 	down_write(&dmar_global_lock);
4349 
4350 	if (!no_iommu)
4351 		intel_iommu_debugfs_init();
4352 
4353 	if (no_iommu || dmar_disabled) {
4354 		/*
4355 		 * We exit the function here to ensure IOMMU's remapping and
4356 		 * mempool aren't setup, which means that the IOMMU's PMRs
4357 		 * won't be disabled via the call to init_dmars(). So disable
4358 		 * it explicitly here. The PMRs were setup by tboot prior to
4359 		 * calling SENTER, but the kernel is expected to reset/tear
4360 		 * down the PMRs.
4361 		 */
4362 		if (intel_iommu_tboot_noforce) {
4363 			for_each_iommu(iommu, drhd)
4364 				iommu_disable_protect_mem_regions(iommu);
4365 		}
4366 
4367 		/*
4368 		 * Make sure the IOMMUs are switched off, even when we
4369 		 * boot into a kexec kernel and the previous kernel left
4370 		 * them enabled
4371 		 */
4372 		intel_disable_iommus();
4373 		goto out_free_dmar;
4374 	}
4375 
4376 	if (list_empty(&dmar_rmrr_units))
4377 		pr_info("No RMRR found\n");
4378 
4379 	if (list_empty(&dmar_atsr_units))
4380 		pr_info("No ATSR found\n");
4381 
4382 	if (list_empty(&dmar_satc_units))
4383 		pr_info("No SATC found\n");
4384 
4385 	if (dmar_map_gfx)
4386 		intel_iommu_gfx_mapped = 1;
4387 
4388 	init_no_remapping_devices();
4389 
4390 	ret = init_dmars();
4391 	if (ret) {
4392 		if (force_on)
4393 			panic("tboot: Failed to initialize DMARs\n");
4394 		pr_err("Initialization failed\n");
4395 		goto out_free_dmar;
4396 	}
4397 	up_write(&dmar_global_lock);
4398 
4399 	init_iommu_pm_ops();
4400 
4401 	down_read(&dmar_global_lock);
4402 	for_each_active_iommu(iommu, drhd) {
4403 		/*
4404 		 * The flush queue implementation does not perform
4405 		 * page-selective invalidations that are required for efficient
4406 		 * TLB flushes in virtual environments.  The benefit of batching
4407 		 * is likely to be much lower than the overhead of synchronizing
4408 		 * the virtual and physical IOMMU page-tables.
4409 		 */
4410 		if (cap_caching_mode(iommu->cap) &&
4411 		    !first_level_by_default(IOMMU_DOMAIN_DMA)) {
4412 			pr_info_once("IOMMU batching disallowed due to virtualization\n");
4413 			iommu_set_dma_strict();
4414 		}
4415 		iommu_device_sysfs_add(&iommu->iommu, NULL,
4416 				       intel_iommu_groups,
4417 				       "%s", iommu->name);
4418 		iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
4419 	}
4420 	up_read(&dmar_global_lock);
4421 
4422 	bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4423 	if (si_domain && !hw_pass_through)
4424 		register_memory_notifier(&intel_iommu_memory_nb);
4425 
4426 	down_read(&dmar_global_lock);
4427 	if (probe_acpi_namespace_devices())
4428 		pr_warn("ACPI name space devices didn't probe correctly\n");
4429 
4430 	/* Finally, we enable the DMA remapping hardware. */
4431 	for_each_iommu(iommu, drhd) {
4432 		if (!drhd->ignored && !translation_pre_enabled(iommu))
4433 			iommu_enable_translation(iommu);
4434 
4435 		iommu_disable_protect_mem_regions(iommu);
4436 	}
4437 	up_read(&dmar_global_lock);
4438 
4439 	pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4440 
4441 	intel_iommu_enabled = 1;
4442 
4443 	return 0;
4444 
4445 out_free_dmar:
4446 	intel_iommu_free_dmars();
4447 	up_write(&dmar_global_lock);
4448 	iommu_exit_mempool();
4449 	return ret;
4450 }
4451 
domain_context_clear_one_cb(struct pci_dev * pdev,u16 alias,void * opaque)4452 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4453 {
4454 	struct device_domain_info *info = opaque;
4455 
4456 	domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
4457 	return 0;
4458 }
4459 
4460 /*
4461  * NB - intel-iommu lacks any sort of reference counting for the users of
4462  * dependent devices.  If multiple endpoints have intersecting dependent
4463  * devices, unbinding the driver from any one of them will possibly leave
4464  * the others unable to operate.
4465  */
domain_context_clear(struct device_domain_info * info)4466 static void domain_context_clear(struct device_domain_info *info)
4467 {
4468 	if (!dev_is_pci(info->dev))
4469 		domain_context_clear_one(info, info->bus, info->devfn);
4470 
4471 	pci_for_each_dma_alias(to_pci_dev(info->dev),
4472 			       &domain_context_clear_one_cb, info);
4473 }
4474 
__dmar_remove_one_dev_info(struct device_domain_info * info)4475 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
4476 {
4477 	struct dmar_domain *domain;
4478 	struct intel_iommu *iommu;
4479 	unsigned long flags;
4480 
4481 	assert_spin_locked(&device_domain_lock);
4482 
4483 	if (WARN_ON(!info))
4484 		return;
4485 
4486 	iommu = info->iommu;
4487 	domain = info->domain;
4488 
4489 	if (info->dev && !dev_is_real_dma_subdevice(info->dev)) {
4490 		if (dev_is_pci(info->dev) && sm_supported(iommu))
4491 			intel_pasid_tear_down_entry(iommu, info->dev,
4492 					PASID_RID2PASID, false);
4493 
4494 		iommu_disable_dev_iotlb(info);
4495 		domain_context_clear(info);
4496 		intel_pasid_free_table(info->dev);
4497 	}
4498 
4499 	unlink_domain_info(info);
4500 
4501 	spin_lock_irqsave(&iommu->lock, flags);
4502 	domain_detach_iommu(domain, iommu);
4503 	spin_unlock_irqrestore(&iommu->lock, flags);
4504 
4505 	free_devinfo_mem(info);
4506 }
4507 
dmar_remove_one_dev_info(struct device * dev)4508 static void dmar_remove_one_dev_info(struct device *dev)
4509 {
4510 	struct device_domain_info *info;
4511 	unsigned long flags;
4512 
4513 	spin_lock_irqsave(&device_domain_lock, flags);
4514 	info = get_domain_info(dev);
4515 	if (info)
4516 		__dmar_remove_one_dev_info(info);
4517 	spin_unlock_irqrestore(&device_domain_lock, flags);
4518 }
4519 
md_domain_init(struct dmar_domain * domain,int guest_width)4520 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4521 {
4522 	int adjust_width;
4523 
4524 	/* calculate AGAW */
4525 	domain->gaw = guest_width;
4526 	adjust_width = guestwidth_to_adjustwidth(guest_width);
4527 	domain->agaw = width_to_agaw(adjust_width);
4528 
4529 	domain->iommu_coherency = false;
4530 	domain->iommu_snooping = false;
4531 	domain->iommu_superpage = 0;
4532 	domain->max_addr = 0;
4533 
4534 	/* always allocate the top pgd */
4535 	domain->pgd = alloc_pgtable_page(domain->nid);
4536 	if (!domain->pgd)
4537 		return -ENOMEM;
4538 	domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4539 	return 0;
4540 }
4541 
intel_iommu_domain_alloc(unsigned type)4542 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4543 {
4544 	struct dmar_domain *dmar_domain;
4545 	struct iommu_domain *domain;
4546 
4547 	switch (type) {
4548 	case IOMMU_DOMAIN_DMA:
4549 	case IOMMU_DOMAIN_DMA_FQ:
4550 	case IOMMU_DOMAIN_UNMANAGED:
4551 		dmar_domain = alloc_domain(type);
4552 		if (!dmar_domain) {
4553 			pr_err("Can't allocate dmar_domain\n");
4554 			return NULL;
4555 		}
4556 		if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4557 			pr_err("Domain initialization failed\n");
4558 			domain_exit(dmar_domain);
4559 			return NULL;
4560 		}
4561 
4562 		domain = &dmar_domain->domain;
4563 		domain->geometry.aperture_start = 0;
4564 		domain->geometry.aperture_end   =
4565 				__DOMAIN_MAX_ADDR(dmar_domain->gaw);
4566 		domain->geometry.force_aperture = true;
4567 
4568 		return domain;
4569 	case IOMMU_DOMAIN_IDENTITY:
4570 		return &si_domain->domain;
4571 	default:
4572 		return NULL;
4573 	}
4574 
4575 	return NULL;
4576 }
4577 
intel_iommu_domain_free(struct iommu_domain * domain)4578 static void intel_iommu_domain_free(struct iommu_domain *domain)
4579 {
4580 	if (domain != &si_domain->domain)
4581 		domain_exit(to_dmar_domain(domain));
4582 }
4583 
4584 /*
4585  * Check whether a @domain could be attached to the @dev through the
4586  * aux-domain attach/detach APIs.
4587  */
4588 static inline bool
is_aux_domain(struct device * dev,struct iommu_domain * domain)4589 is_aux_domain(struct device *dev, struct iommu_domain *domain)
4590 {
4591 	struct device_domain_info *info = get_domain_info(dev);
4592 
4593 	return info && info->auxd_enabled &&
4594 			domain->type == IOMMU_DOMAIN_UNMANAGED;
4595 }
4596 
4597 static inline struct subdev_domain_info *
lookup_subdev_info(struct dmar_domain * domain,struct device * dev)4598 lookup_subdev_info(struct dmar_domain *domain, struct device *dev)
4599 {
4600 	struct subdev_domain_info *sinfo;
4601 
4602 	if (!list_empty(&domain->subdevices)) {
4603 		list_for_each_entry(sinfo, &domain->subdevices, link_domain) {
4604 			if (sinfo->pdev == dev)
4605 				return sinfo;
4606 		}
4607 	}
4608 
4609 	return NULL;
4610 }
4611 
auxiliary_link_device(struct dmar_domain * domain,struct device * dev)4612 static int auxiliary_link_device(struct dmar_domain *domain,
4613 				 struct device *dev)
4614 {
4615 	struct device_domain_info *info = get_domain_info(dev);
4616 	struct subdev_domain_info *sinfo = lookup_subdev_info(domain, dev);
4617 
4618 	assert_spin_locked(&device_domain_lock);
4619 	if (WARN_ON(!info))
4620 		return -EINVAL;
4621 
4622 	if (!sinfo) {
4623 		sinfo = kzalloc(sizeof(*sinfo), GFP_ATOMIC);
4624 		if (!sinfo)
4625 			return -ENOMEM;
4626 		sinfo->domain = domain;
4627 		sinfo->pdev = dev;
4628 		list_add(&sinfo->link_phys, &info->subdevices);
4629 		list_add(&sinfo->link_domain, &domain->subdevices);
4630 	}
4631 
4632 	return ++sinfo->users;
4633 }
4634 
auxiliary_unlink_device(struct dmar_domain * domain,struct device * dev)4635 static int auxiliary_unlink_device(struct dmar_domain *domain,
4636 				   struct device *dev)
4637 {
4638 	struct device_domain_info *info = get_domain_info(dev);
4639 	struct subdev_domain_info *sinfo = lookup_subdev_info(domain, dev);
4640 	int ret;
4641 
4642 	assert_spin_locked(&device_domain_lock);
4643 	if (WARN_ON(!info || !sinfo || sinfo->users <= 0))
4644 		return -EINVAL;
4645 
4646 	ret = --sinfo->users;
4647 	if (!ret) {
4648 		list_del(&sinfo->link_phys);
4649 		list_del(&sinfo->link_domain);
4650 		kfree(sinfo);
4651 	}
4652 
4653 	return ret;
4654 }
4655 
aux_domain_add_dev(struct dmar_domain * domain,struct device * dev)4656 static int aux_domain_add_dev(struct dmar_domain *domain,
4657 			      struct device *dev)
4658 {
4659 	int ret;
4660 	unsigned long flags;
4661 	struct intel_iommu *iommu;
4662 
4663 	iommu = device_to_iommu(dev, NULL, NULL);
4664 	if (!iommu)
4665 		return -ENODEV;
4666 
4667 	if (domain->default_pasid <= 0) {
4668 		u32 pasid;
4669 
4670 		/* No private data needed for the default pasid */
4671 		pasid = ioasid_alloc(NULL, PASID_MIN,
4672 				     pci_max_pasids(to_pci_dev(dev)) - 1,
4673 				     NULL);
4674 		if (pasid == INVALID_IOASID) {
4675 			pr_err("Can't allocate default pasid\n");
4676 			return -ENODEV;
4677 		}
4678 		domain->default_pasid = pasid;
4679 	}
4680 
4681 	spin_lock_irqsave(&device_domain_lock, flags);
4682 	ret = auxiliary_link_device(domain, dev);
4683 	if (ret <= 0)
4684 		goto link_failed;
4685 
4686 	/*
4687 	 * Subdevices from the same physical device can be attached to the
4688 	 * same domain. For such cases, only the first subdevice attachment
4689 	 * needs to go through the full steps in this function. So if ret >
4690 	 * 1, just goto out.
4691 	 */
4692 	if (ret > 1)
4693 		goto out;
4694 
4695 	/*
4696 	 * iommu->lock must be held to attach domain to iommu and setup the
4697 	 * pasid entry for second level translation.
4698 	 */
4699 	spin_lock(&iommu->lock);
4700 	ret = domain_attach_iommu(domain, iommu);
4701 	if (ret)
4702 		goto attach_failed;
4703 
4704 	/* Setup the PASID entry for mediated devices: */
4705 	if (domain_use_first_level(domain))
4706 		ret = domain_setup_first_level(iommu, domain, dev,
4707 					       domain->default_pasid);
4708 	else
4709 		ret = intel_pasid_setup_second_level(iommu, domain, dev,
4710 						     domain->default_pasid);
4711 	if (ret)
4712 		goto table_failed;
4713 
4714 	spin_unlock(&iommu->lock);
4715 out:
4716 	spin_unlock_irqrestore(&device_domain_lock, flags);
4717 
4718 	return 0;
4719 
4720 table_failed:
4721 	domain_detach_iommu(domain, iommu);
4722 attach_failed:
4723 	spin_unlock(&iommu->lock);
4724 	auxiliary_unlink_device(domain, dev);
4725 link_failed:
4726 	spin_unlock_irqrestore(&device_domain_lock, flags);
4727 	if (list_empty(&domain->subdevices) && domain->default_pasid > 0)
4728 		ioasid_put(domain->default_pasid);
4729 
4730 	return ret;
4731 }
4732 
aux_domain_remove_dev(struct dmar_domain * domain,struct device * dev)4733 static void aux_domain_remove_dev(struct dmar_domain *domain,
4734 				  struct device *dev)
4735 {
4736 	struct device_domain_info *info;
4737 	struct intel_iommu *iommu;
4738 	unsigned long flags;
4739 
4740 	if (!is_aux_domain(dev, &domain->domain))
4741 		return;
4742 
4743 	spin_lock_irqsave(&device_domain_lock, flags);
4744 	info = get_domain_info(dev);
4745 	iommu = info->iommu;
4746 
4747 	if (!auxiliary_unlink_device(domain, dev)) {
4748 		spin_lock(&iommu->lock);
4749 		intel_pasid_tear_down_entry(iommu, dev,
4750 					    domain->default_pasid, false);
4751 		domain_detach_iommu(domain, iommu);
4752 		spin_unlock(&iommu->lock);
4753 	}
4754 
4755 	spin_unlock_irqrestore(&device_domain_lock, flags);
4756 
4757 	if (list_empty(&domain->subdevices) && domain->default_pasid > 0)
4758 		ioasid_put(domain->default_pasid);
4759 }
4760 
prepare_domain_attach_device(struct iommu_domain * domain,struct device * dev)4761 static int prepare_domain_attach_device(struct iommu_domain *domain,
4762 					struct device *dev)
4763 {
4764 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4765 	struct intel_iommu *iommu;
4766 	int addr_width;
4767 
4768 	iommu = device_to_iommu(dev, NULL, NULL);
4769 	if (!iommu)
4770 		return -ENODEV;
4771 
4772 	if ((dmar_domain->flags & DOMAIN_FLAG_NESTING_MODE) &&
4773 	    !ecap_nest(iommu->ecap)) {
4774 		dev_err(dev, "%s: iommu not support nested translation\n",
4775 			iommu->name);
4776 		return -EINVAL;
4777 	}
4778 
4779 	/* check if this iommu agaw is sufficient for max mapped address */
4780 	addr_width = agaw_to_width(iommu->agaw);
4781 	if (addr_width > cap_mgaw(iommu->cap))
4782 		addr_width = cap_mgaw(iommu->cap);
4783 
4784 	if (dmar_domain->max_addr > (1LL << addr_width)) {
4785 		dev_err(dev, "%s: iommu width (%d) is not "
4786 		        "sufficient for the mapped address (%llx)\n",
4787 		        __func__, addr_width, dmar_domain->max_addr);
4788 		return -EFAULT;
4789 	}
4790 	dmar_domain->gaw = addr_width;
4791 
4792 	/*
4793 	 * Knock out extra levels of page tables if necessary
4794 	 */
4795 	while (iommu->agaw < dmar_domain->agaw) {
4796 		struct dma_pte *pte;
4797 
4798 		pte = dmar_domain->pgd;
4799 		if (dma_pte_present(pte)) {
4800 			dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte));
4801 			free_pgtable_page(pte);
4802 		}
4803 		dmar_domain->agaw--;
4804 	}
4805 
4806 	return 0;
4807 }
4808 
intel_iommu_attach_device(struct iommu_domain * domain,struct device * dev)4809 static int intel_iommu_attach_device(struct iommu_domain *domain,
4810 				     struct device *dev)
4811 {
4812 	int ret;
4813 
4814 	if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
4815 	    device_is_rmrr_locked(dev)) {
4816 		dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
4817 		return -EPERM;
4818 	}
4819 
4820 	if (is_aux_domain(dev, domain))
4821 		return -EPERM;
4822 
4823 	/* normally dev is not mapped */
4824 	if (unlikely(domain_context_mapped(dev))) {
4825 		struct dmar_domain *old_domain;
4826 
4827 		old_domain = find_domain(dev);
4828 		if (old_domain)
4829 			dmar_remove_one_dev_info(dev);
4830 	}
4831 
4832 	ret = prepare_domain_attach_device(domain, dev);
4833 	if (ret)
4834 		return ret;
4835 
4836 	return domain_add_dev_info(to_dmar_domain(domain), dev);
4837 }
4838 
intel_iommu_aux_attach_device(struct iommu_domain * domain,struct device * dev)4839 static int intel_iommu_aux_attach_device(struct iommu_domain *domain,
4840 					 struct device *dev)
4841 {
4842 	int ret;
4843 
4844 	if (!is_aux_domain(dev, domain))
4845 		return -EPERM;
4846 
4847 	ret = prepare_domain_attach_device(domain, dev);
4848 	if (ret)
4849 		return ret;
4850 
4851 	return aux_domain_add_dev(to_dmar_domain(domain), dev);
4852 }
4853 
intel_iommu_detach_device(struct iommu_domain * domain,struct device * dev)4854 static void intel_iommu_detach_device(struct iommu_domain *domain,
4855 				      struct device *dev)
4856 {
4857 	dmar_remove_one_dev_info(dev);
4858 }
4859 
intel_iommu_aux_detach_device(struct iommu_domain * domain,struct device * dev)4860 static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
4861 					  struct device *dev)
4862 {
4863 	aux_domain_remove_dev(to_dmar_domain(domain), dev);
4864 }
4865 
4866 #ifdef CONFIG_INTEL_IOMMU_SVM
4867 /*
4868  * 2D array for converting and sanitizing IOMMU generic TLB granularity to
4869  * VT-d granularity. Invalidation is typically included in the unmap operation
4870  * as a result of DMA or VFIO unmap. However, for assigned devices guest
4871  * owns the first level page tables. Invalidations of translation caches in the
4872  * guest are trapped and passed down to the host.
4873  *
4874  * vIOMMU in the guest will only expose first level page tables, therefore
4875  * we do not support IOTLB granularity for request without PASID (second level).
4876  *
4877  * For example, to find the VT-d granularity encoding for IOTLB
4878  * type and page selective granularity within PASID:
4879  * X: indexed by iommu cache type
4880  * Y: indexed by enum iommu_inv_granularity
4881  * [IOMMU_CACHE_INV_TYPE_IOTLB][IOMMU_INV_GRANU_ADDR]
4882  */
4883 
4884 static const int
4885 inv_type_granu_table[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU_NR] = {
4886 	/*
4887 	 * PASID based IOTLB invalidation: PASID selective (per PASID),
4888 	 * page selective (address granularity)
4889 	 */
4890 	{-EINVAL, QI_GRAN_NONG_PASID, QI_GRAN_PSI_PASID},
4891 	/* PASID based dev TLBs */
4892 	{-EINVAL, -EINVAL, QI_DEV_IOTLB_GRAN_PASID_SEL},
4893 	/* PASID cache */
4894 	{-EINVAL, -EINVAL, -EINVAL}
4895 };
4896 
to_vtd_granularity(int type,int granu)4897 static inline int to_vtd_granularity(int type, int granu)
4898 {
4899 	return inv_type_granu_table[type][granu];
4900 }
4901 
to_vtd_size(u64 granu_size,u64 nr_granules)4902 static inline u64 to_vtd_size(u64 granu_size, u64 nr_granules)
4903 {
4904 	u64 nr_pages = (granu_size * nr_granules) >> VTD_PAGE_SHIFT;
4905 
4906 	/* VT-d size is encoded as 2^size of 4K pages, 0 for 4k, 9 for 2MB, etc.
4907 	 * IOMMU cache invalidate API passes granu_size in bytes, and number of
4908 	 * granu size in contiguous memory.
4909 	 */
4910 	return order_base_2(nr_pages);
4911 }
4912 
4913 static int
intel_iommu_sva_invalidate(struct iommu_domain * domain,struct device * dev,struct iommu_cache_invalidate_info * inv_info)4914 intel_iommu_sva_invalidate(struct iommu_domain *domain, struct device *dev,
4915 			   struct iommu_cache_invalidate_info *inv_info)
4916 {
4917 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4918 	struct device_domain_info *info;
4919 	struct intel_iommu *iommu;
4920 	unsigned long flags;
4921 	int cache_type;
4922 	u8 bus, devfn;
4923 	u16 did, sid;
4924 	int ret = 0;
4925 	u64 size = 0;
4926 
4927 	if (!inv_info || !dmar_domain)
4928 		return -EINVAL;
4929 
4930 	if (!dev || !dev_is_pci(dev))
4931 		return -ENODEV;
4932 
4933 	iommu = device_to_iommu(dev, &bus, &devfn);
4934 	if (!iommu)
4935 		return -ENODEV;
4936 
4937 	if (!(dmar_domain->flags & DOMAIN_FLAG_NESTING_MODE))
4938 		return -EINVAL;
4939 
4940 	spin_lock_irqsave(&device_domain_lock, flags);
4941 	spin_lock(&iommu->lock);
4942 	info = get_domain_info(dev);
4943 	if (!info) {
4944 		ret = -EINVAL;
4945 		goto out_unlock;
4946 	}
4947 	did = dmar_domain->iommu_did[iommu->seq_id];
4948 	sid = PCI_DEVID(bus, devfn);
4949 
4950 	/* Size is only valid in address selective invalidation */
4951 	if (inv_info->granularity == IOMMU_INV_GRANU_ADDR)
4952 		size = to_vtd_size(inv_info->granu.addr_info.granule_size,
4953 				   inv_info->granu.addr_info.nb_granules);
4954 
4955 	for_each_set_bit(cache_type,
4956 			 (unsigned long *)&inv_info->cache,
4957 			 IOMMU_CACHE_INV_TYPE_NR) {
4958 		int granu = 0;
4959 		u64 pasid = 0;
4960 		u64 addr = 0;
4961 
4962 		granu = to_vtd_granularity(cache_type, inv_info->granularity);
4963 		if (granu == -EINVAL) {
4964 			pr_err_ratelimited("Invalid cache type and granu combination %d/%d\n",
4965 					   cache_type, inv_info->granularity);
4966 			break;
4967 		}
4968 
4969 		/*
4970 		 * PASID is stored in different locations based on the
4971 		 * granularity.
4972 		 */
4973 		if (inv_info->granularity == IOMMU_INV_GRANU_PASID &&
4974 		    (inv_info->granu.pasid_info.flags & IOMMU_INV_PASID_FLAGS_PASID))
4975 			pasid = inv_info->granu.pasid_info.pasid;
4976 		else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
4977 			 (inv_info->granu.addr_info.flags & IOMMU_INV_ADDR_FLAGS_PASID))
4978 			pasid = inv_info->granu.addr_info.pasid;
4979 
4980 		switch (BIT(cache_type)) {
4981 		case IOMMU_CACHE_INV_TYPE_IOTLB:
4982 			/* HW will ignore LSB bits based on address mask */
4983 			if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
4984 			    size &&
4985 			    (inv_info->granu.addr_info.addr & ((BIT(VTD_PAGE_SHIFT + size)) - 1))) {
4986 				pr_err_ratelimited("User address not aligned, 0x%llx, size order %llu\n",
4987 						   inv_info->granu.addr_info.addr, size);
4988 			}
4989 
4990 			/*
4991 			 * If granu is PASID-selective, address is ignored.
4992 			 * We use npages = -1 to indicate that.
4993 			 */
4994 			qi_flush_piotlb(iommu, did, pasid,
4995 					mm_to_dma_pfn(inv_info->granu.addr_info.addr),
4996 					(granu == QI_GRAN_NONG_PASID) ? -1 : 1 << size,
4997 					inv_info->granu.addr_info.flags & IOMMU_INV_ADDR_FLAGS_LEAF);
4998 
4999 			if (!info->ats_enabled)
5000 				break;
5001 			/*
5002 			 * Always flush device IOTLB if ATS is enabled. vIOMMU
5003 			 * in the guest may assume IOTLB flush is inclusive,
5004 			 * which is more efficient.
5005 			 */
5006 			fallthrough;
5007 		case IOMMU_CACHE_INV_TYPE_DEV_IOTLB:
5008 			/*
5009 			 * PASID based device TLB invalidation does not support
5010 			 * IOMMU_INV_GRANU_PASID granularity but only supports
5011 			 * IOMMU_INV_GRANU_ADDR.
5012 			 * The equivalent of that is we set the size to be the
5013 			 * entire range of 64 bit. User only provides PASID info
5014 			 * without address info. So we set addr to 0.
5015 			 */
5016 			if (inv_info->granularity == IOMMU_INV_GRANU_PASID) {
5017 				size = 64 - VTD_PAGE_SHIFT;
5018 				addr = 0;
5019 			} else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR) {
5020 				addr = inv_info->granu.addr_info.addr;
5021 			}
5022 
5023 			if (info->ats_enabled)
5024 				qi_flush_dev_iotlb_pasid(iommu, sid,
5025 						info->pfsid, pasid,
5026 						info->ats_qdep, addr,
5027 						size);
5028 			else
5029 				pr_warn_ratelimited("Passdown device IOTLB flush w/o ATS!\n");
5030 			break;
5031 		default:
5032 			dev_err_ratelimited(dev, "Unsupported IOMMU invalidation type %d\n",
5033 					    cache_type);
5034 			ret = -EINVAL;
5035 		}
5036 	}
5037 out_unlock:
5038 	spin_unlock(&iommu->lock);
5039 	spin_unlock_irqrestore(&device_domain_lock, flags);
5040 
5041 	return ret;
5042 }
5043 #endif
5044 
intel_iommu_map(struct iommu_domain * domain,unsigned long iova,phys_addr_t hpa,size_t size,int iommu_prot,gfp_t gfp)5045 static int intel_iommu_map(struct iommu_domain *domain,
5046 			   unsigned long iova, phys_addr_t hpa,
5047 			   size_t size, int iommu_prot, gfp_t gfp)
5048 {
5049 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5050 	u64 max_addr;
5051 	int prot = 0;
5052 
5053 	if (iommu_prot & IOMMU_READ)
5054 		prot |= DMA_PTE_READ;
5055 	if (iommu_prot & IOMMU_WRITE)
5056 		prot |= DMA_PTE_WRITE;
5057 	if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5058 		prot |= DMA_PTE_SNP;
5059 
5060 	max_addr = iova + size;
5061 	if (dmar_domain->max_addr < max_addr) {
5062 		u64 end;
5063 
5064 		/* check if minimum agaw is sufficient for mapped address */
5065 		end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5066 		if (end < max_addr) {
5067 			pr_err("%s: iommu width (%d) is not "
5068 			       "sufficient for the mapped address (%llx)\n",
5069 			       __func__, dmar_domain->gaw, max_addr);
5070 			return -EFAULT;
5071 		}
5072 		dmar_domain->max_addr = max_addr;
5073 	}
5074 	/* Round up size to next multiple of PAGE_SIZE, if it and
5075 	   the low bits of hpa would take us onto the next page */
5076 	size = aligned_nrpages(hpa, size);
5077 	return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5078 				hpa >> VTD_PAGE_SHIFT, size, prot);
5079 }
5080 
intel_iommu_map_pages(struct iommu_domain * domain,unsigned long iova,phys_addr_t paddr,size_t pgsize,size_t pgcount,int prot,gfp_t gfp,size_t * mapped)5081 static int intel_iommu_map_pages(struct iommu_domain *domain,
5082 				 unsigned long iova, phys_addr_t paddr,
5083 				 size_t pgsize, size_t pgcount,
5084 				 int prot, gfp_t gfp, size_t *mapped)
5085 {
5086 	unsigned long pgshift = __ffs(pgsize);
5087 	size_t size = pgcount << pgshift;
5088 	int ret;
5089 
5090 	if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
5091 		return -EINVAL;
5092 
5093 	if (!IS_ALIGNED(iova | paddr, pgsize))
5094 		return -EINVAL;
5095 
5096 	ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp);
5097 	if (!ret && mapped)
5098 		*mapped = size;
5099 
5100 	return ret;
5101 }
5102 
intel_iommu_unmap(struct iommu_domain * domain,unsigned long iova,size_t size,struct iommu_iotlb_gather * gather)5103 static size_t intel_iommu_unmap(struct iommu_domain *domain,
5104 				unsigned long iova, size_t size,
5105 				struct iommu_iotlb_gather *gather)
5106 {
5107 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5108 	unsigned long start_pfn, last_pfn;
5109 	int level = 0;
5110 
5111 	/* Cope with horrid API which requires us to unmap more than the
5112 	   size argument if it happens to be a large-page mapping. */
5113 	BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5114 
5115 	if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5116 		size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5117 
5118 	start_pfn = iova >> VTD_PAGE_SHIFT;
5119 	last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5120 
5121 	gather->freelist = domain_unmap(dmar_domain, start_pfn,
5122 					last_pfn, gather->freelist);
5123 
5124 	if (dmar_domain->max_addr == iova + size)
5125 		dmar_domain->max_addr = iova;
5126 
5127 	/*
5128 	 * We do not use page-selective IOTLB invalidation in flush queue,
5129 	 * so there is no need to track page and sync iotlb.
5130 	 */
5131 	if (!iommu_iotlb_gather_queued(gather))
5132 		iommu_iotlb_gather_add_page(domain, gather, iova, size);
5133 
5134 	return size;
5135 }
5136 
intel_iommu_unmap_pages(struct iommu_domain * domain,unsigned long iova,size_t pgsize,size_t pgcount,struct iommu_iotlb_gather * gather)5137 static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
5138 				      unsigned long iova,
5139 				      size_t pgsize, size_t pgcount,
5140 				      struct iommu_iotlb_gather *gather)
5141 {
5142 	unsigned long pgshift = __ffs(pgsize);
5143 	size_t size = pgcount << pgshift;
5144 
5145 	return intel_iommu_unmap(domain, iova, size, gather);
5146 }
5147 
intel_iommu_tlb_sync(struct iommu_domain * domain,struct iommu_iotlb_gather * gather)5148 static void intel_iommu_tlb_sync(struct iommu_domain *domain,
5149 				 struct iommu_iotlb_gather *gather)
5150 {
5151 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5152 	unsigned long iova_pfn = IOVA_PFN(gather->start);
5153 	size_t size = gather->end - gather->start;
5154 	unsigned long start_pfn;
5155 	unsigned long nrpages;
5156 	int iommu_id;
5157 
5158 	nrpages = aligned_nrpages(gather->start, size);
5159 	start_pfn = mm_to_dma_pfn(iova_pfn);
5160 
5161 	for_each_domain_iommu(iommu_id, dmar_domain)
5162 		iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5163 				      start_pfn, nrpages, !gather->freelist, 0);
5164 
5165 	dma_free_pagelist(gather->freelist);
5166 }
5167 
intel_iommu_iova_to_phys(struct iommu_domain * domain,dma_addr_t iova)5168 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5169 					    dma_addr_t iova)
5170 {
5171 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5172 	struct dma_pte *pte;
5173 	int level = 0;
5174 	u64 phys = 0;
5175 
5176 	pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5177 	if (pte && dma_pte_present(pte))
5178 		phys = dma_pte_addr(pte) +
5179 			(iova & (BIT_MASK(level_to_offset_bits(level) +
5180 						VTD_PAGE_SHIFT) - 1));
5181 
5182 	return phys;
5183 }
5184 
intel_iommu_capable(enum iommu_cap cap)5185 static bool intel_iommu_capable(enum iommu_cap cap)
5186 {
5187 	if (cap == IOMMU_CAP_CACHE_COHERENCY)
5188 		return domain_update_iommu_snooping(NULL);
5189 	if (cap == IOMMU_CAP_INTR_REMAP)
5190 		return irq_remapping_enabled == 1;
5191 
5192 	return false;
5193 }
5194 
intel_iommu_probe_device(struct device * dev)5195 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
5196 {
5197 	struct intel_iommu *iommu;
5198 
5199 	iommu = device_to_iommu(dev, NULL, NULL);
5200 	if (!iommu)
5201 		return ERR_PTR(-ENODEV);
5202 
5203 	if (translation_pre_enabled(iommu))
5204 		dev_iommu_priv_set(dev, DEFER_DEVICE_DOMAIN_INFO);
5205 
5206 	return &iommu->iommu;
5207 }
5208 
intel_iommu_release_device(struct device * dev)5209 static void intel_iommu_release_device(struct device *dev)
5210 {
5211 	struct intel_iommu *iommu;
5212 
5213 	iommu = device_to_iommu(dev, NULL, NULL);
5214 	if (!iommu)
5215 		return;
5216 
5217 	dmar_remove_one_dev_info(dev);
5218 
5219 	set_dma_ops(dev, NULL);
5220 }
5221 
intel_iommu_probe_finalize(struct device * dev)5222 static void intel_iommu_probe_finalize(struct device *dev)
5223 {
5224 	set_dma_ops(dev, NULL);
5225 	iommu_setup_dma_ops(dev, 0, U64_MAX);
5226 }
5227 
intel_iommu_get_resv_regions(struct device * device,struct list_head * head)5228 static void intel_iommu_get_resv_regions(struct device *device,
5229 					 struct list_head *head)
5230 {
5231 	int prot = DMA_PTE_READ | DMA_PTE_WRITE;
5232 	struct iommu_resv_region *reg;
5233 	struct dmar_rmrr_unit *rmrr;
5234 	struct device *i_dev;
5235 	int i;
5236 
5237 	down_read(&dmar_global_lock);
5238 	for_each_rmrr_units(rmrr) {
5239 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5240 					  i, i_dev) {
5241 			struct iommu_resv_region *resv;
5242 			enum iommu_resv_type type;
5243 			size_t length;
5244 
5245 			if (i_dev != device &&
5246 			    !is_downstream_to_pci_bridge(device, i_dev))
5247 				continue;
5248 
5249 			length = rmrr->end_address - rmrr->base_address + 1;
5250 
5251 			type = device_rmrr_is_relaxable(device) ?
5252 				IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
5253 
5254 			resv = iommu_alloc_resv_region(rmrr->base_address,
5255 						       length, prot, type);
5256 			if (!resv)
5257 				break;
5258 
5259 			list_add_tail(&resv->list, head);
5260 		}
5261 	}
5262 	up_read(&dmar_global_lock);
5263 
5264 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
5265 	if (dev_is_pci(device)) {
5266 		struct pci_dev *pdev = to_pci_dev(device);
5267 
5268 		if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
5269 			reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
5270 						   IOMMU_RESV_DIRECT_RELAXABLE);
5271 			if (reg)
5272 				list_add_tail(&reg->list, head);
5273 		}
5274 	}
5275 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
5276 
5277 	reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5278 				      IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5279 				      0, IOMMU_RESV_MSI);
5280 	if (!reg)
5281 		return;
5282 	list_add_tail(&reg->list, head);
5283 }
5284 
intel_iommu_enable_pasid(struct intel_iommu * iommu,struct device * dev)5285 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
5286 {
5287 	struct device_domain_info *info;
5288 	struct context_entry *context;
5289 	struct dmar_domain *domain;
5290 	unsigned long flags;
5291 	u64 ctx_lo;
5292 	int ret;
5293 
5294 	domain = find_domain(dev);
5295 	if (!domain)
5296 		return -EINVAL;
5297 
5298 	spin_lock_irqsave(&device_domain_lock, flags);
5299 	spin_lock(&iommu->lock);
5300 
5301 	ret = -EINVAL;
5302 	info = get_domain_info(dev);
5303 	if (!info || !info->pasid_supported)
5304 		goto out;
5305 
5306 	context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5307 	if (WARN_ON(!context))
5308 		goto out;
5309 
5310 	ctx_lo = context[0].lo;
5311 
5312 	if (!(ctx_lo & CONTEXT_PASIDE)) {
5313 		ctx_lo |= CONTEXT_PASIDE;
5314 		context[0].lo = ctx_lo;
5315 		wmb();
5316 		iommu->flush.flush_context(iommu,
5317 					   domain->iommu_did[iommu->seq_id],
5318 					   PCI_DEVID(info->bus, info->devfn),
5319 					   DMA_CCMD_MASK_NOBIT,
5320 					   DMA_CCMD_DEVICE_INVL);
5321 	}
5322 
5323 	/* Enable PASID support in the device, if it wasn't already */
5324 	if (!info->pasid_enabled)
5325 		iommu_enable_dev_iotlb(info);
5326 
5327 	ret = 0;
5328 
5329  out:
5330 	spin_unlock(&iommu->lock);
5331 	spin_unlock_irqrestore(&device_domain_lock, flags);
5332 
5333 	return ret;
5334 }
5335 
intel_iommu_device_group(struct device * dev)5336 static struct iommu_group *intel_iommu_device_group(struct device *dev)
5337 {
5338 	if (dev_is_pci(dev))
5339 		return pci_device_group(dev);
5340 	return generic_device_group(dev);
5341 }
5342 
intel_iommu_enable_auxd(struct device * dev)5343 static int intel_iommu_enable_auxd(struct device *dev)
5344 {
5345 	struct device_domain_info *info;
5346 	struct intel_iommu *iommu;
5347 	unsigned long flags;
5348 	int ret;
5349 
5350 	iommu = device_to_iommu(dev, NULL, NULL);
5351 	if (!iommu || dmar_disabled)
5352 		return -EINVAL;
5353 
5354 	if (!sm_supported(iommu) || !pasid_supported(iommu))
5355 		return -EINVAL;
5356 
5357 	ret = intel_iommu_enable_pasid(iommu, dev);
5358 	if (ret)
5359 		return -ENODEV;
5360 
5361 	spin_lock_irqsave(&device_domain_lock, flags);
5362 	info = get_domain_info(dev);
5363 	info->auxd_enabled = 1;
5364 	spin_unlock_irqrestore(&device_domain_lock, flags);
5365 
5366 	return 0;
5367 }
5368 
intel_iommu_disable_auxd(struct device * dev)5369 static int intel_iommu_disable_auxd(struct device *dev)
5370 {
5371 	struct device_domain_info *info;
5372 	unsigned long flags;
5373 
5374 	spin_lock_irqsave(&device_domain_lock, flags);
5375 	info = get_domain_info(dev);
5376 	if (!WARN_ON(!info))
5377 		info->auxd_enabled = 0;
5378 	spin_unlock_irqrestore(&device_domain_lock, flags);
5379 
5380 	return 0;
5381 }
5382 
intel_iommu_enable_sva(struct device * dev)5383 static int intel_iommu_enable_sva(struct device *dev)
5384 {
5385 	struct device_domain_info *info = get_domain_info(dev);
5386 	struct intel_iommu *iommu;
5387 	int ret;
5388 
5389 	if (!info || dmar_disabled)
5390 		return -EINVAL;
5391 
5392 	iommu = info->iommu;
5393 	if (!iommu)
5394 		return -EINVAL;
5395 
5396 	if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE))
5397 		return -ENODEV;
5398 
5399 	if (intel_iommu_enable_pasid(iommu, dev))
5400 		return -ENODEV;
5401 
5402 	if (!info->pasid_enabled || !info->pri_enabled || !info->ats_enabled)
5403 		return -EINVAL;
5404 
5405 	ret = iopf_queue_add_device(iommu->iopf_queue, dev);
5406 	if (ret)
5407 		return ret;
5408 
5409 	ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev);
5410 	if (ret)
5411 		iopf_queue_remove_device(iommu->iopf_queue, dev);
5412 
5413 	return ret;
5414 }
5415 
intel_iommu_disable_sva(struct device * dev)5416 static int intel_iommu_disable_sva(struct device *dev)
5417 {
5418 	struct device_domain_info *info = get_domain_info(dev);
5419 	struct intel_iommu *iommu = info->iommu;
5420 	int ret;
5421 
5422 	ret = iommu_unregister_device_fault_handler(dev);
5423 	if (ret)
5424 		return ret;
5425 
5426 	ret = iopf_queue_remove_device(iommu->iopf_queue, dev);
5427 	if (ret)
5428 		iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev);
5429 
5430 	return ret;
5431 }
5432 
5433 /*
5434  * A PCI express designated vendor specific extended capability is defined
5435  * in the section 3.7 of Intel scalable I/O virtualization technical spec
5436  * for system software and tools to detect endpoint devices supporting the
5437  * Intel scalable IO virtualization without host driver dependency.
5438  *
5439  * Returns the address of the matching extended capability structure within
5440  * the device's PCI configuration space or 0 if the device does not support
5441  * it.
5442  */
siov_find_pci_dvsec(struct pci_dev * pdev)5443 static int siov_find_pci_dvsec(struct pci_dev *pdev)
5444 {
5445 	int pos;
5446 	u16 vendor, id;
5447 
5448 	pos = pci_find_next_ext_capability(pdev, 0, 0x23);
5449 	while (pos) {
5450 		pci_read_config_word(pdev, pos + 4, &vendor);
5451 		pci_read_config_word(pdev, pos + 8, &id);
5452 		if (vendor == PCI_VENDOR_ID_INTEL && id == 5)
5453 			return pos;
5454 
5455 		pos = pci_find_next_ext_capability(pdev, pos, 0x23);
5456 	}
5457 
5458 	return 0;
5459 }
5460 
5461 static bool
intel_iommu_dev_has_feat(struct device * dev,enum iommu_dev_features feat)5462 intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
5463 {
5464 	struct device_domain_info *info = get_domain_info(dev);
5465 
5466 	if (feat == IOMMU_DEV_FEAT_AUX) {
5467 		int ret;
5468 
5469 		if (!dev_is_pci(dev) || dmar_disabled ||
5470 		    !scalable_mode_support() || !pasid_mode_support())
5471 			return false;
5472 
5473 		ret = pci_pasid_features(to_pci_dev(dev));
5474 		if (ret < 0)
5475 			return false;
5476 
5477 		return !!siov_find_pci_dvsec(to_pci_dev(dev));
5478 	}
5479 
5480 	if (feat == IOMMU_DEV_FEAT_IOPF)
5481 		return info && info->pri_supported;
5482 
5483 	if (feat == IOMMU_DEV_FEAT_SVA)
5484 		return info && (info->iommu->flags & VTD_FLAG_SVM_CAPABLE) &&
5485 			info->pasid_supported && info->pri_supported &&
5486 			info->ats_supported;
5487 
5488 	return false;
5489 }
5490 
5491 static int
intel_iommu_dev_enable_feat(struct device * dev,enum iommu_dev_features feat)5492 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
5493 {
5494 	switch (feat) {
5495 	case IOMMU_DEV_FEAT_AUX:
5496 		return intel_iommu_enable_auxd(dev);
5497 
5498 	case IOMMU_DEV_FEAT_IOPF:
5499 		return intel_iommu_dev_has_feat(dev, feat) ? 0 : -ENODEV;
5500 
5501 	case IOMMU_DEV_FEAT_SVA:
5502 		return intel_iommu_enable_sva(dev);
5503 
5504 	default:
5505 		return -ENODEV;
5506 	}
5507 }
5508 
5509 static int
intel_iommu_dev_disable_feat(struct device * dev,enum iommu_dev_features feat)5510 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
5511 {
5512 	switch (feat) {
5513 	case IOMMU_DEV_FEAT_AUX:
5514 		return intel_iommu_disable_auxd(dev);
5515 
5516 	case IOMMU_DEV_FEAT_IOPF:
5517 		return 0;
5518 
5519 	case IOMMU_DEV_FEAT_SVA:
5520 		return intel_iommu_disable_sva(dev);
5521 
5522 	default:
5523 		return -ENODEV;
5524 	}
5525 }
5526 
5527 static bool
intel_iommu_dev_feat_enabled(struct device * dev,enum iommu_dev_features feat)5528 intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat)
5529 {
5530 	struct device_domain_info *info = get_domain_info(dev);
5531 
5532 	if (feat == IOMMU_DEV_FEAT_AUX)
5533 		return scalable_mode_support() && info && info->auxd_enabled;
5534 
5535 	return false;
5536 }
5537 
5538 static int
intel_iommu_aux_get_pasid(struct iommu_domain * domain,struct device * dev)5539 intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
5540 {
5541 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5542 
5543 	return dmar_domain->default_pasid > 0 ?
5544 			dmar_domain->default_pasid : -EINVAL;
5545 }
5546 
intel_iommu_is_attach_deferred(struct iommu_domain * domain,struct device * dev)5547 static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,
5548 					   struct device *dev)
5549 {
5550 	return attach_deferred(dev);
5551 }
5552 
5553 static int
intel_iommu_enable_nesting(struct iommu_domain * domain)5554 intel_iommu_enable_nesting(struct iommu_domain *domain)
5555 {
5556 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5557 	unsigned long flags;
5558 	int ret = -ENODEV;
5559 
5560 	spin_lock_irqsave(&device_domain_lock, flags);
5561 	if (list_empty(&dmar_domain->devices)) {
5562 		dmar_domain->flags |= DOMAIN_FLAG_NESTING_MODE;
5563 		dmar_domain->flags &= ~DOMAIN_FLAG_USE_FIRST_LEVEL;
5564 		ret = 0;
5565 	}
5566 	spin_unlock_irqrestore(&device_domain_lock, flags);
5567 
5568 	return ret;
5569 }
5570 
5571 /*
5572  * Check that the device does not live on an external facing PCI port that is
5573  * marked as untrusted. Such devices should not be able to apply quirks and
5574  * thus not be able to bypass the IOMMU restrictions.
5575  */
risky_device(struct pci_dev * pdev)5576 static bool risky_device(struct pci_dev *pdev)
5577 {
5578 	if (pdev->untrusted) {
5579 		pci_info(pdev,
5580 			 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
5581 			 pdev->vendor, pdev->device);
5582 		pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
5583 		return true;
5584 	}
5585 	return false;
5586 }
5587 
intel_iommu_iotlb_sync_map(struct iommu_domain * domain,unsigned long iova,size_t size)5588 static void intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
5589 				       unsigned long iova, size_t size)
5590 {
5591 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5592 	unsigned long pages = aligned_nrpages(iova, size);
5593 	unsigned long pfn = iova >> VTD_PAGE_SHIFT;
5594 	struct intel_iommu *iommu;
5595 	int iommu_id;
5596 
5597 	for_each_domain_iommu(iommu_id, dmar_domain) {
5598 		iommu = g_iommus[iommu_id];
5599 		__mapping_notify_one(iommu, dmar_domain, pfn, pages);
5600 	}
5601 }
5602 
5603 const struct iommu_ops intel_iommu_ops = {
5604 	.capable		= intel_iommu_capable,
5605 	.domain_alloc		= intel_iommu_domain_alloc,
5606 	.domain_free		= intel_iommu_domain_free,
5607 	.enable_nesting		= intel_iommu_enable_nesting,
5608 	.attach_dev		= intel_iommu_attach_device,
5609 	.detach_dev		= intel_iommu_detach_device,
5610 	.aux_attach_dev		= intel_iommu_aux_attach_device,
5611 	.aux_detach_dev		= intel_iommu_aux_detach_device,
5612 	.aux_get_pasid		= intel_iommu_aux_get_pasid,
5613 	.map_pages		= intel_iommu_map_pages,
5614 	.unmap_pages		= intel_iommu_unmap_pages,
5615 	.iotlb_sync_map		= intel_iommu_iotlb_sync_map,
5616 	.flush_iotlb_all        = intel_flush_iotlb_all,
5617 	.iotlb_sync		= intel_iommu_tlb_sync,
5618 	.iova_to_phys		= intel_iommu_iova_to_phys,
5619 	.probe_device		= intel_iommu_probe_device,
5620 	.probe_finalize		= intel_iommu_probe_finalize,
5621 	.release_device		= intel_iommu_release_device,
5622 	.get_resv_regions	= intel_iommu_get_resv_regions,
5623 	.put_resv_regions	= generic_iommu_put_resv_regions,
5624 	.device_group		= intel_iommu_device_group,
5625 	.dev_has_feat		= intel_iommu_dev_has_feat,
5626 	.dev_feat_enabled	= intel_iommu_dev_feat_enabled,
5627 	.dev_enable_feat	= intel_iommu_dev_enable_feat,
5628 	.dev_disable_feat	= intel_iommu_dev_disable_feat,
5629 	.is_attach_deferred	= intel_iommu_is_attach_deferred,
5630 	.def_domain_type	= device_def_domain_type,
5631 	.pgsize_bitmap		= SZ_4K,
5632 #ifdef CONFIG_INTEL_IOMMU_SVM
5633 	.cache_invalidate	= intel_iommu_sva_invalidate,
5634 	.sva_bind_gpasid	= intel_svm_bind_gpasid,
5635 	.sva_unbind_gpasid	= intel_svm_unbind_gpasid,
5636 	.sva_bind		= intel_svm_bind,
5637 	.sva_unbind		= intel_svm_unbind,
5638 	.sva_get_pasid		= intel_svm_get_pasid,
5639 	.page_response		= intel_svm_page_response,
5640 #endif
5641 };
5642 
quirk_iommu_igfx(struct pci_dev * dev)5643 static void quirk_iommu_igfx(struct pci_dev *dev)
5644 {
5645 	if (risky_device(dev))
5646 		return;
5647 
5648 	pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
5649 	dmar_map_gfx = 0;
5650 }
5651 
5652 /* G4x/GM45 integrated gfx dmar support is totally busted. */
5653 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
5654 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
5655 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
5656 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
5657 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
5658 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
5659 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
5660 
5661 /* Broadwell igfx malfunctions with dmar */
5662 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
5663 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
5664 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
5665 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
5666 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
5667 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
5668 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
5669 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
5670 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
5671 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
5672 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
5673 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
5674 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
5675 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
5676 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
5677 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
5678 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
5679 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
5680 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
5681 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
5682 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
5683 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
5684 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
5685 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
5686 
quirk_iommu_rwbf(struct pci_dev * dev)5687 static void quirk_iommu_rwbf(struct pci_dev *dev)
5688 {
5689 	if (risky_device(dev))
5690 		return;
5691 
5692 	/*
5693 	 * Mobile 4 Series Chipset neglects to set RWBF capability,
5694 	 * but needs it. Same seems to hold for the desktop versions.
5695 	 */
5696 	pci_info(dev, "Forcing write-buffer flush capability\n");
5697 	rwbf_quirk = 1;
5698 }
5699 
5700 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
5701 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
5702 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
5703 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
5704 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
5705 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
5706 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
5707 
5708 #define GGC 0x52
5709 #define GGC_MEMORY_SIZE_MASK	(0xf << 8)
5710 #define GGC_MEMORY_SIZE_NONE	(0x0 << 8)
5711 #define GGC_MEMORY_SIZE_1M	(0x1 << 8)
5712 #define GGC_MEMORY_SIZE_2M	(0x3 << 8)
5713 #define GGC_MEMORY_VT_ENABLED	(0x8 << 8)
5714 #define GGC_MEMORY_SIZE_2M_VT	(0x9 << 8)
5715 #define GGC_MEMORY_SIZE_3M_VT	(0xa << 8)
5716 #define GGC_MEMORY_SIZE_4M_VT	(0xb << 8)
5717 
quirk_calpella_no_shadow_gtt(struct pci_dev * dev)5718 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
5719 {
5720 	unsigned short ggc;
5721 
5722 	if (risky_device(dev))
5723 		return;
5724 
5725 	if (pci_read_config_word(dev, GGC, &ggc))
5726 		return;
5727 
5728 	if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
5729 		pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
5730 		dmar_map_gfx = 0;
5731 	} else if (dmar_map_gfx) {
5732 		/* we have to ensure the gfx device is idle before we flush */
5733 		pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
5734 		iommu_set_dma_strict();
5735 	}
5736 }
5737 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
5738 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
5739 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
5740 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
5741 
quirk_igfx_skip_te_disable(struct pci_dev * dev)5742 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
5743 {
5744 	unsigned short ver;
5745 
5746 	if (!IS_GFX_DEVICE(dev))
5747 		return;
5748 
5749 	ver = (dev->device >> 8) & 0xff;
5750 	if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
5751 	    ver != 0x4e && ver != 0x8a && ver != 0x98 &&
5752 	    ver != 0x9a && ver != 0xa7 && ver != 0x7d)
5753 		return;
5754 
5755 	if (risky_device(dev))
5756 		return;
5757 
5758 	pci_info(dev, "Skip IOMMU disabling for graphics\n");
5759 	iommu_skip_te_disable = 1;
5760 }
5761 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
5762 
5763 /* On Tylersburg chipsets, some BIOSes have been known to enable the
5764    ISOCH DMAR unit for the Azalia sound device, but not give it any
5765    TLB entries, which causes it to deadlock. Check for that.  We do
5766    this in a function called from init_dmars(), instead of in a PCI
5767    quirk, because we don't want to print the obnoxious "BIOS broken"
5768    message if VT-d is actually disabled.
5769 */
check_tylersburg_isoch(void)5770 static void __init check_tylersburg_isoch(void)
5771 {
5772 	struct pci_dev *pdev;
5773 	uint32_t vtisochctrl;
5774 
5775 	/* If there's no Azalia in the system anyway, forget it. */
5776 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
5777 	if (!pdev)
5778 		return;
5779 
5780 	if (risky_device(pdev)) {
5781 		pci_dev_put(pdev);
5782 		return;
5783 	}
5784 
5785 	pci_dev_put(pdev);
5786 
5787 	/* System Management Registers. Might be hidden, in which case
5788 	   we can't do the sanity check. But that's OK, because the
5789 	   known-broken BIOSes _don't_ actually hide it, so far. */
5790 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
5791 	if (!pdev)
5792 		return;
5793 
5794 	if (risky_device(pdev)) {
5795 		pci_dev_put(pdev);
5796 		return;
5797 	}
5798 
5799 	if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
5800 		pci_dev_put(pdev);
5801 		return;
5802 	}
5803 
5804 	pci_dev_put(pdev);
5805 
5806 	/* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5807 	if (vtisochctrl & 1)
5808 		return;
5809 
5810 	/* Drop all bits other than the number of TLB entries */
5811 	vtisochctrl &= 0x1c;
5812 
5813 	/* If we have the recommended number of TLB entries (16), fine. */
5814 	if (vtisochctrl == 0x10)
5815 		return;
5816 
5817 	/* Zero TLB entries? You get to ride the short bus to school. */
5818 	if (!vtisochctrl) {
5819 		WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5820 		     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5821 		     dmi_get_system_info(DMI_BIOS_VENDOR),
5822 		     dmi_get_system_info(DMI_BIOS_VERSION),
5823 		     dmi_get_system_info(DMI_PRODUCT_VERSION));
5824 		iommu_identity_mapping |= IDENTMAP_AZALIA;
5825 		return;
5826 	}
5827 
5828 	pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
5829 	       vtisochctrl);
5830 }
5831