• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright © 2006-2014 Intel Corporation.
4  *
5  * Authors: David Woodhouse <dwmw2@infradead.org>,
6  *          Ashok Raj <ashok.raj@intel.com>,
7  *          Shaohua Li <shaohua.li@intel.com>,
8  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9  *          Fenghua Yu <fenghua.yu@intel.com>
10  *          Joerg Roedel <jroedel@suse.de>
11  */
12 
13 #define pr_fmt(fmt)     "DMAR: " fmt
14 #define dev_fmt(fmt)    pr_fmt(fmt)
15 
16 #include <linux/init.h>
17 #include <linux/bitmap.h>
18 #include <linux/debugfs.h>
19 #include <linux/export.h>
20 #include <linux/slab.h>
21 #include <linux/irq.h>
22 #include <linux/interrupt.h>
23 #include <linux/spinlock.h>
24 #include <linux/pci.h>
25 #include <linux/dmar.h>
26 #include <linux/dma-map-ops.h>
27 #include <linux/mempool.h>
28 #include <linux/memory.h>
29 #include <linux/cpu.h>
30 #include <linux/timer.h>
31 #include <linux/io.h>
32 #include <linux/iova.h>
33 #include <linux/iommu.h>
34 #include <linux/intel-iommu.h>
35 #include <linux/syscore_ops.h>
36 #include <linux/tboot.h>
37 #include <linux/dmi.h>
38 #include <linux/pci-ats.h>
39 #include <linux/memblock.h>
40 #include <linux/dma-map-ops.h>
41 #include <linux/dma-direct.h>
42 #include <linux/crash_dump.h>
43 #include <linux/numa.h>
44 #include <linux/swiotlb.h>
45 #include <asm/irq_remapping.h>
46 #include <asm/cacheflush.h>
47 #include <asm/iommu.h>
48 #include <trace/events/intel_iommu.h>
49 
50 #include "../irq_remapping.h"
51 #include "pasid.h"
52 
53 #define ROOT_SIZE		VTD_PAGE_SIZE
54 #define CONTEXT_SIZE		VTD_PAGE_SIZE
55 
56 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
57 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
58 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
59 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
60 
61 #define IOAPIC_RANGE_START	(0xfee00000)
62 #define IOAPIC_RANGE_END	(0xfeefffff)
63 #define IOVA_START_ADDR		(0x1000)
64 
65 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
66 
67 #define MAX_AGAW_WIDTH 64
68 #define MAX_AGAW_PFN_WIDTH	(MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
69 
70 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
71 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
72 
73 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
74    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
75 #define DOMAIN_MAX_PFN(gaw)	((unsigned long) min_t(uint64_t, \
76 				__DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
77 #define DOMAIN_MAX_ADDR(gaw)	(((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
78 
79 /* IO virtual address start page frame number */
80 #define IOVA_START_PFN		(1)
81 
82 #define IOVA_PFN(addr)		((addr) >> PAGE_SHIFT)
83 
84 /* page table handling */
85 #define LEVEL_STRIDE		(9)
86 #define LEVEL_MASK		(((u64)1 << LEVEL_STRIDE) - 1)
87 
88 /*
89  * This bitmap is used to advertise the page sizes our hardware support
90  * to the IOMMU core, which will then use this information to split
91  * physically contiguous memory regions it is mapping into page sizes
92  * that we support.
93  *
94  * Traditionally the IOMMU core just handed us the mappings directly,
95  * after making sure the size is an order of a 4KiB page and that the
96  * mapping has natural alignment.
97  *
98  * To retain this behavior, we currently advertise that we support
99  * all page sizes that are an order of 4KiB.
100  *
101  * If at some point we'd like to utilize the IOMMU core's new behavior,
102  * we could change this to advertise the real page sizes we support.
103  */
104 #define INTEL_IOMMU_PGSIZES	(~0xFFFUL)
105 
agaw_to_level(int agaw)106 static inline int agaw_to_level(int agaw)
107 {
108 	return agaw + 2;
109 }
110 
agaw_to_width(int agaw)111 static inline int agaw_to_width(int agaw)
112 {
113 	return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
114 }
115 
width_to_agaw(int width)116 static inline int width_to_agaw(int width)
117 {
118 	return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
119 }
120 
level_to_offset_bits(int level)121 static inline unsigned int level_to_offset_bits(int level)
122 {
123 	return (level - 1) * LEVEL_STRIDE;
124 }
125 
pfn_level_offset(u64 pfn,int level)126 static inline int pfn_level_offset(u64 pfn, int level)
127 {
128 	return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
129 }
130 
level_mask(int level)131 static inline u64 level_mask(int level)
132 {
133 	return -1ULL << level_to_offset_bits(level);
134 }
135 
level_size(int level)136 static inline u64 level_size(int level)
137 {
138 	return 1ULL << level_to_offset_bits(level);
139 }
140 
align_to_level(u64 pfn,int level)141 static inline u64 align_to_level(u64 pfn, int level)
142 {
143 	return (pfn + level_size(level) - 1) & level_mask(level);
144 }
145 
lvl_to_nr_pages(unsigned int lvl)146 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
147 {
148 	return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
149 }
150 
151 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
152    are never going to work. */
dma_to_mm_pfn(unsigned long dma_pfn)153 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
154 {
155 	return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
156 }
157 
mm_to_dma_pfn(unsigned long mm_pfn)158 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
159 {
160 	return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
161 }
page_to_dma_pfn(struct page * pg)162 static inline unsigned long page_to_dma_pfn(struct page *pg)
163 {
164 	return mm_to_dma_pfn(page_to_pfn(pg));
165 }
virt_to_dma_pfn(void * p)166 static inline unsigned long virt_to_dma_pfn(void *p)
167 {
168 	return page_to_dma_pfn(virt_to_page(p));
169 }
170 
171 /* global iommu list, set NULL for ignored DMAR units */
172 static struct intel_iommu **g_iommus;
173 
174 static void __init check_tylersburg_isoch(void);
175 static int rwbf_quirk;
176 
177 /*
178  * set to 1 to panic kernel if can't successfully enable VT-d
179  * (used when kernel is launched w/ TXT)
180  */
181 static int force_on = 0;
182 static int intel_iommu_tboot_noforce;
183 static int no_platform_optin;
184 
185 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
186 
187 /*
188  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
189  * if marked present.
190  */
root_entry_lctp(struct root_entry * re)191 static phys_addr_t root_entry_lctp(struct root_entry *re)
192 {
193 	if (!(re->lo & 1))
194 		return 0;
195 
196 	return re->lo & VTD_PAGE_MASK;
197 }
198 
199 /*
200  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
201  * if marked present.
202  */
root_entry_uctp(struct root_entry * re)203 static phys_addr_t root_entry_uctp(struct root_entry *re)
204 {
205 	if (!(re->hi & 1))
206 		return 0;
207 
208 	return re->hi & VTD_PAGE_MASK;
209 }
210 
context_clear_pasid_enable(struct context_entry * context)211 static inline void context_clear_pasid_enable(struct context_entry *context)
212 {
213 	context->lo &= ~(1ULL << 11);
214 }
215 
context_pasid_enabled(struct context_entry * context)216 static inline bool context_pasid_enabled(struct context_entry *context)
217 {
218 	return !!(context->lo & (1ULL << 11));
219 }
220 
context_set_copied(struct context_entry * context)221 static inline void context_set_copied(struct context_entry *context)
222 {
223 	context->hi |= (1ull << 3);
224 }
225 
context_copied(struct context_entry * context)226 static inline bool context_copied(struct context_entry *context)
227 {
228 	return !!(context->hi & (1ULL << 3));
229 }
230 
__context_present(struct context_entry * context)231 static inline bool __context_present(struct context_entry *context)
232 {
233 	return (context->lo & 1);
234 }
235 
context_present(struct context_entry * context)236 bool context_present(struct context_entry *context)
237 {
238 	return context_pasid_enabled(context) ?
239 	     __context_present(context) :
240 	     __context_present(context) && !context_copied(context);
241 }
242 
context_set_present(struct context_entry * context)243 static inline void context_set_present(struct context_entry *context)
244 {
245 	context->lo |= 1;
246 }
247 
context_set_fault_enable(struct context_entry * context)248 static inline void context_set_fault_enable(struct context_entry *context)
249 {
250 	context->lo &= (((u64)-1) << 2) | 1;
251 }
252 
context_set_translation_type(struct context_entry * context,unsigned long value)253 static inline void context_set_translation_type(struct context_entry *context,
254 						unsigned long value)
255 {
256 	context->lo &= (((u64)-1) << 4) | 3;
257 	context->lo |= (value & 3) << 2;
258 }
259 
context_set_address_root(struct context_entry * context,unsigned long value)260 static inline void context_set_address_root(struct context_entry *context,
261 					    unsigned long value)
262 {
263 	context->lo &= ~VTD_PAGE_MASK;
264 	context->lo |= value & VTD_PAGE_MASK;
265 }
266 
context_set_address_width(struct context_entry * context,unsigned long value)267 static inline void context_set_address_width(struct context_entry *context,
268 					     unsigned long value)
269 {
270 	context->hi |= value & 7;
271 }
272 
context_set_domain_id(struct context_entry * context,unsigned long value)273 static inline void context_set_domain_id(struct context_entry *context,
274 					 unsigned long value)
275 {
276 	context->hi |= (value & ((1 << 16) - 1)) << 8;
277 }
278 
context_domain_id(struct context_entry * c)279 static inline int context_domain_id(struct context_entry *c)
280 {
281 	return((c->hi >> 8) & 0xffff);
282 }
283 
context_clear_entry(struct context_entry * context)284 static inline void context_clear_entry(struct context_entry *context)
285 {
286 	context->lo = 0;
287 	context->hi = 0;
288 }
289 
290 /*
291  * This domain is a statically identity mapping domain.
292  *	1. This domain creats a static 1:1 mapping to all usable memory.
293  * 	2. It maps to each iommu if successful.
294  *	3. Each iommu mapps to this domain if successful.
295  */
296 static struct dmar_domain *si_domain;
297 static int hw_pass_through = 1;
298 
299 #define for_each_domain_iommu(idx, domain)			\
300 	for (idx = 0; idx < g_num_of_iommus; idx++)		\
301 		if (domain->iommu_refcnt[idx])
302 
303 struct dmar_rmrr_unit {
304 	struct list_head list;		/* list of rmrr units	*/
305 	struct acpi_dmar_header *hdr;	/* ACPI header		*/
306 	u64	base_address;		/* reserved base address*/
307 	u64	end_address;		/* reserved end address */
308 	struct dmar_dev_scope *devices;	/* target devices */
309 	int	devices_cnt;		/* target device count */
310 };
311 
312 struct dmar_atsr_unit {
313 	struct list_head list;		/* list of ATSR units */
314 	struct acpi_dmar_header *hdr;	/* ACPI header */
315 	struct dmar_dev_scope *devices;	/* target devices */
316 	int devices_cnt;		/* target device count */
317 	u8 include_all:1;		/* include all ports */
318 };
319 
320 static LIST_HEAD(dmar_atsr_units);
321 static LIST_HEAD(dmar_rmrr_units);
322 
323 #define for_each_rmrr_units(rmrr) \
324 	list_for_each_entry(rmrr, &dmar_rmrr_units, list)
325 
326 /* bitmap for indexing intel_iommus */
327 static int g_num_of_iommus;
328 
329 static void domain_exit(struct dmar_domain *domain);
330 static void domain_remove_dev_info(struct dmar_domain *domain);
331 static void dmar_remove_one_dev_info(struct device *dev);
332 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
333 static int intel_iommu_attach_device(struct iommu_domain *domain,
334 				     struct device *dev);
335 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
336 					    dma_addr_t iova);
337 
338 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
339 int dmar_disabled = 0;
340 #else
341 int dmar_disabled = 1;
342 #endif /* CONFIG_INTEL_IOMMU_DEFAULT_ON */
343 
344 #ifdef CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON
345 int intel_iommu_sm = 1;
346 #else
347 int intel_iommu_sm;
348 #endif /* CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON */
349 
350 int intel_iommu_enabled = 0;
351 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
352 
353 static int dmar_map_gfx = 1;
354 static int dmar_forcedac;
355 static int intel_iommu_strict;
356 static int intel_iommu_superpage = 1;
357 static int iommu_identity_mapping;
358 static int intel_no_bounce;
359 static int iommu_skip_te_disable;
360 
361 #define IDENTMAP_GFX		2
362 #define IDENTMAP_AZALIA		4
363 
364 int intel_iommu_gfx_mapped;
365 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
366 
367 #define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2))
get_domain_info(struct device * dev)368 struct device_domain_info *get_domain_info(struct device *dev)
369 {
370 	struct device_domain_info *info;
371 
372 	if (!dev)
373 		return NULL;
374 
375 	info = dev_iommu_priv_get(dev);
376 	if (unlikely(info == DEFER_DEVICE_DOMAIN_INFO))
377 		return NULL;
378 
379 	return info;
380 }
381 
382 DEFINE_SPINLOCK(device_domain_lock);
383 static LIST_HEAD(device_domain_list);
384 
385 #define device_needs_bounce(d) (!intel_no_bounce && dev_is_pci(d) &&	\
386 				to_pci_dev(d)->untrusted)
387 
388 /*
389  * Iterate over elements in device_domain_list and call the specified
390  * callback @fn against each element.
391  */
for_each_device_domain(int (* fn)(struct device_domain_info * info,void * data),void * data)392 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
393 				     void *data), void *data)
394 {
395 	int ret = 0;
396 	unsigned long flags;
397 	struct device_domain_info *info;
398 
399 	spin_lock_irqsave(&device_domain_lock, flags);
400 	list_for_each_entry(info, &device_domain_list, global) {
401 		ret = fn(info, data);
402 		if (ret) {
403 			spin_unlock_irqrestore(&device_domain_lock, flags);
404 			return ret;
405 		}
406 	}
407 	spin_unlock_irqrestore(&device_domain_lock, flags);
408 
409 	return 0;
410 }
411 
412 const struct iommu_ops intel_iommu_ops;
413 
translation_pre_enabled(struct intel_iommu * iommu)414 static bool translation_pre_enabled(struct intel_iommu *iommu)
415 {
416 	return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
417 }
418 
clear_translation_pre_enabled(struct intel_iommu * iommu)419 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
420 {
421 	iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
422 }
423 
init_translation_status(struct intel_iommu * iommu)424 static void init_translation_status(struct intel_iommu *iommu)
425 {
426 	u32 gsts;
427 
428 	gsts = readl(iommu->reg + DMAR_GSTS_REG);
429 	if (gsts & DMA_GSTS_TES)
430 		iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
431 }
432 
intel_iommu_setup(char * str)433 static int __init intel_iommu_setup(char *str)
434 {
435 	if (!str)
436 		return -EINVAL;
437 	while (*str) {
438 		if (!strncmp(str, "on", 2)) {
439 			dmar_disabled = 0;
440 			pr_info("IOMMU enabled\n");
441 		} else if (!strncmp(str, "off", 3)) {
442 			dmar_disabled = 1;
443 			no_platform_optin = 1;
444 			pr_info("IOMMU disabled\n");
445 		} else if (!strncmp(str, "igfx_off", 8)) {
446 			dmar_map_gfx = 0;
447 			pr_info("Disable GFX device mapping\n");
448 		} else if (!strncmp(str, "forcedac", 8)) {
449 			pr_info("Forcing DAC for PCI devices\n");
450 			dmar_forcedac = 1;
451 		} else if (!strncmp(str, "strict", 6)) {
452 			pr_info("Disable batched IOTLB flush\n");
453 			intel_iommu_strict = 1;
454 		} else if (!strncmp(str, "sp_off", 6)) {
455 			pr_info("Disable supported super page\n");
456 			intel_iommu_superpage = 0;
457 		} else if (!strncmp(str, "sm_on", 5)) {
458 			pr_info("Intel-IOMMU: scalable mode supported\n");
459 			intel_iommu_sm = 1;
460 		} else if (!strncmp(str, "tboot_noforce", 13)) {
461 			pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
462 			intel_iommu_tboot_noforce = 1;
463 		} else if (!strncmp(str, "nobounce", 8)) {
464 			pr_info("Intel-IOMMU: No bounce buffer. This could expose security risks of DMA attacks\n");
465 			intel_no_bounce = 1;
466 		}
467 
468 		str += strcspn(str, ",");
469 		while (*str == ',')
470 			str++;
471 	}
472 	return 0;
473 }
474 __setup("intel_iommu=", intel_iommu_setup);
475 
476 static struct kmem_cache *iommu_domain_cache;
477 static struct kmem_cache *iommu_devinfo_cache;
478 
get_iommu_domain(struct intel_iommu * iommu,u16 did)479 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
480 {
481 	struct dmar_domain **domains;
482 	int idx = did >> 8;
483 
484 	domains = iommu->domains[idx];
485 	if (!domains)
486 		return NULL;
487 
488 	return domains[did & 0xff];
489 }
490 
set_iommu_domain(struct intel_iommu * iommu,u16 did,struct dmar_domain * domain)491 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
492 			     struct dmar_domain *domain)
493 {
494 	struct dmar_domain **domains;
495 	int idx = did >> 8;
496 
497 	if (!iommu->domains[idx]) {
498 		size_t size = 256 * sizeof(struct dmar_domain *);
499 		iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
500 	}
501 
502 	domains = iommu->domains[idx];
503 	if (WARN_ON(!domains))
504 		return;
505 	else
506 		domains[did & 0xff] = domain;
507 }
508 
alloc_pgtable_page(int node)509 void *alloc_pgtable_page(int node)
510 {
511 	struct page *page;
512 	void *vaddr = NULL;
513 
514 	page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
515 	if (page)
516 		vaddr = page_address(page);
517 	return vaddr;
518 }
519 
free_pgtable_page(void * vaddr)520 void free_pgtable_page(void *vaddr)
521 {
522 	free_page((unsigned long)vaddr);
523 }
524 
alloc_domain_mem(void)525 static inline void *alloc_domain_mem(void)
526 {
527 	return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
528 }
529 
free_domain_mem(void * vaddr)530 static void free_domain_mem(void *vaddr)
531 {
532 	kmem_cache_free(iommu_domain_cache, vaddr);
533 }
534 
alloc_devinfo_mem(void)535 static inline void * alloc_devinfo_mem(void)
536 {
537 	return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
538 }
539 
free_devinfo_mem(void * vaddr)540 static inline void free_devinfo_mem(void *vaddr)
541 {
542 	kmem_cache_free(iommu_devinfo_cache, vaddr);
543 }
544 
domain_type_is_si(struct dmar_domain * domain)545 static inline int domain_type_is_si(struct dmar_domain *domain)
546 {
547 	return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
548 }
549 
domain_use_first_level(struct dmar_domain * domain)550 static inline bool domain_use_first_level(struct dmar_domain *domain)
551 {
552 	return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL;
553 }
554 
domain_pfn_supported(struct dmar_domain * domain,unsigned long pfn)555 static inline int domain_pfn_supported(struct dmar_domain *domain,
556 				       unsigned long pfn)
557 {
558 	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
559 
560 	return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
561 }
562 
563 /*
564  * Calculate the Supported Adjusted Guest Address Widths of an IOMMU.
565  * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of
566  * the returned SAGAW.
567  */
__iommu_calculate_sagaw(struct intel_iommu * iommu)568 static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu)
569 {
570 	unsigned long fl_sagaw, sl_sagaw;
571 
572 	fl_sagaw = BIT(2) | (cap_5lp_support(iommu->cap) ? BIT(3) : 0);
573 	sl_sagaw = cap_sagaw(iommu->cap);
574 
575 	/* Second level only. */
576 	if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
577 		return sl_sagaw;
578 
579 	/* First level only. */
580 	if (!ecap_slts(iommu->ecap))
581 		return fl_sagaw;
582 
583 	return fl_sagaw & sl_sagaw;
584 }
585 
__iommu_calculate_agaw(struct intel_iommu * iommu,int max_gaw)586 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
587 {
588 	unsigned long sagaw;
589 	int agaw = -1;
590 
591 	sagaw = __iommu_calculate_sagaw(iommu);
592 	for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) {
593 		if (test_bit(agaw, &sagaw))
594 			break;
595 	}
596 
597 	return agaw;
598 }
599 
600 /*
601  * Calculate max SAGAW for each iommu.
602  */
iommu_calculate_max_sagaw(struct intel_iommu * iommu)603 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
604 {
605 	return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
606 }
607 
608 /*
609  * calculate agaw for each iommu.
610  * "SAGAW" may be different across iommus, use a default agaw, and
611  * get a supported less agaw for iommus that don't support the default agaw.
612  */
iommu_calculate_agaw(struct intel_iommu * iommu)613 int iommu_calculate_agaw(struct intel_iommu *iommu)
614 {
615 	return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
616 }
617 
618 /* This functionin only returns single iommu in a domain */
domain_get_iommu(struct dmar_domain * domain)619 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
620 {
621 	int iommu_id;
622 
623 	/* si_domain and vm domain should not get here. */
624 	if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA))
625 		return NULL;
626 
627 	for_each_domain_iommu(iommu_id, domain)
628 		break;
629 
630 	if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
631 		return NULL;
632 
633 	return g_iommus[iommu_id];
634 }
635 
iommu_paging_structure_coherency(struct intel_iommu * iommu)636 static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
637 {
638 	return sm_supported(iommu) ?
639 			ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
640 }
641 
domain_update_iommu_coherency(struct dmar_domain * domain)642 static void domain_update_iommu_coherency(struct dmar_domain *domain)
643 {
644 	struct dmar_drhd_unit *drhd;
645 	struct intel_iommu *iommu;
646 	bool found = false;
647 	int i;
648 
649 	domain->iommu_coherency = 1;
650 
651 	for_each_domain_iommu(i, domain) {
652 		found = true;
653 		if (!iommu_paging_structure_coherency(g_iommus[i])) {
654 			domain->iommu_coherency = 0;
655 			break;
656 		}
657 	}
658 	if (found)
659 		return;
660 
661 	/* No hardware attached; use lowest common denominator */
662 	rcu_read_lock();
663 	for_each_active_iommu(iommu, drhd) {
664 		if (!iommu_paging_structure_coherency(iommu)) {
665 			domain->iommu_coherency = 0;
666 			break;
667 		}
668 	}
669 	rcu_read_unlock();
670 }
671 
domain_update_iommu_snooping(struct intel_iommu * skip)672 static int domain_update_iommu_snooping(struct intel_iommu *skip)
673 {
674 	struct dmar_drhd_unit *drhd;
675 	struct intel_iommu *iommu;
676 	int ret = 1;
677 
678 	rcu_read_lock();
679 	for_each_active_iommu(iommu, drhd) {
680 		if (iommu != skip) {
681 			/*
682 			 * If the hardware is operating in the scalable mode,
683 			 * the snooping control is always supported since we
684 			 * always set PASID-table-entry.PGSNP bit if the domain
685 			 * is managed outside (UNMANAGED).
686 			 */
687 			if (!sm_supported(iommu) &&
688 			    !ecap_sc_support(iommu->ecap)) {
689 				ret = 0;
690 				break;
691 			}
692 		}
693 	}
694 	rcu_read_unlock();
695 
696 	return ret;
697 }
698 
domain_update_iommu_superpage(struct dmar_domain * domain,struct intel_iommu * skip)699 static int domain_update_iommu_superpage(struct dmar_domain *domain,
700 					 struct intel_iommu *skip)
701 {
702 	struct dmar_drhd_unit *drhd;
703 	struct intel_iommu *iommu;
704 	int mask = 0x3;
705 
706 	if (!intel_iommu_superpage) {
707 		return 0;
708 	}
709 
710 	/* set iommu_superpage to the smallest common denominator */
711 	rcu_read_lock();
712 	for_each_active_iommu(iommu, drhd) {
713 		if (iommu != skip) {
714 			if (domain && domain_use_first_level(domain)) {
715 				if (!cap_fl1gp_support(iommu->cap))
716 					mask = 0x1;
717 			} else {
718 				mask &= cap_super_page_val(iommu->cap);
719 			}
720 
721 			if (!mask)
722 				break;
723 		}
724 	}
725 	rcu_read_unlock();
726 
727 	return fls(mask);
728 }
729 
domain_update_device_node(struct dmar_domain * domain)730 static int domain_update_device_node(struct dmar_domain *domain)
731 {
732 	struct device_domain_info *info;
733 	int nid = NUMA_NO_NODE;
734 
735 	assert_spin_locked(&device_domain_lock);
736 
737 	if (list_empty(&domain->devices))
738 		return NUMA_NO_NODE;
739 
740 	list_for_each_entry(info, &domain->devices, link) {
741 		if (!info->dev)
742 			continue;
743 
744 		/*
745 		 * There could possibly be multiple device numa nodes as devices
746 		 * within the same domain may sit behind different IOMMUs. There
747 		 * isn't perfect answer in such situation, so we select first
748 		 * come first served policy.
749 		 */
750 		nid = dev_to_node(info->dev);
751 		if (nid != NUMA_NO_NODE)
752 			break;
753 	}
754 
755 	return nid;
756 }
757 
758 /* Some capabilities may be different across iommus */
domain_update_iommu_cap(struct dmar_domain * domain)759 static void domain_update_iommu_cap(struct dmar_domain *domain)
760 {
761 	domain_update_iommu_coherency(domain);
762 	domain->iommu_snooping = domain_update_iommu_snooping(NULL);
763 	domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
764 
765 	/*
766 	 * If RHSA is missing, we should default to the device numa domain
767 	 * as fall back.
768 	 */
769 	if (domain->nid == NUMA_NO_NODE)
770 		domain->nid = domain_update_device_node(domain);
771 
772 	/*
773 	 * First-level translation restricts the input-address to a
774 	 * canonical address (i.e., address bits 63:N have the same
775 	 * value as address bit [N-1], where N is 48-bits with 4-level
776 	 * paging and 57-bits with 5-level paging). Hence, skip bit
777 	 * [N-1].
778 	 */
779 	if (domain_use_first_level(domain))
780 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
781 	else
782 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
783 }
784 
iommu_context_addr(struct intel_iommu * iommu,u8 bus,u8 devfn,int alloc)785 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
786 					 u8 devfn, int alloc)
787 {
788 	struct root_entry *root = &iommu->root_entry[bus];
789 	struct context_entry *context;
790 	u64 *entry;
791 
792 	entry = &root->lo;
793 	if (sm_supported(iommu)) {
794 		if (devfn >= 0x80) {
795 			devfn -= 0x80;
796 			entry = &root->hi;
797 		}
798 		devfn *= 2;
799 	}
800 	if (*entry & 1)
801 		context = phys_to_virt(*entry & VTD_PAGE_MASK);
802 	else {
803 		unsigned long phy_addr;
804 		if (!alloc)
805 			return NULL;
806 
807 		context = alloc_pgtable_page(iommu->node);
808 		if (!context)
809 			return NULL;
810 
811 		__iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
812 		phy_addr = virt_to_phys((void *)context);
813 		*entry = phy_addr | 1;
814 		__iommu_flush_cache(iommu, entry, sizeof(*entry));
815 	}
816 	return &context[devfn];
817 }
818 
attach_deferred(struct device * dev)819 static bool attach_deferred(struct device *dev)
820 {
821 	return dev_iommu_priv_get(dev) == DEFER_DEVICE_DOMAIN_INFO;
822 }
823 
824 /**
825  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
826  *				 sub-hierarchy of a candidate PCI-PCI bridge
827  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
828  * @bridge: the candidate PCI-PCI bridge
829  *
830  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
831  */
832 static bool
is_downstream_to_pci_bridge(struct device * dev,struct device * bridge)833 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
834 {
835 	struct pci_dev *pdev, *pbridge;
836 
837 	if (!dev_is_pci(dev) || !dev_is_pci(bridge))
838 		return false;
839 
840 	pdev = to_pci_dev(dev);
841 	pbridge = to_pci_dev(bridge);
842 
843 	if (pbridge->subordinate &&
844 	    pbridge->subordinate->number <= pdev->bus->number &&
845 	    pbridge->subordinate->busn_res.end >= pdev->bus->number)
846 		return true;
847 
848 	return false;
849 }
850 
quirk_ioat_snb_local_iommu(struct pci_dev * pdev)851 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
852 {
853 	struct dmar_drhd_unit *drhd;
854 	u32 vtbar;
855 	int rc;
856 
857 	/* We know that this device on this chipset has its own IOMMU.
858 	 * If we find it under a different IOMMU, then the BIOS is lying
859 	 * to us. Hope that the IOMMU for this device is actually
860 	 * disabled, and it needs no translation...
861 	 */
862 	rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
863 	if (rc) {
864 		/* "can't" happen */
865 		dev_info(&pdev->dev, "failed to run vt-d quirk\n");
866 		return false;
867 	}
868 	vtbar &= 0xffff0000;
869 
870 	/* we know that the this iommu should be at offset 0xa000 from vtbar */
871 	drhd = dmar_find_matched_drhd_unit(pdev);
872 	if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
873 		pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
874 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
875 		return true;
876 	}
877 
878 	return false;
879 }
880 
iommu_is_dummy(struct intel_iommu * iommu,struct device * dev)881 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
882 {
883 	if (!iommu || iommu->drhd->ignored)
884 		return true;
885 
886 	if (dev_is_pci(dev)) {
887 		struct pci_dev *pdev = to_pci_dev(dev);
888 
889 		if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
890 		    pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
891 		    quirk_ioat_snb_local_iommu(pdev))
892 			return true;
893 	}
894 
895 	return false;
896 }
897 
device_to_iommu(struct device * dev,u8 * bus,u8 * devfn)898 struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
899 {
900 	struct dmar_drhd_unit *drhd = NULL;
901 	struct pci_dev *pdev = NULL;
902 	struct intel_iommu *iommu;
903 	struct device *tmp;
904 	u16 segment = 0;
905 	int i;
906 
907 	if (!dev)
908 		return NULL;
909 
910 	if (dev_is_pci(dev)) {
911 		struct pci_dev *pf_pdev;
912 
913 		pdev = pci_real_dma_dev(to_pci_dev(dev));
914 
915 		/* VFs aren't listed in scope tables; we need to look up
916 		 * the PF instead to find the IOMMU. */
917 		pf_pdev = pci_physfn(pdev);
918 		dev = &pf_pdev->dev;
919 		segment = pci_domain_nr(pdev->bus);
920 	} else if (has_acpi_companion(dev))
921 		dev = &ACPI_COMPANION(dev)->dev;
922 
923 	rcu_read_lock();
924 	for_each_iommu(iommu, drhd) {
925 		if (pdev && segment != drhd->segment)
926 			continue;
927 
928 		for_each_active_dev_scope(drhd->devices,
929 					  drhd->devices_cnt, i, tmp) {
930 			if (tmp == dev) {
931 				/* For a VF use its original BDF# not that of the PF
932 				 * which we used for the IOMMU lookup. Strictly speaking
933 				 * we could do this for all PCI devices; we only need to
934 				 * get the BDF# from the scope table for ACPI matches. */
935 				if (pdev && pdev->is_virtfn)
936 					goto got_pdev;
937 
938 				if (bus && devfn) {
939 					*bus = drhd->devices[i].bus;
940 					*devfn = drhd->devices[i].devfn;
941 				}
942 				goto out;
943 			}
944 
945 			if (is_downstream_to_pci_bridge(dev, tmp))
946 				goto got_pdev;
947 		}
948 
949 		if (pdev && drhd->include_all) {
950 		got_pdev:
951 			if (bus && devfn) {
952 				*bus = pdev->bus->number;
953 				*devfn = pdev->devfn;
954 			}
955 			goto out;
956 		}
957 	}
958 	iommu = NULL;
959  out:
960 	if (iommu_is_dummy(iommu, dev))
961 		iommu = NULL;
962 
963 	rcu_read_unlock();
964 
965 	return iommu;
966 }
967 
domain_flush_cache(struct dmar_domain * domain,void * addr,int size)968 static void domain_flush_cache(struct dmar_domain *domain,
969 			       void *addr, int size)
970 {
971 	if (!domain->iommu_coherency)
972 		clflush_cache_range(addr, size);
973 }
974 
device_context_mapped(struct intel_iommu * iommu,u8 bus,u8 devfn)975 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
976 {
977 	struct context_entry *context;
978 	int ret = 0;
979 	unsigned long flags;
980 
981 	spin_lock_irqsave(&iommu->lock, flags);
982 	context = iommu_context_addr(iommu, bus, devfn, 0);
983 	if (context)
984 		ret = context_present(context);
985 	spin_unlock_irqrestore(&iommu->lock, flags);
986 	return ret;
987 }
988 
free_context_table(struct intel_iommu * iommu)989 static void free_context_table(struct intel_iommu *iommu)
990 {
991 	int i;
992 	unsigned long flags;
993 	struct context_entry *context;
994 
995 	spin_lock_irqsave(&iommu->lock, flags);
996 	if (!iommu->root_entry) {
997 		goto out;
998 	}
999 	for (i = 0; i < ROOT_ENTRY_NR; i++) {
1000 		context = iommu_context_addr(iommu, i, 0, 0);
1001 		if (context)
1002 			free_pgtable_page(context);
1003 
1004 		if (!sm_supported(iommu))
1005 			continue;
1006 
1007 		context = iommu_context_addr(iommu, i, 0x80, 0);
1008 		if (context)
1009 			free_pgtable_page(context);
1010 
1011 	}
1012 	free_pgtable_page(iommu->root_entry);
1013 	iommu->root_entry = NULL;
1014 out:
1015 	spin_unlock_irqrestore(&iommu->lock, flags);
1016 }
1017 
pfn_to_dma_pte(struct dmar_domain * domain,unsigned long pfn,int * target_level)1018 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
1019 				      unsigned long pfn, int *target_level)
1020 {
1021 	struct dma_pte *parent, *pte;
1022 	int level = agaw_to_level(domain->agaw);
1023 	int offset;
1024 
1025 	BUG_ON(!domain->pgd);
1026 
1027 	if (!domain_pfn_supported(domain, pfn))
1028 		/* Address beyond IOMMU's addressing capabilities. */
1029 		return NULL;
1030 
1031 	parent = domain->pgd;
1032 
1033 	while (1) {
1034 		void *tmp_page;
1035 
1036 		offset = pfn_level_offset(pfn, level);
1037 		pte = &parent[offset];
1038 		if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
1039 			break;
1040 		if (level == *target_level)
1041 			break;
1042 
1043 		if (!dma_pte_present(pte)) {
1044 			uint64_t pteval;
1045 
1046 			tmp_page = alloc_pgtable_page(domain->nid);
1047 
1048 			if (!tmp_page)
1049 				return NULL;
1050 
1051 			domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
1052 			pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
1053 			if (domain_use_first_level(domain)) {
1054 				pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US;
1055 				if (domain->domain.type == IOMMU_DOMAIN_DMA)
1056 					pteval |= DMA_FL_PTE_ACCESS;
1057 			}
1058 			if (cmpxchg64(&pte->val, 0ULL, pteval))
1059 				/* Someone else set it while we were thinking; use theirs. */
1060 				free_pgtable_page(tmp_page);
1061 			else
1062 				domain_flush_cache(domain, pte, sizeof(*pte));
1063 		}
1064 		if (level == 1)
1065 			break;
1066 
1067 		parent = phys_to_virt(dma_pte_addr(pte));
1068 		level--;
1069 	}
1070 
1071 	if (!*target_level)
1072 		*target_level = level;
1073 
1074 	return pte;
1075 }
1076 
1077 /* return address's pte at specific level */
dma_pfn_level_pte(struct dmar_domain * domain,unsigned long pfn,int level,int * large_page)1078 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
1079 					 unsigned long pfn,
1080 					 int level, int *large_page)
1081 {
1082 	struct dma_pte *parent, *pte;
1083 	int total = agaw_to_level(domain->agaw);
1084 	int offset;
1085 
1086 	parent = domain->pgd;
1087 	while (level <= total) {
1088 		offset = pfn_level_offset(pfn, total);
1089 		pte = &parent[offset];
1090 		if (level == total)
1091 			return pte;
1092 
1093 		if (!dma_pte_present(pte)) {
1094 			*large_page = total;
1095 			break;
1096 		}
1097 
1098 		if (dma_pte_superpage(pte)) {
1099 			*large_page = total;
1100 			return pte;
1101 		}
1102 
1103 		parent = phys_to_virt(dma_pte_addr(pte));
1104 		total--;
1105 	}
1106 	return NULL;
1107 }
1108 
1109 /* clear last level pte, a tlb flush should be followed */
dma_pte_clear_range(struct dmar_domain * domain,unsigned long start_pfn,unsigned long last_pfn)1110 static void dma_pte_clear_range(struct dmar_domain *domain,
1111 				unsigned long start_pfn,
1112 				unsigned long last_pfn)
1113 {
1114 	unsigned int large_page;
1115 	struct dma_pte *first_pte, *pte;
1116 
1117 	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1118 	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1119 	BUG_ON(start_pfn > last_pfn);
1120 
1121 	/* we don't need lock here; nobody else touches the iova range */
1122 	do {
1123 		large_page = 1;
1124 		first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1125 		if (!pte) {
1126 			start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1127 			continue;
1128 		}
1129 		do {
1130 			dma_clear_pte(pte);
1131 			start_pfn += lvl_to_nr_pages(large_page);
1132 			pte++;
1133 		} while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1134 
1135 		domain_flush_cache(domain, first_pte,
1136 				   (void *)pte - (void *)first_pte);
1137 
1138 	} while (start_pfn && start_pfn <= last_pfn);
1139 }
1140 
dma_pte_free_level(struct dmar_domain * domain,int level,int retain_level,struct dma_pte * pte,unsigned long pfn,unsigned long start_pfn,unsigned long last_pfn)1141 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1142 			       int retain_level, struct dma_pte *pte,
1143 			       unsigned long pfn, unsigned long start_pfn,
1144 			       unsigned long last_pfn)
1145 {
1146 	pfn = max(start_pfn, pfn);
1147 	pte = &pte[pfn_level_offset(pfn, level)];
1148 
1149 	do {
1150 		unsigned long level_pfn;
1151 		struct dma_pte *level_pte;
1152 
1153 		if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1154 			goto next;
1155 
1156 		level_pfn = pfn & level_mask(level);
1157 		level_pte = phys_to_virt(dma_pte_addr(pte));
1158 
1159 		if (level > 2) {
1160 			dma_pte_free_level(domain, level - 1, retain_level,
1161 					   level_pte, level_pfn, start_pfn,
1162 					   last_pfn);
1163 		}
1164 
1165 		/*
1166 		 * Free the page table if we're below the level we want to
1167 		 * retain and the range covers the entire table.
1168 		 */
1169 		if (level < retain_level && !(start_pfn > level_pfn ||
1170 		      last_pfn < level_pfn + level_size(level) - 1)) {
1171 			dma_clear_pte(pte);
1172 			domain_flush_cache(domain, pte, sizeof(*pte));
1173 			free_pgtable_page(level_pte);
1174 		}
1175 next:
1176 		pfn += level_size(level);
1177 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1178 }
1179 
1180 /*
1181  * clear last level (leaf) ptes and free page table pages below the
1182  * level we wish to keep intact.
1183  */
dma_pte_free_pagetable(struct dmar_domain * domain,unsigned long start_pfn,unsigned long last_pfn,int retain_level)1184 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1185 				   unsigned long start_pfn,
1186 				   unsigned long last_pfn,
1187 				   int retain_level)
1188 {
1189 	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1190 	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1191 	BUG_ON(start_pfn > last_pfn);
1192 
1193 	dma_pte_clear_range(domain, start_pfn, last_pfn);
1194 
1195 	/* We don't need lock here; nobody else touches the iova range */
1196 	dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1197 			   domain->pgd, 0, start_pfn, last_pfn);
1198 
1199 	/* free pgd */
1200 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1201 		free_pgtable_page(domain->pgd);
1202 		domain->pgd = NULL;
1203 	}
1204 }
1205 
1206 /* When a page at a given level is being unlinked from its parent, we don't
1207    need to *modify* it at all. All we need to do is make a list of all the
1208    pages which can be freed just as soon as we've flushed the IOTLB and we
1209    know the hardware page-walk will no longer touch them.
1210    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1211    be freed. */
dma_pte_list_pagetables(struct dmar_domain * domain,int level,struct dma_pte * pte,struct page * freelist)1212 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1213 					    int level, struct dma_pte *pte,
1214 					    struct page *freelist)
1215 {
1216 	struct page *pg;
1217 
1218 	pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1219 	pg->freelist = freelist;
1220 	freelist = pg;
1221 
1222 	if (level == 1)
1223 		return freelist;
1224 
1225 	pte = page_address(pg);
1226 	do {
1227 		if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1228 			freelist = dma_pte_list_pagetables(domain, level - 1,
1229 							   pte, freelist);
1230 		pte++;
1231 	} while (!first_pte_in_page(pte));
1232 
1233 	return freelist;
1234 }
1235 
dma_pte_clear_level(struct dmar_domain * domain,int level,struct dma_pte * pte,unsigned long pfn,unsigned long start_pfn,unsigned long last_pfn,struct page * freelist)1236 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1237 					struct dma_pte *pte, unsigned long pfn,
1238 					unsigned long start_pfn,
1239 					unsigned long last_pfn,
1240 					struct page *freelist)
1241 {
1242 	struct dma_pte *first_pte = NULL, *last_pte = NULL;
1243 
1244 	pfn = max(start_pfn, pfn);
1245 	pte = &pte[pfn_level_offset(pfn, level)];
1246 
1247 	do {
1248 		unsigned long level_pfn;
1249 
1250 		if (!dma_pte_present(pte))
1251 			goto next;
1252 
1253 		level_pfn = pfn & level_mask(level);
1254 
1255 		/* If range covers entire pagetable, free it */
1256 		if (start_pfn <= level_pfn &&
1257 		    last_pfn >= level_pfn + level_size(level) - 1) {
1258 			/* These suborbinate page tables are going away entirely. Don't
1259 			   bother to clear them; we're just going to *free* them. */
1260 			if (level > 1 && !dma_pte_superpage(pte))
1261 				freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1262 
1263 			dma_clear_pte(pte);
1264 			if (!first_pte)
1265 				first_pte = pte;
1266 			last_pte = pte;
1267 		} else if (level > 1) {
1268 			/* Recurse down into a level that isn't *entirely* obsolete */
1269 			freelist = dma_pte_clear_level(domain, level - 1,
1270 						       phys_to_virt(dma_pte_addr(pte)),
1271 						       level_pfn, start_pfn, last_pfn,
1272 						       freelist);
1273 		}
1274 next:
1275 		pfn += level_size(level);
1276 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1277 
1278 	if (first_pte)
1279 		domain_flush_cache(domain, first_pte,
1280 				   (void *)++last_pte - (void *)first_pte);
1281 
1282 	return freelist;
1283 }
1284 
1285 /* We can't just free the pages because the IOMMU may still be walking
1286    the page tables, and may have cached the intermediate levels. The
1287    pages can only be freed after the IOTLB flush has been done. */
domain_unmap(struct dmar_domain * domain,unsigned long start_pfn,unsigned long last_pfn)1288 static struct page *domain_unmap(struct dmar_domain *domain,
1289 				 unsigned long start_pfn,
1290 				 unsigned long last_pfn)
1291 {
1292 	struct page *freelist;
1293 
1294 	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1295 	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1296 	BUG_ON(start_pfn > last_pfn);
1297 
1298 	/* we don't need lock here; nobody else touches the iova range */
1299 	freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1300 				       domain->pgd, 0, start_pfn, last_pfn, NULL);
1301 
1302 	/* free pgd */
1303 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1304 		struct page *pgd_page = virt_to_page(domain->pgd);
1305 		pgd_page->freelist = freelist;
1306 		freelist = pgd_page;
1307 
1308 		domain->pgd = NULL;
1309 	}
1310 
1311 	return freelist;
1312 }
1313 
dma_free_pagelist(struct page * freelist)1314 static void dma_free_pagelist(struct page *freelist)
1315 {
1316 	struct page *pg;
1317 
1318 	while ((pg = freelist)) {
1319 		freelist = pg->freelist;
1320 		free_pgtable_page(page_address(pg));
1321 	}
1322 }
1323 
iova_entry_free(unsigned long data)1324 static void iova_entry_free(unsigned long data)
1325 {
1326 	struct page *freelist = (struct page *)data;
1327 
1328 	dma_free_pagelist(freelist);
1329 }
1330 
1331 /* iommu handling */
iommu_alloc_root_entry(struct intel_iommu * iommu)1332 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1333 {
1334 	struct root_entry *root;
1335 	unsigned long flags;
1336 
1337 	root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1338 	if (!root) {
1339 		pr_err("Allocating root entry for %s failed\n",
1340 			iommu->name);
1341 		return -ENOMEM;
1342 	}
1343 
1344 	__iommu_flush_cache(iommu, root, ROOT_SIZE);
1345 
1346 	spin_lock_irqsave(&iommu->lock, flags);
1347 	iommu->root_entry = root;
1348 	spin_unlock_irqrestore(&iommu->lock, flags);
1349 
1350 	return 0;
1351 }
1352 
iommu_set_root_entry(struct intel_iommu * iommu)1353 static void iommu_set_root_entry(struct intel_iommu *iommu)
1354 {
1355 	u64 addr;
1356 	u32 sts;
1357 	unsigned long flag;
1358 
1359 	addr = virt_to_phys(iommu->root_entry);
1360 	if (sm_supported(iommu))
1361 		addr |= DMA_RTADDR_SMT;
1362 
1363 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1364 	dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1365 
1366 	writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1367 
1368 	/* Make sure hardware complete it */
1369 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1370 		      readl, (sts & DMA_GSTS_RTPS), sts);
1371 
1372 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1373 
1374 	iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1375 	if (sm_supported(iommu))
1376 		qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1377 	iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1378 }
1379 
iommu_flush_write_buffer(struct intel_iommu * iommu)1380 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1381 {
1382 	u32 val;
1383 	unsigned long flag;
1384 
1385 	if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1386 		return;
1387 
1388 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1389 	writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1390 
1391 	/* Make sure hardware complete it */
1392 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1393 		      readl, (!(val & DMA_GSTS_WBFS)), val);
1394 
1395 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1396 }
1397 
1398 /* return value determine if we need a write buffer flush */
__iommu_flush_context(struct intel_iommu * iommu,u16 did,u16 source_id,u8 function_mask,u64 type)1399 static void __iommu_flush_context(struct intel_iommu *iommu,
1400 				  u16 did, u16 source_id, u8 function_mask,
1401 				  u64 type)
1402 {
1403 	u64 val = 0;
1404 	unsigned long flag;
1405 
1406 	switch (type) {
1407 	case DMA_CCMD_GLOBAL_INVL:
1408 		val = DMA_CCMD_GLOBAL_INVL;
1409 		break;
1410 	case DMA_CCMD_DOMAIN_INVL:
1411 		val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1412 		break;
1413 	case DMA_CCMD_DEVICE_INVL:
1414 		val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1415 			| DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1416 		break;
1417 	default:
1418 		BUG();
1419 	}
1420 	val |= DMA_CCMD_ICC;
1421 
1422 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1423 	dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1424 
1425 	/* Make sure hardware complete it */
1426 	IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1427 		dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1428 
1429 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1430 }
1431 
1432 /* return value determine if we need a write buffer flush */
__iommu_flush_iotlb(struct intel_iommu * iommu,u16 did,u64 addr,unsigned int size_order,u64 type)1433 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1434 				u64 addr, unsigned int size_order, u64 type)
1435 {
1436 	int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1437 	u64 val = 0, val_iva = 0;
1438 	unsigned long flag;
1439 
1440 	switch (type) {
1441 	case DMA_TLB_GLOBAL_FLUSH:
1442 		/* global flush doesn't need set IVA_REG */
1443 		val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1444 		break;
1445 	case DMA_TLB_DSI_FLUSH:
1446 		val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1447 		break;
1448 	case DMA_TLB_PSI_FLUSH:
1449 		val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1450 		/* IH bit is passed in as part of address */
1451 		val_iva = size_order | addr;
1452 		break;
1453 	default:
1454 		BUG();
1455 	}
1456 	/* Note: set drain read/write */
1457 #if 0
1458 	/*
1459 	 * This is probably to be super secure.. Looks like we can
1460 	 * ignore it without any impact.
1461 	 */
1462 	if (cap_read_drain(iommu->cap))
1463 		val |= DMA_TLB_READ_DRAIN;
1464 #endif
1465 	if (cap_write_drain(iommu->cap))
1466 		val |= DMA_TLB_WRITE_DRAIN;
1467 
1468 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1469 	/* Note: Only uses first TLB reg currently */
1470 	if (val_iva)
1471 		dmar_writeq(iommu->reg + tlb_offset, val_iva);
1472 	dmar_writeq(iommu->reg + tlb_offset + 8, val);
1473 
1474 	/* Make sure hardware complete it */
1475 	IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1476 		dmar_readq, (!(val & DMA_TLB_IVT)), val);
1477 
1478 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1479 
1480 	/* check IOTLB invalidation granularity */
1481 	if (DMA_TLB_IAIG(val) == 0)
1482 		pr_err("Flush IOTLB failed\n");
1483 	if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1484 		pr_debug("TLB flush request %Lx, actual %Lx\n",
1485 			(unsigned long long)DMA_TLB_IIRG(type),
1486 			(unsigned long long)DMA_TLB_IAIG(val));
1487 }
1488 
1489 static struct device_domain_info *
iommu_support_dev_iotlb(struct dmar_domain * domain,struct intel_iommu * iommu,u8 bus,u8 devfn)1490 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1491 			 u8 bus, u8 devfn)
1492 {
1493 	struct device_domain_info *info;
1494 
1495 	assert_spin_locked(&device_domain_lock);
1496 
1497 	if (!iommu->qi)
1498 		return NULL;
1499 
1500 	list_for_each_entry(info, &domain->devices, link)
1501 		if (info->iommu == iommu && info->bus == bus &&
1502 		    info->devfn == devfn) {
1503 			if (info->ats_supported && info->dev)
1504 				return info;
1505 			break;
1506 		}
1507 
1508 	return NULL;
1509 }
1510 
domain_update_iotlb(struct dmar_domain * domain)1511 static void domain_update_iotlb(struct dmar_domain *domain)
1512 {
1513 	struct device_domain_info *info;
1514 	bool has_iotlb_device = false;
1515 
1516 	assert_spin_locked(&device_domain_lock);
1517 
1518 	list_for_each_entry(info, &domain->devices, link) {
1519 		struct pci_dev *pdev;
1520 
1521 		if (!info->dev || !dev_is_pci(info->dev))
1522 			continue;
1523 
1524 		pdev = to_pci_dev(info->dev);
1525 		if (pdev->ats_enabled) {
1526 			has_iotlb_device = true;
1527 			break;
1528 		}
1529 	}
1530 
1531 	domain->has_iotlb_device = has_iotlb_device;
1532 }
1533 
iommu_enable_dev_iotlb(struct device_domain_info * info)1534 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1535 {
1536 	struct pci_dev *pdev;
1537 
1538 	assert_spin_locked(&device_domain_lock);
1539 
1540 	if (!info || !dev_is_pci(info->dev))
1541 		return;
1542 
1543 	pdev = to_pci_dev(info->dev);
1544 	/* For IOMMU that supports device IOTLB throttling (DIT), we assign
1545 	 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1546 	 * queue depth at PF level. If DIT is not set, PFSID will be treated as
1547 	 * reserved, which should be set to 0.
1548 	 */
1549 	if (!ecap_dit(info->iommu->ecap))
1550 		info->pfsid = 0;
1551 	else {
1552 		struct pci_dev *pf_pdev;
1553 
1554 		/* pdev will be returned if device is not a vf */
1555 		pf_pdev = pci_physfn(pdev);
1556 		info->pfsid = pci_dev_id(pf_pdev);
1557 	}
1558 
1559 #ifdef CONFIG_INTEL_IOMMU_SVM
1560 	/* The PCIe spec, in its wisdom, declares that the behaviour of
1561 	   the device if you enable PASID support after ATS support is
1562 	   undefined. So always enable PASID support on devices which
1563 	   have it, even if we can't yet know if we're ever going to
1564 	   use it. */
1565 	if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1566 		info->pasid_enabled = 1;
1567 
1568 	if (info->pri_supported &&
1569 	    (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1)  &&
1570 	    !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1571 		info->pri_enabled = 1;
1572 #endif
1573 	if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1574 	    !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1575 		info->ats_enabled = 1;
1576 		domain_update_iotlb(info->domain);
1577 		info->ats_qdep = pci_ats_queue_depth(pdev);
1578 	}
1579 }
1580 
iommu_disable_dev_iotlb(struct device_domain_info * info)1581 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1582 {
1583 	struct pci_dev *pdev;
1584 
1585 	assert_spin_locked(&device_domain_lock);
1586 
1587 	if (!dev_is_pci(info->dev))
1588 		return;
1589 
1590 	pdev = to_pci_dev(info->dev);
1591 
1592 	if (info->ats_enabled) {
1593 		pci_disable_ats(pdev);
1594 		info->ats_enabled = 0;
1595 		domain_update_iotlb(info->domain);
1596 	}
1597 #ifdef CONFIG_INTEL_IOMMU_SVM
1598 	if (info->pri_enabled) {
1599 		pci_disable_pri(pdev);
1600 		info->pri_enabled = 0;
1601 	}
1602 	if (info->pasid_enabled) {
1603 		pci_disable_pasid(pdev);
1604 		info->pasid_enabled = 0;
1605 	}
1606 #endif
1607 }
1608 
iommu_flush_dev_iotlb(struct dmar_domain * domain,u64 addr,unsigned mask)1609 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1610 				  u64 addr, unsigned mask)
1611 {
1612 	u16 sid, qdep;
1613 	unsigned long flags;
1614 	struct device_domain_info *info;
1615 
1616 	if (!domain->has_iotlb_device)
1617 		return;
1618 
1619 	spin_lock_irqsave(&device_domain_lock, flags);
1620 	list_for_each_entry(info, &domain->devices, link) {
1621 		if (!info->ats_enabled)
1622 			continue;
1623 
1624 		sid = info->bus << 8 | info->devfn;
1625 		qdep = info->ats_qdep;
1626 		qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1627 				qdep, addr, mask);
1628 	}
1629 	spin_unlock_irqrestore(&device_domain_lock, flags);
1630 }
1631 
domain_flush_piotlb(struct intel_iommu * iommu,struct dmar_domain * domain,u64 addr,unsigned long npages,bool ih)1632 static void domain_flush_piotlb(struct intel_iommu *iommu,
1633 				struct dmar_domain *domain,
1634 				u64 addr, unsigned long npages, bool ih)
1635 {
1636 	u16 did = domain->iommu_did[iommu->seq_id];
1637 
1638 	if (domain->default_pasid)
1639 		qi_flush_piotlb(iommu, did, domain->default_pasid,
1640 				addr, npages, ih);
1641 
1642 	if (!list_empty(&domain->devices))
1643 		qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, npages, ih);
1644 }
1645 
iommu_flush_iotlb_psi(struct intel_iommu * iommu,struct dmar_domain * domain,unsigned long pfn,unsigned int pages,int ih,int map)1646 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1647 				  struct dmar_domain *domain,
1648 				  unsigned long pfn, unsigned int pages,
1649 				  int ih, int map)
1650 {
1651 	unsigned int aligned_pages = __roundup_pow_of_two(pages);
1652 	unsigned int mask = ilog2(aligned_pages);
1653 	uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1654 	u16 did = domain->iommu_did[iommu->seq_id];
1655 
1656 	BUG_ON(pages == 0);
1657 
1658 	if (ih)
1659 		ih = 1 << 6;
1660 
1661 	if (domain_use_first_level(domain)) {
1662 		domain_flush_piotlb(iommu, domain, addr, pages, ih);
1663 	} else {
1664 		unsigned long bitmask = aligned_pages - 1;
1665 
1666 		/*
1667 		 * PSI masks the low order bits of the base address. If the
1668 		 * address isn't aligned to the mask, then compute a mask value
1669 		 * needed to ensure the target range is flushed.
1670 		 */
1671 		if (unlikely(bitmask & pfn)) {
1672 			unsigned long end_pfn = pfn + pages - 1, shared_bits;
1673 
1674 			/*
1675 			 * Since end_pfn <= pfn + bitmask, the only way bits
1676 			 * higher than bitmask can differ in pfn and end_pfn is
1677 			 * by carrying. This means after masking out bitmask,
1678 			 * high bits starting with the first set bit in
1679 			 * shared_bits are all equal in both pfn and end_pfn.
1680 			 */
1681 			shared_bits = ~(pfn ^ end_pfn) & ~bitmask;
1682 			mask = shared_bits ? __ffs(shared_bits) : BITS_PER_LONG;
1683 		}
1684 
1685 		/*
1686 		 * Fallback to domain selective flush if no PSI support or
1687 		 * the size is too big.
1688 		 */
1689 		if (!cap_pgsel_inv(iommu->cap) ||
1690 		    mask > cap_max_amask_val(iommu->cap))
1691 			iommu->flush.flush_iotlb(iommu, did, 0, 0,
1692 							DMA_TLB_DSI_FLUSH);
1693 		else
1694 			iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1695 							DMA_TLB_PSI_FLUSH);
1696 	}
1697 
1698 	/*
1699 	 * In caching mode, changes of pages from non-present to present require
1700 	 * flush. However, device IOTLB doesn't need to be flushed in this case.
1701 	 */
1702 	if (!cap_caching_mode(iommu->cap) || !map)
1703 		iommu_flush_dev_iotlb(domain, addr, mask);
1704 }
1705 
1706 /* Notification for newly created mappings */
__mapping_notify_one(struct intel_iommu * iommu,struct dmar_domain * domain,unsigned long pfn,unsigned int pages)1707 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1708 					struct dmar_domain *domain,
1709 					unsigned long pfn, unsigned int pages)
1710 {
1711 	/*
1712 	 * It's a non-present to present mapping. Only flush if caching mode
1713 	 * and second level.
1714 	 */
1715 	if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain))
1716 		iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1717 	else
1718 		iommu_flush_write_buffer(iommu);
1719 }
1720 
iommu_flush_iova(struct iova_domain * iovad)1721 static void iommu_flush_iova(struct iova_domain *iovad)
1722 {
1723 	struct dmar_domain *domain;
1724 	int idx;
1725 
1726 	domain = container_of(iovad, struct dmar_domain, iovad);
1727 
1728 	for_each_domain_iommu(idx, domain) {
1729 		struct intel_iommu *iommu = g_iommus[idx];
1730 		u16 did = domain->iommu_did[iommu->seq_id];
1731 
1732 		if (domain_use_first_level(domain))
1733 			domain_flush_piotlb(iommu, domain, 0, -1, 0);
1734 		else
1735 			iommu->flush.flush_iotlb(iommu, did, 0, 0,
1736 						 DMA_TLB_DSI_FLUSH);
1737 
1738 		if (!cap_caching_mode(iommu->cap))
1739 			iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1740 					      0, MAX_AGAW_PFN_WIDTH);
1741 	}
1742 }
1743 
iommu_disable_protect_mem_regions(struct intel_iommu * iommu)1744 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1745 {
1746 	u32 pmen;
1747 	unsigned long flags;
1748 
1749 	if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1750 		return;
1751 
1752 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1753 	pmen = readl(iommu->reg + DMAR_PMEN_REG);
1754 	pmen &= ~DMA_PMEN_EPM;
1755 	writel(pmen, iommu->reg + DMAR_PMEN_REG);
1756 
1757 	/* wait for the protected region status bit to clear */
1758 	IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1759 		readl, !(pmen & DMA_PMEN_PRS), pmen);
1760 
1761 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1762 }
1763 
iommu_enable_translation(struct intel_iommu * iommu)1764 static void iommu_enable_translation(struct intel_iommu *iommu)
1765 {
1766 	u32 sts;
1767 	unsigned long flags;
1768 
1769 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1770 	iommu->gcmd |= DMA_GCMD_TE;
1771 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1772 
1773 	/* Make sure hardware complete it */
1774 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1775 		      readl, (sts & DMA_GSTS_TES), sts);
1776 
1777 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1778 }
1779 
iommu_disable_translation(struct intel_iommu * iommu)1780 static void iommu_disable_translation(struct intel_iommu *iommu)
1781 {
1782 	u32 sts;
1783 	unsigned long flag;
1784 
1785 	if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1786 	    (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1787 		return;
1788 
1789 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1790 	iommu->gcmd &= ~DMA_GCMD_TE;
1791 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1792 
1793 	/* Make sure hardware complete it */
1794 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1795 		      readl, (!(sts & DMA_GSTS_TES)), sts);
1796 
1797 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1798 }
1799 
iommu_init_domains(struct intel_iommu * iommu)1800 static int iommu_init_domains(struct intel_iommu *iommu)
1801 {
1802 	u32 ndomains, nlongs;
1803 	size_t size;
1804 
1805 	ndomains = cap_ndoms(iommu->cap);
1806 	pr_debug("%s: Number of Domains supported <%d>\n",
1807 		 iommu->name, ndomains);
1808 	nlongs = BITS_TO_LONGS(ndomains);
1809 
1810 	spin_lock_init(&iommu->lock);
1811 
1812 	iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1813 	if (!iommu->domain_ids) {
1814 		pr_err("%s: Allocating domain id array failed\n",
1815 		       iommu->name);
1816 		return -ENOMEM;
1817 	}
1818 
1819 	size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1820 	iommu->domains = kzalloc(size, GFP_KERNEL);
1821 
1822 	if (iommu->domains) {
1823 		size = 256 * sizeof(struct dmar_domain *);
1824 		iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1825 	}
1826 
1827 	if (!iommu->domains || !iommu->domains[0]) {
1828 		pr_err("%s: Allocating domain array failed\n",
1829 		       iommu->name);
1830 		kfree(iommu->domain_ids);
1831 		kfree(iommu->domains);
1832 		iommu->domain_ids = NULL;
1833 		iommu->domains    = NULL;
1834 		return -ENOMEM;
1835 	}
1836 
1837 	/*
1838 	 * If Caching mode is set, then invalid translations are tagged
1839 	 * with domain-id 0, hence we need to pre-allocate it. We also
1840 	 * use domain-id 0 as a marker for non-allocated domain-id, so
1841 	 * make sure it is not used for a real domain.
1842 	 */
1843 	set_bit(0, iommu->domain_ids);
1844 
1845 	/*
1846 	 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1847 	 * entry for first-level or pass-through translation modes should
1848 	 * be programmed with a domain id different from those used for
1849 	 * second-level or nested translation. We reserve a domain id for
1850 	 * this purpose.
1851 	 */
1852 	if (sm_supported(iommu))
1853 		set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1854 
1855 	return 0;
1856 }
1857 
disable_dmar_iommu(struct intel_iommu * iommu)1858 static void disable_dmar_iommu(struct intel_iommu *iommu)
1859 {
1860 	struct device_domain_info *info, *tmp;
1861 	unsigned long flags;
1862 
1863 	if (!iommu->domains || !iommu->domain_ids)
1864 		return;
1865 
1866 	spin_lock_irqsave(&device_domain_lock, flags);
1867 	list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1868 		if (info->iommu != iommu)
1869 			continue;
1870 
1871 		if (!info->dev || !info->domain)
1872 			continue;
1873 
1874 		__dmar_remove_one_dev_info(info);
1875 	}
1876 	spin_unlock_irqrestore(&device_domain_lock, flags);
1877 
1878 	if (iommu->gcmd & DMA_GCMD_TE)
1879 		iommu_disable_translation(iommu);
1880 }
1881 
free_dmar_iommu(struct intel_iommu * iommu)1882 static void free_dmar_iommu(struct intel_iommu *iommu)
1883 {
1884 	if ((iommu->domains) && (iommu->domain_ids)) {
1885 		int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1886 		int i;
1887 
1888 		for (i = 0; i < elems; i++)
1889 			kfree(iommu->domains[i]);
1890 		kfree(iommu->domains);
1891 		kfree(iommu->domain_ids);
1892 		iommu->domains = NULL;
1893 		iommu->domain_ids = NULL;
1894 	}
1895 
1896 	g_iommus[iommu->seq_id] = NULL;
1897 
1898 	/* free context mapping */
1899 	free_context_table(iommu);
1900 
1901 #ifdef CONFIG_INTEL_IOMMU_SVM
1902 	if (pasid_supported(iommu)) {
1903 		if (ecap_prs(iommu->ecap))
1904 			intel_svm_finish_prq(iommu);
1905 	}
1906 	if (vccap_pasid(iommu->vccap))
1907 		ioasid_unregister_allocator(&iommu->pasid_allocator);
1908 
1909 #endif
1910 }
1911 
1912 /*
1913  * Check and return whether first level is used by default for
1914  * DMA translation.
1915  */
first_level_by_default(void)1916 static bool first_level_by_default(void)
1917 {
1918 	struct dmar_drhd_unit *drhd;
1919 	struct intel_iommu *iommu;
1920 	static int first_level_support = -1;
1921 
1922 	if (likely(first_level_support != -1))
1923 		return first_level_support;
1924 
1925 	first_level_support = 1;
1926 
1927 	rcu_read_lock();
1928 	for_each_active_iommu(iommu, drhd) {
1929 		if (!sm_supported(iommu) || !ecap_flts(iommu->ecap)) {
1930 			first_level_support = 0;
1931 			break;
1932 		}
1933 	}
1934 	rcu_read_unlock();
1935 
1936 	return first_level_support;
1937 }
1938 
alloc_domain(int flags)1939 static struct dmar_domain *alloc_domain(int flags)
1940 {
1941 	struct dmar_domain *domain;
1942 
1943 	domain = alloc_domain_mem();
1944 	if (!domain)
1945 		return NULL;
1946 
1947 	memset(domain, 0, sizeof(*domain));
1948 	domain->nid = NUMA_NO_NODE;
1949 	domain->flags = flags;
1950 	if (first_level_by_default())
1951 		domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL;
1952 	domain->has_iotlb_device = false;
1953 	INIT_LIST_HEAD(&domain->devices);
1954 
1955 	return domain;
1956 }
1957 
1958 /* Must be called with iommu->lock */
domain_attach_iommu(struct dmar_domain * domain,struct intel_iommu * iommu)1959 static int domain_attach_iommu(struct dmar_domain *domain,
1960 			       struct intel_iommu *iommu)
1961 {
1962 	unsigned long ndomains;
1963 	int num;
1964 
1965 	assert_spin_locked(&device_domain_lock);
1966 	assert_spin_locked(&iommu->lock);
1967 
1968 	domain->iommu_refcnt[iommu->seq_id] += 1;
1969 	domain->iommu_count += 1;
1970 	if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1971 		ndomains = cap_ndoms(iommu->cap);
1972 		num      = find_first_zero_bit(iommu->domain_ids, ndomains);
1973 
1974 		if (num >= ndomains) {
1975 			pr_err("%s: No free domain ids\n", iommu->name);
1976 			domain->iommu_refcnt[iommu->seq_id] -= 1;
1977 			domain->iommu_count -= 1;
1978 			return -ENOSPC;
1979 		}
1980 
1981 		set_bit(num, iommu->domain_ids);
1982 		set_iommu_domain(iommu, num, domain);
1983 
1984 		domain->iommu_did[iommu->seq_id] = num;
1985 		domain->nid			 = iommu->node;
1986 
1987 		domain_update_iommu_cap(domain);
1988 	}
1989 
1990 	return 0;
1991 }
1992 
domain_detach_iommu(struct dmar_domain * domain,struct intel_iommu * iommu)1993 static int domain_detach_iommu(struct dmar_domain *domain,
1994 			       struct intel_iommu *iommu)
1995 {
1996 	int num, count;
1997 
1998 	assert_spin_locked(&device_domain_lock);
1999 	assert_spin_locked(&iommu->lock);
2000 
2001 	domain->iommu_refcnt[iommu->seq_id] -= 1;
2002 	count = --domain->iommu_count;
2003 	if (domain->iommu_refcnt[iommu->seq_id] == 0) {
2004 		num = domain->iommu_did[iommu->seq_id];
2005 		clear_bit(num, iommu->domain_ids);
2006 		set_iommu_domain(iommu, num, NULL);
2007 
2008 		domain_update_iommu_cap(domain);
2009 		domain->iommu_did[iommu->seq_id] = 0;
2010 	}
2011 
2012 	return count;
2013 }
2014 
2015 static struct iova_domain reserved_iova_list;
2016 static struct lock_class_key reserved_rbtree_key;
2017 
dmar_init_reserved_ranges(void)2018 static int dmar_init_reserved_ranges(void)
2019 {
2020 	struct pci_dev *pdev = NULL;
2021 	struct iova *iova;
2022 	int i;
2023 
2024 	init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
2025 
2026 	lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
2027 		&reserved_rbtree_key);
2028 
2029 	/* IOAPIC ranges shouldn't be accessed by DMA */
2030 	iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
2031 		IOVA_PFN(IOAPIC_RANGE_END));
2032 	if (!iova) {
2033 		pr_err("Reserve IOAPIC range failed\n");
2034 		return -ENODEV;
2035 	}
2036 
2037 	/* Reserve all PCI MMIO to avoid peer-to-peer access */
2038 	for_each_pci_dev(pdev) {
2039 		struct resource *r;
2040 
2041 		for (i = 0; i < PCI_NUM_RESOURCES; i++) {
2042 			r = &pdev->resource[i];
2043 			if (!r->flags || !(r->flags & IORESOURCE_MEM))
2044 				continue;
2045 			iova = reserve_iova(&reserved_iova_list,
2046 					    IOVA_PFN(r->start),
2047 					    IOVA_PFN(r->end));
2048 			if (!iova) {
2049 				pci_err(pdev, "Reserve iova for %pR failed\n", r);
2050 				return -ENODEV;
2051 			}
2052 		}
2053 	}
2054 	return 0;
2055 }
2056 
guestwidth_to_adjustwidth(int gaw)2057 static inline int guestwidth_to_adjustwidth(int gaw)
2058 {
2059 	int agaw;
2060 	int r = (gaw - 12) % 9;
2061 
2062 	if (r == 0)
2063 		agaw = gaw;
2064 	else
2065 		agaw = gaw + 9 - r;
2066 	if (agaw > 64)
2067 		agaw = 64;
2068 	return agaw;
2069 }
2070 
domain_exit(struct dmar_domain * domain)2071 static void domain_exit(struct dmar_domain *domain)
2072 {
2073 
2074 	/* Remove associated devices and clear attached or cached domains */
2075 	domain_remove_dev_info(domain);
2076 
2077 	/* destroy iovas */
2078 	if (domain->domain.type == IOMMU_DOMAIN_DMA)
2079 		put_iova_domain(&domain->iovad);
2080 
2081 	if (domain->pgd) {
2082 		struct page *freelist;
2083 
2084 		freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
2085 		dma_free_pagelist(freelist);
2086 	}
2087 
2088 	free_domain_mem(domain);
2089 }
2090 
2091 /*
2092  * Get the PASID directory size for scalable mode context entry.
2093  * Value of X in the PDTS field of a scalable mode context entry
2094  * indicates PASID directory with 2^(X + 7) entries.
2095  */
context_get_sm_pds(struct pasid_table * table)2096 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
2097 {
2098 	int pds, max_pde;
2099 
2100 	max_pde = table->max_pasid >> PASID_PDE_SHIFT;
2101 	pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
2102 	if (pds < 7)
2103 		return 0;
2104 
2105 	return pds - 7;
2106 }
2107 
2108 /*
2109  * Set the RID_PASID field of a scalable mode context entry. The
2110  * IOMMU hardware will use the PASID value set in this field for
2111  * DMA translations of DMA requests without PASID.
2112  */
2113 static inline void
context_set_sm_rid2pasid(struct context_entry * context,unsigned long pasid)2114 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
2115 {
2116 	context->hi |= pasid & ((1 << 20) - 1);
2117 }
2118 
2119 /*
2120  * Set the DTE(Device-TLB Enable) field of a scalable mode context
2121  * entry.
2122  */
context_set_sm_dte(struct context_entry * context)2123 static inline void context_set_sm_dte(struct context_entry *context)
2124 {
2125 	context->lo |= (1 << 2);
2126 }
2127 
2128 /*
2129  * Set the PRE(Page Request Enable) field of a scalable mode context
2130  * entry.
2131  */
context_set_sm_pre(struct context_entry * context)2132 static inline void context_set_sm_pre(struct context_entry *context)
2133 {
2134 	context->lo |= (1 << 4);
2135 }
2136 
2137 /* Convert value to context PASID directory size field coding. */
2138 #define context_pdts(pds)	(((pds) & 0x7) << 9)
2139 
domain_context_mapping_one(struct dmar_domain * domain,struct intel_iommu * iommu,struct pasid_table * table,u8 bus,u8 devfn)2140 static int domain_context_mapping_one(struct dmar_domain *domain,
2141 				      struct intel_iommu *iommu,
2142 				      struct pasid_table *table,
2143 				      u8 bus, u8 devfn)
2144 {
2145 	u16 did = domain->iommu_did[iommu->seq_id];
2146 	int translation = CONTEXT_TT_MULTI_LEVEL;
2147 	struct device_domain_info *info = NULL;
2148 	struct context_entry *context;
2149 	unsigned long flags;
2150 	int ret;
2151 
2152 	WARN_ON(did == 0);
2153 
2154 	if (hw_pass_through && domain_type_is_si(domain))
2155 		translation = CONTEXT_TT_PASS_THROUGH;
2156 
2157 	pr_debug("Set context mapping for %02x:%02x.%d\n",
2158 		bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
2159 
2160 	BUG_ON(!domain->pgd);
2161 
2162 	spin_lock_irqsave(&device_domain_lock, flags);
2163 	spin_lock(&iommu->lock);
2164 
2165 	ret = -ENOMEM;
2166 	context = iommu_context_addr(iommu, bus, devfn, 1);
2167 	if (!context)
2168 		goto out_unlock;
2169 
2170 	ret = 0;
2171 	if (context_present(context))
2172 		goto out_unlock;
2173 
2174 	/*
2175 	 * For kdump cases, old valid entries may be cached due to the
2176 	 * in-flight DMA and copied pgtable, but there is no unmapping
2177 	 * behaviour for them, thus we need an explicit cache flush for
2178 	 * the newly-mapped device. For kdump, at this point, the device
2179 	 * is supposed to finish reset at its driver probe stage, so no
2180 	 * in-flight DMA will exist, and we don't need to worry anymore
2181 	 * hereafter.
2182 	 */
2183 	if (context_copied(context)) {
2184 		u16 did_old = context_domain_id(context);
2185 
2186 		if (did_old < cap_ndoms(iommu->cap)) {
2187 			iommu->flush.flush_context(iommu, did_old,
2188 						   (((u16)bus) << 8) | devfn,
2189 						   DMA_CCMD_MASK_NOBIT,
2190 						   DMA_CCMD_DEVICE_INVL);
2191 			iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2192 						 DMA_TLB_DSI_FLUSH);
2193 		}
2194 	}
2195 
2196 	context_clear_entry(context);
2197 
2198 	if (sm_supported(iommu)) {
2199 		unsigned long pds;
2200 
2201 		WARN_ON(!table);
2202 
2203 		/* Setup the PASID DIR pointer: */
2204 		pds = context_get_sm_pds(table);
2205 		context->lo = (u64)virt_to_phys(table->table) |
2206 				context_pdts(pds);
2207 
2208 		/* Setup the RID_PASID field: */
2209 		context_set_sm_rid2pasid(context, PASID_RID2PASID);
2210 
2211 		/*
2212 		 * Setup the Device-TLB enable bit and Page request
2213 		 * Enable bit:
2214 		 */
2215 		info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2216 		if (info && info->ats_supported)
2217 			context_set_sm_dte(context);
2218 		if (info && info->pri_supported)
2219 			context_set_sm_pre(context);
2220 	} else {
2221 		struct dma_pte *pgd = domain->pgd;
2222 		int agaw;
2223 
2224 		context_set_domain_id(context, did);
2225 
2226 		if (translation != CONTEXT_TT_PASS_THROUGH) {
2227 			/*
2228 			 * Skip top levels of page tables for iommu which has
2229 			 * less agaw than default. Unnecessary for PT mode.
2230 			 */
2231 			for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2232 				ret = -ENOMEM;
2233 				pgd = phys_to_virt(dma_pte_addr(pgd));
2234 				if (!dma_pte_present(pgd))
2235 					goto out_unlock;
2236 			}
2237 
2238 			info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2239 			if (info && info->ats_supported)
2240 				translation = CONTEXT_TT_DEV_IOTLB;
2241 			else
2242 				translation = CONTEXT_TT_MULTI_LEVEL;
2243 
2244 			context_set_address_root(context, virt_to_phys(pgd));
2245 			context_set_address_width(context, agaw);
2246 		} else {
2247 			/*
2248 			 * In pass through mode, AW must be programmed to
2249 			 * indicate the largest AGAW value supported by
2250 			 * hardware. And ASR is ignored by hardware.
2251 			 */
2252 			context_set_address_width(context, iommu->msagaw);
2253 		}
2254 
2255 		context_set_translation_type(context, translation);
2256 	}
2257 
2258 	context_set_fault_enable(context);
2259 	context_set_present(context);
2260 	if (!ecap_coherent(iommu->ecap))
2261 		clflush_cache_range(context, sizeof(*context));
2262 
2263 	/*
2264 	 * It's a non-present to present mapping. If hardware doesn't cache
2265 	 * non-present entry we only need to flush the write-buffer. If the
2266 	 * _does_ cache non-present entries, then it does so in the special
2267 	 * domain #0, which we have to flush:
2268 	 */
2269 	if (cap_caching_mode(iommu->cap)) {
2270 		iommu->flush.flush_context(iommu, 0,
2271 					   (((u16)bus) << 8) | devfn,
2272 					   DMA_CCMD_MASK_NOBIT,
2273 					   DMA_CCMD_DEVICE_INVL);
2274 		iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2275 	} else {
2276 		iommu_flush_write_buffer(iommu);
2277 	}
2278 	iommu_enable_dev_iotlb(info);
2279 
2280 	ret = 0;
2281 
2282 out_unlock:
2283 	spin_unlock(&iommu->lock);
2284 	spin_unlock_irqrestore(&device_domain_lock, flags);
2285 
2286 	return ret;
2287 }
2288 
2289 struct domain_context_mapping_data {
2290 	struct dmar_domain *domain;
2291 	struct intel_iommu *iommu;
2292 	struct pasid_table *table;
2293 };
2294 
domain_context_mapping_cb(struct pci_dev * pdev,u16 alias,void * opaque)2295 static int domain_context_mapping_cb(struct pci_dev *pdev,
2296 				     u16 alias, void *opaque)
2297 {
2298 	struct domain_context_mapping_data *data = opaque;
2299 
2300 	return domain_context_mapping_one(data->domain, data->iommu,
2301 					  data->table, PCI_BUS_NUM(alias),
2302 					  alias & 0xff);
2303 }
2304 
2305 static int
domain_context_mapping(struct dmar_domain * domain,struct device * dev)2306 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2307 {
2308 	struct domain_context_mapping_data data;
2309 	struct pasid_table *table;
2310 	struct intel_iommu *iommu;
2311 	u8 bus, devfn;
2312 
2313 	iommu = device_to_iommu(dev, &bus, &devfn);
2314 	if (!iommu)
2315 		return -ENODEV;
2316 
2317 	table = intel_pasid_get_table(dev);
2318 
2319 	if (!dev_is_pci(dev))
2320 		return domain_context_mapping_one(domain, iommu, table,
2321 						  bus, devfn);
2322 
2323 	data.domain = domain;
2324 	data.iommu = iommu;
2325 	data.table = table;
2326 
2327 	return pci_for_each_dma_alias(to_pci_dev(dev),
2328 				      &domain_context_mapping_cb, &data);
2329 }
2330 
domain_context_mapped_cb(struct pci_dev * pdev,u16 alias,void * opaque)2331 static int domain_context_mapped_cb(struct pci_dev *pdev,
2332 				    u16 alias, void *opaque)
2333 {
2334 	struct intel_iommu *iommu = opaque;
2335 
2336 	return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2337 }
2338 
domain_context_mapped(struct device * dev)2339 static int domain_context_mapped(struct device *dev)
2340 {
2341 	struct intel_iommu *iommu;
2342 	u8 bus, devfn;
2343 
2344 	iommu = device_to_iommu(dev, &bus, &devfn);
2345 	if (!iommu)
2346 		return -ENODEV;
2347 
2348 	if (!dev_is_pci(dev))
2349 		return device_context_mapped(iommu, bus, devfn);
2350 
2351 	return !pci_for_each_dma_alias(to_pci_dev(dev),
2352 				       domain_context_mapped_cb, iommu);
2353 }
2354 
2355 /* Returns a number of VTD pages, but aligned to MM page size */
aligned_nrpages(unsigned long host_addr,size_t size)2356 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2357 					    size_t size)
2358 {
2359 	host_addr &= ~PAGE_MASK;
2360 	return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2361 }
2362 
2363 /* Return largest possible superpage level for a given mapping */
hardware_largepage_caps(struct dmar_domain * domain,unsigned long iov_pfn,unsigned long phy_pfn,unsigned long pages)2364 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2365 					  unsigned long iov_pfn,
2366 					  unsigned long phy_pfn,
2367 					  unsigned long pages)
2368 {
2369 	int support, level = 1;
2370 	unsigned long pfnmerge;
2371 
2372 	support = domain->iommu_superpage;
2373 
2374 	/* To use a large page, the virtual *and* physical addresses
2375 	   must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2376 	   of them will mean we have to use smaller pages. So just
2377 	   merge them and check both at once. */
2378 	pfnmerge = iov_pfn | phy_pfn;
2379 
2380 	while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2381 		pages >>= VTD_STRIDE_SHIFT;
2382 		if (!pages)
2383 			break;
2384 		pfnmerge >>= VTD_STRIDE_SHIFT;
2385 		level++;
2386 		support--;
2387 	}
2388 	return level;
2389 }
2390 
__domain_mapping(struct dmar_domain * domain,unsigned long iov_pfn,struct scatterlist * sg,unsigned long phys_pfn,unsigned long nr_pages,int prot)2391 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2392 			    struct scatterlist *sg, unsigned long phys_pfn,
2393 			    unsigned long nr_pages, int prot)
2394 {
2395 	struct dma_pte *first_pte = NULL, *pte = NULL;
2396 	phys_addr_t pteval;
2397 	unsigned long sg_res = 0;
2398 	unsigned int largepage_lvl = 0;
2399 	unsigned long lvl_pages = 0;
2400 	u64 attr;
2401 
2402 	BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2403 
2404 	if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2405 		return -EINVAL;
2406 
2407 	attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2408 	attr |= DMA_FL_PTE_PRESENT;
2409 	if (domain_use_first_level(domain)) {
2410 		attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US;
2411 
2412 		if (domain->domain.type == IOMMU_DOMAIN_DMA) {
2413 			attr |= DMA_FL_PTE_ACCESS;
2414 			if (prot & DMA_PTE_WRITE)
2415 				attr |= DMA_FL_PTE_DIRTY;
2416 		}
2417 	}
2418 
2419 	if (!sg) {
2420 		sg_res = nr_pages;
2421 		pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2422 	}
2423 
2424 	while (nr_pages > 0) {
2425 		uint64_t tmp;
2426 
2427 		if (!sg_res) {
2428 			unsigned int pgoff = sg->offset & ~PAGE_MASK;
2429 
2430 			sg_res = aligned_nrpages(sg->offset, sg->length);
2431 			sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2432 			sg->dma_length = sg->length;
2433 			pteval = (sg_phys(sg) - pgoff) | attr;
2434 			phys_pfn = pteval >> VTD_PAGE_SHIFT;
2435 		}
2436 
2437 		if (!pte) {
2438 			largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2439 
2440 			first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2441 			if (!pte)
2442 				return -ENOMEM;
2443 			/* It is large page*/
2444 			if (largepage_lvl > 1) {
2445 				unsigned long nr_superpages, end_pfn;
2446 
2447 				pteval |= DMA_PTE_LARGE_PAGE;
2448 				lvl_pages = lvl_to_nr_pages(largepage_lvl);
2449 
2450 				nr_superpages = sg_res / lvl_pages;
2451 				end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2452 
2453 				/*
2454 				 * Ensure that old small page tables are
2455 				 * removed to make room for superpage(s).
2456 				 * We're adding new large pages, so make sure
2457 				 * we don't remove their parent tables.
2458 				 */
2459 				dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2460 						       largepage_lvl + 1);
2461 			} else {
2462 				pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2463 			}
2464 
2465 		}
2466 		/* We don't need lock here, nobody else
2467 		 * touches the iova range
2468 		 */
2469 		tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2470 		if (tmp) {
2471 			static int dumps = 5;
2472 			pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2473 				iov_pfn, tmp, (unsigned long long)pteval);
2474 			if (dumps) {
2475 				dumps--;
2476 				debug_dma_dump_mappings(NULL);
2477 			}
2478 			WARN_ON(1);
2479 		}
2480 
2481 		lvl_pages = lvl_to_nr_pages(largepage_lvl);
2482 
2483 		BUG_ON(nr_pages < lvl_pages);
2484 		BUG_ON(sg_res < lvl_pages);
2485 
2486 		nr_pages -= lvl_pages;
2487 		iov_pfn += lvl_pages;
2488 		phys_pfn += lvl_pages;
2489 		pteval += lvl_pages * VTD_PAGE_SIZE;
2490 		sg_res -= lvl_pages;
2491 
2492 		/* If the next PTE would be the first in a new page, then we
2493 		   need to flush the cache on the entries we've just written.
2494 		   And then we'll need to recalculate 'pte', so clear it and
2495 		   let it get set again in the if (!pte) block above.
2496 
2497 		   If we're done (!nr_pages) we need to flush the cache too.
2498 
2499 		   Also if we've been setting superpages, we may need to
2500 		   recalculate 'pte' and switch back to smaller pages for the
2501 		   end of the mapping, if the trailing size is not enough to
2502 		   use another superpage (i.e. sg_res < lvl_pages). */
2503 		pte++;
2504 		if (!nr_pages || first_pte_in_page(pte) ||
2505 		    (largepage_lvl > 1 && sg_res < lvl_pages)) {
2506 			domain_flush_cache(domain, first_pte,
2507 					   (void *)pte - (void *)first_pte);
2508 			pte = NULL;
2509 		}
2510 
2511 		if (!sg_res && nr_pages)
2512 			sg = sg_next(sg);
2513 	}
2514 	return 0;
2515 }
2516 
domain_mapping(struct dmar_domain * domain,unsigned long iov_pfn,struct scatterlist * sg,unsigned long phys_pfn,unsigned long nr_pages,int prot)2517 static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2518 			  struct scatterlist *sg, unsigned long phys_pfn,
2519 			  unsigned long nr_pages, int prot)
2520 {
2521 	int iommu_id, ret;
2522 	struct intel_iommu *iommu;
2523 
2524 	/* Do the real mapping first */
2525 	ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot);
2526 	if (ret)
2527 		return ret;
2528 
2529 	for_each_domain_iommu(iommu_id, domain) {
2530 		iommu = g_iommus[iommu_id];
2531 		__mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2532 	}
2533 
2534 	return 0;
2535 }
2536 
domain_sg_mapping(struct dmar_domain * domain,unsigned long iov_pfn,struct scatterlist * sg,unsigned long nr_pages,int prot)2537 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2538 				    struct scatterlist *sg, unsigned long nr_pages,
2539 				    int prot)
2540 {
2541 	return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2542 }
2543 
domain_pfn_mapping(struct dmar_domain * domain,unsigned long iov_pfn,unsigned long phys_pfn,unsigned long nr_pages,int prot)2544 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2545 				     unsigned long phys_pfn, unsigned long nr_pages,
2546 				     int prot)
2547 {
2548 	return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2549 }
2550 
domain_context_clear_one(struct intel_iommu * iommu,u8 bus,u8 devfn)2551 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2552 {
2553 	unsigned long flags;
2554 	struct context_entry *context;
2555 	u16 did_old;
2556 
2557 	if (!iommu)
2558 		return;
2559 
2560 	spin_lock_irqsave(&iommu->lock, flags);
2561 	context = iommu_context_addr(iommu, bus, devfn, 0);
2562 	if (!context) {
2563 		spin_unlock_irqrestore(&iommu->lock, flags);
2564 		return;
2565 	}
2566 	did_old = context_domain_id(context);
2567 	context_clear_entry(context);
2568 	__iommu_flush_cache(iommu, context, sizeof(*context));
2569 	spin_unlock_irqrestore(&iommu->lock, flags);
2570 	iommu->flush.flush_context(iommu,
2571 				   did_old,
2572 				   (((u16)bus) << 8) | devfn,
2573 				   DMA_CCMD_MASK_NOBIT,
2574 				   DMA_CCMD_DEVICE_INVL);
2575 
2576 	if (sm_supported(iommu))
2577 		qi_flush_pasid_cache(iommu, did_old, QI_PC_ALL_PASIDS, 0);
2578 
2579 	iommu->flush.flush_iotlb(iommu,
2580 				 did_old,
2581 				 0,
2582 				 0,
2583 				 DMA_TLB_DSI_FLUSH);
2584 }
2585 
unlink_domain_info(struct device_domain_info * info)2586 static inline void unlink_domain_info(struct device_domain_info *info)
2587 {
2588 	assert_spin_locked(&device_domain_lock);
2589 	list_del(&info->link);
2590 	list_del(&info->global);
2591 	if (info->dev)
2592 		dev_iommu_priv_set(info->dev, NULL);
2593 }
2594 
domain_remove_dev_info(struct dmar_domain * domain)2595 static void domain_remove_dev_info(struct dmar_domain *domain)
2596 {
2597 	struct device_domain_info *info, *tmp;
2598 	unsigned long flags;
2599 
2600 	spin_lock_irqsave(&device_domain_lock, flags);
2601 	list_for_each_entry_safe(info, tmp, &domain->devices, link)
2602 		__dmar_remove_one_dev_info(info);
2603 	spin_unlock_irqrestore(&device_domain_lock, flags);
2604 }
2605 
find_domain(struct device * dev)2606 struct dmar_domain *find_domain(struct device *dev)
2607 {
2608 	struct device_domain_info *info;
2609 
2610 	if (unlikely(!dev || !dev->iommu))
2611 		return NULL;
2612 
2613 	if (unlikely(attach_deferred(dev)))
2614 		return NULL;
2615 
2616 	/* No lock here, assumes no domain exit in normal case */
2617 	info = get_domain_info(dev);
2618 	if (likely(info))
2619 		return info->domain;
2620 
2621 	return NULL;
2622 }
2623 
do_deferred_attach(struct device * dev)2624 static void do_deferred_attach(struct device *dev)
2625 {
2626 	struct iommu_domain *domain;
2627 
2628 	dev_iommu_priv_set(dev, NULL);
2629 	domain = iommu_get_domain_for_dev(dev);
2630 	if (domain)
2631 		intel_iommu_attach_device(domain, dev);
2632 }
2633 
2634 static inline struct device_domain_info *
dmar_search_domain_by_dev_info(int segment,int bus,int devfn)2635 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2636 {
2637 	struct device_domain_info *info;
2638 
2639 	list_for_each_entry(info, &device_domain_list, global)
2640 		if (info->segment == segment && info->bus == bus &&
2641 		    info->devfn == devfn)
2642 			return info;
2643 
2644 	return NULL;
2645 }
2646 
domain_setup_first_level(struct intel_iommu * iommu,struct dmar_domain * domain,struct device * dev,u32 pasid)2647 static int domain_setup_first_level(struct intel_iommu *iommu,
2648 				    struct dmar_domain *domain,
2649 				    struct device *dev,
2650 				    u32 pasid)
2651 {
2652 	struct dma_pte *pgd = domain->pgd;
2653 	int agaw, level;
2654 	int flags = 0;
2655 
2656 	/*
2657 	 * Skip top levels of page tables for iommu which has
2658 	 * less agaw than default. Unnecessary for PT mode.
2659 	 */
2660 	for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2661 		pgd = phys_to_virt(dma_pte_addr(pgd));
2662 		if (!dma_pte_present(pgd))
2663 			return -ENOMEM;
2664 	}
2665 
2666 	level = agaw_to_level(agaw);
2667 	if (level != 4 && level != 5)
2668 		return -EINVAL;
2669 
2670 	if (pasid != PASID_RID2PASID)
2671 		flags |= PASID_FLAG_SUPERVISOR_MODE;
2672 	if (level == 5)
2673 		flags |= PASID_FLAG_FL5LP;
2674 
2675 	if (domain->domain.type == IOMMU_DOMAIN_UNMANAGED)
2676 		flags |= PASID_FLAG_PAGE_SNOOP;
2677 
2678 	return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2679 					     domain->iommu_did[iommu->seq_id],
2680 					     flags);
2681 }
2682 
dev_is_real_dma_subdevice(struct device * dev)2683 static bool dev_is_real_dma_subdevice(struct device *dev)
2684 {
2685 	return dev && dev_is_pci(dev) &&
2686 	       pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2687 }
2688 
dmar_insert_one_dev_info(struct intel_iommu * iommu,int bus,int devfn,struct device * dev,struct dmar_domain * domain)2689 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2690 						    int bus, int devfn,
2691 						    struct device *dev,
2692 						    struct dmar_domain *domain)
2693 {
2694 	struct dmar_domain *found = NULL;
2695 	struct device_domain_info *info;
2696 	unsigned long flags;
2697 	int ret;
2698 
2699 	info = alloc_devinfo_mem();
2700 	if (!info)
2701 		return NULL;
2702 
2703 	if (!dev_is_real_dma_subdevice(dev)) {
2704 		info->bus = bus;
2705 		info->devfn = devfn;
2706 		info->segment = iommu->segment;
2707 	} else {
2708 		struct pci_dev *pdev = to_pci_dev(dev);
2709 
2710 		info->bus = pdev->bus->number;
2711 		info->devfn = pdev->devfn;
2712 		info->segment = pci_domain_nr(pdev->bus);
2713 	}
2714 
2715 	info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2716 	info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2717 	info->ats_qdep = 0;
2718 	info->dev = dev;
2719 	info->domain = domain;
2720 	info->iommu = iommu;
2721 	info->pasid_table = NULL;
2722 	info->auxd_enabled = 0;
2723 	INIT_LIST_HEAD(&info->auxiliary_domains);
2724 
2725 	if (dev && dev_is_pci(dev)) {
2726 		struct pci_dev *pdev = to_pci_dev(info->dev);
2727 
2728 		if (ecap_dev_iotlb_support(iommu->ecap) &&
2729 		    pci_ats_supported(pdev) &&
2730 		    dmar_find_matched_atsr_unit(pdev))
2731 			info->ats_supported = 1;
2732 
2733 		if (sm_supported(iommu)) {
2734 			if (pasid_supported(iommu)) {
2735 				int features = pci_pasid_features(pdev);
2736 				if (features >= 0)
2737 					info->pasid_supported = features | 1;
2738 			}
2739 
2740 			if (info->ats_supported && ecap_prs(iommu->ecap) &&
2741 			    pci_pri_supported(pdev))
2742 				info->pri_supported = 1;
2743 		}
2744 	}
2745 
2746 	spin_lock_irqsave(&device_domain_lock, flags);
2747 	if (dev)
2748 		found = find_domain(dev);
2749 
2750 	if (!found) {
2751 		struct device_domain_info *info2;
2752 		info2 = dmar_search_domain_by_dev_info(info->segment, info->bus,
2753 						       info->devfn);
2754 		if (info2) {
2755 			found      = info2->domain;
2756 			info2->dev = dev;
2757 		}
2758 	}
2759 
2760 	if (found) {
2761 		spin_unlock_irqrestore(&device_domain_lock, flags);
2762 		free_devinfo_mem(info);
2763 		/* Caller must free the original domain */
2764 		return found;
2765 	}
2766 
2767 	spin_lock(&iommu->lock);
2768 	ret = domain_attach_iommu(domain, iommu);
2769 	spin_unlock(&iommu->lock);
2770 
2771 	if (ret) {
2772 		spin_unlock_irqrestore(&device_domain_lock, flags);
2773 		free_devinfo_mem(info);
2774 		return NULL;
2775 	}
2776 
2777 	list_add(&info->link, &domain->devices);
2778 	list_add(&info->global, &device_domain_list);
2779 	if (dev)
2780 		dev_iommu_priv_set(dev, info);
2781 	spin_unlock_irqrestore(&device_domain_lock, flags);
2782 
2783 	/* PASID table is mandatory for a PCI device in scalable mode. */
2784 	if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
2785 		ret = intel_pasid_alloc_table(dev);
2786 		if (ret) {
2787 			dev_err(dev, "PASID table allocation failed\n");
2788 			dmar_remove_one_dev_info(dev);
2789 			return NULL;
2790 		}
2791 
2792 		/* Setup the PASID entry for requests without PASID: */
2793 		spin_lock_irqsave(&iommu->lock, flags);
2794 		if (hw_pass_through && domain_type_is_si(domain))
2795 			ret = intel_pasid_setup_pass_through(iommu, domain,
2796 					dev, PASID_RID2PASID);
2797 		else if (domain_use_first_level(domain))
2798 			ret = domain_setup_first_level(iommu, domain, dev,
2799 					PASID_RID2PASID);
2800 		else
2801 			ret = intel_pasid_setup_second_level(iommu, domain,
2802 					dev, PASID_RID2PASID);
2803 		spin_unlock_irqrestore(&iommu->lock, flags);
2804 		if (ret) {
2805 			dev_err(dev, "Setup RID2PASID failed\n");
2806 			dmar_remove_one_dev_info(dev);
2807 			return NULL;
2808 		}
2809 	}
2810 
2811 	if (dev && domain_context_mapping(domain, dev)) {
2812 		dev_err(dev, "Domain context map failed\n");
2813 		dmar_remove_one_dev_info(dev);
2814 		return NULL;
2815 	}
2816 
2817 	return domain;
2818 }
2819 
iommu_domain_identity_map(struct dmar_domain * domain,unsigned long first_vpfn,unsigned long last_vpfn)2820 static int iommu_domain_identity_map(struct dmar_domain *domain,
2821 				     unsigned long first_vpfn,
2822 				     unsigned long last_vpfn)
2823 {
2824 	/*
2825 	 * RMRR range might have overlap with physical memory range,
2826 	 * clear it first
2827 	 */
2828 	dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2829 
2830 	return __domain_mapping(domain, first_vpfn, NULL,
2831 				first_vpfn, last_vpfn - first_vpfn + 1,
2832 				DMA_PTE_READ|DMA_PTE_WRITE);
2833 }
2834 
2835 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2836 
si_domain_init(int hw)2837 static int __init si_domain_init(int hw)
2838 {
2839 	struct dmar_rmrr_unit *rmrr;
2840 	struct device *dev;
2841 	int i, nid, ret;
2842 
2843 	si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2844 	if (!si_domain)
2845 		return -EFAULT;
2846 
2847 	if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2848 		domain_exit(si_domain);
2849 		si_domain = NULL;
2850 		return -EFAULT;
2851 	}
2852 
2853 	if (hw)
2854 		return 0;
2855 
2856 	for_each_online_node(nid) {
2857 		unsigned long start_pfn, end_pfn;
2858 		int i;
2859 
2860 		for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2861 			ret = iommu_domain_identity_map(si_domain,
2862 					mm_to_dma_pfn(start_pfn),
2863 					mm_to_dma_pfn(end_pfn));
2864 			if (ret)
2865 				return ret;
2866 		}
2867 	}
2868 
2869 	/*
2870 	 * Identity map the RMRRs so that devices with RMRRs could also use
2871 	 * the si_domain.
2872 	 */
2873 	for_each_rmrr_units(rmrr) {
2874 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2875 					  i, dev) {
2876 			unsigned long long start = rmrr->base_address;
2877 			unsigned long long end = rmrr->end_address;
2878 
2879 			if (WARN_ON(end < start ||
2880 				    end >> agaw_to_width(si_domain->agaw)))
2881 				continue;
2882 
2883 			ret = iommu_domain_identity_map(si_domain,
2884 					mm_to_dma_pfn(start >> PAGE_SHIFT),
2885 					mm_to_dma_pfn(end >> PAGE_SHIFT));
2886 			if (ret)
2887 				return ret;
2888 		}
2889 	}
2890 
2891 	return 0;
2892 }
2893 
domain_add_dev_info(struct dmar_domain * domain,struct device * dev)2894 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2895 {
2896 	struct dmar_domain *ndomain;
2897 	struct intel_iommu *iommu;
2898 	u8 bus, devfn;
2899 
2900 	iommu = device_to_iommu(dev, &bus, &devfn);
2901 	if (!iommu)
2902 		return -ENODEV;
2903 
2904 	ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2905 	if (ndomain != domain)
2906 		return -EBUSY;
2907 
2908 	return 0;
2909 }
2910 
device_has_rmrr(struct device * dev)2911 static bool device_has_rmrr(struct device *dev)
2912 {
2913 	struct dmar_rmrr_unit *rmrr;
2914 	struct device *tmp;
2915 	int i;
2916 
2917 	rcu_read_lock();
2918 	for_each_rmrr_units(rmrr) {
2919 		/*
2920 		 * Return TRUE if this RMRR contains the device that
2921 		 * is passed in.
2922 		 */
2923 		for_each_active_dev_scope(rmrr->devices,
2924 					  rmrr->devices_cnt, i, tmp)
2925 			if (tmp == dev ||
2926 			    is_downstream_to_pci_bridge(dev, tmp)) {
2927 				rcu_read_unlock();
2928 				return true;
2929 			}
2930 	}
2931 	rcu_read_unlock();
2932 	return false;
2933 }
2934 
2935 /**
2936  * device_rmrr_is_relaxable - Test whether the RMRR of this device
2937  * is relaxable (ie. is allowed to be not enforced under some conditions)
2938  * @dev: device handle
2939  *
2940  * We assume that PCI USB devices with RMRRs have them largely
2941  * for historical reasons and that the RMRR space is not actively used post
2942  * boot.  This exclusion may change if vendors begin to abuse it.
2943  *
2944  * The same exception is made for graphics devices, with the requirement that
2945  * any use of the RMRR regions will be torn down before assigning the device
2946  * to a guest.
2947  *
2948  * Return: true if the RMRR is relaxable, false otherwise
2949  */
device_rmrr_is_relaxable(struct device * dev)2950 static bool device_rmrr_is_relaxable(struct device *dev)
2951 {
2952 	struct pci_dev *pdev;
2953 
2954 	if (!dev_is_pci(dev))
2955 		return false;
2956 
2957 	pdev = to_pci_dev(dev);
2958 	if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2959 		return true;
2960 	else
2961 		return false;
2962 }
2963 
2964 /*
2965  * There are a couple cases where we need to restrict the functionality of
2966  * devices associated with RMRRs.  The first is when evaluating a device for
2967  * identity mapping because problems exist when devices are moved in and out
2968  * of domains and their respective RMRR information is lost.  This means that
2969  * a device with associated RMRRs will never be in a "passthrough" domain.
2970  * The second is use of the device through the IOMMU API.  This interface
2971  * expects to have full control of the IOVA space for the device.  We cannot
2972  * satisfy both the requirement that RMRR access is maintained and have an
2973  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2974  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2975  * We therefore prevent devices associated with an RMRR from participating in
2976  * the IOMMU API, which eliminates them from device assignment.
2977  *
2978  * In both cases, devices which have relaxable RMRRs are not concerned by this
2979  * restriction. See device_rmrr_is_relaxable comment.
2980  */
device_is_rmrr_locked(struct device * dev)2981 static bool device_is_rmrr_locked(struct device *dev)
2982 {
2983 	if (!device_has_rmrr(dev))
2984 		return false;
2985 
2986 	if (device_rmrr_is_relaxable(dev))
2987 		return false;
2988 
2989 	return true;
2990 }
2991 
2992 /*
2993  * Return the required default domain type for a specific device.
2994  *
2995  * @dev: the device in query
2996  * @startup: true if this is during early boot
2997  *
2998  * Returns:
2999  *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
3000  *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
3001  *  - 0: both identity and dynamic domains work for this device
3002  */
device_def_domain_type(struct device * dev)3003 static int device_def_domain_type(struct device *dev)
3004 {
3005 	if (dev_is_pci(dev)) {
3006 		struct pci_dev *pdev = to_pci_dev(dev);
3007 
3008 		/*
3009 		 * Prevent any device marked as untrusted from getting
3010 		 * placed into the statically identity mapping domain.
3011 		 */
3012 		if (pdev->untrusted)
3013 			return IOMMU_DOMAIN_DMA;
3014 
3015 		if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
3016 			return IOMMU_DOMAIN_IDENTITY;
3017 
3018 		if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
3019 			return IOMMU_DOMAIN_IDENTITY;
3020 	}
3021 
3022 	return 0;
3023 }
3024 
intel_iommu_init_qi(struct intel_iommu * iommu)3025 static void intel_iommu_init_qi(struct intel_iommu *iommu)
3026 {
3027 	/*
3028 	 * Start from the sane iommu hardware state.
3029 	 * If the queued invalidation is already initialized by us
3030 	 * (for example, while enabling interrupt-remapping) then
3031 	 * we got the things already rolling from a sane state.
3032 	 */
3033 	if (!iommu->qi) {
3034 		/*
3035 		 * Clear any previous faults.
3036 		 */
3037 		dmar_fault(-1, iommu);
3038 		/*
3039 		 * Disable queued invalidation if supported and already enabled
3040 		 * before OS handover.
3041 		 */
3042 		dmar_disable_qi(iommu);
3043 	}
3044 
3045 	if (dmar_enable_qi(iommu)) {
3046 		/*
3047 		 * Queued Invalidate not enabled, use Register Based Invalidate
3048 		 */
3049 		iommu->flush.flush_context = __iommu_flush_context;
3050 		iommu->flush.flush_iotlb = __iommu_flush_iotlb;
3051 		pr_info("%s: Using Register based invalidation\n",
3052 			iommu->name);
3053 	} else {
3054 		iommu->flush.flush_context = qi_flush_context;
3055 		iommu->flush.flush_iotlb = qi_flush_iotlb;
3056 		pr_info("%s: Using Queued invalidation\n", iommu->name);
3057 	}
3058 }
3059 
copy_context_table(struct intel_iommu * iommu,struct root_entry * old_re,struct context_entry ** tbl,int bus,bool ext)3060 static int copy_context_table(struct intel_iommu *iommu,
3061 			      struct root_entry *old_re,
3062 			      struct context_entry **tbl,
3063 			      int bus, bool ext)
3064 {
3065 	int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
3066 	struct context_entry *new_ce = NULL, ce;
3067 	struct context_entry *old_ce = NULL;
3068 	struct root_entry re;
3069 	phys_addr_t old_ce_phys;
3070 
3071 	tbl_idx = ext ? bus * 2 : bus;
3072 	memcpy(&re, old_re, sizeof(re));
3073 
3074 	for (devfn = 0; devfn < 256; devfn++) {
3075 		/* First calculate the correct index */
3076 		idx = (ext ? devfn * 2 : devfn) % 256;
3077 
3078 		if (idx == 0) {
3079 			/* First save what we may have and clean up */
3080 			if (new_ce) {
3081 				tbl[tbl_idx] = new_ce;
3082 				__iommu_flush_cache(iommu, new_ce,
3083 						    VTD_PAGE_SIZE);
3084 				pos = 1;
3085 			}
3086 
3087 			if (old_ce)
3088 				memunmap(old_ce);
3089 
3090 			ret = 0;
3091 			if (devfn < 0x80)
3092 				old_ce_phys = root_entry_lctp(&re);
3093 			else
3094 				old_ce_phys = root_entry_uctp(&re);
3095 
3096 			if (!old_ce_phys) {
3097 				if (ext && devfn == 0) {
3098 					/* No LCTP, try UCTP */
3099 					devfn = 0x7f;
3100 					continue;
3101 				} else {
3102 					goto out;
3103 				}
3104 			}
3105 
3106 			ret = -ENOMEM;
3107 			old_ce = memremap(old_ce_phys, PAGE_SIZE,
3108 					MEMREMAP_WB);
3109 			if (!old_ce)
3110 				goto out;
3111 
3112 			new_ce = alloc_pgtable_page(iommu->node);
3113 			if (!new_ce)
3114 				goto out_unmap;
3115 
3116 			ret = 0;
3117 		}
3118 
3119 		/* Now copy the context entry */
3120 		memcpy(&ce, old_ce + idx, sizeof(ce));
3121 
3122 		if (!__context_present(&ce))
3123 			continue;
3124 
3125 		did = context_domain_id(&ce);
3126 		if (did >= 0 && did < cap_ndoms(iommu->cap))
3127 			set_bit(did, iommu->domain_ids);
3128 
3129 		/*
3130 		 * We need a marker for copied context entries. This
3131 		 * marker needs to work for the old format as well as
3132 		 * for extended context entries.
3133 		 *
3134 		 * Bit 67 of the context entry is used. In the old
3135 		 * format this bit is available to software, in the
3136 		 * extended format it is the PGE bit, but PGE is ignored
3137 		 * by HW if PASIDs are disabled (and thus still
3138 		 * available).
3139 		 *
3140 		 * So disable PASIDs first and then mark the entry
3141 		 * copied. This means that we don't copy PASID
3142 		 * translations from the old kernel, but this is fine as
3143 		 * faults there are not fatal.
3144 		 */
3145 		context_clear_pasid_enable(&ce);
3146 		context_set_copied(&ce);
3147 
3148 		new_ce[idx] = ce;
3149 	}
3150 
3151 	tbl[tbl_idx + pos] = new_ce;
3152 
3153 	__iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3154 
3155 out_unmap:
3156 	memunmap(old_ce);
3157 
3158 out:
3159 	return ret;
3160 }
3161 
copy_translation_tables(struct intel_iommu * iommu)3162 static int copy_translation_tables(struct intel_iommu *iommu)
3163 {
3164 	struct context_entry **ctxt_tbls;
3165 	struct root_entry *old_rt;
3166 	phys_addr_t old_rt_phys;
3167 	int ctxt_table_entries;
3168 	unsigned long flags;
3169 	u64 rtaddr_reg;
3170 	int bus, ret;
3171 	bool new_ext, ext;
3172 
3173 	rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3174 	ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
3175 	new_ext    = !!ecap_ecs(iommu->ecap);
3176 
3177 	/*
3178 	 * The RTT bit can only be changed when translation is disabled,
3179 	 * but disabling translation means to open a window for data
3180 	 * corruption. So bail out and don't copy anything if we would
3181 	 * have to change the bit.
3182 	 */
3183 	if (new_ext != ext)
3184 		return -EINVAL;
3185 
3186 	old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3187 	if (!old_rt_phys)
3188 		return -EINVAL;
3189 
3190 	old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3191 	if (!old_rt)
3192 		return -ENOMEM;
3193 
3194 	/* This is too big for the stack - allocate it from slab */
3195 	ctxt_table_entries = ext ? 512 : 256;
3196 	ret = -ENOMEM;
3197 	ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3198 	if (!ctxt_tbls)
3199 		goto out_unmap;
3200 
3201 	for (bus = 0; bus < 256; bus++) {
3202 		ret = copy_context_table(iommu, &old_rt[bus],
3203 					 ctxt_tbls, bus, ext);
3204 		if (ret) {
3205 			pr_err("%s: Failed to copy context table for bus %d\n",
3206 				iommu->name, bus);
3207 			continue;
3208 		}
3209 	}
3210 
3211 	spin_lock_irqsave(&iommu->lock, flags);
3212 
3213 	/* Context tables are copied, now write them to the root_entry table */
3214 	for (bus = 0; bus < 256; bus++) {
3215 		int idx = ext ? bus * 2 : bus;
3216 		u64 val;
3217 
3218 		if (ctxt_tbls[idx]) {
3219 			val = virt_to_phys(ctxt_tbls[idx]) | 1;
3220 			iommu->root_entry[bus].lo = val;
3221 		}
3222 
3223 		if (!ext || !ctxt_tbls[idx + 1])
3224 			continue;
3225 
3226 		val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3227 		iommu->root_entry[bus].hi = val;
3228 	}
3229 
3230 	spin_unlock_irqrestore(&iommu->lock, flags);
3231 
3232 	kfree(ctxt_tbls);
3233 
3234 	__iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3235 
3236 	ret = 0;
3237 
3238 out_unmap:
3239 	memunmap(old_rt);
3240 
3241 	return ret;
3242 }
3243 
3244 #ifdef CONFIG_INTEL_IOMMU_SVM
intel_vcmd_ioasid_alloc(ioasid_t min,ioasid_t max,void * data)3245 static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data)
3246 {
3247 	struct intel_iommu *iommu = data;
3248 	ioasid_t ioasid;
3249 
3250 	if (!iommu)
3251 		return INVALID_IOASID;
3252 	/*
3253 	 * VT-d virtual command interface always uses the full 20 bit
3254 	 * PASID range. Host can partition guest PASID range based on
3255 	 * policies but it is out of guest's control.
3256 	 */
3257 	if (min < PASID_MIN || max > intel_pasid_max_id)
3258 		return INVALID_IOASID;
3259 
3260 	if (vcmd_alloc_pasid(iommu, &ioasid))
3261 		return INVALID_IOASID;
3262 
3263 	return ioasid;
3264 }
3265 
intel_vcmd_ioasid_free(ioasid_t ioasid,void * data)3266 static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data)
3267 {
3268 	struct intel_iommu *iommu = data;
3269 
3270 	if (!iommu)
3271 		return;
3272 	/*
3273 	 * Sanity check the ioasid owner is done at upper layer, e.g. VFIO
3274 	 * We can only free the PASID when all the devices are unbound.
3275 	 */
3276 	if (ioasid_find(NULL, ioasid, NULL)) {
3277 		pr_alert("Cannot free active IOASID %d\n", ioasid);
3278 		return;
3279 	}
3280 	vcmd_free_pasid(iommu, ioasid);
3281 }
3282 
register_pasid_allocator(struct intel_iommu * iommu)3283 static void register_pasid_allocator(struct intel_iommu *iommu)
3284 {
3285 	/*
3286 	 * If we are running in the host, no need for custom allocator
3287 	 * in that PASIDs are allocated from the host system-wide.
3288 	 */
3289 	if (!cap_caching_mode(iommu->cap))
3290 		return;
3291 
3292 	if (!sm_supported(iommu)) {
3293 		pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n");
3294 		return;
3295 	}
3296 
3297 	/*
3298 	 * Register a custom PASID allocator if we are running in a guest,
3299 	 * guest PASID must be obtained via virtual command interface.
3300 	 * There can be multiple vIOMMUs in each guest but only one allocator
3301 	 * is active. All vIOMMU allocators will eventually be calling the same
3302 	 * host allocator.
3303 	 */
3304 	if (!vccap_pasid(iommu->vccap))
3305 		return;
3306 
3307 	pr_info("Register custom PASID allocator\n");
3308 	iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc;
3309 	iommu->pasid_allocator.free = intel_vcmd_ioasid_free;
3310 	iommu->pasid_allocator.pdata = (void *)iommu;
3311 	if (ioasid_register_allocator(&iommu->pasid_allocator)) {
3312 		pr_warn("Custom PASID allocator failed, scalable mode disabled\n");
3313 		/*
3314 		 * Disable scalable mode on this IOMMU if there
3315 		 * is no custom allocator. Mixing SM capable vIOMMU
3316 		 * and non-SM vIOMMU are not supported.
3317 		 */
3318 		intel_iommu_sm = 0;
3319 	}
3320 }
3321 #endif
3322 
init_dmars(void)3323 static int __init init_dmars(void)
3324 {
3325 	struct dmar_drhd_unit *drhd;
3326 	struct intel_iommu *iommu;
3327 	int ret;
3328 
3329 	/*
3330 	 * for each drhd
3331 	 *    allocate root
3332 	 *    initialize and program root entry to not present
3333 	 * endfor
3334 	 */
3335 	for_each_drhd_unit(drhd) {
3336 		/*
3337 		 * lock not needed as this is only incremented in the single
3338 		 * threaded kernel __init code path all other access are read
3339 		 * only
3340 		 */
3341 		if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3342 			g_num_of_iommus++;
3343 			continue;
3344 		}
3345 		pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3346 	}
3347 
3348 	/* Preallocate enough resources for IOMMU hot-addition */
3349 	if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3350 		g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3351 
3352 	g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3353 			GFP_KERNEL);
3354 	if (!g_iommus) {
3355 		pr_err("Allocating global iommu array failed\n");
3356 		ret = -ENOMEM;
3357 		goto error;
3358 	}
3359 
3360 	for_each_iommu(iommu, drhd) {
3361 		if (drhd->ignored) {
3362 			iommu_disable_translation(iommu);
3363 			continue;
3364 		}
3365 
3366 		/*
3367 		 * Find the max pasid size of all IOMMU's in the system.
3368 		 * We need to ensure the system pasid table is no bigger
3369 		 * than the smallest supported.
3370 		 */
3371 		if (pasid_supported(iommu)) {
3372 			u32 temp = 2 << ecap_pss(iommu->ecap);
3373 
3374 			intel_pasid_max_id = min_t(u32, temp,
3375 						   intel_pasid_max_id);
3376 		}
3377 
3378 		g_iommus[iommu->seq_id] = iommu;
3379 
3380 		intel_iommu_init_qi(iommu);
3381 
3382 		ret = iommu_init_domains(iommu);
3383 		if (ret)
3384 			goto free_iommu;
3385 
3386 		init_translation_status(iommu);
3387 
3388 		if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3389 			iommu_disable_translation(iommu);
3390 			clear_translation_pre_enabled(iommu);
3391 			pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3392 				iommu->name);
3393 		}
3394 
3395 		/*
3396 		 * TBD:
3397 		 * we could share the same root & context tables
3398 		 * among all IOMMU's. Need to Split it later.
3399 		 */
3400 		ret = iommu_alloc_root_entry(iommu);
3401 		if (ret)
3402 			goto free_iommu;
3403 
3404 		if (translation_pre_enabled(iommu)) {
3405 			pr_info("Translation already enabled - trying to copy translation structures\n");
3406 
3407 			ret = copy_translation_tables(iommu);
3408 			if (ret) {
3409 				/*
3410 				 * We found the IOMMU with translation
3411 				 * enabled - but failed to copy over the
3412 				 * old root-entry table. Try to proceed
3413 				 * by disabling translation now and
3414 				 * allocating a clean root-entry table.
3415 				 * This might cause DMAR faults, but
3416 				 * probably the dump will still succeed.
3417 				 */
3418 				pr_err("Failed to copy translation tables from previous kernel for %s\n",
3419 				       iommu->name);
3420 				iommu_disable_translation(iommu);
3421 				clear_translation_pre_enabled(iommu);
3422 			} else {
3423 				pr_info("Copied translation tables from previous kernel for %s\n",
3424 					iommu->name);
3425 			}
3426 		}
3427 
3428 		if (!ecap_pass_through(iommu->ecap))
3429 			hw_pass_through = 0;
3430 
3431 		if (!intel_iommu_strict && cap_caching_mode(iommu->cap)) {
3432 			pr_warn("Disable batched IOTLB flush due to virtualization");
3433 			intel_iommu_strict = 1;
3434 		}
3435 		intel_svm_check(iommu);
3436 	}
3437 
3438 	/*
3439 	 * Now that qi is enabled on all iommus, set the root entry and flush
3440 	 * caches. This is required on some Intel X58 chipsets, otherwise the
3441 	 * flush_context function will loop forever and the boot hangs.
3442 	 */
3443 	for_each_active_iommu(iommu, drhd) {
3444 		iommu_flush_write_buffer(iommu);
3445 #ifdef CONFIG_INTEL_IOMMU_SVM
3446 		register_pasid_allocator(iommu);
3447 #endif
3448 		iommu_set_root_entry(iommu);
3449 	}
3450 
3451 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3452 	dmar_map_gfx = 0;
3453 #endif
3454 
3455 	if (!dmar_map_gfx)
3456 		iommu_identity_mapping |= IDENTMAP_GFX;
3457 
3458 	check_tylersburg_isoch();
3459 
3460 	ret = si_domain_init(hw_pass_through);
3461 	if (ret)
3462 		goto free_iommu;
3463 
3464 	/*
3465 	 * for each drhd
3466 	 *   enable fault log
3467 	 *   global invalidate context cache
3468 	 *   global invalidate iotlb
3469 	 *   enable translation
3470 	 */
3471 	for_each_iommu(iommu, drhd) {
3472 		if (drhd->ignored) {
3473 			/*
3474 			 * we always have to disable PMRs or DMA may fail on
3475 			 * this device
3476 			 */
3477 			if (force_on)
3478 				iommu_disable_protect_mem_regions(iommu);
3479 			continue;
3480 		}
3481 
3482 		iommu_flush_write_buffer(iommu);
3483 
3484 #ifdef CONFIG_INTEL_IOMMU_SVM
3485 		if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3486 			/*
3487 			 * Call dmar_alloc_hwirq() with dmar_global_lock held,
3488 			 * could cause possible lock race condition.
3489 			 */
3490 			up_write(&dmar_global_lock);
3491 			ret = intel_svm_enable_prq(iommu);
3492 			down_write(&dmar_global_lock);
3493 			if (ret)
3494 				goto free_iommu;
3495 		}
3496 #endif
3497 		ret = dmar_set_interrupt(iommu);
3498 		if (ret)
3499 			goto free_iommu;
3500 	}
3501 
3502 	return 0;
3503 
3504 free_iommu:
3505 	for_each_active_iommu(iommu, drhd) {
3506 		disable_dmar_iommu(iommu);
3507 		free_dmar_iommu(iommu);
3508 	}
3509 	if (si_domain) {
3510 		domain_exit(si_domain);
3511 		si_domain = NULL;
3512 	}
3513 
3514 	kfree(g_iommus);
3515 
3516 error:
3517 	return ret;
3518 }
3519 
3520 /* This takes a number of _MM_ pages, not VTD pages */
intel_alloc_iova(struct device * dev,struct dmar_domain * domain,unsigned long nrpages,uint64_t dma_mask)3521 static unsigned long intel_alloc_iova(struct device *dev,
3522 				     struct dmar_domain *domain,
3523 				     unsigned long nrpages, uint64_t dma_mask)
3524 {
3525 	unsigned long iova_pfn;
3526 
3527 	/*
3528 	 * Restrict dma_mask to the width that the iommu can handle.
3529 	 * First-level translation restricts the input-address to a
3530 	 * canonical address (i.e., address bits 63:N have the same
3531 	 * value as address bit [N-1], where N is 48-bits with 4-level
3532 	 * paging and 57-bits with 5-level paging). Hence, skip bit
3533 	 * [N-1].
3534 	 */
3535 	if (domain_use_first_level(domain))
3536 		dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw - 1),
3537 				 dma_mask);
3538 	else
3539 		dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw),
3540 				 dma_mask);
3541 
3542 	/* Ensure we reserve the whole size-aligned region */
3543 	nrpages = __roundup_pow_of_two(nrpages);
3544 
3545 	if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3546 		/*
3547 		 * First try to allocate an io virtual address in
3548 		 * DMA_BIT_MASK(32) and if that fails then try allocating
3549 		 * from higher range
3550 		 */
3551 		iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3552 					   IOVA_PFN(DMA_BIT_MASK(32)), false);
3553 		if (iova_pfn)
3554 			return iova_pfn;
3555 	}
3556 	iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3557 				   IOVA_PFN(dma_mask), true);
3558 	if (unlikely(!iova_pfn)) {
3559 		dev_err_once(dev, "Allocating %ld-page iova failed\n",
3560 			     nrpages);
3561 		return 0;
3562 	}
3563 
3564 	return iova_pfn;
3565 }
3566 
__intel_map_single(struct device * dev,phys_addr_t paddr,size_t size,int dir,u64 dma_mask)3567 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3568 				     size_t size, int dir, u64 dma_mask)
3569 {
3570 	struct dmar_domain *domain;
3571 	phys_addr_t start_paddr;
3572 	unsigned long iova_pfn;
3573 	int prot = 0;
3574 	int ret;
3575 	struct intel_iommu *iommu;
3576 	unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3577 
3578 	BUG_ON(dir == DMA_NONE);
3579 
3580 	if (unlikely(attach_deferred(dev)))
3581 		do_deferred_attach(dev);
3582 
3583 	domain = find_domain(dev);
3584 	if (!domain)
3585 		return DMA_MAPPING_ERROR;
3586 
3587 	iommu = domain_get_iommu(domain);
3588 	size = aligned_nrpages(paddr, size);
3589 
3590 	iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3591 	if (!iova_pfn)
3592 		goto error;
3593 
3594 	/*
3595 	 * Check if DMAR supports zero-length reads on write only
3596 	 * mappings..
3597 	 */
3598 	if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3599 			!cap_zlr(iommu->cap))
3600 		prot |= DMA_PTE_READ;
3601 	if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3602 		prot |= DMA_PTE_WRITE;
3603 	/*
3604 	 * paddr - (paddr + size) might be partial page, we should map the whole
3605 	 * page.  Note: if two part of one page are separately mapped, we
3606 	 * might have two guest_addr mapping to the same host paddr, but this
3607 	 * is not a big problem
3608 	 */
3609 	ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3610 				 mm_to_dma_pfn(paddr_pfn), size, prot);
3611 	if (ret)
3612 		goto error;
3613 
3614 	start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
3615 	start_paddr += paddr & ~PAGE_MASK;
3616 
3617 	trace_map_single(dev, start_paddr, paddr, size << VTD_PAGE_SHIFT);
3618 
3619 	return start_paddr;
3620 
3621 error:
3622 	if (iova_pfn)
3623 		free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3624 	dev_err(dev, "Device request: %zx@%llx dir %d --- failed\n",
3625 		size, (unsigned long long)paddr, dir);
3626 	return DMA_MAPPING_ERROR;
3627 }
3628 
intel_map_page(struct device * dev,struct page * page,unsigned long offset,size_t size,enum dma_data_direction dir,unsigned long attrs)3629 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3630 				 unsigned long offset, size_t size,
3631 				 enum dma_data_direction dir,
3632 				 unsigned long attrs)
3633 {
3634 	return __intel_map_single(dev, page_to_phys(page) + offset,
3635 				  size, dir, *dev->dma_mask);
3636 }
3637 
intel_map_resource(struct device * dev,phys_addr_t phys_addr,size_t size,enum dma_data_direction dir,unsigned long attrs)3638 static dma_addr_t intel_map_resource(struct device *dev, phys_addr_t phys_addr,
3639 				     size_t size, enum dma_data_direction dir,
3640 				     unsigned long attrs)
3641 {
3642 	return __intel_map_single(dev, phys_addr, size, dir, *dev->dma_mask);
3643 }
3644 
intel_unmap(struct device * dev,dma_addr_t dev_addr,size_t size)3645 static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
3646 {
3647 	struct dmar_domain *domain;
3648 	unsigned long start_pfn, last_pfn;
3649 	unsigned long nrpages;
3650 	unsigned long iova_pfn;
3651 	struct intel_iommu *iommu;
3652 	struct page *freelist;
3653 	struct pci_dev *pdev = NULL;
3654 
3655 	domain = find_domain(dev);
3656 	BUG_ON(!domain);
3657 
3658 	iommu = domain_get_iommu(domain);
3659 
3660 	iova_pfn = IOVA_PFN(dev_addr);
3661 
3662 	nrpages = aligned_nrpages(dev_addr, size);
3663 	start_pfn = mm_to_dma_pfn(iova_pfn);
3664 	last_pfn = start_pfn + nrpages - 1;
3665 
3666 	if (dev_is_pci(dev))
3667 		pdev = to_pci_dev(dev);
3668 
3669 	freelist = domain_unmap(domain, start_pfn, last_pfn);
3670 	if (intel_iommu_strict || (pdev && pdev->untrusted) ||
3671 			!has_iova_flush_queue(&domain->iovad)) {
3672 		iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3673 				      nrpages, !freelist, 0);
3674 		/* free iova */
3675 		free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3676 		dma_free_pagelist(freelist);
3677 	} else {
3678 		queue_iova(&domain->iovad, iova_pfn, nrpages,
3679 			   (unsigned long)freelist);
3680 		/*
3681 		 * queue up the release of the unmap to save the 1/6th of the
3682 		 * cpu used up by the iotlb flush operation...
3683 		 */
3684 	}
3685 
3686 	trace_unmap_single(dev, dev_addr, size);
3687 }
3688 
intel_unmap_page(struct device * dev,dma_addr_t dev_addr,size_t size,enum dma_data_direction dir,unsigned long attrs)3689 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3690 			     size_t size, enum dma_data_direction dir,
3691 			     unsigned long attrs)
3692 {
3693 	intel_unmap(dev, dev_addr, size);
3694 }
3695 
intel_unmap_resource(struct device * dev,dma_addr_t dev_addr,size_t size,enum dma_data_direction dir,unsigned long attrs)3696 static void intel_unmap_resource(struct device *dev, dma_addr_t dev_addr,
3697 		size_t size, enum dma_data_direction dir, unsigned long attrs)
3698 {
3699 	intel_unmap(dev, dev_addr, size);
3700 }
3701 
intel_alloc_coherent(struct device * dev,size_t size,dma_addr_t * dma_handle,gfp_t flags,unsigned long attrs)3702 static void *intel_alloc_coherent(struct device *dev, size_t size,
3703 				  dma_addr_t *dma_handle, gfp_t flags,
3704 				  unsigned long attrs)
3705 {
3706 	struct page *page = NULL;
3707 	int order;
3708 
3709 	if (unlikely(attach_deferred(dev)))
3710 		do_deferred_attach(dev);
3711 
3712 	size = PAGE_ALIGN(size);
3713 	order = get_order(size);
3714 
3715 	if (gfpflags_allow_blocking(flags)) {
3716 		unsigned int count = size >> PAGE_SHIFT;
3717 
3718 		page = dma_alloc_from_contiguous(dev, count, order,
3719 						 flags & __GFP_NOWARN);
3720 	}
3721 
3722 	if (!page)
3723 		page = alloc_pages(flags, order);
3724 	if (!page)
3725 		return NULL;
3726 	memset(page_address(page), 0, size);
3727 
3728 	*dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3729 					 DMA_BIDIRECTIONAL,
3730 					 dev->coherent_dma_mask);
3731 	if (*dma_handle != DMA_MAPPING_ERROR)
3732 		return page_address(page);
3733 	if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3734 		__free_pages(page, order);
3735 
3736 	return NULL;
3737 }
3738 
intel_free_coherent(struct device * dev,size_t size,void * vaddr,dma_addr_t dma_handle,unsigned long attrs)3739 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3740 				dma_addr_t dma_handle, unsigned long attrs)
3741 {
3742 	int order;
3743 	struct page *page = virt_to_page(vaddr);
3744 
3745 	size = PAGE_ALIGN(size);
3746 	order = get_order(size);
3747 
3748 	intel_unmap(dev, dma_handle, size);
3749 	if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3750 		__free_pages(page, order);
3751 }
3752 
intel_unmap_sg(struct device * dev,struct scatterlist * sglist,int nelems,enum dma_data_direction dir,unsigned long attrs)3753 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3754 			   int nelems, enum dma_data_direction dir,
3755 			   unsigned long attrs)
3756 {
3757 	dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
3758 	unsigned long nrpages = 0;
3759 	struct scatterlist *sg;
3760 	int i;
3761 
3762 	for_each_sg(sglist, sg, nelems, i) {
3763 		nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
3764 	}
3765 
3766 	intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3767 
3768 	trace_unmap_sg(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3769 }
3770 
intel_map_sg(struct device * dev,struct scatterlist * sglist,int nelems,enum dma_data_direction dir,unsigned long attrs)3771 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3772 			enum dma_data_direction dir, unsigned long attrs)
3773 {
3774 	int i;
3775 	struct dmar_domain *domain;
3776 	size_t size = 0;
3777 	int prot = 0;
3778 	unsigned long iova_pfn;
3779 	int ret;
3780 	struct scatterlist *sg;
3781 	unsigned long start_vpfn;
3782 	struct intel_iommu *iommu;
3783 
3784 	BUG_ON(dir == DMA_NONE);
3785 
3786 	if (unlikely(attach_deferred(dev)))
3787 		do_deferred_attach(dev);
3788 
3789 	domain = find_domain(dev);
3790 	if (!domain)
3791 		return 0;
3792 
3793 	iommu = domain_get_iommu(domain);
3794 
3795 	for_each_sg(sglist, sg, nelems, i)
3796 		size += aligned_nrpages(sg->offset, sg->length);
3797 
3798 	iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3799 				*dev->dma_mask);
3800 	if (!iova_pfn) {
3801 		sglist->dma_length = 0;
3802 		return 0;
3803 	}
3804 
3805 	/*
3806 	 * Check if DMAR supports zero-length reads on write only
3807 	 * mappings..
3808 	 */
3809 	if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3810 			!cap_zlr(iommu->cap))
3811 		prot |= DMA_PTE_READ;
3812 	if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3813 		prot |= DMA_PTE_WRITE;
3814 
3815 	start_vpfn = mm_to_dma_pfn(iova_pfn);
3816 
3817 	ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3818 	if (unlikely(ret)) {
3819 		dma_pte_free_pagetable(domain, start_vpfn,
3820 				       start_vpfn + size - 1,
3821 				       agaw_to_level(domain->agaw) + 1);
3822 		free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3823 		return 0;
3824 	}
3825 
3826 	for_each_sg(sglist, sg, nelems, i)
3827 		trace_map_sg(dev, i + 1, nelems, sg);
3828 
3829 	return nelems;
3830 }
3831 
intel_get_required_mask(struct device * dev)3832 static u64 intel_get_required_mask(struct device *dev)
3833 {
3834 	return DMA_BIT_MASK(32);
3835 }
3836 
3837 static const struct dma_map_ops intel_dma_ops = {
3838 	.alloc = intel_alloc_coherent,
3839 	.free = intel_free_coherent,
3840 	.map_sg = intel_map_sg,
3841 	.unmap_sg = intel_unmap_sg,
3842 	.map_page = intel_map_page,
3843 	.unmap_page = intel_unmap_page,
3844 	.map_resource = intel_map_resource,
3845 	.unmap_resource = intel_unmap_resource,
3846 	.dma_supported = dma_direct_supported,
3847 	.mmap = dma_common_mmap,
3848 	.get_sgtable = dma_common_get_sgtable,
3849 	.alloc_pages = dma_common_alloc_pages,
3850 	.free_pages = dma_common_free_pages,
3851 	.get_required_mask = intel_get_required_mask,
3852 };
3853 
3854 static void
bounce_sync_single(struct device * dev,dma_addr_t addr,size_t size,enum dma_data_direction dir,enum dma_sync_target target)3855 bounce_sync_single(struct device *dev, dma_addr_t addr, size_t size,
3856 		   enum dma_data_direction dir, enum dma_sync_target target)
3857 {
3858 	struct dmar_domain *domain;
3859 	phys_addr_t tlb_addr;
3860 
3861 	domain = find_domain(dev);
3862 	if (WARN_ON(!domain))
3863 		return;
3864 
3865 	tlb_addr = intel_iommu_iova_to_phys(&domain->domain, addr);
3866 	if (is_swiotlb_buffer(tlb_addr))
3867 		swiotlb_tbl_sync_single(dev, tlb_addr, size, dir, target);
3868 }
3869 
3870 static dma_addr_t
bounce_map_single(struct device * dev,phys_addr_t paddr,size_t size,enum dma_data_direction dir,unsigned long attrs,u64 dma_mask)3871 bounce_map_single(struct device *dev, phys_addr_t paddr, size_t size,
3872 		  enum dma_data_direction dir, unsigned long attrs,
3873 		  u64 dma_mask)
3874 {
3875 	size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
3876 	struct dmar_domain *domain;
3877 	struct intel_iommu *iommu;
3878 	unsigned long iova_pfn;
3879 	unsigned long nrpages;
3880 	phys_addr_t tlb_addr;
3881 	int prot = 0;
3882 	int ret;
3883 
3884 	if (unlikely(attach_deferred(dev)))
3885 		do_deferred_attach(dev);
3886 
3887 	domain = find_domain(dev);
3888 
3889 	if (WARN_ON(dir == DMA_NONE || !domain))
3890 		return DMA_MAPPING_ERROR;
3891 
3892 	iommu = domain_get_iommu(domain);
3893 	if (WARN_ON(!iommu))
3894 		return DMA_MAPPING_ERROR;
3895 
3896 	nrpages = aligned_nrpages(0, size);
3897 	iova_pfn = intel_alloc_iova(dev, domain,
3898 				    dma_to_mm_pfn(nrpages), dma_mask);
3899 	if (!iova_pfn)
3900 		return DMA_MAPPING_ERROR;
3901 
3902 	/*
3903 	 * Check if DMAR supports zero-length reads on write only
3904 	 * mappings..
3905 	 */
3906 	if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL ||
3907 			!cap_zlr(iommu->cap))
3908 		prot |= DMA_PTE_READ;
3909 	if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3910 		prot |= DMA_PTE_WRITE;
3911 
3912 	/*
3913 	 * If both the physical buffer start address and size are
3914 	 * page aligned, we don't need to use a bounce page.
3915 	 */
3916 	if (!IS_ALIGNED(paddr | size, VTD_PAGE_SIZE)) {
3917 		tlb_addr = swiotlb_tbl_map_single(dev, paddr, size,
3918 				aligned_size, dir, attrs);
3919 		if (tlb_addr == DMA_MAPPING_ERROR) {
3920 			goto swiotlb_error;
3921 		} else {
3922 			/* Cleanup the padding area. */
3923 			void *padding_start = phys_to_virt(tlb_addr);
3924 			size_t padding_size = aligned_size;
3925 
3926 			if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
3927 			    (dir == DMA_TO_DEVICE ||
3928 			     dir == DMA_BIDIRECTIONAL)) {
3929 				padding_start += size;
3930 				padding_size -= size;
3931 			}
3932 
3933 			memset(padding_start, 0, padding_size);
3934 		}
3935 	} else {
3936 		tlb_addr = paddr;
3937 	}
3938 
3939 	ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3940 				 tlb_addr >> VTD_PAGE_SHIFT, nrpages, prot);
3941 	if (ret)
3942 		goto mapping_error;
3943 
3944 	trace_bounce_map_single(dev, iova_pfn << PAGE_SHIFT, paddr, size);
3945 
3946 	return (phys_addr_t)iova_pfn << PAGE_SHIFT;
3947 
3948 mapping_error:
3949 	if (is_swiotlb_buffer(tlb_addr))
3950 		swiotlb_tbl_unmap_single(dev, tlb_addr, size,
3951 					 aligned_size, dir, attrs);
3952 swiotlb_error:
3953 	free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3954 	dev_err(dev, "Device bounce map: %zx@%llx dir %d --- failed\n",
3955 		size, (unsigned long long)paddr, dir);
3956 
3957 	return DMA_MAPPING_ERROR;
3958 }
3959 
3960 static void
bounce_unmap_single(struct device * dev,dma_addr_t dev_addr,size_t size,enum dma_data_direction dir,unsigned long attrs)3961 bounce_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
3962 		    enum dma_data_direction dir, unsigned long attrs)
3963 {
3964 	size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
3965 	struct dmar_domain *domain;
3966 	phys_addr_t tlb_addr;
3967 
3968 	domain = find_domain(dev);
3969 	if (WARN_ON(!domain))
3970 		return;
3971 
3972 	tlb_addr = intel_iommu_iova_to_phys(&domain->domain, dev_addr);
3973 	if (WARN_ON(!tlb_addr))
3974 		return;
3975 
3976 	intel_unmap(dev, dev_addr, size);
3977 	if (is_swiotlb_buffer(tlb_addr))
3978 		swiotlb_tbl_unmap_single(dev, tlb_addr, size,
3979 					 aligned_size, dir, attrs);
3980 
3981 	trace_bounce_unmap_single(dev, dev_addr, size);
3982 }
3983 
3984 static dma_addr_t
bounce_map_page(struct device * dev,struct page * page,unsigned long offset,size_t size,enum dma_data_direction dir,unsigned long attrs)3985 bounce_map_page(struct device *dev, struct page *page, unsigned long offset,
3986 		size_t size, enum dma_data_direction dir, unsigned long attrs)
3987 {
3988 	return bounce_map_single(dev, page_to_phys(page) + offset,
3989 				 size, dir, attrs, *dev->dma_mask);
3990 }
3991 
3992 static dma_addr_t
bounce_map_resource(struct device * dev,phys_addr_t phys_addr,size_t size,enum dma_data_direction dir,unsigned long attrs)3993 bounce_map_resource(struct device *dev, phys_addr_t phys_addr, size_t size,
3994 		    enum dma_data_direction dir, unsigned long attrs)
3995 {
3996 	return bounce_map_single(dev, phys_addr, size,
3997 				 dir, attrs, *dev->dma_mask);
3998 }
3999 
4000 static void
bounce_unmap_page(struct device * dev,dma_addr_t dev_addr,size_t size,enum dma_data_direction dir,unsigned long attrs)4001 bounce_unmap_page(struct device *dev, dma_addr_t dev_addr, size_t size,
4002 		  enum dma_data_direction dir, unsigned long attrs)
4003 {
4004 	bounce_unmap_single(dev, dev_addr, size, dir, attrs);
4005 }
4006 
4007 static void
bounce_unmap_resource(struct device * dev,dma_addr_t dev_addr,size_t size,enum dma_data_direction dir,unsigned long attrs)4008 bounce_unmap_resource(struct device *dev, dma_addr_t dev_addr, size_t size,
4009 		      enum dma_data_direction dir, unsigned long attrs)
4010 {
4011 	bounce_unmap_single(dev, dev_addr, size, dir, attrs);
4012 }
4013 
4014 static void
bounce_unmap_sg(struct device * dev,struct scatterlist * sglist,int nelems,enum dma_data_direction dir,unsigned long attrs)4015 bounce_unmap_sg(struct device *dev, struct scatterlist *sglist, int nelems,
4016 		enum dma_data_direction dir, unsigned long attrs)
4017 {
4018 	struct scatterlist *sg;
4019 	int i;
4020 
4021 	for_each_sg(sglist, sg, nelems, i)
4022 		bounce_unmap_page(dev, sg->dma_address,
4023 				  sg_dma_len(sg), dir, attrs);
4024 }
4025 
4026 static int
bounce_map_sg(struct device * dev,struct scatterlist * sglist,int nelems,enum dma_data_direction dir,unsigned long attrs)4027 bounce_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
4028 	      enum dma_data_direction dir, unsigned long attrs)
4029 {
4030 	int i;
4031 	struct scatterlist *sg;
4032 
4033 	for_each_sg(sglist, sg, nelems, i) {
4034 		sg->dma_address = bounce_map_page(dev, sg_page(sg),
4035 						  sg->offset, sg->length,
4036 						  dir, attrs);
4037 		if (sg->dma_address == DMA_MAPPING_ERROR)
4038 			goto out_unmap;
4039 		sg_dma_len(sg) = sg->length;
4040 	}
4041 
4042 	for_each_sg(sglist, sg, nelems, i)
4043 		trace_bounce_map_sg(dev, i + 1, nelems, sg);
4044 
4045 	return nelems;
4046 
4047 out_unmap:
4048 	bounce_unmap_sg(dev, sglist, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
4049 	return 0;
4050 }
4051 
4052 static void
bounce_sync_single_for_cpu(struct device * dev,dma_addr_t addr,size_t size,enum dma_data_direction dir)4053 bounce_sync_single_for_cpu(struct device *dev, dma_addr_t addr,
4054 			   size_t size, enum dma_data_direction dir)
4055 {
4056 	bounce_sync_single(dev, addr, size, dir, SYNC_FOR_CPU);
4057 }
4058 
4059 static void
bounce_sync_single_for_device(struct device * dev,dma_addr_t addr,size_t size,enum dma_data_direction dir)4060 bounce_sync_single_for_device(struct device *dev, dma_addr_t addr,
4061 			      size_t size, enum dma_data_direction dir)
4062 {
4063 	bounce_sync_single(dev, addr, size, dir, SYNC_FOR_DEVICE);
4064 }
4065 
4066 static void
bounce_sync_sg_for_cpu(struct device * dev,struct scatterlist * sglist,int nelems,enum dma_data_direction dir)4067 bounce_sync_sg_for_cpu(struct device *dev, struct scatterlist *sglist,
4068 		       int nelems, enum dma_data_direction dir)
4069 {
4070 	struct scatterlist *sg;
4071 	int i;
4072 
4073 	for_each_sg(sglist, sg, nelems, i)
4074 		bounce_sync_single(dev, sg_dma_address(sg),
4075 				   sg_dma_len(sg), dir, SYNC_FOR_CPU);
4076 }
4077 
4078 static void
bounce_sync_sg_for_device(struct device * dev,struct scatterlist * sglist,int nelems,enum dma_data_direction dir)4079 bounce_sync_sg_for_device(struct device *dev, struct scatterlist *sglist,
4080 			  int nelems, enum dma_data_direction dir)
4081 {
4082 	struct scatterlist *sg;
4083 	int i;
4084 
4085 	for_each_sg(sglist, sg, nelems, i)
4086 		bounce_sync_single(dev, sg_dma_address(sg),
4087 				   sg_dma_len(sg), dir, SYNC_FOR_DEVICE);
4088 }
4089 
4090 static const struct dma_map_ops bounce_dma_ops = {
4091 	.alloc			= intel_alloc_coherent,
4092 	.free			= intel_free_coherent,
4093 	.map_sg			= bounce_map_sg,
4094 	.unmap_sg		= bounce_unmap_sg,
4095 	.map_page		= bounce_map_page,
4096 	.unmap_page		= bounce_unmap_page,
4097 	.sync_single_for_cpu	= bounce_sync_single_for_cpu,
4098 	.sync_single_for_device	= bounce_sync_single_for_device,
4099 	.sync_sg_for_cpu	= bounce_sync_sg_for_cpu,
4100 	.sync_sg_for_device	= bounce_sync_sg_for_device,
4101 	.map_resource		= bounce_map_resource,
4102 	.unmap_resource		= bounce_unmap_resource,
4103 	.alloc_pages		= dma_common_alloc_pages,
4104 	.free_pages		= dma_common_free_pages,
4105 	.dma_supported		= dma_direct_supported,
4106 };
4107 
iommu_domain_cache_init(void)4108 static inline int iommu_domain_cache_init(void)
4109 {
4110 	int ret = 0;
4111 
4112 	iommu_domain_cache = kmem_cache_create("iommu_domain",
4113 					 sizeof(struct dmar_domain),
4114 					 0,
4115 					 SLAB_HWCACHE_ALIGN,
4116 
4117 					 NULL);
4118 	if (!iommu_domain_cache) {
4119 		pr_err("Couldn't create iommu_domain cache\n");
4120 		ret = -ENOMEM;
4121 	}
4122 
4123 	return ret;
4124 }
4125 
iommu_devinfo_cache_init(void)4126 static inline int iommu_devinfo_cache_init(void)
4127 {
4128 	int ret = 0;
4129 
4130 	iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
4131 					 sizeof(struct device_domain_info),
4132 					 0,
4133 					 SLAB_HWCACHE_ALIGN,
4134 					 NULL);
4135 	if (!iommu_devinfo_cache) {
4136 		pr_err("Couldn't create devinfo cache\n");
4137 		ret = -ENOMEM;
4138 	}
4139 
4140 	return ret;
4141 }
4142 
iommu_init_mempool(void)4143 static int __init iommu_init_mempool(void)
4144 {
4145 	int ret;
4146 	ret = iova_cache_get();
4147 	if (ret)
4148 		return ret;
4149 
4150 	ret = iommu_domain_cache_init();
4151 	if (ret)
4152 		goto domain_error;
4153 
4154 	ret = iommu_devinfo_cache_init();
4155 	if (!ret)
4156 		return ret;
4157 
4158 	kmem_cache_destroy(iommu_domain_cache);
4159 domain_error:
4160 	iova_cache_put();
4161 
4162 	return -ENOMEM;
4163 }
4164 
iommu_exit_mempool(void)4165 static void __init iommu_exit_mempool(void)
4166 {
4167 	kmem_cache_destroy(iommu_devinfo_cache);
4168 	kmem_cache_destroy(iommu_domain_cache);
4169 	iova_cache_put();
4170 }
4171 
init_no_remapping_devices(void)4172 static void __init init_no_remapping_devices(void)
4173 {
4174 	struct dmar_drhd_unit *drhd;
4175 	struct device *dev;
4176 	int i;
4177 
4178 	for_each_drhd_unit(drhd) {
4179 		if (!drhd->include_all) {
4180 			for_each_active_dev_scope(drhd->devices,
4181 						  drhd->devices_cnt, i, dev)
4182 				break;
4183 			/* ignore DMAR unit if no devices exist */
4184 			if (i == drhd->devices_cnt)
4185 				drhd->ignored = 1;
4186 		}
4187 	}
4188 
4189 	for_each_active_drhd_unit(drhd) {
4190 		if (drhd->include_all)
4191 			continue;
4192 
4193 		for_each_active_dev_scope(drhd->devices,
4194 					  drhd->devices_cnt, i, dev)
4195 			if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
4196 				break;
4197 		if (i < drhd->devices_cnt)
4198 			continue;
4199 
4200 		/* This IOMMU has *only* gfx devices. Either bypass it or
4201 		   set the gfx_mapped flag, as appropriate */
4202 		drhd->gfx_dedicated = 1;
4203 		if (!dmar_map_gfx)
4204 			drhd->ignored = 1;
4205 	}
4206 }
4207 
4208 #ifdef CONFIG_SUSPEND
init_iommu_hw(void)4209 static int init_iommu_hw(void)
4210 {
4211 	struct dmar_drhd_unit *drhd;
4212 	struct intel_iommu *iommu = NULL;
4213 
4214 	for_each_active_iommu(iommu, drhd)
4215 		if (iommu->qi)
4216 			dmar_reenable_qi(iommu);
4217 
4218 	for_each_iommu(iommu, drhd) {
4219 		if (drhd->ignored) {
4220 			/*
4221 			 * we always have to disable PMRs or DMA may fail on
4222 			 * this device
4223 			 */
4224 			if (force_on)
4225 				iommu_disable_protect_mem_regions(iommu);
4226 			continue;
4227 		}
4228 
4229 		iommu_flush_write_buffer(iommu);
4230 		iommu_set_root_entry(iommu);
4231 		iommu_enable_translation(iommu);
4232 		iommu_disable_protect_mem_regions(iommu);
4233 	}
4234 
4235 	return 0;
4236 }
4237 
iommu_flush_all(void)4238 static void iommu_flush_all(void)
4239 {
4240 	struct dmar_drhd_unit *drhd;
4241 	struct intel_iommu *iommu;
4242 
4243 	for_each_active_iommu(iommu, drhd) {
4244 		iommu->flush.flush_context(iommu, 0, 0, 0,
4245 					   DMA_CCMD_GLOBAL_INVL);
4246 		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
4247 					 DMA_TLB_GLOBAL_FLUSH);
4248 	}
4249 }
4250 
iommu_suspend(void)4251 static int iommu_suspend(void)
4252 {
4253 	struct dmar_drhd_unit *drhd;
4254 	struct intel_iommu *iommu = NULL;
4255 	unsigned long flag;
4256 
4257 	for_each_active_iommu(iommu, drhd) {
4258 		iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
4259 						 GFP_ATOMIC);
4260 		if (!iommu->iommu_state)
4261 			goto nomem;
4262 	}
4263 
4264 	iommu_flush_all();
4265 
4266 	for_each_active_iommu(iommu, drhd) {
4267 		iommu_disable_translation(iommu);
4268 
4269 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
4270 
4271 		iommu->iommu_state[SR_DMAR_FECTL_REG] =
4272 			readl(iommu->reg + DMAR_FECTL_REG);
4273 		iommu->iommu_state[SR_DMAR_FEDATA_REG] =
4274 			readl(iommu->reg + DMAR_FEDATA_REG);
4275 		iommu->iommu_state[SR_DMAR_FEADDR_REG] =
4276 			readl(iommu->reg + DMAR_FEADDR_REG);
4277 		iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
4278 			readl(iommu->reg + DMAR_FEUADDR_REG);
4279 
4280 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4281 	}
4282 	return 0;
4283 
4284 nomem:
4285 	for_each_active_iommu(iommu, drhd)
4286 		kfree(iommu->iommu_state);
4287 
4288 	return -ENOMEM;
4289 }
4290 
iommu_resume(void)4291 static void iommu_resume(void)
4292 {
4293 	struct dmar_drhd_unit *drhd;
4294 	struct intel_iommu *iommu = NULL;
4295 	unsigned long flag;
4296 
4297 	if (init_iommu_hw()) {
4298 		if (force_on)
4299 			panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
4300 		else
4301 			WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
4302 		return;
4303 	}
4304 
4305 	for_each_active_iommu(iommu, drhd) {
4306 
4307 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
4308 
4309 		writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
4310 			iommu->reg + DMAR_FECTL_REG);
4311 		writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
4312 			iommu->reg + DMAR_FEDATA_REG);
4313 		writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
4314 			iommu->reg + DMAR_FEADDR_REG);
4315 		writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
4316 			iommu->reg + DMAR_FEUADDR_REG);
4317 
4318 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4319 	}
4320 
4321 	for_each_active_iommu(iommu, drhd)
4322 		kfree(iommu->iommu_state);
4323 }
4324 
4325 static struct syscore_ops iommu_syscore_ops = {
4326 	.resume		= iommu_resume,
4327 	.suspend	= iommu_suspend,
4328 };
4329 
init_iommu_pm_ops(void)4330 static void __init init_iommu_pm_ops(void)
4331 {
4332 	register_syscore_ops(&iommu_syscore_ops);
4333 }
4334 
4335 #else
init_iommu_pm_ops(void)4336 static inline void init_iommu_pm_ops(void) {}
4337 #endif	/* CONFIG_PM */
4338 
rmrr_sanity_check(struct acpi_dmar_reserved_memory * rmrr)4339 static int rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
4340 {
4341 	if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
4342 	    !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
4343 	    rmrr->end_address <= rmrr->base_address ||
4344 	    arch_rmrr_sanity_check(rmrr))
4345 		return -EINVAL;
4346 
4347 	return 0;
4348 }
4349 
dmar_parse_one_rmrr(struct acpi_dmar_header * header,void * arg)4350 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4351 {
4352 	struct acpi_dmar_reserved_memory *rmrr;
4353 	struct dmar_rmrr_unit *rmrru;
4354 
4355 	rmrr = (struct acpi_dmar_reserved_memory *)header;
4356 	if (rmrr_sanity_check(rmrr)) {
4357 		pr_warn(FW_BUG
4358 			   "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
4359 			   "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4360 			   rmrr->base_address, rmrr->end_address,
4361 			   dmi_get_system_info(DMI_BIOS_VENDOR),
4362 			   dmi_get_system_info(DMI_BIOS_VERSION),
4363 			   dmi_get_system_info(DMI_PRODUCT_VERSION));
4364 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
4365 	}
4366 
4367 	rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4368 	if (!rmrru)
4369 		goto out;
4370 
4371 	rmrru->hdr = header;
4372 
4373 	rmrru->base_address = rmrr->base_address;
4374 	rmrru->end_address = rmrr->end_address;
4375 
4376 	rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4377 				((void *)rmrr) + rmrr->header.length,
4378 				&rmrru->devices_cnt);
4379 	if (rmrru->devices_cnt && rmrru->devices == NULL)
4380 		goto free_rmrru;
4381 
4382 	list_add(&rmrru->list, &dmar_rmrr_units);
4383 
4384 	return 0;
4385 free_rmrru:
4386 	kfree(rmrru);
4387 out:
4388 	return -ENOMEM;
4389 }
4390 
dmar_find_atsr(struct acpi_dmar_atsr * atsr)4391 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4392 {
4393 	struct dmar_atsr_unit *atsru;
4394 	struct acpi_dmar_atsr *tmp;
4395 
4396 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
4397 				dmar_rcu_check()) {
4398 		tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4399 		if (atsr->segment != tmp->segment)
4400 			continue;
4401 		if (atsr->header.length != tmp->header.length)
4402 			continue;
4403 		if (memcmp(atsr, tmp, atsr->header.length) == 0)
4404 			return atsru;
4405 	}
4406 
4407 	return NULL;
4408 }
4409 
dmar_parse_one_atsr(struct acpi_dmar_header * hdr,void * arg)4410 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4411 {
4412 	struct acpi_dmar_atsr *atsr;
4413 	struct dmar_atsr_unit *atsru;
4414 
4415 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
4416 		return 0;
4417 
4418 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4419 	atsru = dmar_find_atsr(atsr);
4420 	if (atsru)
4421 		return 0;
4422 
4423 	atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4424 	if (!atsru)
4425 		return -ENOMEM;
4426 
4427 	/*
4428 	 * If memory is allocated from slab by ACPI _DSM method, we need to
4429 	 * copy the memory content because the memory buffer will be freed
4430 	 * on return.
4431 	 */
4432 	atsru->hdr = (void *)(atsru + 1);
4433 	memcpy(atsru->hdr, hdr, hdr->length);
4434 	atsru->include_all = atsr->flags & 0x1;
4435 	if (!atsru->include_all) {
4436 		atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4437 				(void *)atsr + atsr->header.length,
4438 				&atsru->devices_cnt);
4439 		if (atsru->devices_cnt && atsru->devices == NULL) {
4440 			kfree(atsru);
4441 			return -ENOMEM;
4442 		}
4443 	}
4444 
4445 	list_add_rcu(&atsru->list, &dmar_atsr_units);
4446 
4447 	return 0;
4448 }
4449 
intel_iommu_free_atsr(struct dmar_atsr_unit * atsru)4450 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4451 {
4452 	dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4453 	kfree(atsru);
4454 }
4455 
dmar_release_one_atsr(struct acpi_dmar_header * hdr,void * arg)4456 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4457 {
4458 	struct acpi_dmar_atsr *atsr;
4459 	struct dmar_atsr_unit *atsru;
4460 
4461 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4462 	atsru = dmar_find_atsr(atsr);
4463 	if (atsru) {
4464 		list_del_rcu(&atsru->list);
4465 		synchronize_rcu();
4466 		intel_iommu_free_atsr(atsru);
4467 	}
4468 
4469 	return 0;
4470 }
4471 
dmar_check_one_atsr(struct acpi_dmar_header * hdr,void * arg)4472 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4473 {
4474 	int i;
4475 	struct device *dev;
4476 	struct acpi_dmar_atsr *atsr;
4477 	struct dmar_atsr_unit *atsru;
4478 
4479 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4480 	atsru = dmar_find_atsr(atsr);
4481 	if (!atsru)
4482 		return 0;
4483 
4484 	if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4485 		for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4486 					  i, dev)
4487 			return -EBUSY;
4488 	}
4489 
4490 	return 0;
4491 }
4492 
intel_iommu_add(struct dmar_drhd_unit * dmaru)4493 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4494 {
4495 	int sp, ret;
4496 	struct intel_iommu *iommu = dmaru->iommu;
4497 
4498 	if (g_iommus[iommu->seq_id])
4499 		return 0;
4500 
4501 	if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4502 		pr_warn("%s: Doesn't support hardware pass through.\n",
4503 			iommu->name);
4504 		return -ENXIO;
4505 	}
4506 	if (!ecap_sc_support(iommu->ecap) &&
4507 	    domain_update_iommu_snooping(iommu)) {
4508 		pr_warn("%s: Doesn't support snooping.\n",
4509 			iommu->name);
4510 		return -ENXIO;
4511 	}
4512 	sp = domain_update_iommu_superpage(NULL, iommu) - 1;
4513 	if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4514 		pr_warn("%s: Doesn't support large page.\n",
4515 			iommu->name);
4516 		return -ENXIO;
4517 	}
4518 
4519 	/*
4520 	 * Disable translation if already enabled prior to OS handover.
4521 	 */
4522 	if (iommu->gcmd & DMA_GCMD_TE)
4523 		iommu_disable_translation(iommu);
4524 
4525 	g_iommus[iommu->seq_id] = iommu;
4526 	ret = iommu_init_domains(iommu);
4527 	if (ret == 0)
4528 		ret = iommu_alloc_root_entry(iommu);
4529 	if (ret)
4530 		goto out;
4531 
4532 	intel_svm_check(iommu);
4533 
4534 	if (dmaru->ignored) {
4535 		/*
4536 		 * we always have to disable PMRs or DMA may fail on this device
4537 		 */
4538 		if (force_on)
4539 			iommu_disable_protect_mem_regions(iommu);
4540 		return 0;
4541 	}
4542 
4543 	intel_iommu_init_qi(iommu);
4544 	iommu_flush_write_buffer(iommu);
4545 
4546 #ifdef CONFIG_INTEL_IOMMU_SVM
4547 	if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
4548 		ret = intel_svm_enable_prq(iommu);
4549 		if (ret)
4550 			goto disable_iommu;
4551 	}
4552 #endif
4553 	ret = dmar_set_interrupt(iommu);
4554 	if (ret)
4555 		goto disable_iommu;
4556 
4557 	iommu_set_root_entry(iommu);
4558 	iommu_enable_translation(iommu);
4559 
4560 	iommu_disable_protect_mem_regions(iommu);
4561 	return 0;
4562 
4563 disable_iommu:
4564 	disable_dmar_iommu(iommu);
4565 out:
4566 	free_dmar_iommu(iommu);
4567 	return ret;
4568 }
4569 
dmar_iommu_hotplug(struct dmar_drhd_unit * dmaru,bool insert)4570 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4571 {
4572 	int ret = 0;
4573 	struct intel_iommu *iommu = dmaru->iommu;
4574 
4575 	if (!intel_iommu_enabled)
4576 		return 0;
4577 	if (iommu == NULL)
4578 		return -EINVAL;
4579 
4580 	if (insert) {
4581 		ret = intel_iommu_add(dmaru);
4582 	} else {
4583 		disable_dmar_iommu(iommu);
4584 		free_dmar_iommu(iommu);
4585 	}
4586 
4587 	return ret;
4588 }
4589 
intel_iommu_free_dmars(void)4590 static void intel_iommu_free_dmars(void)
4591 {
4592 	struct dmar_rmrr_unit *rmrru, *rmrr_n;
4593 	struct dmar_atsr_unit *atsru, *atsr_n;
4594 
4595 	list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4596 		list_del(&rmrru->list);
4597 		dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4598 		kfree(rmrru);
4599 	}
4600 
4601 	list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4602 		list_del(&atsru->list);
4603 		intel_iommu_free_atsr(atsru);
4604 	}
4605 }
4606 
dmar_find_matched_atsr_unit(struct pci_dev * dev)4607 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4608 {
4609 	int i, ret = 1;
4610 	struct pci_bus *bus;
4611 	struct pci_dev *bridge = NULL;
4612 	struct device *tmp;
4613 	struct acpi_dmar_atsr *atsr;
4614 	struct dmar_atsr_unit *atsru;
4615 
4616 	dev = pci_physfn(dev);
4617 	for (bus = dev->bus; bus; bus = bus->parent) {
4618 		bridge = bus->self;
4619 		/* If it's an integrated device, allow ATS */
4620 		if (!bridge)
4621 			return 1;
4622 		/* Connected via non-PCIe: no ATS */
4623 		if (!pci_is_pcie(bridge) ||
4624 		    pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4625 			return 0;
4626 		/* If we found the root port, look it up in the ATSR */
4627 		if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4628 			break;
4629 	}
4630 
4631 	rcu_read_lock();
4632 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4633 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4634 		if (atsr->segment != pci_domain_nr(dev->bus))
4635 			continue;
4636 
4637 		for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4638 			if (tmp == &bridge->dev)
4639 				goto out;
4640 
4641 		if (atsru->include_all)
4642 			goto out;
4643 	}
4644 	ret = 0;
4645 out:
4646 	rcu_read_unlock();
4647 
4648 	return ret;
4649 }
4650 
dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info * info)4651 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4652 {
4653 	int ret;
4654 	struct dmar_rmrr_unit *rmrru;
4655 	struct dmar_atsr_unit *atsru;
4656 	struct acpi_dmar_atsr *atsr;
4657 	struct acpi_dmar_reserved_memory *rmrr;
4658 
4659 	if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4660 		return 0;
4661 
4662 	list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4663 		rmrr = container_of(rmrru->hdr,
4664 				    struct acpi_dmar_reserved_memory, header);
4665 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4666 			ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4667 				((void *)rmrr) + rmrr->header.length,
4668 				rmrr->segment, rmrru->devices,
4669 				rmrru->devices_cnt);
4670 			if (ret < 0)
4671 				return ret;
4672 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4673 			dmar_remove_dev_scope(info, rmrr->segment,
4674 				rmrru->devices, rmrru->devices_cnt);
4675 		}
4676 	}
4677 
4678 	list_for_each_entry(atsru, &dmar_atsr_units, list) {
4679 		if (atsru->include_all)
4680 			continue;
4681 
4682 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4683 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4684 			ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4685 					(void *)atsr + atsr->header.length,
4686 					atsr->segment, atsru->devices,
4687 					atsru->devices_cnt);
4688 			if (ret > 0)
4689 				break;
4690 			else if (ret < 0)
4691 				return ret;
4692 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4693 			if (dmar_remove_dev_scope(info, atsr->segment,
4694 					atsru->devices, atsru->devices_cnt))
4695 				break;
4696 		}
4697 	}
4698 
4699 	return 0;
4700 }
4701 
intel_iommu_memory_notifier(struct notifier_block * nb,unsigned long val,void * v)4702 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4703 				       unsigned long val, void *v)
4704 {
4705 	struct memory_notify *mhp = v;
4706 	unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4707 	unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn +
4708 			mhp->nr_pages - 1);
4709 
4710 	switch (val) {
4711 	case MEM_GOING_ONLINE:
4712 		if (iommu_domain_identity_map(si_domain,
4713 					      start_vpfn, last_vpfn)) {
4714 			pr_warn("Failed to build identity map for [%lx-%lx]\n",
4715 				start_vpfn, last_vpfn);
4716 			return NOTIFY_BAD;
4717 		}
4718 		break;
4719 
4720 	case MEM_OFFLINE:
4721 	case MEM_CANCEL_ONLINE:
4722 		{
4723 			struct dmar_drhd_unit *drhd;
4724 			struct intel_iommu *iommu;
4725 			struct page *freelist;
4726 
4727 			freelist = domain_unmap(si_domain,
4728 						start_vpfn, last_vpfn);
4729 
4730 			rcu_read_lock();
4731 			for_each_active_iommu(iommu, drhd)
4732 				iommu_flush_iotlb_psi(iommu, si_domain,
4733 					start_vpfn, mhp->nr_pages,
4734 					!freelist, 0);
4735 			rcu_read_unlock();
4736 			dma_free_pagelist(freelist);
4737 		}
4738 		break;
4739 	}
4740 
4741 	return NOTIFY_OK;
4742 }
4743 
4744 static struct notifier_block intel_iommu_memory_nb = {
4745 	.notifier_call = intel_iommu_memory_notifier,
4746 	.priority = 0
4747 };
4748 
free_all_cpu_cached_iovas(unsigned int cpu)4749 static void free_all_cpu_cached_iovas(unsigned int cpu)
4750 {
4751 	int i;
4752 
4753 	for (i = 0; i < g_num_of_iommus; i++) {
4754 		struct intel_iommu *iommu = g_iommus[i];
4755 		struct dmar_domain *domain;
4756 		int did;
4757 
4758 		if (!iommu)
4759 			continue;
4760 
4761 		for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4762 			domain = get_iommu_domain(iommu, (u16)did);
4763 
4764 			if (!domain || domain->domain.type != IOMMU_DOMAIN_DMA)
4765 				continue;
4766 
4767 			free_cpu_cached_iovas(cpu, &domain->iovad);
4768 		}
4769 	}
4770 }
4771 
intel_iommu_cpu_dead(unsigned int cpu)4772 static int intel_iommu_cpu_dead(unsigned int cpu)
4773 {
4774 	free_all_cpu_cached_iovas(cpu);
4775 	return 0;
4776 }
4777 
intel_disable_iommus(void)4778 static void intel_disable_iommus(void)
4779 {
4780 	struct intel_iommu *iommu = NULL;
4781 	struct dmar_drhd_unit *drhd;
4782 
4783 	for_each_iommu(iommu, drhd)
4784 		iommu_disable_translation(iommu);
4785 }
4786 
intel_iommu_shutdown(void)4787 void intel_iommu_shutdown(void)
4788 {
4789 	struct dmar_drhd_unit *drhd;
4790 	struct intel_iommu *iommu = NULL;
4791 
4792 	if (no_iommu || dmar_disabled)
4793 		return;
4794 
4795 	down_write(&dmar_global_lock);
4796 
4797 	/* Disable PMRs explicitly here. */
4798 	for_each_iommu(iommu, drhd)
4799 		iommu_disable_protect_mem_regions(iommu);
4800 
4801 	/* Make sure the IOMMUs are switched off */
4802 	intel_disable_iommus();
4803 
4804 	up_write(&dmar_global_lock);
4805 }
4806 
dev_to_intel_iommu(struct device * dev)4807 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4808 {
4809 	struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4810 
4811 	return container_of(iommu_dev, struct intel_iommu, iommu);
4812 }
4813 
intel_iommu_show_version(struct device * dev,struct device_attribute * attr,char * buf)4814 static ssize_t intel_iommu_show_version(struct device *dev,
4815 					struct device_attribute *attr,
4816 					char *buf)
4817 {
4818 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4819 	u32 ver = readl(iommu->reg + DMAR_VER_REG);
4820 	return sprintf(buf, "%d:%d\n",
4821 		       DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4822 }
4823 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4824 
intel_iommu_show_address(struct device * dev,struct device_attribute * attr,char * buf)4825 static ssize_t intel_iommu_show_address(struct device *dev,
4826 					struct device_attribute *attr,
4827 					char *buf)
4828 {
4829 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4830 	return sprintf(buf, "%llx\n", iommu->reg_phys);
4831 }
4832 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4833 
intel_iommu_show_cap(struct device * dev,struct device_attribute * attr,char * buf)4834 static ssize_t intel_iommu_show_cap(struct device *dev,
4835 				    struct device_attribute *attr,
4836 				    char *buf)
4837 {
4838 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4839 	return sprintf(buf, "%llx\n", iommu->cap);
4840 }
4841 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4842 
intel_iommu_show_ecap(struct device * dev,struct device_attribute * attr,char * buf)4843 static ssize_t intel_iommu_show_ecap(struct device *dev,
4844 				    struct device_attribute *attr,
4845 				    char *buf)
4846 {
4847 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4848 	return sprintf(buf, "%llx\n", iommu->ecap);
4849 }
4850 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4851 
intel_iommu_show_ndoms(struct device * dev,struct device_attribute * attr,char * buf)4852 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4853 				      struct device_attribute *attr,
4854 				      char *buf)
4855 {
4856 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4857 	return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4858 }
4859 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4860 
intel_iommu_show_ndoms_used(struct device * dev,struct device_attribute * attr,char * buf)4861 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4862 					   struct device_attribute *attr,
4863 					   char *buf)
4864 {
4865 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4866 	return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4867 						  cap_ndoms(iommu->cap)));
4868 }
4869 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4870 
4871 static struct attribute *intel_iommu_attrs[] = {
4872 	&dev_attr_version.attr,
4873 	&dev_attr_address.attr,
4874 	&dev_attr_cap.attr,
4875 	&dev_attr_ecap.attr,
4876 	&dev_attr_domains_supported.attr,
4877 	&dev_attr_domains_used.attr,
4878 	NULL,
4879 };
4880 
4881 static struct attribute_group intel_iommu_group = {
4882 	.name = "intel-iommu",
4883 	.attrs = intel_iommu_attrs,
4884 };
4885 
4886 const struct attribute_group *intel_iommu_groups[] = {
4887 	&intel_iommu_group,
4888 	NULL,
4889 };
4890 
has_external_pci(void)4891 static inline bool has_external_pci(void)
4892 {
4893 	struct pci_dev *pdev = NULL;
4894 
4895 	for_each_pci_dev(pdev)
4896 		if (pdev->external_facing) {
4897 			pci_dev_put(pdev);
4898 			return true;
4899 		}
4900 
4901 	return false;
4902 }
4903 
platform_optin_force_iommu(void)4904 static int __init platform_optin_force_iommu(void)
4905 {
4906 	if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
4907 		return 0;
4908 
4909 	if (no_iommu || dmar_disabled)
4910 		pr_info("Intel-IOMMU force enabled due to platform opt in\n");
4911 
4912 	/*
4913 	 * If Intel-IOMMU is disabled by default, we will apply identity
4914 	 * map for all devices except those marked as being untrusted.
4915 	 */
4916 	if (dmar_disabled)
4917 		iommu_set_default_passthrough(false);
4918 
4919 	dmar_disabled = 0;
4920 	no_iommu = 0;
4921 
4922 	return 1;
4923 }
4924 
probe_acpi_namespace_devices(void)4925 static int __init probe_acpi_namespace_devices(void)
4926 {
4927 	struct dmar_drhd_unit *drhd;
4928 	/* To avoid a -Wunused-but-set-variable warning. */
4929 	struct intel_iommu *iommu __maybe_unused;
4930 	struct device *dev;
4931 	int i, ret = 0;
4932 
4933 	for_each_active_iommu(iommu, drhd) {
4934 		for_each_active_dev_scope(drhd->devices,
4935 					  drhd->devices_cnt, i, dev) {
4936 			struct acpi_device_physical_node *pn;
4937 			struct iommu_group *group;
4938 			struct acpi_device *adev;
4939 
4940 			if (dev->bus != &acpi_bus_type)
4941 				continue;
4942 
4943 			adev = to_acpi_device(dev);
4944 			mutex_lock(&adev->physical_node_lock);
4945 			list_for_each_entry(pn,
4946 					    &adev->physical_node_list, node) {
4947 				group = iommu_group_get(pn->dev);
4948 				if (group) {
4949 					iommu_group_put(group);
4950 					continue;
4951 				}
4952 
4953 				pn->dev->bus->iommu_ops = &intel_iommu_ops;
4954 				ret = iommu_probe_device(pn->dev);
4955 				if (ret)
4956 					break;
4957 			}
4958 			mutex_unlock(&adev->physical_node_lock);
4959 
4960 			if (ret)
4961 				return ret;
4962 		}
4963 	}
4964 
4965 	return 0;
4966 }
4967 
intel_iommu_init(void)4968 int __init intel_iommu_init(void)
4969 {
4970 	int ret = -ENODEV;
4971 	struct dmar_drhd_unit *drhd;
4972 	struct intel_iommu *iommu;
4973 
4974 	/*
4975 	 * Intel IOMMU is required for a TXT/tboot launch or platform
4976 	 * opt in, so enforce that.
4977 	 */
4978 	force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
4979 		    platform_optin_force_iommu();
4980 
4981 	if (iommu_init_mempool()) {
4982 		if (force_on)
4983 			panic("tboot: Failed to initialize iommu memory\n");
4984 		return -ENOMEM;
4985 	}
4986 
4987 	down_write(&dmar_global_lock);
4988 	if (dmar_table_init()) {
4989 		if (force_on)
4990 			panic("tboot: Failed to initialize DMAR table\n");
4991 		goto out_free_dmar;
4992 	}
4993 
4994 	if (dmar_dev_scope_init() < 0) {
4995 		if (force_on)
4996 			panic("tboot: Failed to initialize DMAR device scope\n");
4997 		goto out_free_dmar;
4998 	}
4999 
5000 	up_write(&dmar_global_lock);
5001 
5002 	/*
5003 	 * The bus notifier takes the dmar_global_lock, so lockdep will
5004 	 * complain later when we register it under the lock.
5005 	 */
5006 	dmar_register_bus_notifier();
5007 
5008 	down_write(&dmar_global_lock);
5009 
5010 	if (!no_iommu)
5011 		intel_iommu_debugfs_init();
5012 
5013 	if (no_iommu || dmar_disabled) {
5014 		/*
5015 		 * We exit the function here to ensure IOMMU's remapping and
5016 		 * mempool aren't setup, which means that the IOMMU's PMRs
5017 		 * won't be disabled via the call to init_dmars(). So disable
5018 		 * it explicitly here. The PMRs were setup by tboot prior to
5019 		 * calling SENTER, but the kernel is expected to reset/tear
5020 		 * down the PMRs.
5021 		 */
5022 		if (intel_iommu_tboot_noforce) {
5023 			for_each_iommu(iommu, drhd)
5024 				iommu_disable_protect_mem_regions(iommu);
5025 		}
5026 
5027 		/*
5028 		 * Make sure the IOMMUs are switched off, even when we
5029 		 * boot into a kexec kernel and the previous kernel left
5030 		 * them enabled
5031 		 */
5032 		intel_disable_iommus();
5033 		goto out_free_dmar;
5034 	}
5035 
5036 	if (list_empty(&dmar_rmrr_units))
5037 		pr_info("No RMRR found\n");
5038 
5039 	if (list_empty(&dmar_atsr_units))
5040 		pr_info("No ATSR found\n");
5041 
5042 	if (dmar_init_reserved_ranges()) {
5043 		if (force_on)
5044 			panic("tboot: Failed to reserve iommu ranges\n");
5045 		goto out_free_reserved_range;
5046 	}
5047 
5048 	if (dmar_map_gfx)
5049 		intel_iommu_gfx_mapped = 1;
5050 
5051 	init_no_remapping_devices();
5052 
5053 	ret = init_dmars();
5054 	if (ret) {
5055 		if (force_on)
5056 			panic("tboot: Failed to initialize DMARs\n");
5057 		pr_err("Initialization failed\n");
5058 		goto out_free_reserved_range;
5059 	}
5060 	up_write(&dmar_global_lock);
5061 
5062 	init_iommu_pm_ops();
5063 
5064 	down_read(&dmar_global_lock);
5065 	for_each_active_iommu(iommu, drhd) {
5066 		iommu_device_sysfs_add(&iommu->iommu, NULL,
5067 				       intel_iommu_groups,
5068 				       "%s", iommu->name);
5069 		iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
5070 		iommu_device_register(&iommu->iommu);
5071 	}
5072 	up_read(&dmar_global_lock);
5073 
5074 	bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
5075 	if (si_domain && !hw_pass_through)
5076 		register_memory_notifier(&intel_iommu_memory_nb);
5077 	cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
5078 			  intel_iommu_cpu_dead);
5079 
5080 	down_read(&dmar_global_lock);
5081 	if (probe_acpi_namespace_devices())
5082 		pr_warn("ACPI name space devices didn't probe correctly\n");
5083 
5084 	/* Finally, we enable the DMA remapping hardware. */
5085 	for_each_iommu(iommu, drhd) {
5086 		if (!drhd->ignored && !translation_pre_enabled(iommu))
5087 			iommu_enable_translation(iommu);
5088 
5089 		iommu_disable_protect_mem_regions(iommu);
5090 	}
5091 	up_read(&dmar_global_lock);
5092 
5093 	pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
5094 
5095 	intel_iommu_enabled = 1;
5096 
5097 	return 0;
5098 
5099 out_free_reserved_range:
5100 	put_iova_domain(&reserved_iova_list);
5101 out_free_dmar:
5102 	intel_iommu_free_dmars();
5103 	up_write(&dmar_global_lock);
5104 	iommu_exit_mempool();
5105 	return ret;
5106 }
5107 
domain_context_clear_one_cb(struct pci_dev * pdev,u16 alias,void * opaque)5108 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
5109 {
5110 	struct intel_iommu *iommu = opaque;
5111 
5112 	domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
5113 	return 0;
5114 }
5115 
5116 /*
5117  * NB - intel-iommu lacks any sort of reference counting for the users of
5118  * dependent devices.  If multiple endpoints have intersecting dependent
5119  * devices, unbinding the driver from any one of them will possibly leave
5120  * the others unable to operate.
5121  */
domain_context_clear(struct intel_iommu * iommu,struct device * dev)5122 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
5123 {
5124 	if (!iommu || !dev || !dev_is_pci(dev))
5125 		return;
5126 
5127 	pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
5128 }
5129 
__dmar_remove_one_dev_info(struct device_domain_info * info)5130 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
5131 {
5132 	struct dmar_domain *domain;
5133 	struct intel_iommu *iommu;
5134 	unsigned long flags;
5135 
5136 	assert_spin_locked(&device_domain_lock);
5137 
5138 	if (WARN_ON(!info))
5139 		return;
5140 
5141 	iommu = info->iommu;
5142 	domain = info->domain;
5143 
5144 	if (info->dev) {
5145 		if (dev_is_pci(info->dev) && sm_supported(iommu))
5146 			intel_pasid_tear_down_entry(iommu, info->dev,
5147 					PASID_RID2PASID, false);
5148 
5149 		iommu_disable_dev_iotlb(info);
5150 		if (!dev_is_real_dma_subdevice(info->dev))
5151 			domain_context_clear(iommu, info->dev);
5152 		intel_pasid_free_table(info->dev);
5153 	}
5154 
5155 	unlink_domain_info(info);
5156 
5157 	spin_lock_irqsave(&iommu->lock, flags);
5158 	domain_detach_iommu(domain, iommu);
5159 	spin_unlock_irqrestore(&iommu->lock, flags);
5160 
5161 	free_devinfo_mem(info);
5162 }
5163 
dmar_remove_one_dev_info(struct device * dev)5164 static void dmar_remove_one_dev_info(struct device *dev)
5165 {
5166 	struct device_domain_info *info;
5167 	unsigned long flags;
5168 
5169 	spin_lock_irqsave(&device_domain_lock, flags);
5170 	info = get_domain_info(dev);
5171 	if (info)
5172 		__dmar_remove_one_dev_info(info);
5173 	spin_unlock_irqrestore(&device_domain_lock, flags);
5174 }
5175 
md_domain_init(struct dmar_domain * domain,int guest_width)5176 static int md_domain_init(struct dmar_domain *domain, int guest_width)
5177 {
5178 	int adjust_width;
5179 
5180 	/* calculate AGAW */
5181 	domain->gaw = guest_width;
5182 	adjust_width = guestwidth_to_adjustwidth(guest_width);
5183 	domain->agaw = width_to_agaw(adjust_width);
5184 
5185 	domain->iommu_coherency = 0;
5186 	domain->iommu_snooping = 0;
5187 	domain->iommu_superpage = 0;
5188 	domain->max_addr = 0;
5189 
5190 	/* always allocate the top pgd */
5191 	domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
5192 	if (!domain->pgd)
5193 		return -ENOMEM;
5194 	domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
5195 	return 0;
5196 }
5197 
intel_init_iova_domain(struct dmar_domain * dmar_domain)5198 static void intel_init_iova_domain(struct dmar_domain *dmar_domain)
5199 {
5200 	init_iova_domain(&dmar_domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
5201 	copy_reserved_iova(&reserved_iova_list, &dmar_domain->iovad);
5202 
5203 	if (!intel_iommu_strict &&
5204 	    init_iova_flush_queue(&dmar_domain->iovad,
5205 				  iommu_flush_iova, iova_entry_free))
5206 		pr_info("iova flush queue initialization failed\n");
5207 }
5208 
intel_iommu_domain_alloc(unsigned type)5209 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
5210 {
5211 	struct dmar_domain *dmar_domain;
5212 	struct iommu_domain *domain;
5213 
5214 	switch (type) {
5215 	case IOMMU_DOMAIN_DMA:
5216 	case IOMMU_DOMAIN_UNMANAGED:
5217 		dmar_domain = alloc_domain(0);
5218 		if (!dmar_domain) {
5219 			pr_err("Can't allocate dmar_domain\n");
5220 			return NULL;
5221 		}
5222 		if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
5223 			pr_err("Domain initialization failed\n");
5224 			domain_exit(dmar_domain);
5225 			return NULL;
5226 		}
5227 
5228 		if (type == IOMMU_DOMAIN_DMA)
5229 			intel_init_iova_domain(dmar_domain);
5230 
5231 		domain = &dmar_domain->domain;
5232 		domain->geometry.aperture_start = 0;
5233 		domain->geometry.aperture_end   =
5234 				__DOMAIN_MAX_ADDR(dmar_domain->gaw);
5235 		domain->geometry.force_aperture = true;
5236 
5237 		return domain;
5238 	case IOMMU_DOMAIN_IDENTITY:
5239 		return &si_domain->domain;
5240 	default:
5241 		return NULL;
5242 	}
5243 
5244 	return NULL;
5245 }
5246 
intel_iommu_domain_free(struct iommu_domain * domain)5247 static void intel_iommu_domain_free(struct iommu_domain *domain)
5248 {
5249 	if (domain != &si_domain->domain)
5250 		domain_exit(to_dmar_domain(domain));
5251 }
5252 
5253 /*
5254  * Check whether a @domain could be attached to the @dev through the
5255  * aux-domain attach/detach APIs.
5256  */
5257 static inline bool
is_aux_domain(struct device * dev,struct iommu_domain * domain)5258 is_aux_domain(struct device *dev, struct iommu_domain *domain)
5259 {
5260 	struct device_domain_info *info = get_domain_info(dev);
5261 
5262 	return info && info->auxd_enabled &&
5263 			domain->type == IOMMU_DOMAIN_UNMANAGED;
5264 }
5265 
auxiliary_link_device(struct dmar_domain * domain,struct device * dev)5266 static void auxiliary_link_device(struct dmar_domain *domain,
5267 				  struct device *dev)
5268 {
5269 	struct device_domain_info *info = get_domain_info(dev);
5270 
5271 	assert_spin_locked(&device_domain_lock);
5272 	if (WARN_ON(!info))
5273 		return;
5274 
5275 	domain->auxd_refcnt++;
5276 	list_add(&domain->auxd, &info->auxiliary_domains);
5277 }
5278 
auxiliary_unlink_device(struct dmar_domain * domain,struct device * dev)5279 static void auxiliary_unlink_device(struct dmar_domain *domain,
5280 				    struct device *dev)
5281 {
5282 	struct device_domain_info *info = get_domain_info(dev);
5283 
5284 	assert_spin_locked(&device_domain_lock);
5285 	if (WARN_ON(!info))
5286 		return;
5287 
5288 	list_del(&domain->auxd);
5289 	domain->auxd_refcnt--;
5290 
5291 	if (!domain->auxd_refcnt && domain->default_pasid > 0)
5292 		ioasid_free(domain->default_pasid);
5293 }
5294 
aux_domain_add_dev(struct dmar_domain * domain,struct device * dev)5295 static int aux_domain_add_dev(struct dmar_domain *domain,
5296 			      struct device *dev)
5297 {
5298 	int ret;
5299 	unsigned long flags;
5300 	struct intel_iommu *iommu;
5301 
5302 	iommu = device_to_iommu(dev, NULL, NULL);
5303 	if (!iommu)
5304 		return -ENODEV;
5305 
5306 	if (domain->default_pasid <= 0) {
5307 		u32 pasid;
5308 
5309 		/* No private data needed for the default pasid */
5310 		pasid = ioasid_alloc(NULL, PASID_MIN,
5311 				     pci_max_pasids(to_pci_dev(dev)) - 1,
5312 				     NULL);
5313 		if (pasid == INVALID_IOASID) {
5314 			pr_err("Can't allocate default pasid\n");
5315 			return -ENODEV;
5316 		}
5317 		domain->default_pasid = pasid;
5318 	}
5319 
5320 	spin_lock_irqsave(&device_domain_lock, flags);
5321 	/*
5322 	 * iommu->lock must be held to attach domain to iommu and setup the
5323 	 * pasid entry for second level translation.
5324 	 */
5325 	spin_lock(&iommu->lock);
5326 	ret = domain_attach_iommu(domain, iommu);
5327 	if (ret)
5328 		goto attach_failed;
5329 
5330 	/* Setup the PASID entry for mediated devices: */
5331 	if (domain_use_first_level(domain))
5332 		ret = domain_setup_first_level(iommu, domain, dev,
5333 					       domain->default_pasid);
5334 	else
5335 		ret = intel_pasid_setup_second_level(iommu, domain, dev,
5336 						     domain->default_pasid);
5337 	if (ret)
5338 		goto table_failed;
5339 	spin_unlock(&iommu->lock);
5340 
5341 	auxiliary_link_device(domain, dev);
5342 
5343 	spin_unlock_irqrestore(&device_domain_lock, flags);
5344 
5345 	return 0;
5346 
5347 table_failed:
5348 	domain_detach_iommu(domain, iommu);
5349 attach_failed:
5350 	spin_unlock(&iommu->lock);
5351 	spin_unlock_irqrestore(&device_domain_lock, flags);
5352 	if (!domain->auxd_refcnt && domain->default_pasid > 0)
5353 		ioasid_free(domain->default_pasid);
5354 
5355 	return ret;
5356 }
5357 
aux_domain_remove_dev(struct dmar_domain * domain,struct device * dev)5358 static void aux_domain_remove_dev(struct dmar_domain *domain,
5359 				  struct device *dev)
5360 {
5361 	struct device_domain_info *info;
5362 	struct intel_iommu *iommu;
5363 	unsigned long flags;
5364 
5365 	if (!is_aux_domain(dev, &domain->domain))
5366 		return;
5367 
5368 	spin_lock_irqsave(&device_domain_lock, flags);
5369 	info = get_domain_info(dev);
5370 	iommu = info->iommu;
5371 
5372 	auxiliary_unlink_device(domain, dev);
5373 
5374 	spin_lock(&iommu->lock);
5375 	intel_pasid_tear_down_entry(iommu, dev, domain->default_pasid, false);
5376 	domain_detach_iommu(domain, iommu);
5377 	spin_unlock(&iommu->lock);
5378 
5379 	spin_unlock_irqrestore(&device_domain_lock, flags);
5380 }
5381 
prepare_domain_attach_device(struct iommu_domain * domain,struct device * dev)5382 static int prepare_domain_attach_device(struct iommu_domain *domain,
5383 					struct device *dev)
5384 {
5385 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5386 	struct intel_iommu *iommu;
5387 	int addr_width;
5388 
5389 	iommu = device_to_iommu(dev, NULL, NULL);
5390 	if (!iommu)
5391 		return -ENODEV;
5392 
5393 	/* check if this iommu agaw is sufficient for max mapped address */
5394 	addr_width = agaw_to_width(iommu->agaw);
5395 	if (addr_width > cap_mgaw(iommu->cap))
5396 		addr_width = cap_mgaw(iommu->cap);
5397 
5398 	if (dmar_domain->max_addr > (1LL << addr_width)) {
5399 		dev_err(dev, "%s: iommu width (%d) is not "
5400 		        "sufficient for the mapped address (%llx)\n",
5401 		        __func__, addr_width, dmar_domain->max_addr);
5402 		return -EFAULT;
5403 	}
5404 	dmar_domain->gaw = addr_width;
5405 
5406 	/*
5407 	 * Knock out extra levels of page tables if necessary
5408 	 */
5409 	while (iommu->agaw < dmar_domain->agaw) {
5410 		struct dma_pte *pte;
5411 
5412 		pte = dmar_domain->pgd;
5413 		if (dma_pte_present(pte)) {
5414 			dmar_domain->pgd = (struct dma_pte *)
5415 				phys_to_virt(dma_pte_addr(pte));
5416 			free_pgtable_page(pte);
5417 		}
5418 		dmar_domain->agaw--;
5419 	}
5420 
5421 	return 0;
5422 }
5423 
intel_iommu_attach_device(struct iommu_domain * domain,struct device * dev)5424 static int intel_iommu_attach_device(struct iommu_domain *domain,
5425 				     struct device *dev)
5426 {
5427 	int ret;
5428 
5429 	if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
5430 	    device_is_rmrr_locked(dev)) {
5431 		dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
5432 		return -EPERM;
5433 	}
5434 
5435 	if (is_aux_domain(dev, domain))
5436 		return -EPERM;
5437 
5438 	/* normally dev is not mapped */
5439 	if (unlikely(domain_context_mapped(dev))) {
5440 		struct dmar_domain *old_domain;
5441 
5442 		old_domain = find_domain(dev);
5443 		if (old_domain)
5444 			dmar_remove_one_dev_info(dev);
5445 	}
5446 
5447 	ret = prepare_domain_attach_device(domain, dev);
5448 	if (ret)
5449 		return ret;
5450 
5451 	return domain_add_dev_info(to_dmar_domain(domain), dev);
5452 }
5453 
intel_iommu_aux_attach_device(struct iommu_domain * domain,struct device * dev)5454 static int intel_iommu_aux_attach_device(struct iommu_domain *domain,
5455 					 struct device *dev)
5456 {
5457 	int ret;
5458 
5459 	if (!is_aux_domain(dev, domain))
5460 		return -EPERM;
5461 
5462 	ret = prepare_domain_attach_device(domain, dev);
5463 	if (ret)
5464 		return ret;
5465 
5466 	return aux_domain_add_dev(to_dmar_domain(domain), dev);
5467 }
5468 
intel_iommu_detach_device(struct iommu_domain * domain,struct device * dev)5469 static void intel_iommu_detach_device(struct iommu_domain *domain,
5470 				      struct device *dev)
5471 {
5472 	dmar_remove_one_dev_info(dev);
5473 }
5474 
intel_iommu_aux_detach_device(struct iommu_domain * domain,struct device * dev)5475 static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
5476 					  struct device *dev)
5477 {
5478 	aux_domain_remove_dev(to_dmar_domain(domain), dev);
5479 }
5480 
5481 #ifdef CONFIG_INTEL_IOMMU_SVM
5482 /*
5483  * 2D array for converting and sanitizing IOMMU generic TLB granularity to
5484  * VT-d granularity. Invalidation is typically included in the unmap operation
5485  * as a result of DMA or VFIO unmap. However, for assigned devices guest
5486  * owns the first level page tables. Invalidations of translation caches in the
5487  * guest are trapped and passed down to the host.
5488  *
5489  * vIOMMU in the guest will only expose first level page tables, therefore
5490  * we do not support IOTLB granularity for request without PASID (second level).
5491  *
5492  * For example, to find the VT-d granularity encoding for IOTLB
5493  * type and page selective granularity within PASID:
5494  * X: indexed by iommu cache type
5495  * Y: indexed by enum iommu_inv_granularity
5496  * [IOMMU_CACHE_INV_TYPE_IOTLB][IOMMU_INV_GRANU_ADDR]
5497  */
5498 
5499 static const int
5500 inv_type_granu_table[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU_NR] = {
5501 	/*
5502 	 * PASID based IOTLB invalidation: PASID selective (per PASID),
5503 	 * page selective (address granularity)
5504 	 */
5505 	{-EINVAL, QI_GRAN_NONG_PASID, QI_GRAN_PSI_PASID},
5506 	/* PASID based dev TLBs */
5507 	{-EINVAL, -EINVAL, QI_DEV_IOTLB_GRAN_PASID_SEL},
5508 	/* PASID cache */
5509 	{-EINVAL, -EINVAL, -EINVAL}
5510 };
5511 
to_vtd_granularity(int type,int granu)5512 static inline int to_vtd_granularity(int type, int granu)
5513 {
5514 	return inv_type_granu_table[type][granu];
5515 }
5516 
to_vtd_size(u64 granu_size,u64 nr_granules)5517 static inline u64 to_vtd_size(u64 granu_size, u64 nr_granules)
5518 {
5519 	u64 nr_pages = (granu_size * nr_granules) >> VTD_PAGE_SHIFT;
5520 
5521 	/* VT-d size is encoded as 2^size of 4K pages, 0 for 4k, 9 for 2MB, etc.
5522 	 * IOMMU cache invalidate API passes granu_size in bytes, and number of
5523 	 * granu size in contiguous memory.
5524 	 */
5525 	return order_base_2(nr_pages);
5526 }
5527 
5528 static int
intel_iommu_sva_invalidate(struct iommu_domain * domain,struct device * dev,struct iommu_cache_invalidate_info * inv_info)5529 intel_iommu_sva_invalidate(struct iommu_domain *domain, struct device *dev,
5530 			   struct iommu_cache_invalidate_info *inv_info)
5531 {
5532 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5533 	struct device_domain_info *info;
5534 	struct intel_iommu *iommu;
5535 	unsigned long flags;
5536 	int cache_type;
5537 	u8 bus, devfn;
5538 	u16 did, sid;
5539 	int ret = 0;
5540 	u64 size = 0;
5541 
5542 	if (!inv_info || !dmar_domain)
5543 		return -EINVAL;
5544 
5545 	if (!dev || !dev_is_pci(dev))
5546 		return -ENODEV;
5547 
5548 	iommu = device_to_iommu(dev, &bus, &devfn);
5549 	if (!iommu)
5550 		return -ENODEV;
5551 
5552 	if (!(dmar_domain->flags & DOMAIN_FLAG_NESTING_MODE))
5553 		return -EINVAL;
5554 
5555 	spin_lock_irqsave(&device_domain_lock, flags);
5556 	spin_lock(&iommu->lock);
5557 	info = get_domain_info(dev);
5558 	if (!info) {
5559 		ret = -EINVAL;
5560 		goto out_unlock;
5561 	}
5562 	did = dmar_domain->iommu_did[iommu->seq_id];
5563 	sid = PCI_DEVID(bus, devfn);
5564 
5565 	/* Size is only valid in address selective invalidation */
5566 	if (inv_info->granularity == IOMMU_INV_GRANU_ADDR)
5567 		size = to_vtd_size(inv_info->granu.addr_info.granule_size,
5568 				   inv_info->granu.addr_info.nb_granules);
5569 
5570 	for_each_set_bit(cache_type,
5571 			 (unsigned long *)&inv_info->cache,
5572 			 IOMMU_CACHE_INV_TYPE_NR) {
5573 		int granu = 0;
5574 		u64 pasid = 0;
5575 		u64 addr = 0;
5576 
5577 		granu = to_vtd_granularity(cache_type, inv_info->granularity);
5578 		if (granu == -EINVAL) {
5579 			pr_err_ratelimited("Invalid cache type and granu combination %d/%d\n",
5580 					   cache_type, inv_info->granularity);
5581 			break;
5582 		}
5583 
5584 		/*
5585 		 * PASID is stored in different locations based on the
5586 		 * granularity.
5587 		 */
5588 		if (inv_info->granularity == IOMMU_INV_GRANU_PASID &&
5589 		    (inv_info->granu.pasid_info.flags & IOMMU_INV_PASID_FLAGS_PASID))
5590 			pasid = inv_info->granu.pasid_info.pasid;
5591 		else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
5592 			 (inv_info->granu.addr_info.flags & IOMMU_INV_ADDR_FLAGS_PASID))
5593 			pasid = inv_info->granu.addr_info.pasid;
5594 
5595 		switch (BIT(cache_type)) {
5596 		case IOMMU_CACHE_INV_TYPE_IOTLB:
5597 			/* HW will ignore LSB bits based on address mask */
5598 			if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
5599 			    size &&
5600 			    (inv_info->granu.addr_info.addr & ((BIT(VTD_PAGE_SHIFT + size)) - 1))) {
5601 				pr_err_ratelimited("User address not aligned, 0x%llx, size order %llu\n",
5602 						   inv_info->granu.addr_info.addr, size);
5603 			}
5604 
5605 			/*
5606 			 * If granu is PASID-selective, address is ignored.
5607 			 * We use npages = -1 to indicate that.
5608 			 */
5609 			qi_flush_piotlb(iommu, did, pasid,
5610 					mm_to_dma_pfn(inv_info->granu.addr_info.addr),
5611 					(granu == QI_GRAN_NONG_PASID) ? -1 : 1 << size,
5612 					inv_info->granu.addr_info.flags & IOMMU_INV_ADDR_FLAGS_LEAF);
5613 
5614 			if (!info->ats_enabled)
5615 				break;
5616 			/*
5617 			 * Always flush device IOTLB if ATS is enabled. vIOMMU
5618 			 * in the guest may assume IOTLB flush is inclusive,
5619 			 * which is more efficient.
5620 			 */
5621 			fallthrough;
5622 		case IOMMU_CACHE_INV_TYPE_DEV_IOTLB:
5623 			/*
5624 			 * PASID based device TLB invalidation does not support
5625 			 * IOMMU_INV_GRANU_PASID granularity but only supports
5626 			 * IOMMU_INV_GRANU_ADDR.
5627 			 * The equivalent of that is we set the size to be the
5628 			 * entire range of 64 bit. User only provides PASID info
5629 			 * without address info. So we set addr to 0.
5630 			 */
5631 			if (inv_info->granularity == IOMMU_INV_GRANU_PASID) {
5632 				size = 64 - VTD_PAGE_SHIFT;
5633 				addr = 0;
5634 			} else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR) {
5635 				addr = inv_info->granu.addr_info.addr;
5636 			}
5637 
5638 			if (info->ats_enabled)
5639 				qi_flush_dev_iotlb_pasid(iommu, sid,
5640 						info->pfsid, pasid,
5641 						info->ats_qdep, addr,
5642 						size);
5643 			else
5644 				pr_warn_ratelimited("Passdown device IOTLB flush w/o ATS!\n");
5645 			break;
5646 		default:
5647 			dev_err_ratelimited(dev, "Unsupported IOMMU invalidation type %d\n",
5648 					    cache_type);
5649 			ret = -EINVAL;
5650 		}
5651 	}
5652 out_unlock:
5653 	spin_unlock(&iommu->lock);
5654 	spin_unlock_irqrestore(&device_domain_lock, flags);
5655 
5656 	return ret;
5657 }
5658 #endif
5659 
intel_iommu_map(struct iommu_domain * domain,unsigned long iova,phys_addr_t hpa,size_t size,int iommu_prot,gfp_t gfp)5660 static int intel_iommu_map(struct iommu_domain *domain,
5661 			   unsigned long iova, phys_addr_t hpa,
5662 			   size_t size, int iommu_prot, gfp_t gfp)
5663 {
5664 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5665 	u64 max_addr;
5666 	int prot = 0;
5667 	int ret;
5668 
5669 	if (iommu_prot & IOMMU_READ)
5670 		prot |= DMA_PTE_READ;
5671 	if (iommu_prot & IOMMU_WRITE)
5672 		prot |= DMA_PTE_WRITE;
5673 	if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5674 		prot |= DMA_PTE_SNP;
5675 
5676 	max_addr = iova + size;
5677 	if (dmar_domain->max_addr < max_addr) {
5678 		u64 end;
5679 
5680 		/* check if minimum agaw is sufficient for mapped address */
5681 		end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5682 		if (end < max_addr) {
5683 			pr_err("%s: iommu width (%d) is not "
5684 			       "sufficient for the mapped address (%llx)\n",
5685 			       __func__, dmar_domain->gaw, max_addr);
5686 			return -EFAULT;
5687 		}
5688 		dmar_domain->max_addr = max_addr;
5689 	}
5690 	/* Round up size to next multiple of PAGE_SIZE, if it and
5691 	   the low bits of hpa would take us onto the next page */
5692 	size = aligned_nrpages(hpa, size);
5693 	ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5694 				 hpa >> VTD_PAGE_SHIFT, size, prot);
5695 	return ret;
5696 }
5697 
intel_iommu_unmap(struct iommu_domain * domain,unsigned long iova,size_t size,struct iommu_iotlb_gather * gather)5698 static size_t intel_iommu_unmap(struct iommu_domain *domain,
5699 				unsigned long iova, size_t size,
5700 				struct iommu_iotlb_gather *gather)
5701 {
5702 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5703 	struct page *freelist = NULL;
5704 	unsigned long start_pfn, last_pfn;
5705 	unsigned int npages;
5706 	int iommu_id, level = 0;
5707 
5708 	/* Cope with horrid API which requires us to unmap more than the
5709 	   size argument if it happens to be a large-page mapping. */
5710 	BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5711 
5712 	if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5713 		size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5714 
5715 	start_pfn = iova >> VTD_PAGE_SHIFT;
5716 	last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5717 
5718 	freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
5719 
5720 	npages = last_pfn - start_pfn + 1;
5721 
5722 	for_each_domain_iommu(iommu_id, dmar_domain)
5723 		iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5724 				      start_pfn, npages, !freelist, 0);
5725 
5726 	dma_free_pagelist(freelist);
5727 
5728 	if (dmar_domain->max_addr == iova + size)
5729 		dmar_domain->max_addr = iova;
5730 
5731 	return size;
5732 }
5733 
intel_iommu_iova_to_phys(struct iommu_domain * domain,dma_addr_t iova)5734 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5735 					    dma_addr_t iova)
5736 {
5737 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5738 	struct dma_pte *pte;
5739 	int level = 0;
5740 	u64 phys = 0;
5741 
5742 	pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5743 	if (pte && dma_pte_present(pte))
5744 		phys = dma_pte_addr(pte) +
5745 			(iova & (BIT_MASK(level_to_offset_bits(level) +
5746 						VTD_PAGE_SHIFT) - 1));
5747 
5748 	return phys;
5749 }
5750 
scalable_mode_support(void)5751 static inline bool scalable_mode_support(void)
5752 {
5753 	struct dmar_drhd_unit *drhd;
5754 	struct intel_iommu *iommu;
5755 	bool ret = true;
5756 
5757 	rcu_read_lock();
5758 	for_each_active_iommu(iommu, drhd) {
5759 		if (!sm_supported(iommu)) {
5760 			ret = false;
5761 			break;
5762 		}
5763 	}
5764 	rcu_read_unlock();
5765 
5766 	return ret;
5767 }
5768 
iommu_pasid_support(void)5769 static inline bool iommu_pasid_support(void)
5770 {
5771 	struct dmar_drhd_unit *drhd;
5772 	struct intel_iommu *iommu;
5773 	bool ret = true;
5774 
5775 	rcu_read_lock();
5776 	for_each_active_iommu(iommu, drhd) {
5777 		if (!pasid_supported(iommu)) {
5778 			ret = false;
5779 			break;
5780 		}
5781 	}
5782 	rcu_read_unlock();
5783 
5784 	return ret;
5785 }
5786 
nested_mode_support(void)5787 static inline bool nested_mode_support(void)
5788 {
5789 	struct dmar_drhd_unit *drhd;
5790 	struct intel_iommu *iommu;
5791 	bool ret = true;
5792 
5793 	rcu_read_lock();
5794 	for_each_active_iommu(iommu, drhd) {
5795 		if (!sm_supported(iommu) || !ecap_nest(iommu->ecap)) {
5796 			ret = false;
5797 			break;
5798 		}
5799 	}
5800 	rcu_read_unlock();
5801 
5802 	return ret;
5803 }
5804 
intel_iommu_capable(enum iommu_cap cap)5805 static bool intel_iommu_capable(enum iommu_cap cap)
5806 {
5807 	if (cap == IOMMU_CAP_CACHE_COHERENCY)
5808 		return domain_update_iommu_snooping(NULL) == 1;
5809 	if (cap == IOMMU_CAP_INTR_REMAP)
5810 		return irq_remapping_enabled == 1;
5811 
5812 	return false;
5813 }
5814 
intel_iommu_probe_device(struct device * dev)5815 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
5816 {
5817 	struct intel_iommu *iommu;
5818 
5819 	iommu = device_to_iommu(dev, NULL, NULL);
5820 	if (!iommu)
5821 		return ERR_PTR(-ENODEV);
5822 
5823 	if (translation_pre_enabled(iommu))
5824 		dev_iommu_priv_set(dev, DEFER_DEVICE_DOMAIN_INFO);
5825 
5826 	return &iommu->iommu;
5827 }
5828 
intel_iommu_release_device(struct device * dev)5829 static void intel_iommu_release_device(struct device *dev)
5830 {
5831 	struct intel_iommu *iommu;
5832 
5833 	iommu = device_to_iommu(dev, NULL, NULL);
5834 	if (!iommu)
5835 		return;
5836 
5837 	dmar_remove_one_dev_info(dev);
5838 
5839 	set_dma_ops(dev, NULL);
5840 }
5841 
intel_iommu_probe_finalize(struct device * dev)5842 static void intel_iommu_probe_finalize(struct device *dev)
5843 {
5844 	struct iommu_domain *domain;
5845 
5846 	domain = iommu_get_domain_for_dev(dev);
5847 	if (device_needs_bounce(dev))
5848 		set_dma_ops(dev, &bounce_dma_ops);
5849 	else if (domain && domain->type == IOMMU_DOMAIN_DMA)
5850 		set_dma_ops(dev, &intel_dma_ops);
5851 	else
5852 		set_dma_ops(dev, NULL);
5853 }
5854 
intel_iommu_get_resv_regions(struct device * device,struct list_head * head)5855 static void intel_iommu_get_resv_regions(struct device *device,
5856 					 struct list_head *head)
5857 {
5858 	int prot = DMA_PTE_READ | DMA_PTE_WRITE;
5859 	struct iommu_resv_region *reg;
5860 	struct dmar_rmrr_unit *rmrr;
5861 	struct device *i_dev;
5862 	int i;
5863 
5864 	down_read(&dmar_global_lock);
5865 	for_each_rmrr_units(rmrr) {
5866 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5867 					  i, i_dev) {
5868 			struct iommu_resv_region *resv;
5869 			enum iommu_resv_type type;
5870 			size_t length;
5871 
5872 			if (i_dev != device &&
5873 			    !is_downstream_to_pci_bridge(device, i_dev))
5874 				continue;
5875 
5876 			length = rmrr->end_address - rmrr->base_address + 1;
5877 
5878 			type = device_rmrr_is_relaxable(device) ?
5879 				IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
5880 
5881 			resv = iommu_alloc_resv_region(rmrr->base_address,
5882 						       length, prot, type);
5883 			if (!resv)
5884 				break;
5885 
5886 			list_add_tail(&resv->list, head);
5887 		}
5888 	}
5889 	up_read(&dmar_global_lock);
5890 
5891 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
5892 	if (dev_is_pci(device)) {
5893 		struct pci_dev *pdev = to_pci_dev(device);
5894 
5895 		if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
5896 			reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
5897 						   IOMMU_RESV_DIRECT_RELAXABLE);
5898 			if (reg)
5899 				list_add_tail(&reg->list, head);
5900 		}
5901 	}
5902 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
5903 
5904 	reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5905 				      IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5906 				      0, IOMMU_RESV_MSI);
5907 	if (!reg)
5908 		return;
5909 	list_add_tail(&reg->list, head);
5910 }
5911 
intel_iommu_enable_pasid(struct intel_iommu * iommu,struct device * dev)5912 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
5913 {
5914 	struct device_domain_info *info;
5915 	struct context_entry *context;
5916 	struct dmar_domain *domain;
5917 	unsigned long flags;
5918 	u64 ctx_lo;
5919 	int ret;
5920 
5921 	domain = find_domain(dev);
5922 	if (!domain)
5923 		return -EINVAL;
5924 
5925 	spin_lock_irqsave(&device_domain_lock, flags);
5926 	spin_lock(&iommu->lock);
5927 
5928 	ret = -EINVAL;
5929 	info = get_domain_info(dev);
5930 	if (!info || !info->pasid_supported)
5931 		goto out;
5932 
5933 	context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5934 	if (WARN_ON(!context))
5935 		goto out;
5936 
5937 	ctx_lo = context[0].lo;
5938 
5939 	if (!(ctx_lo & CONTEXT_PASIDE)) {
5940 		ctx_lo |= CONTEXT_PASIDE;
5941 		context[0].lo = ctx_lo;
5942 		wmb();
5943 		iommu->flush.flush_context(iommu,
5944 					   domain->iommu_did[iommu->seq_id],
5945 					   PCI_DEVID(info->bus, info->devfn),
5946 					   DMA_CCMD_MASK_NOBIT,
5947 					   DMA_CCMD_DEVICE_INVL);
5948 	}
5949 
5950 	/* Enable PASID support in the device, if it wasn't already */
5951 	if (!info->pasid_enabled)
5952 		iommu_enable_dev_iotlb(info);
5953 
5954 	ret = 0;
5955 
5956  out:
5957 	spin_unlock(&iommu->lock);
5958 	spin_unlock_irqrestore(&device_domain_lock, flags);
5959 
5960 	return ret;
5961 }
5962 
intel_iommu_apply_resv_region(struct device * dev,struct iommu_domain * domain,struct iommu_resv_region * region)5963 static void intel_iommu_apply_resv_region(struct device *dev,
5964 					  struct iommu_domain *domain,
5965 					  struct iommu_resv_region *region)
5966 {
5967 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5968 	unsigned long start, end;
5969 
5970 	start = IOVA_PFN(region->start);
5971 	end   = IOVA_PFN(region->start + region->length - 1);
5972 
5973 	WARN_ON_ONCE(!reserve_iova(&dmar_domain->iovad, start, end));
5974 }
5975 
intel_iommu_device_group(struct device * dev)5976 static struct iommu_group *intel_iommu_device_group(struct device *dev)
5977 {
5978 	if (dev_is_pci(dev))
5979 		return pci_device_group(dev);
5980 	return generic_device_group(dev);
5981 }
5982 
intel_iommu_enable_auxd(struct device * dev)5983 static int intel_iommu_enable_auxd(struct device *dev)
5984 {
5985 	struct device_domain_info *info;
5986 	struct intel_iommu *iommu;
5987 	unsigned long flags;
5988 	int ret;
5989 
5990 	iommu = device_to_iommu(dev, NULL, NULL);
5991 	if (!iommu || dmar_disabled)
5992 		return -EINVAL;
5993 
5994 	if (!sm_supported(iommu) || !pasid_supported(iommu))
5995 		return -EINVAL;
5996 
5997 	ret = intel_iommu_enable_pasid(iommu, dev);
5998 	if (ret)
5999 		return -ENODEV;
6000 
6001 	spin_lock_irqsave(&device_domain_lock, flags);
6002 	info = get_domain_info(dev);
6003 	info->auxd_enabled = 1;
6004 	spin_unlock_irqrestore(&device_domain_lock, flags);
6005 
6006 	return 0;
6007 }
6008 
intel_iommu_disable_auxd(struct device * dev)6009 static int intel_iommu_disable_auxd(struct device *dev)
6010 {
6011 	struct device_domain_info *info;
6012 	unsigned long flags;
6013 
6014 	spin_lock_irqsave(&device_domain_lock, flags);
6015 	info = get_domain_info(dev);
6016 	if (!WARN_ON(!info))
6017 		info->auxd_enabled = 0;
6018 	spin_unlock_irqrestore(&device_domain_lock, flags);
6019 
6020 	return 0;
6021 }
6022 
6023 /*
6024  * A PCI express designated vendor specific extended capability is defined
6025  * in the section 3.7 of Intel scalable I/O virtualization technical spec
6026  * for system software and tools to detect endpoint devices supporting the
6027  * Intel scalable IO virtualization without host driver dependency.
6028  *
6029  * Returns the address of the matching extended capability structure within
6030  * the device's PCI configuration space or 0 if the device does not support
6031  * it.
6032  */
siov_find_pci_dvsec(struct pci_dev * pdev)6033 static int siov_find_pci_dvsec(struct pci_dev *pdev)
6034 {
6035 	int pos;
6036 	u16 vendor, id;
6037 
6038 	pos = pci_find_next_ext_capability(pdev, 0, 0x23);
6039 	while (pos) {
6040 		pci_read_config_word(pdev, pos + 4, &vendor);
6041 		pci_read_config_word(pdev, pos + 8, &id);
6042 		if (vendor == PCI_VENDOR_ID_INTEL && id == 5)
6043 			return pos;
6044 
6045 		pos = pci_find_next_ext_capability(pdev, pos, 0x23);
6046 	}
6047 
6048 	return 0;
6049 }
6050 
6051 static bool
intel_iommu_dev_has_feat(struct device * dev,enum iommu_dev_features feat)6052 intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
6053 {
6054 	if (feat == IOMMU_DEV_FEAT_AUX) {
6055 		int ret;
6056 
6057 		if (!dev_is_pci(dev) || dmar_disabled ||
6058 		    !scalable_mode_support() || !iommu_pasid_support())
6059 			return false;
6060 
6061 		ret = pci_pasid_features(to_pci_dev(dev));
6062 		if (ret < 0)
6063 			return false;
6064 
6065 		return !!siov_find_pci_dvsec(to_pci_dev(dev));
6066 	}
6067 
6068 	if (feat == IOMMU_DEV_FEAT_SVA) {
6069 		struct device_domain_info *info = get_domain_info(dev);
6070 
6071 		return info && (info->iommu->flags & VTD_FLAG_SVM_CAPABLE) &&
6072 			info->pasid_supported && info->pri_supported &&
6073 			info->ats_supported;
6074 	}
6075 
6076 	return false;
6077 }
6078 
6079 static int
intel_iommu_dev_enable_feat(struct device * dev,enum iommu_dev_features feat)6080 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
6081 {
6082 	if (feat == IOMMU_DEV_FEAT_AUX)
6083 		return intel_iommu_enable_auxd(dev);
6084 
6085 	if (feat == IOMMU_DEV_FEAT_SVA) {
6086 		struct device_domain_info *info = get_domain_info(dev);
6087 
6088 		if (!info)
6089 			return -EINVAL;
6090 
6091 		if (info->iommu->flags & VTD_FLAG_SVM_CAPABLE)
6092 			return 0;
6093 	}
6094 
6095 	return -ENODEV;
6096 }
6097 
6098 static int
intel_iommu_dev_disable_feat(struct device * dev,enum iommu_dev_features feat)6099 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
6100 {
6101 	if (feat == IOMMU_DEV_FEAT_AUX)
6102 		return intel_iommu_disable_auxd(dev);
6103 
6104 	return -ENODEV;
6105 }
6106 
6107 static bool
intel_iommu_dev_feat_enabled(struct device * dev,enum iommu_dev_features feat)6108 intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat)
6109 {
6110 	struct device_domain_info *info = get_domain_info(dev);
6111 
6112 	if (feat == IOMMU_DEV_FEAT_AUX)
6113 		return scalable_mode_support() && info && info->auxd_enabled;
6114 
6115 	return false;
6116 }
6117 
6118 static int
intel_iommu_aux_get_pasid(struct iommu_domain * domain,struct device * dev)6119 intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
6120 {
6121 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
6122 
6123 	return dmar_domain->default_pasid > 0 ?
6124 			dmar_domain->default_pasid : -EINVAL;
6125 }
6126 
intel_iommu_is_attach_deferred(struct iommu_domain * domain,struct device * dev)6127 static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,
6128 					   struct device *dev)
6129 {
6130 	return attach_deferred(dev);
6131 }
6132 
6133 static int
intel_iommu_domain_set_attr(struct iommu_domain * domain,enum iommu_attr attr,void * data)6134 intel_iommu_domain_set_attr(struct iommu_domain *domain,
6135 			    enum iommu_attr attr, void *data)
6136 {
6137 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
6138 	unsigned long flags;
6139 	int ret = 0;
6140 
6141 	if (domain->type != IOMMU_DOMAIN_UNMANAGED)
6142 		return -EINVAL;
6143 
6144 	switch (attr) {
6145 	case DOMAIN_ATTR_NESTING:
6146 		spin_lock_irqsave(&device_domain_lock, flags);
6147 		if (nested_mode_support() &&
6148 		    list_empty(&dmar_domain->devices)) {
6149 			dmar_domain->flags |= DOMAIN_FLAG_NESTING_MODE;
6150 			dmar_domain->flags &= ~DOMAIN_FLAG_USE_FIRST_LEVEL;
6151 		} else {
6152 			ret = -ENODEV;
6153 		}
6154 		spin_unlock_irqrestore(&device_domain_lock, flags);
6155 		break;
6156 	default:
6157 		ret = -EINVAL;
6158 		break;
6159 	}
6160 
6161 	return ret;
6162 }
6163 
6164 /*
6165  * Check that the device does not live on an external facing PCI port that is
6166  * marked as untrusted. Such devices should not be able to apply quirks and
6167  * thus not be able to bypass the IOMMU restrictions.
6168  */
risky_device(struct pci_dev * pdev)6169 static bool risky_device(struct pci_dev *pdev)
6170 {
6171 	if (pdev->untrusted) {
6172 		pci_info(pdev,
6173 			 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
6174 			 pdev->vendor, pdev->device);
6175 		pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
6176 		return true;
6177 	}
6178 	return false;
6179 }
6180 
6181 const struct iommu_ops intel_iommu_ops = {
6182 	.capable		= intel_iommu_capable,
6183 	.domain_alloc		= intel_iommu_domain_alloc,
6184 	.domain_free		= intel_iommu_domain_free,
6185 	.domain_set_attr	= intel_iommu_domain_set_attr,
6186 	.attach_dev		= intel_iommu_attach_device,
6187 	.detach_dev		= intel_iommu_detach_device,
6188 	.aux_attach_dev		= intel_iommu_aux_attach_device,
6189 	.aux_detach_dev		= intel_iommu_aux_detach_device,
6190 	.aux_get_pasid		= intel_iommu_aux_get_pasid,
6191 	.map			= intel_iommu_map,
6192 	.unmap			= intel_iommu_unmap,
6193 	.iova_to_phys		= intel_iommu_iova_to_phys,
6194 	.probe_device		= intel_iommu_probe_device,
6195 	.probe_finalize		= intel_iommu_probe_finalize,
6196 	.release_device		= intel_iommu_release_device,
6197 	.get_resv_regions	= intel_iommu_get_resv_regions,
6198 	.put_resv_regions	= generic_iommu_put_resv_regions,
6199 	.apply_resv_region	= intel_iommu_apply_resv_region,
6200 	.device_group		= intel_iommu_device_group,
6201 	.dev_has_feat		= intel_iommu_dev_has_feat,
6202 	.dev_feat_enabled	= intel_iommu_dev_feat_enabled,
6203 	.dev_enable_feat	= intel_iommu_dev_enable_feat,
6204 	.dev_disable_feat	= intel_iommu_dev_disable_feat,
6205 	.is_attach_deferred	= intel_iommu_is_attach_deferred,
6206 	.def_domain_type	= device_def_domain_type,
6207 	.pgsize_bitmap		= INTEL_IOMMU_PGSIZES,
6208 #ifdef CONFIG_INTEL_IOMMU_SVM
6209 	.cache_invalidate	= intel_iommu_sva_invalidate,
6210 	.sva_bind_gpasid	= intel_svm_bind_gpasid,
6211 	.sva_unbind_gpasid	= intel_svm_unbind_gpasid,
6212 	.sva_bind		= intel_svm_bind,
6213 	.sva_unbind		= intel_svm_unbind,
6214 	.sva_get_pasid		= intel_svm_get_pasid,
6215 	.page_response		= intel_svm_page_response,
6216 #endif
6217 };
6218 
quirk_iommu_igfx(struct pci_dev * dev)6219 static void quirk_iommu_igfx(struct pci_dev *dev)
6220 {
6221 	if (risky_device(dev))
6222 		return;
6223 
6224 	pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
6225 	dmar_map_gfx = 0;
6226 }
6227 
6228 /* G4x/GM45 integrated gfx dmar support is totally busted. */
6229 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
6230 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
6231 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
6232 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
6233 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
6234 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
6235 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
6236 
6237 /* Broadwell igfx malfunctions with dmar */
6238 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
6239 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
6240 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
6241 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
6242 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
6243 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
6244 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
6245 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
6246 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
6247 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
6248 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
6249 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
6250 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
6251 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
6252 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
6253 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
6254 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
6255 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
6256 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
6257 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
6258 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
6259 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
6260 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
6261 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
6262 
quirk_iommu_rwbf(struct pci_dev * dev)6263 static void quirk_iommu_rwbf(struct pci_dev *dev)
6264 {
6265 	if (risky_device(dev))
6266 		return;
6267 
6268 	/*
6269 	 * Mobile 4 Series Chipset neglects to set RWBF capability,
6270 	 * but needs it. Same seems to hold for the desktop versions.
6271 	 */
6272 	pci_info(dev, "Forcing write-buffer flush capability\n");
6273 	rwbf_quirk = 1;
6274 }
6275 
6276 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
6277 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
6278 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
6279 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
6280 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
6281 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
6282 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
6283 
6284 #define GGC 0x52
6285 #define GGC_MEMORY_SIZE_MASK	(0xf << 8)
6286 #define GGC_MEMORY_SIZE_NONE	(0x0 << 8)
6287 #define GGC_MEMORY_SIZE_1M	(0x1 << 8)
6288 #define GGC_MEMORY_SIZE_2M	(0x3 << 8)
6289 #define GGC_MEMORY_VT_ENABLED	(0x8 << 8)
6290 #define GGC_MEMORY_SIZE_2M_VT	(0x9 << 8)
6291 #define GGC_MEMORY_SIZE_3M_VT	(0xa << 8)
6292 #define GGC_MEMORY_SIZE_4M_VT	(0xb << 8)
6293 
quirk_calpella_no_shadow_gtt(struct pci_dev * dev)6294 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
6295 {
6296 	unsigned short ggc;
6297 
6298 	if (risky_device(dev))
6299 		return;
6300 
6301 	if (pci_read_config_word(dev, GGC, &ggc))
6302 		return;
6303 
6304 	if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
6305 		pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
6306 		dmar_map_gfx = 0;
6307 	} else if (dmar_map_gfx) {
6308 		/* we have to ensure the gfx device is idle before we flush */
6309 		pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
6310 		intel_iommu_strict = 1;
6311        }
6312 }
6313 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
6314 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
6315 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
6316 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
6317 
quirk_igfx_skip_te_disable(struct pci_dev * dev)6318 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
6319 {
6320 	unsigned short ver;
6321 
6322 	if (!IS_GFX_DEVICE(dev))
6323 		return;
6324 
6325 	ver = (dev->device >> 8) & 0xff;
6326 	if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
6327 	    ver != 0x4e && ver != 0x8a && ver != 0x98 &&
6328 	    ver != 0x9a && ver != 0xa7 && ver != 0x7d)
6329 		return;
6330 
6331 	if (risky_device(dev))
6332 		return;
6333 
6334 	pci_info(dev, "Skip IOMMU disabling for graphics\n");
6335 	iommu_skip_te_disable = 1;
6336 }
6337 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
6338 
6339 /* On Tylersburg chipsets, some BIOSes have been known to enable the
6340    ISOCH DMAR unit for the Azalia sound device, but not give it any
6341    TLB entries, which causes it to deadlock. Check for that.  We do
6342    this in a function called from init_dmars(), instead of in a PCI
6343    quirk, because we don't want to print the obnoxious "BIOS broken"
6344    message if VT-d is actually disabled.
6345 */
check_tylersburg_isoch(void)6346 static void __init check_tylersburg_isoch(void)
6347 {
6348 	struct pci_dev *pdev;
6349 	uint32_t vtisochctrl;
6350 
6351 	/* If there's no Azalia in the system anyway, forget it. */
6352 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
6353 	if (!pdev)
6354 		return;
6355 
6356 	if (risky_device(pdev)) {
6357 		pci_dev_put(pdev);
6358 		return;
6359 	}
6360 
6361 	pci_dev_put(pdev);
6362 
6363 	/* System Management Registers. Might be hidden, in which case
6364 	   we can't do the sanity check. But that's OK, because the
6365 	   known-broken BIOSes _don't_ actually hide it, so far. */
6366 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
6367 	if (!pdev)
6368 		return;
6369 
6370 	if (risky_device(pdev)) {
6371 		pci_dev_put(pdev);
6372 		return;
6373 	}
6374 
6375 	if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
6376 		pci_dev_put(pdev);
6377 		return;
6378 	}
6379 
6380 	pci_dev_put(pdev);
6381 
6382 	/* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
6383 	if (vtisochctrl & 1)
6384 		return;
6385 
6386 	/* Drop all bits other than the number of TLB entries */
6387 	vtisochctrl &= 0x1c;
6388 
6389 	/* If we have the recommended number of TLB entries (16), fine. */
6390 	if (vtisochctrl == 0x10)
6391 		return;
6392 
6393 	/* Zero TLB entries? You get to ride the short bus to school. */
6394 	if (!vtisochctrl) {
6395 		WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
6396 		     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
6397 		     dmi_get_system_info(DMI_BIOS_VENDOR),
6398 		     dmi_get_system_info(DMI_BIOS_VERSION),
6399 		     dmi_get_system_info(DMI_PRODUCT_VERSION));
6400 		iommu_identity_mapping |= IDENTMAP_AZALIA;
6401 		return;
6402 	}
6403 
6404 	pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
6405 	       vtisochctrl);
6406 }
6407