• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright © 2006-2014 Intel Corporation.
4  *
5  * Authors: David Woodhouse <dwmw2@infradead.org>,
6  *          Ashok Raj <ashok.raj@intel.com>,
7  *          Shaohua Li <shaohua.li@intel.com>,
8  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9  *          Fenghua Yu <fenghua.yu@intel.com>
10  *          Joerg Roedel <jroedel@suse.de>
11  */
12 
13 #define pr_fmt(fmt)     "DMAR: " fmt
14 #define dev_fmt(fmt)    pr_fmt(fmt)
15 
16 #include <linux/init.h>
17 #include <linux/bitmap.h>
18 #include <linux/debugfs.h>
19 #include <linux/export.h>
20 #include <linux/slab.h>
21 #include <linux/irq.h>
22 #include <linux/interrupt.h>
23 #include <linux/spinlock.h>
24 #include <linux/pci.h>
25 #include <linux/dmar.h>
26 #include <linux/dma-map-ops.h>
27 #include <linux/mempool.h>
28 #include <linux/memory.h>
29 #include <linux/cpu.h>
30 #include <linux/timer.h>
31 #include <linux/io.h>
32 #include <linux/iova.h>
33 #include <linux/iommu.h>
34 #include <linux/intel-iommu.h>
35 #include <linux/syscore_ops.h>
36 #include <linux/tboot.h>
37 #include <linux/dmi.h>
38 #include <linux/pci-ats.h>
39 #include <linux/memblock.h>
40 #include <linux/dma-map-ops.h>
41 #include <linux/dma-direct.h>
42 #include <linux/crash_dump.h>
43 #include <linux/numa.h>
44 #include <linux/swiotlb.h>
45 #include <asm/irq_remapping.h>
46 #include <asm/cacheflush.h>
47 #include <asm/iommu.h>
48 #include <trace/events/intel_iommu.h>
49 
50 #include "../irq_remapping.h"
51 #include "pasid.h"
52 
53 #define ROOT_SIZE		VTD_PAGE_SIZE
54 #define CONTEXT_SIZE		VTD_PAGE_SIZE
55 
56 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
57 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
58 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
59 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
60 
61 #define IOAPIC_RANGE_START	(0xfee00000)
62 #define IOAPIC_RANGE_END	(0xfeefffff)
63 #define IOVA_START_ADDR		(0x1000)
64 
65 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
66 
67 #define MAX_AGAW_WIDTH 64
68 #define MAX_AGAW_PFN_WIDTH	(MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
69 
70 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
71 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
72 
73 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
74    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
75 #define DOMAIN_MAX_PFN(gaw)	((unsigned long) min_t(uint64_t, \
76 				__DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
77 #define DOMAIN_MAX_ADDR(gaw)	(((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
78 
79 /* IO virtual address start page frame number */
80 #define IOVA_START_PFN		(1)
81 
82 #define IOVA_PFN(addr)		((addr) >> PAGE_SHIFT)
83 
84 /* page table handling */
85 #define LEVEL_STRIDE		(9)
86 #define LEVEL_MASK		(((u64)1 << LEVEL_STRIDE) - 1)
87 
88 /*
89  * This bitmap is used to advertise the page sizes our hardware support
90  * to the IOMMU core, which will then use this information to split
91  * physically contiguous memory regions it is mapping into page sizes
92  * that we support.
93  *
94  * Traditionally the IOMMU core just handed us the mappings directly,
95  * after making sure the size is an order of a 4KiB page and that the
96  * mapping has natural alignment.
97  *
98  * To retain this behavior, we currently advertise that we support
99  * all page sizes that are an order of 4KiB.
100  *
101  * If at some point we'd like to utilize the IOMMU core's new behavior,
102  * we could change this to advertise the real page sizes we support.
103  */
104 #define INTEL_IOMMU_PGSIZES	(~0xFFFUL)
105 
agaw_to_level(int agaw)106 static inline int agaw_to_level(int agaw)
107 {
108 	return agaw + 2;
109 }
110 
agaw_to_width(int agaw)111 static inline int agaw_to_width(int agaw)
112 {
113 	return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
114 }
115 
width_to_agaw(int width)116 static inline int width_to_agaw(int width)
117 {
118 	return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
119 }
120 
level_to_offset_bits(int level)121 static inline unsigned int level_to_offset_bits(int level)
122 {
123 	return (level - 1) * LEVEL_STRIDE;
124 }
125 
pfn_level_offset(u64 pfn,int level)126 static inline int pfn_level_offset(u64 pfn, int level)
127 {
128 	return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
129 }
130 
level_mask(int level)131 static inline u64 level_mask(int level)
132 {
133 	return -1ULL << level_to_offset_bits(level);
134 }
135 
level_size(int level)136 static inline u64 level_size(int level)
137 {
138 	return 1ULL << level_to_offset_bits(level);
139 }
140 
align_to_level(u64 pfn,int level)141 static inline u64 align_to_level(u64 pfn, int level)
142 {
143 	return (pfn + level_size(level) - 1) & level_mask(level);
144 }
145 
lvl_to_nr_pages(unsigned int lvl)146 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
147 {
148 	return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
149 }
150 
151 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
152    are never going to work. */
dma_to_mm_pfn(unsigned long dma_pfn)153 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
154 {
155 	return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
156 }
157 
mm_to_dma_pfn(unsigned long mm_pfn)158 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
159 {
160 	return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
161 }
page_to_dma_pfn(struct page * pg)162 static inline unsigned long page_to_dma_pfn(struct page *pg)
163 {
164 	return mm_to_dma_pfn(page_to_pfn(pg));
165 }
virt_to_dma_pfn(void * p)166 static inline unsigned long virt_to_dma_pfn(void *p)
167 {
168 	return page_to_dma_pfn(virt_to_page(p));
169 }
170 
171 /* global iommu list, set NULL for ignored DMAR units */
172 static struct intel_iommu **g_iommus;
173 
174 static void __init check_tylersburg_isoch(void);
175 static int rwbf_quirk;
176 
177 /*
178  * set to 1 to panic kernel if can't successfully enable VT-d
179  * (used when kernel is launched w/ TXT)
180  */
181 static int force_on = 0;
182 static int intel_iommu_tboot_noforce;
183 static int no_platform_optin;
184 
185 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
186 
187 /*
188  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
189  * if marked present.
190  */
root_entry_lctp(struct root_entry * re)191 static phys_addr_t root_entry_lctp(struct root_entry *re)
192 {
193 	if (!(re->lo & 1))
194 		return 0;
195 
196 	return re->lo & VTD_PAGE_MASK;
197 }
198 
199 /*
200  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
201  * if marked present.
202  */
root_entry_uctp(struct root_entry * re)203 static phys_addr_t root_entry_uctp(struct root_entry *re)
204 {
205 	if (!(re->hi & 1))
206 		return 0;
207 
208 	return re->hi & VTD_PAGE_MASK;
209 }
210 
context_clear_pasid_enable(struct context_entry * context)211 static inline void context_clear_pasid_enable(struct context_entry *context)
212 {
213 	context->lo &= ~(1ULL << 11);
214 }
215 
context_pasid_enabled(struct context_entry * context)216 static inline bool context_pasid_enabled(struct context_entry *context)
217 {
218 	return !!(context->lo & (1ULL << 11));
219 }
220 
context_set_copied(struct context_entry * context)221 static inline void context_set_copied(struct context_entry *context)
222 {
223 	context->hi |= (1ull << 3);
224 }
225 
context_copied(struct context_entry * context)226 static inline bool context_copied(struct context_entry *context)
227 {
228 	return !!(context->hi & (1ULL << 3));
229 }
230 
__context_present(struct context_entry * context)231 static inline bool __context_present(struct context_entry *context)
232 {
233 	return (context->lo & 1);
234 }
235 
context_present(struct context_entry * context)236 bool context_present(struct context_entry *context)
237 {
238 	return context_pasid_enabled(context) ?
239 	     __context_present(context) :
240 	     __context_present(context) && !context_copied(context);
241 }
242 
context_set_present(struct context_entry * context)243 static inline void context_set_present(struct context_entry *context)
244 {
245 	context->lo |= 1;
246 }
247 
context_set_fault_enable(struct context_entry * context)248 static inline void context_set_fault_enable(struct context_entry *context)
249 {
250 	context->lo &= (((u64)-1) << 2) | 1;
251 }
252 
context_set_translation_type(struct context_entry * context,unsigned long value)253 static inline void context_set_translation_type(struct context_entry *context,
254 						unsigned long value)
255 {
256 	context->lo &= (((u64)-1) << 4) | 3;
257 	context->lo |= (value & 3) << 2;
258 }
259 
context_set_address_root(struct context_entry * context,unsigned long value)260 static inline void context_set_address_root(struct context_entry *context,
261 					    unsigned long value)
262 {
263 	context->lo &= ~VTD_PAGE_MASK;
264 	context->lo |= value & VTD_PAGE_MASK;
265 }
266 
context_set_address_width(struct context_entry * context,unsigned long value)267 static inline void context_set_address_width(struct context_entry *context,
268 					     unsigned long value)
269 {
270 	context->hi |= value & 7;
271 }
272 
context_set_domain_id(struct context_entry * context,unsigned long value)273 static inline void context_set_domain_id(struct context_entry *context,
274 					 unsigned long value)
275 {
276 	context->hi |= (value & ((1 << 16) - 1)) << 8;
277 }
278 
context_domain_id(struct context_entry * c)279 static inline int context_domain_id(struct context_entry *c)
280 {
281 	return((c->hi >> 8) & 0xffff);
282 }
283 
context_clear_entry(struct context_entry * context)284 static inline void context_clear_entry(struct context_entry *context)
285 {
286 	context->lo = 0;
287 	context->hi = 0;
288 }
289 
290 /*
291  * This domain is a statically identity mapping domain.
292  *	1. This domain creats a static 1:1 mapping to all usable memory.
293  * 	2. It maps to each iommu if successful.
294  *	3. Each iommu mapps to this domain if successful.
295  */
296 static struct dmar_domain *si_domain;
297 static int hw_pass_through = 1;
298 
299 #define for_each_domain_iommu(idx, domain)			\
300 	for (idx = 0; idx < g_num_of_iommus; idx++)		\
301 		if (domain->iommu_refcnt[idx])
302 
303 struct dmar_rmrr_unit {
304 	struct list_head list;		/* list of rmrr units	*/
305 	struct acpi_dmar_header *hdr;	/* ACPI header		*/
306 	u64	base_address;		/* reserved base address*/
307 	u64	end_address;		/* reserved end address */
308 	struct dmar_dev_scope *devices;	/* target devices */
309 	int	devices_cnt;		/* target device count */
310 };
311 
312 struct dmar_atsr_unit {
313 	struct list_head list;		/* list of ATSR units */
314 	struct acpi_dmar_header *hdr;	/* ACPI header */
315 	struct dmar_dev_scope *devices;	/* target devices */
316 	int devices_cnt;		/* target device count */
317 	u8 include_all:1;		/* include all ports */
318 };
319 
320 static LIST_HEAD(dmar_atsr_units);
321 static LIST_HEAD(dmar_rmrr_units);
322 
323 #define for_each_rmrr_units(rmrr) \
324 	list_for_each_entry(rmrr, &dmar_rmrr_units, list)
325 
326 /* bitmap for indexing intel_iommus */
327 static int g_num_of_iommus;
328 
329 static void domain_exit(struct dmar_domain *domain);
330 static void domain_remove_dev_info(struct dmar_domain *domain);
331 static void dmar_remove_one_dev_info(struct device *dev);
332 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
333 static int intel_iommu_attach_device(struct iommu_domain *domain,
334 				     struct device *dev);
335 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
336 					    dma_addr_t iova);
337 
338 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
339 int dmar_disabled = 0;
340 #else
341 int dmar_disabled = 1;
342 #endif /* CONFIG_INTEL_IOMMU_DEFAULT_ON */
343 
344 #ifdef CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON
345 int intel_iommu_sm = 1;
346 #else
347 int intel_iommu_sm;
348 #endif /* CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON */
349 
350 int intel_iommu_enabled = 0;
351 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
352 
353 static int dmar_map_gfx = 1;
354 static int dmar_forcedac;
355 static int intel_iommu_strict;
356 static int intel_iommu_superpage = 1;
357 static int iommu_identity_mapping;
358 static int intel_no_bounce;
359 static int iommu_skip_te_disable;
360 
361 #define IDENTMAP_GFX		2
362 #define IDENTMAP_AZALIA		4
363 
364 int intel_iommu_gfx_mapped;
365 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
366 
367 #define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2))
get_domain_info(struct device * dev)368 struct device_domain_info *get_domain_info(struct device *dev)
369 {
370 	struct device_domain_info *info;
371 
372 	if (!dev)
373 		return NULL;
374 
375 	info = dev_iommu_priv_get(dev);
376 	if (unlikely(info == DEFER_DEVICE_DOMAIN_INFO))
377 		return NULL;
378 
379 	return info;
380 }
381 
382 DEFINE_SPINLOCK(device_domain_lock);
383 static LIST_HEAD(device_domain_list);
384 
385 #define device_needs_bounce(d) (!intel_no_bounce && dev_is_pci(d) &&	\
386 				to_pci_dev(d)->untrusted)
387 
388 /*
389  * Iterate over elements in device_domain_list and call the specified
390  * callback @fn against each element.
391  */
for_each_device_domain(int (* fn)(struct device_domain_info * info,void * data),void * data)392 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
393 				     void *data), void *data)
394 {
395 	int ret = 0;
396 	unsigned long flags;
397 	struct device_domain_info *info;
398 
399 	spin_lock_irqsave(&device_domain_lock, flags);
400 	list_for_each_entry(info, &device_domain_list, global) {
401 		ret = fn(info, data);
402 		if (ret) {
403 			spin_unlock_irqrestore(&device_domain_lock, flags);
404 			return ret;
405 		}
406 	}
407 	spin_unlock_irqrestore(&device_domain_lock, flags);
408 
409 	return 0;
410 }
411 
412 const struct iommu_ops intel_iommu_ops;
413 
translation_pre_enabled(struct intel_iommu * iommu)414 static bool translation_pre_enabled(struct intel_iommu *iommu)
415 {
416 	return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
417 }
418 
clear_translation_pre_enabled(struct intel_iommu * iommu)419 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
420 {
421 	iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
422 }
423 
init_translation_status(struct intel_iommu * iommu)424 static void init_translation_status(struct intel_iommu *iommu)
425 {
426 	u32 gsts;
427 
428 	gsts = readl(iommu->reg + DMAR_GSTS_REG);
429 	if (gsts & DMA_GSTS_TES)
430 		iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
431 }
432 
intel_iommu_setup(char * str)433 static int __init intel_iommu_setup(char *str)
434 {
435 	if (!str)
436 		return -EINVAL;
437 	while (*str) {
438 		if (!strncmp(str, "on", 2)) {
439 			dmar_disabled = 0;
440 			pr_info("IOMMU enabled\n");
441 		} else if (!strncmp(str, "off", 3)) {
442 			dmar_disabled = 1;
443 			no_platform_optin = 1;
444 			pr_info("IOMMU disabled\n");
445 		} else if (!strncmp(str, "igfx_off", 8)) {
446 			dmar_map_gfx = 0;
447 			pr_info("Disable GFX device mapping\n");
448 		} else if (!strncmp(str, "forcedac", 8)) {
449 			pr_info("Forcing DAC for PCI devices\n");
450 			dmar_forcedac = 1;
451 		} else if (!strncmp(str, "strict", 6)) {
452 			pr_info("Disable batched IOTLB flush\n");
453 			intel_iommu_strict = 1;
454 		} else if (!strncmp(str, "sp_off", 6)) {
455 			pr_info("Disable supported super page\n");
456 			intel_iommu_superpage = 0;
457 		} else if (!strncmp(str, "sm_on", 5)) {
458 			pr_info("Intel-IOMMU: scalable mode supported\n");
459 			intel_iommu_sm = 1;
460 		} else if (!strncmp(str, "tboot_noforce", 13)) {
461 			pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
462 			intel_iommu_tboot_noforce = 1;
463 		} else if (!strncmp(str, "nobounce", 8)) {
464 			pr_info("Intel-IOMMU: No bounce buffer. This could expose security risks of DMA attacks\n");
465 			intel_no_bounce = 1;
466 		}
467 
468 		str += strcspn(str, ",");
469 		while (*str == ',')
470 			str++;
471 	}
472 	return 0;
473 }
474 __setup("intel_iommu=", intel_iommu_setup);
475 
476 static struct kmem_cache *iommu_domain_cache;
477 static struct kmem_cache *iommu_devinfo_cache;
478 
get_iommu_domain(struct intel_iommu * iommu,u16 did)479 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
480 {
481 	struct dmar_domain **domains;
482 	int idx = did >> 8;
483 
484 	domains = iommu->domains[idx];
485 	if (!domains)
486 		return NULL;
487 
488 	return domains[did & 0xff];
489 }
490 
set_iommu_domain(struct intel_iommu * iommu,u16 did,struct dmar_domain * domain)491 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
492 			     struct dmar_domain *domain)
493 {
494 	struct dmar_domain **domains;
495 	int idx = did >> 8;
496 
497 	if (!iommu->domains[idx]) {
498 		size_t size = 256 * sizeof(struct dmar_domain *);
499 		iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
500 	}
501 
502 	domains = iommu->domains[idx];
503 	if (WARN_ON(!domains))
504 		return;
505 	else
506 		domains[did & 0xff] = domain;
507 }
508 
alloc_pgtable_page(int node)509 void *alloc_pgtable_page(int node)
510 {
511 	struct page *page;
512 	void *vaddr = NULL;
513 
514 	page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
515 	if (page)
516 		vaddr = page_address(page);
517 	return vaddr;
518 }
519 
free_pgtable_page(void * vaddr)520 void free_pgtable_page(void *vaddr)
521 {
522 	free_page((unsigned long)vaddr);
523 }
524 
alloc_domain_mem(void)525 static inline void *alloc_domain_mem(void)
526 {
527 	return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
528 }
529 
free_domain_mem(void * vaddr)530 static void free_domain_mem(void *vaddr)
531 {
532 	kmem_cache_free(iommu_domain_cache, vaddr);
533 }
534 
alloc_devinfo_mem(void)535 static inline void * alloc_devinfo_mem(void)
536 {
537 	return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
538 }
539 
free_devinfo_mem(void * vaddr)540 static inline void free_devinfo_mem(void *vaddr)
541 {
542 	kmem_cache_free(iommu_devinfo_cache, vaddr);
543 }
544 
domain_type_is_si(struct dmar_domain * domain)545 static inline int domain_type_is_si(struct dmar_domain *domain)
546 {
547 	return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
548 }
549 
domain_use_first_level(struct dmar_domain * domain)550 static inline bool domain_use_first_level(struct dmar_domain *domain)
551 {
552 	return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL;
553 }
554 
domain_pfn_supported(struct dmar_domain * domain,unsigned long pfn)555 static inline int domain_pfn_supported(struct dmar_domain *domain,
556 				       unsigned long pfn)
557 {
558 	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
559 
560 	return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
561 }
562 
__iommu_calculate_agaw(struct intel_iommu * iommu,int max_gaw)563 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
564 {
565 	unsigned long sagaw;
566 	int agaw = -1;
567 
568 	sagaw = cap_sagaw(iommu->cap);
569 	for (agaw = width_to_agaw(max_gaw);
570 	     agaw >= 0; agaw--) {
571 		if (test_bit(agaw, &sagaw))
572 			break;
573 	}
574 
575 	return agaw;
576 }
577 
578 /*
579  * Calculate max SAGAW for each iommu.
580  */
iommu_calculate_max_sagaw(struct intel_iommu * iommu)581 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
582 {
583 	return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
584 }
585 
586 /*
587  * calculate agaw for each iommu.
588  * "SAGAW" may be different across iommus, use a default agaw, and
589  * get a supported less agaw for iommus that don't support the default agaw.
590  */
iommu_calculate_agaw(struct intel_iommu * iommu)591 int iommu_calculate_agaw(struct intel_iommu *iommu)
592 {
593 	return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
594 }
595 
596 /* This functionin only returns single iommu in a domain */
domain_get_iommu(struct dmar_domain * domain)597 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
598 {
599 	int iommu_id;
600 
601 	/* si_domain and vm domain should not get here. */
602 	if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA))
603 		return NULL;
604 
605 	for_each_domain_iommu(iommu_id, domain)
606 		break;
607 
608 	if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
609 		return NULL;
610 
611 	return g_iommus[iommu_id];
612 }
613 
iommu_paging_structure_coherency(struct intel_iommu * iommu)614 static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
615 {
616 	return sm_supported(iommu) ?
617 			ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
618 }
619 
domain_update_iommu_coherency(struct dmar_domain * domain)620 static void domain_update_iommu_coherency(struct dmar_domain *domain)
621 {
622 	struct dmar_drhd_unit *drhd;
623 	struct intel_iommu *iommu;
624 	bool found = false;
625 	int i;
626 
627 	domain->iommu_coherency = 1;
628 
629 	for_each_domain_iommu(i, domain) {
630 		found = true;
631 		if (!iommu_paging_structure_coherency(g_iommus[i])) {
632 			domain->iommu_coherency = 0;
633 			break;
634 		}
635 	}
636 	if (found)
637 		return;
638 
639 	/* No hardware attached; use lowest common denominator */
640 	rcu_read_lock();
641 	for_each_active_iommu(iommu, drhd) {
642 		if (!iommu_paging_structure_coherency(iommu)) {
643 			domain->iommu_coherency = 0;
644 			break;
645 		}
646 	}
647 	rcu_read_unlock();
648 }
649 
domain_update_iommu_snooping(struct intel_iommu * skip)650 static int domain_update_iommu_snooping(struct intel_iommu *skip)
651 {
652 	struct dmar_drhd_unit *drhd;
653 	struct intel_iommu *iommu;
654 	int ret = 1;
655 
656 	rcu_read_lock();
657 	for_each_active_iommu(iommu, drhd) {
658 		if (iommu != skip) {
659 			/*
660 			 * If the hardware is operating in the scalable mode,
661 			 * the snooping control is always supported since we
662 			 * always set PASID-table-entry.PGSNP bit if the domain
663 			 * is managed outside (UNMANAGED).
664 			 */
665 			if (!sm_supported(iommu) &&
666 			    !ecap_sc_support(iommu->ecap)) {
667 				ret = 0;
668 				break;
669 			}
670 		}
671 	}
672 	rcu_read_unlock();
673 
674 	return ret;
675 }
676 
domain_update_iommu_superpage(struct dmar_domain * domain,struct intel_iommu * skip)677 static int domain_update_iommu_superpage(struct dmar_domain *domain,
678 					 struct intel_iommu *skip)
679 {
680 	struct dmar_drhd_unit *drhd;
681 	struct intel_iommu *iommu;
682 	int mask = 0x3;
683 
684 	if (!intel_iommu_superpage) {
685 		return 0;
686 	}
687 
688 	/* set iommu_superpage to the smallest common denominator */
689 	rcu_read_lock();
690 	for_each_active_iommu(iommu, drhd) {
691 		if (iommu != skip) {
692 			if (domain && domain_use_first_level(domain)) {
693 				if (!cap_fl1gp_support(iommu->cap))
694 					mask = 0x1;
695 			} else {
696 				mask &= cap_super_page_val(iommu->cap);
697 			}
698 
699 			if (!mask)
700 				break;
701 		}
702 	}
703 	rcu_read_unlock();
704 
705 	return fls(mask);
706 }
707 
domain_update_device_node(struct dmar_domain * domain)708 static int domain_update_device_node(struct dmar_domain *domain)
709 {
710 	struct device_domain_info *info;
711 	int nid = NUMA_NO_NODE;
712 
713 	assert_spin_locked(&device_domain_lock);
714 
715 	if (list_empty(&domain->devices))
716 		return NUMA_NO_NODE;
717 
718 	list_for_each_entry(info, &domain->devices, link) {
719 		if (!info->dev)
720 			continue;
721 
722 		/*
723 		 * There could possibly be multiple device numa nodes as devices
724 		 * within the same domain may sit behind different IOMMUs. There
725 		 * isn't perfect answer in such situation, so we select first
726 		 * come first served policy.
727 		 */
728 		nid = dev_to_node(info->dev);
729 		if (nid != NUMA_NO_NODE)
730 			break;
731 	}
732 
733 	return nid;
734 }
735 
736 /* Some capabilities may be different across iommus */
domain_update_iommu_cap(struct dmar_domain * domain)737 static void domain_update_iommu_cap(struct dmar_domain *domain)
738 {
739 	domain_update_iommu_coherency(domain);
740 	domain->iommu_snooping = domain_update_iommu_snooping(NULL);
741 	domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
742 
743 	/*
744 	 * If RHSA is missing, we should default to the device numa domain
745 	 * as fall back.
746 	 */
747 	if (domain->nid == NUMA_NO_NODE)
748 		domain->nid = domain_update_device_node(domain);
749 
750 	/*
751 	 * First-level translation restricts the input-address to a
752 	 * canonical address (i.e., address bits 63:N have the same
753 	 * value as address bit [N-1], where N is 48-bits with 4-level
754 	 * paging and 57-bits with 5-level paging). Hence, skip bit
755 	 * [N-1].
756 	 */
757 	if (domain_use_first_level(domain))
758 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
759 	else
760 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
761 }
762 
iommu_context_addr(struct intel_iommu * iommu,u8 bus,u8 devfn,int alloc)763 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
764 					 u8 devfn, int alloc)
765 {
766 	struct root_entry *root = &iommu->root_entry[bus];
767 	struct context_entry *context;
768 	u64 *entry;
769 
770 	entry = &root->lo;
771 	if (sm_supported(iommu)) {
772 		if (devfn >= 0x80) {
773 			devfn -= 0x80;
774 			entry = &root->hi;
775 		}
776 		devfn *= 2;
777 	}
778 	if (*entry & 1)
779 		context = phys_to_virt(*entry & VTD_PAGE_MASK);
780 	else {
781 		unsigned long phy_addr;
782 		if (!alloc)
783 			return NULL;
784 
785 		context = alloc_pgtable_page(iommu->node);
786 		if (!context)
787 			return NULL;
788 
789 		__iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
790 		phy_addr = virt_to_phys((void *)context);
791 		*entry = phy_addr | 1;
792 		__iommu_flush_cache(iommu, entry, sizeof(*entry));
793 	}
794 	return &context[devfn];
795 }
796 
attach_deferred(struct device * dev)797 static bool attach_deferred(struct device *dev)
798 {
799 	return dev_iommu_priv_get(dev) == DEFER_DEVICE_DOMAIN_INFO;
800 }
801 
802 /**
803  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
804  *				 sub-hierarchy of a candidate PCI-PCI bridge
805  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
806  * @bridge: the candidate PCI-PCI bridge
807  *
808  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
809  */
810 static bool
is_downstream_to_pci_bridge(struct device * dev,struct device * bridge)811 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
812 {
813 	struct pci_dev *pdev, *pbridge;
814 
815 	if (!dev_is_pci(dev) || !dev_is_pci(bridge))
816 		return false;
817 
818 	pdev = to_pci_dev(dev);
819 	pbridge = to_pci_dev(bridge);
820 
821 	if (pbridge->subordinate &&
822 	    pbridge->subordinate->number <= pdev->bus->number &&
823 	    pbridge->subordinate->busn_res.end >= pdev->bus->number)
824 		return true;
825 
826 	return false;
827 }
828 
quirk_ioat_snb_local_iommu(struct pci_dev * pdev)829 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
830 {
831 	struct dmar_drhd_unit *drhd;
832 	u32 vtbar;
833 	int rc;
834 
835 	/* We know that this device on this chipset has its own IOMMU.
836 	 * If we find it under a different IOMMU, then the BIOS is lying
837 	 * to us. Hope that the IOMMU for this device is actually
838 	 * disabled, and it needs no translation...
839 	 */
840 	rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
841 	if (rc) {
842 		/* "can't" happen */
843 		dev_info(&pdev->dev, "failed to run vt-d quirk\n");
844 		return false;
845 	}
846 	vtbar &= 0xffff0000;
847 
848 	/* we know that the this iommu should be at offset 0xa000 from vtbar */
849 	drhd = dmar_find_matched_drhd_unit(pdev);
850 	if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
851 		pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
852 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
853 		return true;
854 	}
855 
856 	return false;
857 }
858 
iommu_is_dummy(struct intel_iommu * iommu,struct device * dev)859 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
860 {
861 	if (!iommu || iommu->drhd->ignored)
862 		return true;
863 
864 	if (dev_is_pci(dev)) {
865 		struct pci_dev *pdev = to_pci_dev(dev);
866 
867 		if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
868 		    pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
869 		    quirk_ioat_snb_local_iommu(pdev))
870 			return true;
871 	}
872 
873 	return false;
874 }
875 
device_to_iommu(struct device * dev,u8 * bus,u8 * devfn)876 struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
877 {
878 	struct dmar_drhd_unit *drhd = NULL;
879 	struct pci_dev *pdev = NULL;
880 	struct intel_iommu *iommu;
881 	struct device *tmp;
882 	u16 segment = 0;
883 	int i;
884 
885 	if (!dev)
886 		return NULL;
887 
888 	if (dev_is_pci(dev)) {
889 		struct pci_dev *pf_pdev;
890 
891 		pdev = pci_real_dma_dev(to_pci_dev(dev));
892 
893 		/* VFs aren't listed in scope tables; we need to look up
894 		 * the PF instead to find the IOMMU. */
895 		pf_pdev = pci_physfn(pdev);
896 		dev = &pf_pdev->dev;
897 		segment = pci_domain_nr(pdev->bus);
898 	} else if (has_acpi_companion(dev))
899 		dev = &ACPI_COMPANION(dev)->dev;
900 
901 	rcu_read_lock();
902 	for_each_iommu(iommu, drhd) {
903 		if (pdev && segment != drhd->segment)
904 			continue;
905 
906 		for_each_active_dev_scope(drhd->devices,
907 					  drhd->devices_cnt, i, tmp) {
908 			if (tmp == dev) {
909 				/* For a VF use its original BDF# not that of the PF
910 				 * which we used for the IOMMU lookup. Strictly speaking
911 				 * we could do this for all PCI devices; we only need to
912 				 * get the BDF# from the scope table for ACPI matches. */
913 				if (pdev && pdev->is_virtfn)
914 					goto got_pdev;
915 
916 				if (bus && devfn) {
917 					*bus = drhd->devices[i].bus;
918 					*devfn = drhd->devices[i].devfn;
919 				}
920 				goto out;
921 			}
922 
923 			if (is_downstream_to_pci_bridge(dev, tmp))
924 				goto got_pdev;
925 		}
926 
927 		if (pdev && drhd->include_all) {
928 		got_pdev:
929 			if (bus && devfn) {
930 				*bus = pdev->bus->number;
931 				*devfn = pdev->devfn;
932 			}
933 			goto out;
934 		}
935 	}
936 	iommu = NULL;
937  out:
938 	if (iommu_is_dummy(iommu, dev))
939 		iommu = NULL;
940 
941 	rcu_read_unlock();
942 
943 	return iommu;
944 }
945 
domain_flush_cache(struct dmar_domain * domain,void * addr,int size)946 static void domain_flush_cache(struct dmar_domain *domain,
947 			       void *addr, int size)
948 {
949 	if (!domain->iommu_coherency)
950 		clflush_cache_range(addr, size);
951 }
952 
device_context_mapped(struct intel_iommu * iommu,u8 bus,u8 devfn)953 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
954 {
955 	struct context_entry *context;
956 	int ret = 0;
957 	unsigned long flags;
958 
959 	spin_lock_irqsave(&iommu->lock, flags);
960 	context = iommu_context_addr(iommu, bus, devfn, 0);
961 	if (context)
962 		ret = context_present(context);
963 	spin_unlock_irqrestore(&iommu->lock, flags);
964 	return ret;
965 }
966 
free_context_table(struct intel_iommu * iommu)967 static void free_context_table(struct intel_iommu *iommu)
968 {
969 	int i;
970 	unsigned long flags;
971 	struct context_entry *context;
972 
973 	spin_lock_irqsave(&iommu->lock, flags);
974 	if (!iommu->root_entry) {
975 		goto out;
976 	}
977 	for (i = 0; i < ROOT_ENTRY_NR; i++) {
978 		context = iommu_context_addr(iommu, i, 0, 0);
979 		if (context)
980 			free_pgtable_page(context);
981 
982 		if (!sm_supported(iommu))
983 			continue;
984 
985 		context = iommu_context_addr(iommu, i, 0x80, 0);
986 		if (context)
987 			free_pgtable_page(context);
988 
989 	}
990 	free_pgtable_page(iommu->root_entry);
991 	iommu->root_entry = NULL;
992 out:
993 	spin_unlock_irqrestore(&iommu->lock, flags);
994 }
995 
pfn_to_dma_pte(struct dmar_domain * domain,unsigned long pfn,int * target_level)996 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
997 				      unsigned long pfn, int *target_level)
998 {
999 	struct dma_pte *parent, *pte;
1000 	int level = agaw_to_level(domain->agaw);
1001 	int offset;
1002 
1003 	BUG_ON(!domain->pgd);
1004 
1005 	if (!domain_pfn_supported(domain, pfn))
1006 		/* Address beyond IOMMU's addressing capabilities. */
1007 		return NULL;
1008 
1009 	parent = domain->pgd;
1010 
1011 	while (1) {
1012 		void *tmp_page;
1013 
1014 		offset = pfn_level_offset(pfn, level);
1015 		pte = &parent[offset];
1016 		if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
1017 			break;
1018 		if (level == *target_level)
1019 			break;
1020 
1021 		if (!dma_pte_present(pte)) {
1022 			uint64_t pteval;
1023 
1024 			tmp_page = alloc_pgtable_page(domain->nid);
1025 
1026 			if (!tmp_page)
1027 				return NULL;
1028 
1029 			domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
1030 			pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
1031 			if (domain_use_first_level(domain)) {
1032 				pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US;
1033 				if (domain->domain.type == IOMMU_DOMAIN_DMA)
1034 					pteval |= DMA_FL_PTE_ACCESS;
1035 			}
1036 			if (cmpxchg64(&pte->val, 0ULL, pteval))
1037 				/* Someone else set it while we were thinking; use theirs. */
1038 				free_pgtable_page(tmp_page);
1039 			else
1040 				domain_flush_cache(domain, pte, sizeof(*pte));
1041 		}
1042 		if (level == 1)
1043 			break;
1044 
1045 		parent = phys_to_virt(dma_pte_addr(pte));
1046 		level--;
1047 	}
1048 
1049 	if (!*target_level)
1050 		*target_level = level;
1051 
1052 	return pte;
1053 }
1054 
1055 /* return address's pte at specific level */
dma_pfn_level_pte(struct dmar_domain * domain,unsigned long pfn,int level,int * large_page)1056 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
1057 					 unsigned long pfn,
1058 					 int level, int *large_page)
1059 {
1060 	struct dma_pte *parent, *pte;
1061 	int total = agaw_to_level(domain->agaw);
1062 	int offset;
1063 
1064 	parent = domain->pgd;
1065 	while (level <= total) {
1066 		offset = pfn_level_offset(pfn, total);
1067 		pte = &parent[offset];
1068 		if (level == total)
1069 			return pte;
1070 
1071 		if (!dma_pte_present(pte)) {
1072 			*large_page = total;
1073 			break;
1074 		}
1075 
1076 		if (dma_pte_superpage(pte)) {
1077 			*large_page = total;
1078 			return pte;
1079 		}
1080 
1081 		parent = phys_to_virt(dma_pte_addr(pte));
1082 		total--;
1083 	}
1084 	return NULL;
1085 }
1086 
1087 /* clear last level pte, a tlb flush should be followed */
dma_pte_clear_range(struct dmar_domain * domain,unsigned long start_pfn,unsigned long last_pfn)1088 static void dma_pte_clear_range(struct dmar_domain *domain,
1089 				unsigned long start_pfn,
1090 				unsigned long last_pfn)
1091 {
1092 	unsigned int large_page;
1093 	struct dma_pte *first_pte, *pte;
1094 
1095 	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1096 	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1097 	BUG_ON(start_pfn > last_pfn);
1098 
1099 	/* we don't need lock here; nobody else touches the iova range */
1100 	do {
1101 		large_page = 1;
1102 		first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1103 		if (!pte) {
1104 			start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1105 			continue;
1106 		}
1107 		do {
1108 			dma_clear_pte(pte);
1109 			start_pfn += lvl_to_nr_pages(large_page);
1110 			pte++;
1111 		} while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1112 
1113 		domain_flush_cache(domain, first_pte,
1114 				   (void *)pte - (void *)first_pte);
1115 
1116 	} while (start_pfn && start_pfn <= last_pfn);
1117 }
1118 
dma_pte_free_level(struct dmar_domain * domain,int level,int retain_level,struct dma_pte * pte,unsigned long pfn,unsigned long start_pfn,unsigned long last_pfn)1119 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1120 			       int retain_level, struct dma_pte *pte,
1121 			       unsigned long pfn, unsigned long start_pfn,
1122 			       unsigned long last_pfn)
1123 {
1124 	pfn = max(start_pfn, pfn);
1125 	pte = &pte[pfn_level_offset(pfn, level)];
1126 
1127 	do {
1128 		unsigned long level_pfn;
1129 		struct dma_pte *level_pte;
1130 
1131 		if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1132 			goto next;
1133 
1134 		level_pfn = pfn & level_mask(level);
1135 		level_pte = phys_to_virt(dma_pte_addr(pte));
1136 
1137 		if (level > 2) {
1138 			dma_pte_free_level(domain, level - 1, retain_level,
1139 					   level_pte, level_pfn, start_pfn,
1140 					   last_pfn);
1141 		}
1142 
1143 		/*
1144 		 * Free the page table if we're below the level we want to
1145 		 * retain and the range covers the entire table.
1146 		 */
1147 		if (level < retain_level && !(start_pfn > level_pfn ||
1148 		      last_pfn < level_pfn + level_size(level) - 1)) {
1149 			dma_clear_pte(pte);
1150 			domain_flush_cache(domain, pte, sizeof(*pte));
1151 			free_pgtable_page(level_pte);
1152 		}
1153 next:
1154 		pfn += level_size(level);
1155 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1156 }
1157 
1158 /*
1159  * clear last level (leaf) ptes and free page table pages below the
1160  * level we wish to keep intact.
1161  */
dma_pte_free_pagetable(struct dmar_domain * domain,unsigned long start_pfn,unsigned long last_pfn,int retain_level)1162 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1163 				   unsigned long start_pfn,
1164 				   unsigned long last_pfn,
1165 				   int retain_level)
1166 {
1167 	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1168 	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1169 	BUG_ON(start_pfn > last_pfn);
1170 
1171 	dma_pte_clear_range(domain, start_pfn, last_pfn);
1172 
1173 	/* We don't need lock here; nobody else touches the iova range */
1174 	dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1175 			   domain->pgd, 0, start_pfn, last_pfn);
1176 
1177 	/* free pgd */
1178 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1179 		free_pgtable_page(domain->pgd);
1180 		domain->pgd = NULL;
1181 	}
1182 }
1183 
1184 /* When a page at a given level is being unlinked from its parent, we don't
1185    need to *modify* it at all. All we need to do is make a list of all the
1186    pages which can be freed just as soon as we've flushed the IOTLB and we
1187    know the hardware page-walk will no longer touch them.
1188    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1189    be freed. */
dma_pte_list_pagetables(struct dmar_domain * domain,int level,struct dma_pte * pte,struct page * freelist)1190 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1191 					    int level, struct dma_pte *pte,
1192 					    struct page *freelist)
1193 {
1194 	struct page *pg;
1195 
1196 	pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1197 	pg->freelist = freelist;
1198 	freelist = pg;
1199 
1200 	if (level == 1)
1201 		return freelist;
1202 
1203 	pte = page_address(pg);
1204 	do {
1205 		if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1206 			freelist = dma_pte_list_pagetables(domain, level - 1,
1207 							   pte, freelist);
1208 		pte++;
1209 	} while (!first_pte_in_page(pte));
1210 
1211 	return freelist;
1212 }
1213 
dma_pte_clear_level(struct dmar_domain * domain,int level,struct dma_pte * pte,unsigned long pfn,unsigned long start_pfn,unsigned long last_pfn,struct page * freelist)1214 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1215 					struct dma_pte *pte, unsigned long pfn,
1216 					unsigned long start_pfn,
1217 					unsigned long last_pfn,
1218 					struct page *freelist)
1219 {
1220 	struct dma_pte *first_pte = NULL, *last_pte = NULL;
1221 
1222 	pfn = max(start_pfn, pfn);
1223 	pte = &pte[pfn_level_offset(pfn, level)];
1224 
1225 	do {
1226 		unsigned long level_pfn;
1227 
1228 		if (!dma_pte_present(pte))
1229 			goto next;
1230 
1231 		level_pfn = pfn & level_mask(level);
1232 
1233 		/* If range covers entire pagetable, free it */
1234 		if (start_pfn <= level_pfn &&
1235 		    last_pfn >= level_pfn + level_size(level) - 1) {
1236 			/* These suborbinate page tables are going away entirely. Don't
1237 			   bother to clear them; we're just going to *free* them. */
1238 			if (level > 1 && !dma_pte_superpage(pte))
1239 				freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1240 
1241 			dma_clear_pte(pte);
1242 			if (!first_pte)
1243 				first_pte = pte;
1244 			last_pte = pte;
1245 		} else if (level > 1) {
1246 			/* Recurse down into a level that isn't *entirely* obsolete */
1247 			freelist = dma_pte_clear_level(domain, level - 1,
1248 						       phys_to_virt(dma_pte_addr(pte)),
1249 						       level_pfn, start_pfn, last_pfn,
1250 						       freelist);
1251 		}
1252 next:
1253 		pfn += level_size(level);
1254 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1255 
1256 	if (first_pte)
1257 		domain_flush_cache(domain, first_pte,
1258 				   (void *)++last_pte - (void *)first_pte);
1259 
1260 	return freelist;
1261 }
1262 
1263 /* We can't just free the pages because the IOMMU may still be walking
1264    the page tables, and may have cached the intermediate levels. The
1265    pages can only be freed after the IOTLB flush has been done. */
domain_unmap(struct dmar_domain * domain,unsigned long start_pfn,unsigned long last_pfn)1266 static struct page *domain_unmap(struct dmar_domain *domain,
1267 				 unsigned long start_pfn,
1268 				 unsigned long last_pfn)
1269 {
1270 	struct page *freelist;
1271 
1272 	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1273 	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1274 	BUG_ON(start_pfn > last_pfn);
1275 
1276 	/* we don't need lock here; nobody else touches the iova range */
1277 	freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1278 				       domain->pgd, 0, start_pfn, last_pfn, NULL);
1279 
1280 	/* free pgd */
1281 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1282 		struct page *pgd_page = virt_to_page(domain->pgd);
1283 		pgd_page->freelist = freelist;
1284 		freelist = pgd_page;
1285 
1286 		domain->pgd = NULL;
1287 	}
1288 
1289 	return freelist;
1290 }
1291 
dma_free_pagelist(struct page * freelist)1292 static void dma_free_pagelist(struct page *freelist)
1293 {
1294 	struct page *pg;
1295 
1296 	while ((pg = freelist)) {
1297 		freelist = pg->freelist;
1298 		free_pgtable_page(page_address(pg));
1299 	}
1300 }
1301 
iova_entry_free(unsigned long data)1302 static void iova_entry_free(unsigned long data)
1303 {
1304 	struct page *freelist = (struct page *)data;
1305 
1306 	dma_free_pagelist(freelist);
1307 }
1308 
1309 /* iommu handling */
iommu_alloc_root_entry(struct intel_iommu * iommu)1310 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1311 {
1312 	struct root_entry *root;
1313 	unsigned long flags;
1314 
1315 	root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1316 	if (!root) {
1317 		pr_err("Allocating root entry for %s failed\n",
1318 			iommu->name);
1319 		return -ENOMEM;
1320 	}
1321 
1322 	__iommu_flush_cache(iommu, root, ROOT_SIZE);
1323 
1324 	spin_lock_irqsave(&iommu->lock, flags);
1325 	iommu->root_entry = root;
1326 	spin_unlock_irqrestore(&iommu->lock, flags);
1327 
1328 	return 0;
1329 }
1330 
iommu_set_root_entry(struct intel_iommu * iommu)1331 static void iommu_set_root_entry(struct intel_iommu *iommu)
1332 {
1333 	u64 addr;
1334 	u32 sts;
1335 	unsigned long flag;
1336 
1337 	addr = virt_to_phys(iommu->root_entry);
1338 	if (sm_supported(iommu))
1339 		addr |= DMA_RTADDR_SMT;
1340 
1341 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1342 	dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1343 
1344 	writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1345 
1346 	/* Make sure hardware complete it */
1347 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1348 		      readl, (sts & DMA_GSTS_RTPS), sts);
1349 
1350 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1351 
1352 	iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1353 	if (sm_supported(iommu))
1354 		qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1355 	iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1356 }
1357 
iommu_flush_write_buffer(struct intel_iommu * iommu)1358 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1359 {
1360 	u32 val;
1361 	unsigned long flag;
1362 
1363 	if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1364 		return;
1365 
1366 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1367 	writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1368 
1369 	/* Make sure hardware complete it */
1370 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1371 		      readl, (!(val & DMA_GSTS_WBFS)), val);
1372 
1373 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1374 }
1375 
1376 /* return value determine if we need a write buffer flush */
__iommu_flush_context(struct intel_iommu * iommu,u16 did,u16 source_id,u8 function_mask,u64 type)1377 static void __iommu_flush_context(struct intel_iommu *iommu,
1378 				  u16 did, u16 source_id, u8 function_mask,
1379 				  u64 type)
1380 {
1381 	u64 val = 0;
1382 	unsigned long flag;
1383 
1384 	switch (type) {
1385 	case DMA_CCMD_GLOBAL_INVL:
1386 		val = DMA_CCMD_GLOBAL_INVL;
1387 		break;
1388 	case DMA_CCMD_DOMAIN_INVL:
1389 		val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1390 		break;
1391 	case DMA_CCMD_DEVICE_INVL:
1392 		val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1393 			| DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1394 		break;
1395 	default:
1396 		BUG();
1397 	}
1398 	val |= DMA_CCMD_ICC;
1399 
1400 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1401 	dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1402 
1403 	/* Make sure hardware complete it */
1404 	IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1405 		dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1406 
1407 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1408 }
1409 
1410 /* return value determine if we need a write buffer flush */
__iommu_flush_iotlb(struct intel_iommu * iommu,u16 did,u64 addr,unsigned int size_order,u64 type)1411 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1412 				u64 addr, unsigned int size_order, u64 type)
1413 {
1414 	int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1415 	u64 val = 0, val_iva = 0;
1416 	unsigned long flag;
1417 
1418 	switch (type) {
1419 	case DMA_TLB_GLOBAL_FLUSH:
1420 		/* global flush doesn't need set IVA_REG */
1421 		val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1422 		break;
1423 	case DMA_TLB_DSI_FLUSH:
1424 		val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1425 		break;
1426 	case DMA_TLB_PSI_FLUSH:
1427 		val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1428 		/* IH bit is passed in as part of address */
1429 		val_iva = size_order | addr;
1430 		break;
1431 	default:
1432 		BUG();
1433 	}
1434 	/* Note: set drain read/write */
1435 #if 0
1436 	/*
1437 	 * This is probably to be super secure.. Looks like we can
1438 	 * ignore it without any impact.
1439 	 */
1440 	if (cap_read_drain(iommu->cap))
1441 		val |= DMA_TLB_READ_DRAIN;
1442 #endif
1443 	if (cap_write_drain(iommu->cap))
1444 		val |= DMA_TLB_WRITE_DRAIN;
1445 
1446 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1447 	/* Note: Only uses first TLB reg currently */
1448 	if (val_iva)
1449 		dmar_writeq(iommu->reg + tlb_offset, val_iva);
1450 	dmar_writeq(iommu->reg + tlb_offset + 8, val);
1451 
1452 	/* Make sure hardware complete it */
1453 	IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1454 		dmar_readq, (!(val & DMA_TLB_IVT)), val);
1455 
1456 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1457 
1458 	/* check IOTLB invalidation granularity */
1459 	if (DMA_TLB_IAIG(val) == 0)
1460 		pr_err("Flush IOTLB failed\n");
1461 	if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1462 		pr_debug("TLB flush request %Lx, actual %Lx\n",
1463 			(unsigned long long)DMA_TLB_IIRG(type),
1464 			(unsigned long long)DMA_TLB_IAIG(val));
1465 }
1466 
1467 static struct device_domain_info *
iommu_support_dev_iotlb(struct dmar_domain * domain,struct intel_iommu * iommu,u8 bus,u8 devfn)1468 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1469 			 u8 bus, u8 devfn)
1470 {
1471 	struct device_domain_info *info;
1472 
1473 	assert_spin_locked(&device_domain_lock);
1474 
1475 	if (!iommu->qi)
1476 		return NULL;
1477 
1478 	list_for_each_entry(info, &domain->devices, link)
1479 		if (info->iommu == iommu && info->bus == bus &&
1480 		    info->devfn == devfn) {
1481 			if (info->ats_supported && info->dev)
1482 				return info;
1483 			break;
1484 		}
1485 
1486 	return NULL;
1487 }
1488 
domain_update_iotlb(struct dmar_domain * domain)1489 static void domain_update_iotlb(struct dmar_domain *domain)
1490 {
1491 	struct device_domain_info *info;
1492 	bool has_iotlb_device = false;
1493 
1494 	assert_spin_locked(&device_domain_lock);
1495 
1496 	list_for_each_entry(info, &domain->devices, link) {
1497 		struct pci_dev *pdev;
1498 
1499 		if (!info->dev || !dev_is_pci(info->dev))
1500 			continue;
1501 
1502 		pdev = to_pci_dev(info->dev);
1503 		if (pdev->ats_enabled) {
1504 			has_iotlb_device = true;
1505 			break;
1506 		}
1507 	}
1508 
1509 	domain->has_iotlb_device = has_iotlb_device;
1510 }
1511 
iommu_enable_dev_iotlb(struct device_domain_info * info)1512 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1513 {
1514 	struct pci_dev *pdev;
1515 
1516 	assert_spin_locked(&device_domain_lock);
1517 
1518 	if (!info || !dev_is_pci(info->dev))
1519 		return;
1520 
1521 	pdev = to_pci_dev(info->dev);
1522 	/* For IOMMU that supports device IOTLB throttling (DIT), we assign
1523 	 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1524 	 * queue depth at PF level. If DIT is not set, PFSID will be treated as
1525 	 * reserved, which should be set to 0.
1526 	 */
1527 	if (!ecap_dit(info->iommu->ecap))
1528 		info->pfsid = 0;
1529 	else {
1530 		struct pci_dev *pf_pdev;
1531 
1532 		/* pdev will be returned if device is not a vf */
1533 		pf_pdev = pci_physfn(pdev);
1534 		info->pfsid = pci_dev_id(pf_pdev);
1535 	}
1536 
1537 #ifdef CONFIG_INTEL_IOMMU_SVM
1538 	/* The PCIe spec, in its wisdom, declares that the behaviour of
1539 	   the device if you enable PASID support after ATS support is
1540 	   undefined. So always enable PASID support on devices which
1541 	   have it, even if we can't yet know if we're ever going to
1542 	   use it. */
1543 	if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1544 		info->pasid_enabled = 1;
1545 
1546 	if (info->pri_supported &&
1547 	    (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1)  &&
1548 	    !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1549 		info->pri_enabled = 1;
1550 #endif
1551 	if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1552 	    !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1553 		info->ats_enabled = 1;
1554 		domain_update_iotlb(info->domain);
1555 		info->ats_qdep = pci_ats_queue_depth(pdev);
1556 	}
1557 }
1558 
iommu_disable_dev_iotlb(struct device_domain_info * info)1559 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1560 {
1561 	struct pci_dev *pdev;
1562 
1563 	assert_spin_locked(&device_domain_lock);
1564 
1565 	if (!dev_is_pci(info->dev))
1566 		return;
1567 
1568 	pdev = to_pci_dev(info->dev);
1569 
1570 	if (info->ats_enabled) {
1571 		pci_disable_ats(pdev);
1572 		info->ats_enabled = 0;
1573 		domain_update_iotlb(info->domain);
1574 	}
1575 #ifdef CONFIG_INTEL_IOMMU_SVM
1576 	if (info->pri_enabled) {
1577 		pci_disable_pri(pdev);
1578 		info->pri_enabled = 0;
1579 	}
1580 	if (info->pasid_enabled) {
1581 		pci_disable_pasid(pdev);
1582 		info->pasid_enabled = 0;
1583 	}
1584 #endif
1585 }
1586 
iommu_flush_dev_iotlb(struct dmar_domain * domain,u64 addr,unsigned mask)1587 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1588 				  u64 addr, unsigned mask)
1589 {
1590 	u16 sid, qdep;
1591 	unsigned long flags;
1592 	struct device_domain_info *info;
1593 
1594 	if (!domain->has_iotlb_device)
1595 		return;
1596 
1597 	spin_lock_irqsave(&device_domain_lock, flags);
1598 	list_for_each_entry(info, &domain->devices, link) {
1599 		if (!info->ats_enabled)
1600 			continue;
1601 
1602 		sid = info->bus << 8 | info->devfn;
1603 		qdep = info->ats_qdep;
1604 		qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1605 				qdep, addr, mask);
1606 	}
1607 	spin_unlock_irqrestore(&device_domain_lock, flags);
1608 }
1609 
domain_flush_piotlb(struct intel_iommu * iommu,struct dmar_domain * domain,u64 addr,unsigned long npages,bool ih)1610 static void domain_flush_piotlb(struct intel_iommu *iommu,
1611 				struct dmar_domain *domain,
1612 				u64 addr, unsigned long npages, bool ih)
1613 {
1614 	u16 did = domain->iommu_did[iommu->seq_id];
1615 
1616 	if (domain->default_pasid)
1617 		qi_flush_piotlb(iommu, did, domain->default_pasid,
1618 				addr, npages, ih);
1619 
1620 	if (!list_empty(&domain->devices))
1621 		qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, npages, ih);
1622 }
1623 
iommu_flush_iotlb_psi(struct intel_iommu * iommu,struct dmar_domain * domain,unsigned long pfn,unsigned int pages,int ih,int map)1624 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1625 				  struct dmar_domain *domain,
1626 				  unsigned long pfn, unsigned int pages,
1627 				  int ih, int map)
1628 {
1629 	unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1630 	uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1631 	u16 did = domain->iommu_did[iommu->seq_id];
1632 
1633 	BUG_ON(pages == 0);
1634 
1635 	if (ih)
1636 		ih = 1 << 6;
1637 
1638 	if (domain_use_first_level(domain)) {
1639 		domain_flush_piotlb(iommu, domain, addr, pages, ih);
1640 	} else {
1641 		/*
1642 		 * Fallback to domain selective flush if no PSI support or
1643 		 * the size is too big. PSI requires page size to be 2 ^ x,
1644 		 * and the base address is naturally aligned to the size.
1645 		 */
1646 		if (!cap_pgsel_inv(iommu->cap) ||
1647 		    mask > cap_max_amask_val(iommu->cap))
1648 			iommu->flush.flush_iotlb(iommu, did, 0, 0,
1649 							DMA_TLB_DSI_FLUSH);
1650 		else
1651 			iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1652 							DMA_TLB_PSI_FLUSH);
1653 	}
1654 
1655 	/*
1656 	 * In caching mode, changes of pages from non-present to present require
1657 	 * flush. However, device IOTLB doesn't need to be flushed in this case.
1658 	 */
1659 	if (!cap_caching_mode(iommu->cap) || !map)
1660 		iommu_flush_dev_iotlb(domain, addr, mask);
1661 }
1662 
1663 /* Notification for newly created mappings */
__mapping_notify_one(struct intel_iommu * iommu,struct dmar_domain * domain,unsigned long pfn,unsigned int pages)1664 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1665 					struct dmar_domain *domain,
1666 					unsigned long pfn, unsigned int pages)
1667 {
1668 	/*
1669 	 * It's a non-present to present mapping. Only flush if caching mode
1670 	 * and second level.
1671 	 */
1672 	if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain))
1673 		iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1674 	else
1675 		iommu_flush_write_buffer(iommu);
1676 }
1677 
iommu_flush_iova(struct iova_domain * iovad)1678 static void iommu_flush_iova(struct iova_domain *iovad)
1679 {
1680 	struct dmar_domain *domain;
1681 	int idx;
1682 
1683 	domain = container_of(iovad, struct dmar_domain, iovad);
1684 
1685 	for_each_domain_iommu(idx, domain) {
1686 		struct intel_iommu *iommu = g_iommus[idx];
1687 		u16 did = domain->iommu_did[iommu->seq_id];
1688 
1689 		if (domain_use_first_level(domain))
1690 			domain_flush_piotlb(iommu, domain, 0, -1, 0);
1691 		else
1692 			iommu->flush.flush_iotlb(iommu, did, 0, 0,
1693 						 DMA_TLB_DSI_FLUSH);
1694 
1695 		if (!cap_caching_mode(iommu->cap))
1696 			iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1697 					      0, MAX_AGAW_PFN_WIDTH);
1698 	}
1699 }
1700 
iommu_disable_protect_mem_regions(struct intel_iommu * iommu)1701 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1702 {
1703 	u32 pmen;
1704 	unsigned long flags;
1705 
1706 	if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1707 		return;
1708 
1709 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1710 	pmen = readl(iommu->reg + DMAR_PMEN_REG);
1711 	pmen &= ~DMA_PMEN_EPM;
1712 	writel(pmen, iommu->reg + DMAR_PMEN_REG);
1713 
1714 	/* wait for the protected region status bit to clear */
1715 	IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1716 		readl, !(pmen & DMA_PMEN_PRS), pmen);
1717 
1718 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1719 }
1720 
iommu_enable_translation(struct intel_iommu * iommu)1721 static void iommu_enable_translation(struct intel_iommu *iommu)
1722 {
1723 	u32 sts;
1724 	unsigned long flags;
1725 
1726 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1727 	iommu->gcmd |= DMA_GCMD_TE;
1728 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1729 
1730 	/* Make sure hardware complete it */
1731 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1732 		      readl, (sts & DMA_GSTS_TES), sts);
1733 
1734 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1735 }
1736 
iommu_disable_translation(struct intel_iommu * iommu)1737 static void iommu_disable_translation(struct intel_iommu *iommu)
1738 {
1739 	u32 sts;
1740 	unsigned long flag;
1741 
1742 	if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1743 	    (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1744 		return;
1745 
1746 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1747 	iommu->gcmd &= ~DMA_GCMD_TE;
1748 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1749 
1750 	/* Make sure hardware complete it */
1751 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1752 		      readl, (!(sts & DMA_GSTS_TES)), sts);
1753 
1754 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1755 }
1756 
iommu_init_domains(struct intel_iommu * iommu)1757 static int iommu_init_domains(struct intel_iommu *iommu)
1758 {
1759 	u32 ndomains, nlongs;
1760 	size_t size;
1761 
1762 	ndomains = cap_ndoms(iommu->cap);
1763 	pr_debug("%s: Number of Domains supported <%d>\n",
1764 		 iommu->name, ndomains);
1765 	nlongs = BITS_TO_LONGS(ndomains);
1766 
1767 	spin_lock_init(&iommu->lock);
1768 
1769 	iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1770 	if (!iommu->domain_ids) {
1771 		pr_err("%s: Allocating domain id array failed\n",
1772 		       iommu->name);
1773 		return -ENOMEM;
1774 	}
1775 
1776 	size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1777 	iommu->domains = kzalloc(size, GFP_KERNEL);
1778 
1779 	if (iommu->domains) {
1780 		size = 256 * sizeof(struct dmar_domain *);
1781 		iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1782 	}
1783 
1784 	if (!iommu->domains || !iommu->domains[0]) {
1785 		pr_err("%s: Allocating domain array failed\n",
1786 		       iommu->name);
1787 		kfree(iommu->domain_ids);
1788 		kfree(iommu->domains);
1789 		iommu->domain_ids = NULL;
1790 		iommu->domains    = NULL;
1791 		return -ENOMEM;
1792 	}
1793 
1794 	/*
1795 	 * If Caching mode is set, then invalid translations are tagged
1796 	 * with domain-id 0, hence we need to pre-allocate it. We also
1797 	 * use domain-id 0 as a marker for non-allocated domain-id, so
1798 	 * make sure it is not used for a real domain.
1799 	 */
1800 	set_bit(0, iommu->domain_ids);
1801 
1802 	/*
1803 	 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1804 	 * entry for first-level or pass-through translation modes should
1805 	 * be programmed with a domain id different from those used for
1806 	 * second-level or nested translation. We reserve a domain id for
1807 	 * this purpose.
1808 	 */
1809 	if (sm_supported(iommu))
1810 		set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1811 
1812 	return 0;
1813 }
1814 
disable_dmar_iommu(struct intel_iommu * iommu)1815 static void disable_dmar_iommu(struct intel_iommu *iommu)
1816 {
1817 	struct device_domain_info *info, *tmp;
1818 	unsigned long flags;
1819 
1820 	if (!iommu->domains || !iommu->domain_ids)
1821 		return;
1822 
1823 	spin_lock_irqsave(&device_domain_lock, flags);
1824 	list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1825 		if (info->iommu != iommu)
1826 			continue;
1827 
1828 		if (!info->dev || !info->domain)
1829 			continue;
1830 
1831 		__dmar_remove_one_dev_info(info);
1832 	}
1833 	spin_unlock_irqrestore(&device_domain_lock, flags);
1834 
1835 	if (iommu->gcmd & DMA_GCMD_TE)
1836 		iommu_disable_translation(iommu);
1837 }
1838 
free_dmar_iommu(struct intel_iommu * iommu)1839 static void free_dmar_iommu(struct intel_iommu *iommu)
1840 {
1841 	if ((iommu->domains) && (iommu->domain_ids)) {
1842 		int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1843 		int i;
1844 
1845 		for (i = 0; i < elems; i++)
1846 			kfree(iommu->domains[i]);
1847 		kfree(iommu->domains);
1848 		kfree(iommu->domain_ids);
1849 		iommu->domains = NULL;
1850 		iommu->domain_ids = NULL;
1851 	}
1852 
1853 	g_iommus[iommu->seq_id] = NULL;
1854 
1855 	/* free context mapping */
1856 	free_context_table(iommu);
1857 
1858 #ifdef CONFIG_INTEL_IOMMU_SVM
1859 	if (pasid_supported(iommu)) {
1860 		if (ecap_prs(iommu->ecap))
1861 			intel_svm_finish_prq(iommu);
1862 	}
1863 	if (vccap_pasid(iommu->vccap))
1864 		ioasid_unregister_allocator(&iommu->pasid_allocator);
1865 
1866 #endif
1867 }
1868 
1869 /*
1870  * Check and return whether first level is used by default for
1871  * DMA translation.
1872  */
first_level_by_default(void)1873 static bool first_level_by_default(void)
1874 {
1875 	struct dmar_drhd_unit *drhd;
1876 	struct intel_iommu *iommu;
1877 	static int first_level_support = -1;
1878 
1879 	if (likely(first_level_support != -1))
1880 		return first_level_support;
1881 
1882 	first_level_support = 1;
1883 
1884 	rcu_read_lock();
1885 	for_each_active_iommu(iommu, drhd) {
1886 		if (!sm_supported(iommu) || !ecap_flts(iommu->ecap)) {
1887 			first_level_support = 0;
1888 			break;
1889 		}
1890 	}
1891 	rcu_read_unlock();
1892 
1893 	return first_level_support;
1894 }
1895 
alloc_domain(int flags)1896 static struct dmar_domain *alloc_domain(int flags)
1897 {
1898 	struct dmar_domain *domain;
1899 
1900 	domain = alloc_domain_mem();
1901 	if (!domain)
1902 		return NULL;
1903 
1904 	memset(domain, 0, sizeof(*domain));
1905 	domain->nid = NUMA_NO_NODE;
1906 	domain->flags = flags;
1907 	if (first_level_by_default())
1908 		domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL;
1909 	domain->has_iotlb_device = false;
1910 	INIT_LIST_HEAD(&domain->devices);
1911 
1912 	return domain;
1913 }
1914 
1915 /* Must be called with iommu->lock */
domain_attach_iommu(struct dmar_domain * domain,struct intel_iommu * iommu)1916 static int domain_attach_iommu(struct dmar_domain *domain,
1917 			       struct intel_iommu *iommu)
1918 {
1919 	unsigned long ndomains;
1920 	int num;
1921 
1922 	assert_spin_locked(&device_domain_lock);
1923 	assert_spin_locked(&iommu->lock);
1924 
1925 	domain->iommu_refcnt[iommu->seq_id] += 1;
1926 	domain->iommu_count += 1;
1927 	if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1928 		ndomains = cap_ndoms(iommu->cap);
1929 		num      = find_first_zero_bit(iommu->domain_ids, ndomains);
1930 
1931 		if (num >= ndomains) {
1932 			pr_err("%s: No free domain ids\n", iommu->name);
1933 			domain->iommu_refcnt[iommu->seq_id] -= 1;
1934 			domain->iommu_count -= 1;
1935 			return -ENOSPC;
1936 		}
1937 
1938 		set_bit(num, iommu->domain_ids);
1939 		set_iommu_domain(iommu, num, domain);
1940 
1941 		domain->iommu_did[iommu->seq_id] = num;
1942 		domain->nid			 = iommu->node;
1943 
1944 		domain_update_iommu_cap(domain);
1945 	}
1946 
1947 	return 0;
1948 }
1949 
domain_detach_iommu(struct dmar_domain * domain,struct intel_iommu * iommu)1950 static int domain_detach_iommu(struct dmar_domain *domain,
1951 			       struct intel_iommu *iommu)
1952 {
1953 	int num, count;
1954 
1955 	assert_spin_locked(&device_domain_lock);
1956 	assert_spin_locked(&iommu->lock);
1957 
1958 	domain->iommu_refcnt[iommu->seq_id] -= 1;
1959 	count = --domain->iommu_count;
1960 	if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1961 		num = domain->iommu_did[iommu->seq_id];
1962 		clear_bit(num, iommu->domain_ids);
1963 		set_iommu_domain(iommu, num, NULL);
1964 
1965 		domain_update_iommu_cap(domain);
1966 		domain->iommu_did[iommu->seq_id] = 0;
1967 	}
1968 
1969 	return count;
1970 }
1971 
1972 static struct iova_domain reserved_iova_list;
1973 static struct lock_class_key reserved_rbtree_key;
1974 
dmar_init_reserved_ranges(void)1975 static int dmar_init_reserved_ranges(void)
1976 {
1977 	struct pci_dev *pdev = NULL;
1978 	struct iova *iova;
1979 	int i;
1980 
1981 	init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
1982 
1983 	lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1984 		&reserved_rbtree_key);
1985 
1986 	/* IOAPIC ranges shouldn't be accessed by DMA */
1987 	iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1988 		IOVA_PFN(IOAPIC_RANGE_END));
1989 	if (!iova) {
1990 		pr_err("Reserve IOAPIC range failed\n");
1991 		return -ENODEV;
1992 	}
1993 
1994 	/* Reserve all PCI MMIO to avoid peer-to-peer access */
1995 	for_each_pci_dev(pdev) {
1996 		struct resource *r;
1997 
1998 		for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1999 			r = &pdev->resource[i];
2000 			if (!r->flags || !(r->flags & IORESOURCE_MEM))
2001 				continue;
2002 			iova = reserve_iova(&reserved_iova_list,
2003 					    IOVA_PFN(r->start),
2004 					    IOVA_PFN(r->end));
2005 			if (!iova) {
2006 				pci_err(pdev, "Reserve iova for %pR failed\n", r);
2007 				return -ENODEV;
2008 			}
2009 		}
2010 	}
2011 	return 0;
2012 }
2013 
guestwidth_to_adjustwidth(int gaw)2014 static inline int guestwidth_to_adjustwidth(int gaw)
2015 {
2016 	int agaw;
2017 	int r = (gaw - 12) % 9;
2018 
2019 	if (r == 0)
2020 		agaw = gaw;
2021 	else
2022 		agaw = gaw + 9 - r;
2023 	if (agaw > 64)
2024 		agaw = 64;
2025 	return agaw;
2026 }
2027 
domain_exit(struct dmar_domain * domain)2028 static void domain_exit(struct dmar_domain *domain)
2029 {
2030 
2031 	/* Remove associated devices and clear attached or cached domains */
2032 	domain_remove_dev_info(domain);
2033 
2034 	/* destroy iovas */
2035 	if (domain->domain.type == IOMMU_DOMAIN_DMA)
2036 		put_iova_domain(&domain->iovad);
2037 
2038 	if (domain->pgd) {
2039 		struct page *freelist;
2040 
2041 		freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
2042 		dma_free_pagelist(freelist);
2043 	}
2044 
2045 	free_domain_mem(domain);
2046 }
2047 
2048 /*
2049  * Get the PASID directory size for scalable mode context entry.
2050  * Value of X in the PDTS field of a scalable mode context entry
2051  * indicates PASID directory with 2^(X + 7) entries.
2052  */
context_get_sm_pds(struct pasid_table * table)2053 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
2054 {
2055 	int pds, max_pde;
2056 
2057 	max_pde = table->max_pasid >> PASID_PDE_SHIFT;
2058 	pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
2059 	if (pds < 7)
2060 		return 0;
2061 
2062 	return pds - 7;
2063 }
2064 
2065 /*
2066  * Set the RID_PASID field of a scalable mode context entry. The
2067  * IOMMU hardware will use the PASID value set in this field for
2068  * DMA translations of DMA requests without PASID.
2069  */
2070 static inline void
context_set_sm_rid2pasid(struct context_entry * context,unsigned long pasid)2071 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
2072 {
2073 	context->hi |= pasid & ((1 << 20) - 1);
2074 }
2075 
2076 /*
2077  * Set the DTE(Device-TLB Enable) field of a scalable mode context
2078  * entry.
2079  */
context_set_sm_dte(struct context_entry * context)2080 static inline void context_set_sm_dte(struct context_entry *context)
2081 {
2082 	context->lo |= (1 << 2);
2083 }
2084 
2085 /*
2086  * Set the PRE(Page Request Enable) field of a scalable mode context
2087  * entry.
2088  */
context_set_sm_pre(struct context_entry * context)2089 static inline void context_set_sm_pre(struct context_entry *context)
2090 {
2091 	context->lo |= (1 << 4);
2092 }
2093 
2094 /* Convert value to context PASID directory size field coding. */
2095 #define context_pdts(pds)	(((pds) & 0x7) << 9)
2096 
domain_context_mapping_one(struct dmar_domain * domain,struct intel_iommu * iommu,struct pasid_table * table,u8 bus,u8 devfn)2097 static int domain_context_mapping_one(struct dmar_domain *domain,
2098 				      struct intel_iommu *iommu,
2099 				      struct pasid_table *table,
2100 				      u8 bus, u8 devfn)
2101 {
2102 	u16 did = domain->iommu_did[iommu->seq_id];
2103 	int translation = CONTEXT_TT_MULTI_LEVEL;
2104 	struct device_domain_info *info = NULL;
2105 	struct context_entry *context;
2106 	unsigned long flags;
2107 	int ret;
2108 
2109 	WARN_ON(did == 0);
2110 
2111 	if (hw_pass_through && domain_type_is_si(domain))
2112 		translation = CONTEXT_TT_PASS_THROUGH;
2113 
2114 	pr_debug("Set context mapping for %02x:%02x.%d\n",
2115 		bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
2116 
2117 	BUG_ON(!domain->pgd);
2118 
2119 	spin_lock_irqsave(&device_domain_lock, flags);
2120 	spin_lock(&iommu->lock);
2121 
2122 	ret = -ENOMEM;
2123 	context = iommu_context_addr(iommu, bus, devfn, 1);
2124 	if (!context)
2125 		goto out_unlock;
2126 
2127 	ret = 0;
2128 	if (context_present(context))
2129 		goto out_unlock;
2130 
2131 	/*
2132 	 * For kdump cases, old valid entries may be cached due to the
2133 	 * in-flight DMA and copied pgtable, but there is no unmapping
2134 	 * behaviour for them, thus we need an explicit cache flush for
2135 	 * the newly-mapped device. For kdump, at this point, the device
2136 	 * is supposed to finish reset at its driver probe stage, so no
2137 	 * in-flight DMA will exist, and we don't need to worry anymore
2138 	 * hereafter.
2139 	 */
2140 	if (context_copied(context)) {
2141 		u16 did_old = context_domain_id(context);
2142 
2143 		if (did_old < cap_ndoms(iommu->cap)) {
2144 			iommu->flush.flush_context(iommu, did_old,
2145 						   (((u16)bus) << 8) | devfn,
2146 						   DMA_CCMD_MASK_NOBIT,
2147 						   DMA_CCMD_DEVICE_INVL);
2148 			iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2149 						 DMA_TLB_DSI_FLUSH);
2150 		}
2151 	}
2152 
2153 	context_clear_entry(context);
2154 
2155 	if (sm_supported(iommu)) {
2156 		unsigned long pds;
2157 
2158 		WARN_ON(!table);
2159 
2160 		/* Setup the PASID DIR pointer: */
2161 		pds = context_get_sm_pds(table);
2162 		context->lo = (u64)virt_to_phys(table->table) |
2163 				context_pdts(pds);
2164 
2165 		/* Setup the RID_PASID field: */
2166 		context_set_sm_rid2pasid(context, PASID_RID2PASID);
2167 
2168 		/*
2169 		 * Setup the Device-TLB enable bit and Page request
2170 		 * Enable bit:
2171 		 */
2172 		info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2173 		if (info && info->ats_supported)
2174 			context_set_sm_dte(context);
2175 		if (info && info->pri_supported)
2176 			context_set_sm_pre(context);
2177 	} else {
2178 		struct dma_pte *pgd = domain->pgd;
2179 		int agaw;
2180 
2181 		context_set_domain_id(context, did);
2182 
2183 		if (translation != CONTEXT_TT_PASS_THROUGH) {
2184 			/*
2185 			 * Skip top levels of page tables for iommu which has
2186 			 * less agaw than default. Unnecessary for PT mode.
2187 			 */
2188 			for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2189 				ret = -ENOMEM;
2190 				pgd = phys_to_virt(dma_pte_addr(pgd));
2191 				if (!dma_pte_present(pgd))
2192 					goto out_unlock;
2193 			}
2194 
2195 			info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2196 			if (info && info->ats_supported)
2197 				translation = CONTEXT_TT_DEV_IOTLB;
2198 			else
2199 				translation = CONTEXT_TT_MULTI_LEVEL;
2200 
2201 			context_set_address_root(context, virt_to_phys(pgd));
2202 			context_set_address_width(context, agaw);
2203 		} else {
2204 			/*
2205 			 * In pass through mode, AW must be programmed to
2206 			 * indicate the largest AGAW value supported by
2207 			 * hardware. And ASR is ignored by hardware.
2208 			 */
2209 			context_set_address_width(context, iommu->msagaw);
2210 		}
2211 
2212 		context_set_translation_type(context, translation);
2213 	}
2214 
2215 	context_set_fault_enable(context);
2216 	context_set_present(context);
2217 	if (!ecap_coherent(iommu->ecap))
2218 		clflush_cache_range(context, sizeof(*context));
2219 
2220 	/*
2221 	 * It's a non-present to present mapping. If hardware doesn't cache
2222 	 * non-present entry we only need to flush the write-buffer. If the
2223 	 * _does_ cache non-present entries, then it does so in the special
2224 	 * domain #0, which we have to flush:
2225 	 */
2226 	if (cap_caching_mode(iommu->cap)) {
2227 		iommu->flush.flush_context(iommu, 0,
2228 					   (((u16)bus) << 8) | devfn,
2229 					   DMA_CCMD_MASK_NOBIT,
2230 					   DMA_CCMD_DEVICE_INVL);
2231 		iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2232 	} else {
2233 		iommu_flush_write_buffer(iommu);
2234 	}
2235 	iommu_enable_dev_iotlb(info);
2236 
2237 	ret = 0;
2238 
2239 out_unlock:
2240 	spin_unlock(&iommu->lock);
2241 	spin_unlock_irqrestore(&device_domain_lock, flags);
2242 
2243 	return ret;
2244 }
2245 
2246 struct domain_context_mapping_data {
2247 	struct dmar_domain *domain;
2248 	struct intel_iommu *iommu;
2249 	struct pasid_table *table;
2250 };
2251 
domain_context_mapping_cb(struct pci_dev * pdev,u16 alias,void * opaque)2252 static int domain_context_mapping_cb(struct pci_dev *pdev,
2253 				     u16 alias, void *opaque)
2254 {
2255 	struct domain_context_mapping_data *data = opaque;
2256 
2257 	return domain_context_mapping_one(data->domain, data->iommu,
2258 					  data->table, PCI_BUS_NUM(alias),
2259 					  alias & 0xff);
2260 }
2261 
2262 static int
domain_context_mapping(struct dmar_domain * domain,struct device * dev)2263 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2264 {
2265 	struct domain_context_mapping_data data;
2266 	struct pasid_table *table;
2267 	struct intel_iommu *iommu;
2268 	u8 bus, devfn;
2269 
2270 	iommu = device_to_iommu(dev, &bus, &devfn);
2271 	if (!iommu)
2272 		return -ENODEV;
2273 
2274 	table = intel_pasid_get_table(dev);
2275 
2276 	if (!dev_is_pci(dev))
2277 		return domain_context_mapping_one(domain, iommu, table,
2278 						  bus, devfn);
2279 
2280 	data.domain = domain;
2281 	data.iommu = iommu;
2282 	data.table = table;
2283 
2284 	return pci_for_each_dma_alias(to_pci_dev(dev),
2285 				      &domain_context_mapping_cb, &data);
2286 }
2287 
domain_context_mapped_cb(struct pci_dev * pdev,u16 alias,void * opaque)2288 static int domain_context_mapped_cb(struct pci_dev *pdev,
2289 				    u16 alias, void *opaque)
2290 {
2291 	struct intel_iommu *iommu = opaque;
2292 
2293 	return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2294 }
2295 
domain_context_mapped(struct device * dev)2296 static int domain_context_mapped(struct device *dev)
2297 {
2298 	struct intel_iommu *iommu;
2299 	u8 bus, devfn;
2300 
2301 	iommu = device_to_iommu(dev, &bus, &devfn);
2302 	if (!iommu)
2303 		return -ENODEV;
2304 
2305 	if (!dev_is_pci(dev))
2306 		return device_context_mapped(iommu, bus, devfn);
2307 
2308 	return !pci_for_each_dma_alias(to_pci_dev(dev),
2309 				       domain_context_mapped_cb, iommu);
2310 }
2311 
2312 /* Returns a number of VTD pages, but aligned to MM page size */
aligned_nrpages(unsigned long host_addr,size_t size)2313 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2314 					    size_t size)
2315 {
2316 	host_addr &= ~PAGE_MASK;
2317 	return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2318 }
2319 
2320 /* Return largest possible superpage level for a given mapping */
hardware_largepage_caps(struct dmar_domain * domain,unsigned long iov_pfn,unsigned long phy_pfn,unsigned long pages)2321 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2322 					  unsigned long iov_pfn,
2323 					  unsigned long phy_pfn,
2324 					  unsigned long pages)
2325 {
2326 	int support, level = 1;
2327 	unsigned long pfnmerge;
2328 
2329 	support = domain->iommu_superpage;
2330 
2331 	/* To use a large page, the virtual *and* physical addresses
2332 	   must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2333 	   of them will mean we have to use smaller pages. So just
2334 	   merge them and check both at once. */
2335 	pfnmerge = iov_pfn | phy_pfn;
2336 
2337 	while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2338 		pages >>= VTD_STRIDE_SHIFT;
2339 		if (!pages)
2340 			break;
2341 		pfnmerge >>= VTD_STRIDE_SHIFT;
2342 		level++;
2343 		support--;
2344 	}
2345 	return level;
2346 }
2347 
__domain_mapping(struct dmar_domain * domain,unsigned long iov_pfn,struct scatterlist * sg,unsigned long phys_pfn,unsigned long nr_pages,int prot)2348 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2349 			    struct scatterlist *sg, unsigned long phys_pfn,
2350 			    unsigned long nr_pages, int prot)
2351 {
2352 	struct dma_pte *first_pte = NULL, *pte = NULL;
2353 	phys_addr_t pteval;
2354 	unsigned long sg_res = 0;
2355 	unsigned int largepage_lvl = 0;
2356 	unsigned long lvl_pages = 0;
2357 	u64 attr;
2358 
2359 	BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2360 
2361 	if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2362 		return -EINVAL;
2363 
2364 	attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2365 	attr |= DMA_FL_PTE_PRESENT;
2366 	if (domain_use_first_level(domain)) {
2367 		attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US;
2368 
2369 		if (domain->domain.type == IOMMU_DOMAIN_DMA) {
2370 			attr |= DMA_FL_PTE_ACCESS;
2371 			if (prot & DMA_PTE_WRITE)
2372 				attr |= DMA_FL_PTE_DIRTY;
2373 		}
2374 	}
2375 
2376 	if (!sg) {
2377 		sg_res = nr_pages;
2378 		pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2379 	}
2380 
2381 	while (nr_pages > 0) {
2382 		uint64_t tmp;
2383 
2384 		if (!sg_res) {
2385 			unsigned int pgoff = sg->offset & ~PAGE_MASK;
2386 
2387 			sg_res = aligned_nrpages(sg->offset, sg->length);
2388 			sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2389 			sg->dma_length = sg->length;
2390 			pteval = (sg_phys(sg) - pgoff) | attr;
2391 			phys_pfn = pteval >> VTD_PAGE_SHIFT;
2392 		}
2393 
2394 		if (!pte) {
2395 			largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2396 
2397 			first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2398 			if (!pte)
2399 				return -ENOMEM;
2400 			/* It is large page*/
2401 			if (largepage_lvl > 1) {
2402 				unsigned long nr_superpages, end_pfn;
2403 
2404 				pteval |= DMA_PTE_LARGE_PAGE;
2405 				lvl_pages = lvl_to_nr_pages(largepage_lvl);
2406 
2407 				nr_superpages = sg_res / lvl_pages;
2408 				end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2409 
2410 				/*
2411 				 * Ensure that old small page tables are
2412 				 * removed to make room for superpage(s).
2413 				 * We're adding new large pages, so make sure
2414 				 * we don't remove their parent tables.
2415 				 */
2416 				dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2417 						       largepage_lvl + 1);
2418 			} else {
2419 				pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2420 			}
2421 
2422 		}
2423 		/* We don't need lock here, nobody else
2424 		 * touches the iova range
2425 		 */
2426 		tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2427 		if (tmp) {
2428 			static int dumps = 5;
2429 			pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2430 				iov_pfn, tmp, (unsigned long long)pteval);
2431 			if (dumps) {
2432 				dumps--;
2433 				debug_dma_dump_mappings(NULL);
2434 			}
2435 			WARN_ON(1);
2436 		}
2437 
2438 		lvl_pages = lvl_to_nr_pages(largepage_lvl);
2439 
2440 		BUG_ON(nr_pages < lvl_pages);
2441 		BUG_ON(sg_res < lvl_pages);
2442 
2443 		nr_pages -= lvl_pages;
2444 		iov_pfn += lvl_pages;
2445 		phys_pfn += lvl_pages;
2446 		pteval += lvl_pages * VTD_PAGE_SIZE;
2447 		sg_res -= lvl_pages;
2448 
2449 		/* If the next PTE would be the first in a new page, then we
2450 		   need to flush the cache on the entries we've just written.
2451 		   And then we'll need to recalculate 'pte', so clear it and
2452 		   let it get set again in the if (!pte) block above.
2453 
2454 		   If we're done (!nr_pages) we need to flush the cache too.
2455 
2456 		   Also if we've been setting superpages, we may need to
2457 		   recalculate 'pte' and switch back to smaller pages for the
2458 		   end of the mapping, if the trailing size is not enough to
2459 		   use another superpage (i.e. sg_res < lvl_pages). */
2460 		pte++;
2461 		if (!nr_pages || first_pte_in_page(pte) ||
2462 		    (largepage_lvl > 1 && sg_res < lvl_pages)) {
2463 			domain_flush_cache(domain, first_pte,
2464 					   (void *)pte - (void *)first_pte);
2465 			pte = NULL;
2466 		}
2467 
2468 		if (!sg_res && nr_pages)
2469 			sg = sg_next(sg);
2470 	}
2471 	return 0;
2472 }
2473 
domain_mapping(struct dmar_domain * domain,unsigned long iov_pfn,struct scatterlist * sg,unsigned long phys_pfn,unsigned long nr_pages,int prot)2474 static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2475 			  struct scatterlist *sg, unsigned long phys_pfn,
2476 			  unsigned long nr_pages, int prot)
2477 {
2478 	int iommu_id, ret;
2479 	struct intel_iommu *iommu;
2480 
2481 	/* Do the real mapping first */
2482 	ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot);
2483 	if (ret)
2484 		return ret;
2485 
2486 	for_each_domain_iommu(iommu_id, domain) {
2487 		iommu = g_iommus[iommu_id];
2488 		__mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2489 	}
2490 
2491 	return 0;
2492 }
2493 
domain_sg_mapping(struct dmar_domain * domain,unsigned long iov_pfn,struct scatterlist * sg,unsigned long nr_pages,int prot)2494 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2495 				    struct scatterlist *sg, unsigned long nr_pages,
2496 				    int prot)
2497 {
2498 	return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2499 }
2500 
domain_pfn_mapping(struct dmar_domain * domain,unsigned long iov_pfn,unsigned long phys_pfn,unsigned long nr_pages,int prot)2501 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2502 				     unsigned long phys_pfn, unsigned long nr_pages,
2503 				     int prot)
2504 {
2505 	return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2506 }
2507 
domain_context_clear_one(struct intel_iommu * iommu,u8 bus,u8 devfn)2508 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2509 {
2510 	unsigned long flags;
2511 	struct context_entry *context;
2512 	u16 did_old;
2513 
2514 	if (!iommu)
2515 		return;
2516 
2517 	spin_lock_irqsave(&iommu->lock, flags);
2518 	context = iommu_context_addr(iommu, bus, devfn, 0);
2519 	if (!context) {
2520 		spin_unlock_irqrestore(&iommu->lock, flags);
2521 		return;
2522 	}
2523 	did_old = context_domain_id(context);
2524 	context_clear_entry(context);
2525 	__iommu_flush_cache(iommu, context, sizeof(*context));
2526 	spin_unlock_irqrestore(&iommu->lock, flags);
2527 	iommu->flush.flush_context(iommu,
2528 				   did_old,
2529 				   (((u16)bus) << 8) | devfn,
2530 				   DMA_CCMD_MASK_NOBIT,
2531 				   DMA_CCMD_DEVICE_INVL);
2532 
2533 	if (sm_supported(iommu))
2534 		qi_flush_pasid_cache(iommu, did_old, QI_PC_ALL_PASIDS, 0);
2535 
2536 	iommu->flush.flush_iotlb(iommu,
2537 				 did_old,
2538 				 0,
2539 				 0,
2540 				 DMA_TLB_DSI_FLUSH);
2541 }
2542 
unlink_domain_info(struct device_domain_info * info)2543 static inline void unlink_domain_info(struct device_domain_info *info)
2544 {
2545 	assert_spin_locked(&device_domain_lock);
2546 	list_del(&info->link);
2547 	list_del(&info->global);
2548 	if (info->dev)
2549 		dev_iommu_priv_set(info->dev, NULL);
2550 }
2551 
domain_remove_dev_info(struct dmar_domain * domain)2552 static void domain_remove_dev_info(struct dmar_domain *domain)
2553 {
2554 	struct device_domain_info *info, *tmp;
2555 	unsigned long flags;
2556 
2557 	spin_lock_irqsave(&device_domain_lock, flags);
2558 	list_for_each_entry_safe(info, tmp, &domain->devices, link)
2559 		__dmar_remove_one_dev_info(info);
2560 	spin_unlock_irqrestore(&device_domain_lock, flags);
2561 }
2562 
find_domain(struct device * dev)2563 struct dmar_domain *find_domain(struct device *dev)
2564 {
2565 	struct device_domain_info *info;
2566 
2567 	if (unlikely(!dev || !dev->iommu))
2568 		return NULL;
2569 
2570 	if (unlikely(attach_deferred(dev)))
2571 		return NULL;
2572 
2573 	/* No lock here, assumes no domain exit in normal case */
2574 	info = get_domain_info(dev);
2575 	if (likely(info))
2576 		return info->domain;
2577 
2578 	return NULL;
2579 }
2580 
do_deferred_attach(struct device * dev)2581 static void do_deferred_attach(struct device *dev)
2582 {
2583 	struct iommu_domain *domain;
2584 
2585 	dev_iommu_priv_set(dev, NULL);
2586 	domain = iommu_get_domain_for_dev(dev);
2587 	if (domain)
2588 		intel_iommu_attach_device(domain, dev);
2589 }
2590 
2591 static inline struct device_domain_info *
dmar_search_domain_by_dev_info(int segment,int bus,int devfn)2592 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2593 {
2594 	struct device_domain_info *info;
2595 
2596 	list_for_each_entry(info, &device_domain_list, global)
2597 		if (info->segment == segment && info->bus == bus &&
2598 		    info->devfn == devfn)
2599 			return info;
2600 
2601 	return NULL;
2602 }
2603 
domain_setup_first_level(struct intel_iommu * iommu,struct dmar_domain * domain,struct device * dev,u32 pasid)2604 static int domain_setup_first_level(struct intel_iommu *iommu,
2605 				    struct dmar_domain *domain,
2606 				    struct device *dev,
2607 				    u32 pasid)
2608 {
2609 	struct dma_pte *pgd = domain->pgd;
2610 	int agaw, level;
2611 	int flags = 0;
2612 
2613 	/*
2614 	 * Skip top levels of page tables for iommu which has
2615 	 * less agaw than default. Unnecessary for PT mode.
2616 	 */
2617 	for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2618 		pgd = phys_to_virt(dma_pte_addr(pgd));
2619 		if (!dma_pte_present(pgd))
2620 			return -ENOMEM;
2621 	}
2622 
2623 	level = agaw_to_level(agaw);
2624 	if (level != 4 && level != 5)
2625 		return -EINVAL;
2626 
2627 	if (pasid != PASID_RID2PASID)
2628 		flags |= PASID_FLAG_SUPERVISOR_MODE;
2629 	if (level == 5)
2630 		flags |= PASID_FLAG_FL5LP;
2631 
2632 	if (domain->domain.type == IOMMU_DOMAIN_UNMANAGED)
2633 		flags |= PASID_FLAG_PAGE_SNOOP;
2634 
2635 	return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2636 					     domain->iommu_did[iommu->seq_id],
2637 					     flags);
2638 }
2639 
dev_is_real_dma_subdevice(struct device * dev)2640 static bool dev_is_real_dma_subdevice(struct device *dev)
2641 {
2642 	return dev && dev_is_pci(dev) &&
2643 	       pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2644 }
2645 
dmar_insert_one_dev_info(struct intel_iommu * iommu,int bus,int devfn,struct device * dev,struct dmar_domain * domain)2646 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2647 						    int bus, int devfn,
2648 						    struct device *dev,
2649 						    struct dmar_domain *domain)
2650 {
2651 	struct dmar_domain *found = NULL;
2652 	struct device_domain_info *info;
2653 	unsigned long flags;
2654 	int ret;
2655 
2656 	info = alloc_devinfo_mem();
2657 	if (!info)
2658 		return NULL;
2659 
2660 	if (!dev_is_real_dma_subdevice(dev)) {
2661 		info->bus = bus;
2662 		info->devfn = devfn;
2663 		info->segment = iommu->segment;
2664 	} else {
2665 		struct pci_dev *pdev = to_pci_dev(dev);
2666 
2667 		info->bus = pdev->bus->number;
2668 		info->devfn = pdev->devfn;
2669 		info->segment = pci_domain_nr(pdev->bus);
2670 	}
2671 
2672 	info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2673 	info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2674 	info->ats_qdep = 0;
2675 	info->dev = dev;
2676 	info->domain = domain;
2677 	info->iommu = iommu;
2678 	info->pasid_table = NULL;
2679 	info->auxd_enabled = 0;
2680 	INIT_LIST_HEAD(&info->auxiliary_domains);
2681 
2682 	if (dev && dev_is_pci(dev)) {
2683 		struct pci_dev *pdev = to_pci_dev(info->dev);
2684 
2685 		if (ecap_dev_iotlb_support(iommu->ecap) &&
2686 		    pci_ats_supported(pdev) &&
2687 		    dmar_find_matched_atsr_unit(pdev))
2688 			info->ats_supported = 1;
2689 
2690 		if (sm_supported(iommu)) {
2691 			if (pasid_supported(iommu)) {
2692 				int features = pci_pasid_features(pdev);
2693 				if (features >= 0)
2694 					info->pasid_supported = features | 1;
2695 			}
2696 
2697 			if (info->ats_supported && ecap_prs(iommu->ecap) &&
2698 			    pci_pri_supported(pdev))
2699 				info->pri_supported = 1;
2700 		}
2701 	}
2702 
2703 	spin_lock_irqsave(&device_domain_lock, flags);
2704 	if (dev)
2705 		found = find_domain(dev);
2706 
2707 	if (!found) {
2708 		struct device_domain_info *info2;
2709 		info2 = dmar_search_domain_by_dev_info(info->segment, info->bus,
2710 						       info->devfn);
2711 		if (info2) {
2712 			found      = info2->domain;
2713 			info2->dev = dev;
2714 		}
2715 	}
2716 
2717 	if (found) {
2718 		spin_unlock_irqrestore(&device_domain_lock, flags);
2719 		free_devinfo_mem(info);
2720 		/* Caller must free the original domain */
2721 		return found;
2722 	}
2723 
2724 	spin_lock(&iommu->lock);
2725 	ret = domain_attach_iommu(domain, iommu);
2726 	spin_unlock(&iommu->lock);
2727 
2728 	if (ret) {
2729 		spin_unlock_irqrestore(&device_domain_lock, flags);
2730 		free_devinfo_mem(info);
2731 		return NULL;
2732 	}
2733 
2734 	list_add(&info->link, &domain->devices);
2735 	list_add(&info->global, &device_domain_list);
2736 	if (dev)
2737 		dev_iommu_priv_set(dev, info);
2738 	spin_unlock_irqrestore(&device_domain_lock, flags);
2739 
2740 	/* PASID table is mandatory for a PCI device in scalable mode. */
2741 	if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
2742 		ret = intel_pasid_alloc_table(dev);
2743 		if (ret) {
2744 			dev_err(dev, "PASID table allocation failed\n");
2745 			dmar_remove_one_dev_info(dev);
2746 			return NULL;
2747 		}
2748 
2749 		/* Setup the PASID entry for requests without PASID: */
2750 		spin_lock_irqsave(&iommu->lock, flags);
2751 		if (hw_pass_through && domain_type_is_si(domain))
2752 			ret = intel_pasid_setup_pass_through(iommu, domain,
2753 					dev, PASID_RID2PASID);
2754 		else if (domain_use_first_level(domain))
2755 			ret = domain_setup_first_level(iommu, domain, dev,
2756 					PASID_RID2PASID);
2757 		else
2758 			ret = intel_pasid_setup_second_level(iommu, domain,
2759 					dev, PASID_RID2PASID);
2760 		spin_unlock_irqrestore(&iommu->lock, flags);
2761 		if (ret) {
2762 			dev_err(dev, "Setup RID2PASID failed\n");
2763 			dmar_remove_one_dev_info(dev);
2764 			return NULL;
2765 		}
2766 	}
2767 
2768 	if (dev && domain_context_mapping(domain, dev)) {
2769 		dev_err(dev, "Domain context map failed\n");
2770 		dmar_remove_one_dev_info(dev);
2771 		return NULL;
2772 	}
2773 
2774 	return domain;
2775 }
2776 
iommu_domain_identity_map(struct dmar_domain * domain,unsigned long first_vpfn,unsigned long last_vpfn)2777 static int iommu_domain_identity_map(struct dmar_domain *domain,
2778 				     unsigned long first_vpfn,
2779 				     unsigned long last_vpfn)
2780 {
2781 	/*
2782 	 * RMRR range might have overlap with physical memory range,
2783 	 * clear it first
2784 	 */
2785 	dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2786 
2787 	return __domain_mapping(domain, first_vpfn, NULL,
2788 				first_vpfn, last_vpfn - first_vpfn + 1,
2789 				DMA_PTE_READ|DMA_PTE_WRITE);
2790 }
2791 
2792 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2793 
si_domain_init(int hw)2794 static int __init si_domain_init(int hw)
2795 {
2796 	struct dmar_rmrr_unit *rmrr;
2797 	struct device *dev;
2798 	int i, nid, ret;
2799 
2800 	si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2801 	if (!si_domain)
2802 		return -EFAULT;
2803 
2804 	if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2805 		domain_exit(si_domain);
2806 		return -EFAULT;
2807 	}
2808 
2809 	if (hw)
2810 		return 0;
2811 
2812 	for_each_online_node(nid) {
2813 		unsigned long start_pfn, end_pfn;
2814 		int i;
2815 
2816 		for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2817 			ret = iommu_domain_identity_map(si_domain,
2818 					mm_to_dma_pfn(start_pfn),
2819 					mm_to_dma_pfn(end_pfn));
2820 			if (ret)
2821 				return ret;
2822 		}
2823 	}
2824 
2825 	/*
2826 	 * Identity map the RMRRs so that devices with RMRRs could also use
2827 	 * the si_domain.
2828 	 */
2829 	for_each_rmrr_units(rmrr) {
2830 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2831 					  i, dev) {
2832 			unsigned long long start = rmrr->base_address;
2833 			unsigned long long end = rmrr->end_address;
2834 
2835 			if (WARN_ON(end < start ||
2836 				    end >> agaw_to_width(si_domain->agaw)))
2837 				continue;
2838 
2839 			ret = iommu_domain_identity_map(si_domain,
2840 					mm_to_dma_pfn(start >> PAGE_SHIFT),
2841 					mm_to_dma_pfn(end >> PAGE_SHIFT));
2842 			if (ret)
2843 				return ret;
2844 		}
2845 	}
2846 
2847 	return 0;
2848 }
2849 
domain_add_dev_info(struct dmar_domain * domain,struct device * dev)2850 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2851 {
2852 	struct dmar_domain *ndomain;
2853 	struct intel_iommu *iommu;
2854 	u8 bus, devfn;
2855 
2856 	iommu = device_to_iommu(dev, &bus, &devfn);
2857 	if (!iommu)
2858 		return -ENODEV;
2859 
2860 	ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2861 	if (ndomain != domain)
2862 		return -EBUSY;
2863 
2864 	return 0;
2865 }
2866 
device_has_rmrr(struct device * dev)2867 static bool device_has_rmrr(struct device *dev)
2868 {
2869 	struct dmar_rmrr_unit *rmrr;
2870 	struct device *tmp;
2871 	int i;
2872 
2873 	rcu_read_lock();
2874 	for_each_rmrr_units(rmrr) {
2875 		/*
2876 		 * Return TRUE if this RMRR contains the device that
2877 		 * is passed in.
2878 		 */
2879 		for_each_active_dev_scope(rmrr->devices,
2880 					  rmrr->devices_cnt, i, tmp)
2881 			if (tmp == dev ||
2882 			    is_downstream_to_pci_bridge(dev, tmp)) {
2883 				rcu_read_unlock();
2884 				return true;
2885 			}
2886 	}
2887 	rcu_read_unlock();
2888 	return false;
2889 }
2890 
2891 /**
2892  * device_rmrr_is_relaxable - Test whether the RMRR of this device
2893  * is relaxable (ie. is allowed to be not enforced under some conditions)
2894  * @dev: device handle
2895  *
2896  * We assume that PCI USB devices with RMRRs have them largely
2897  * for historical reasons and that the RMRR space is not actively used post
2898  * boot.  This exclusion may change if vendors begin to abuse it.
2899  *
2900  * The same exception is made for graphics devices, with the requirement that
2901  * any use of the RMRR regions will be torn down before assigning the device
2902  * to a guest.
2903  *
2904  * Return: true if the RMRR is relaxable, false otherwise
2905  */
device_rmrr_is_relaxable(struct device * dev)2906 static bool device_rmrr_is_relaxable(struct device *dev)
2907 {
2908 	struct pci_dev *pdev;
2909 
2910 	if (!dev_is_pci(dev))
2911 		return false;
2912 
2913 	pdev = to_pci_dev(dev);
2914 	if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2915 		return true;
2916 	else
2917 		return false;
2918 }
2919 
2920 /*
2921  * There are a couple cases where we need to restrict the functionality of
2922  * devices associated with RMRRs.  The first is when evaluating a device for
2923  * identity mapping because problems exist when devices are moved in and out
2924  * of domains and their respective RMRR information is lost.  This means that
2925  * a device with associated RMRRs will never be in a "passthrough" domain.
2926  * The second is use of the device through the IOMMU API.  This interface
2927  * expects to have full control of the IOVA space for the device.  We cannot
2928  * satisfy both the requirement that RMRR access is maintained and have an
2929  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2930  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2931  * We therefore prevent devices associated with an RMRR from participating in
2932  * the IOMMU API, which eliminates them from device assignment.
2933  *
2934  * In both cases, devices which have relaxable RMRRs are not concerned by this
2935  * restriction. See device_rmrr_is_relaxable comment.
2936  */
device_is_rmrr_locked(struct device * dev)2937 static bool device_is_rmrr_locked(struct device *dev)
2938 {
2939 	if (!device_has_rmrr(dev))
2940 		return false;
2941 
2942 	if (device_rmrr_is_relaxable(dev))
2943 		return false;
2944 
2945 	return true;
2946 }
2947 
2948 /*
2949  * Return the required default domain type for a specific device.
2950  *
2951  * @dev: the device in query
2952  * @startup: true if this is during early boot
2953  *
2954  * Returns:
2955  *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2956  *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2957  *  - 0: both identity and dynamic domains work for this device
2958  */
device_def_domain_type(struct device * dev)2959 static int device_def_domain_type(struct device *dev)
2960 {
2961 	if (dev_is_pci(dev)) {
2962 		struct pci_dev *pdev = to_pci_dev(dev);
2963 
2964 		/*
2965 		 * Prevent any device marked as untrusted from getting
2966 		 * placed into the statically identity mapping domain.
2967 		 */
2968 		if (pdev->untrusted)
2969 			return IOMMU_DOMAIN_DMA;
2970 
2971 		if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2972 			return IOMMU_DOMAIN_IDENTITY;
2973 
2974 		if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2975 			return IOMMU_DOMAIN_IDENTITY;
2976 	}
2977 
2978 	return 0;
2979 }
2980 
intel_iommu_init_qi(struct intel_iommu * iommu)2981 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2982 {
2983 	/*
2984 	 * Start from the sane iommu hardware state.
2985 	 * If the queued invalidation is already initialized by us
2986 	 * (for example, while enabling interrupt-remapping) then
2987 	 * we got the things already rolling from a sane state.
2988 	 */
2989 	if (!iommu->qi) {
2990 		/*
2991 		 * Clear any previous faults.
2992 		 */
2993 		dmar_fault(-1, iommu);
2994 		/*
2995 		 * Disable queued invalidation if supported and already enabled
2996 		 * before OS handover.
2997 		 */
2998 		dmar_disable_qi(iommu);
2999 	}
3000 
3001 	if (dmar_enable_qi(iommu)) {
3002 		/*
3003 		 * Queued Invalidate not enabled, use Register Based Invalidate
3004 		 */
3005 		iommu->flush.flush_context = __iommu_flush_context;
3006 		iommu->flush.flush_iotlb = __iommu_flush_iotlb;
3007 		pr_info("%s: Using Register based invalidation\n",
3008 			iommu->name);
3009 	} else {
3010 		iommu->flush.flush_context = qi_flush_context;
3011 		iommu->flush.flush_iotlb = qi_flush_iotlb;
3012 		pr_info("%s: Using Queued invalidation\n", iommu->name);
3013 	}
3014 }
3015 
copy_context_table(struct intel_iommu * iommu,struct root_entry * old_re,struct context_entry ** tbl,int bus,bool ext)3016 static int copy_context_table(struct intel_iommu *iommu,
3017 			      struct root_entry *old_re,
3018 			      struct context_entry **tbl,
3019 			      int bus, bool ext)
3020 {
3021 	int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
3022 	struct context_entry *new_ce = NULL, ce;
3023 	struct context_entry *old_ce = NULL;
3024 	struct root_entry re;
3025 	phys_addr_t old_ce_phys;
3026 
3027 	tbl_idx = ext ? bus * 2 : bus;
3028 	memcpy(&re, old_re, sizeof(re));
3029 
3030 	for (devfn = 0; devfn < 256; devfn++) {
3031 		/* First calculate the correct index */
3032 		idx = (ext ? devfn * 2 : devfn) % 256;
3033 
3034 		if (idx == 0) {
3035 			/* First save what we may have and clean up */
3036 			if (new_ce) {
3037 				tbl[tbl_idx] = new_ce;
3038 				__iommu_flush_cache(iommu, new_ce,
3039 						    VTD_PAGE_SIZE);
3040 				pos = 1;
3041 			}
3042 
3043 			if (old_ce)
3044 				memunmap(old_ce);
3045 
3046 			ret = 0;
3047 			if (devfn < 0x80)
3048 				old_ce_phys = root_entry_lctp(&re);
3049 			else
3050 				old_ce_phys = root_entry_uctp(&re);
3051 
3052 			if (!old_ce_phys) {
3053 				if (ext && devfn == 0) {
3054 					/* No LCTP, try UCTP */
3055 					devfn = 0x7f;
3056 					continue;
3057 				} else {
3058 					goto out;
3059 				}
3060 			}
3061 
3062 			ret = -ENOMEM;
3063 			old_ce = memremap(old_ce_phys, PAGE_SIZE,
3064 					MEMREMAP_WB);
3065 			if (!old_ce)
3066 				goto out;
3067 
3068 			new_ce = alloc_pgtable_page(iommu->node);
3069 			if (!new_ce)
3070 				goto out_unmap;
3071 
3072 			ret = 0;
3073 		}
3074 
3075 		/* Now copy the context entry */
3076 		memcpy(&ce, old_ce + idx, sizeof(ce));
3077 
3078 		if (!__context_present(&ce))
3079 			continue;
3080 
3081 		did = context_domain_id(&ce);
3082 		if (did >= 0 && did < cap_ndoms(iommu->cap))
3083 			set_bit(did, iommu->domain_ids);
3084 
3085 		/*
3086 		 * We need a marker for copied context entries. This
3087 		 * marker needs to work for the old format as well as
3088 		 * for extended context entries.
3089 		 *
3090 		 * Bit 67 of the context entry is used. In the old
3091 		 * format this bit is available to software, in the
3092 		 * extended format it is the PGE bit, but PGE is ignored
3093 		 * by HW if PASIDs are disabled (and thus still
3094 		 * available).
3095 		 *
3096 		 * So disable PASIDs first and then mark the entry
3097 		 * copied. This means that we don't copy PASID
3098 		 * translations from the old kernel, but this is fine as
3099 		 * faults there are not fatal.
3100 		 */
3101 		context_clear_pasid_enable(&ce);
3102 		context_set_copied(&ce);
3103 
3104 		new_ce[idx] = ce;
3105 	}
3106 
3107 	tbl[tbl_idx + pos] = new_ce;
3108 
3109 	__iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3110 
3111 out_unmap:
3112 	memunmap(old_ce);
3113 
3114 out:
3115 	return ret;
3116 }
3117 
copy_translation_tables(struct intel_iommu * iommu)3118 static int copy_translation_tables(struct intel_iommu *iommu)
3119 {
3120 	struct context_entry **ctxt_tbls;
3121 	struct root_entry *old_rt;
3122 	phys_addr_t old_rt_phys;
3123 	int ctxt_table_entries;
3124 	unsigned long flags;
3125 	u64 rtaddr_reg;
3126 	int bus, ret;
3127 	bool new_ext, ext;
3128 
3129 	rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3130 	ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
3131 	new_ext    = !!ecap_ecs(iommu->ecap);
3132 
3133 	/*
3134 	 * The RTT bit can only be changed when translation is disabled,
3135 	 * but disabling translation means to open a window for data
3136 	 * corruption. So bail out and don't copy anything if we would
3137 	 * have to change the bit.
3138 	 */
3139 	if (new_ext != ext)
3140 		return -EINVAL;
3141 
3142 	old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3143 	if (!old_rt_phys)
3144 		return -EINVAL;
3145 
3146 	old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3147 	if (!old_rt)
3148 		return -ENOMEM;
3149 
3150 	/* This is too big for the stack - allocate it from slab */
3151 	ctxt_table_entries = ext ? 512 : 256;
3152 	ret = -ENOMEM;
3153 	ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3154 	if (!ctxt_tbls)
3155 		goto out_unmap;
3156 
3157 	for (bus = 0; bus < 256; bus++) {
3158 		ret = copy_context_table(iommu, &old_rt[bus],
3159 					 ctxt_tbls, bus, ext);
3160 		if (ret) {
3161 			pr_err("%s: Failed to copy context table for bus %d\n",
3162 				iommu->name, bus);
3163 			continue;
3164 		}
3165 	}
3166 
3167 	spin_lock_irqsave(&iommu->lock, flags);
3168 
3169 	/* Context tables are copied, now write them to the root_entry table */
3170 	for (bus = 0; bus < 256; bus++) {
3171 		int idx = ext ? bus * 2 : bus;
3172 		u64 val;
3173 
3174 		if (ctxt_tbls[idx]) {
3175 			val = virt_to_phys(ctxt_tbls[idx]) | 1;
3176 			iommu->root_entry[bus].lo = val;
3177 		}
3178 
3179 		if (!ext || !ctxt_tbls[idx + 1])
3180 			continue;
3181 
3182 		val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3183 		iommu->root_entry[bus].hi = val;
3184 	}
3185 
3186 	spin_unlock_irqrestore(&iommu->lock, flags);
3187 
3188 	kfree(ctxt_tbls);
3189 
3190 	__iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3191 
3192 	ret = 0;
3193 
3194 out_unmap:
3195 	memunmap(old_rt);
3196 
3197 	return ret;
3198 }
3199 
3200 #ifdef CONFIG_INTEL_IOMMU_SVM
intel_vcmd_ioasid_alloc(ioasid_t min,ioasid_t max,void * data)3201 static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data)
3202 {
3203 	struct intel_iommu *iommu = data;
3204 	ioasid_t ioasid;
3205 
3206 	if (!iommu)
3207 		return INVALID_IOASID;
3208 	/*
3209 	 * VT-d virtual command interface always uses the full 20 bit
3210 	 * PASID range. Host can partition guest PASID range based on
3211 	 * policies but it is out of guest's control.
3212 	 */
3213 	if (min < PASID_MIN || max > intel_pasid_max_id)
3214 		return INVALID_IOASID;
3215 
3216 	if (vcmd_alloc_pasid(iommu, &ioasid))
3217 		return INVALID_IOASID;
3218 
3219 	return ioasid;
3220 }
3221 
intel_vcmd_ioasid_free(ioasid_t ioasid,void * data)3222 static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data)
3223 {
3224 	struct intel_iommu *iommu = data;
3225 
3226 	if (!iommu)
3227 		return;
3228 	/*
3229 	 * Sanity check the ioasid owner is done at upper layer, e.g. VFIO
3230 	 * We can only free the PASID when all the devices are unbound.
3231 	 */
3232 	if (ioasid_find(NULL, ioasid, NULL)) {
3233 		pr_alert("Cannot free active IOASID %d\n", ioasid);
3234 		return;
3235 	}
3236 	vcmd_free_pasid(iommu, ioasid);
3237 }
3238 
register_pasid_allocator(struct intel_iommu * iommu)3239 static void register_pasid_allocator(struct intel_iommu *iommu)
3240 {
3241 	/*
3242 	 * If we are running in the host, no need for custom allocator
3243 	 * in that PASIDs are allocated from the host system-wide.
3244 	 */
3245 	if (!cap_caching_mode(iommu->cap))
3246 		return;
3247 
3248 	if (!sm_supported(iommu)) {
3249 		pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n");
3250 		return;
3251 	}
3252 
3253 	/*
3254 	 * Register a custom PASID allocator if we are running in a guest,
3255 	 * guest PASID must be obtained via virtual command interface.
3256 	 * There can be multiple vIOMMUs in each guest but only one allocator
3257 	 * is active. All vIOMMU allocators will eventually be calling the same
3258 	 * host allocator.
3259 	 */
3260 	if (!vccap_pasid(iommu->vccap))
3261 		return;
3262 
3263 	pr_info("Register custom PASID allocator\n");
3264 	iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc;
3265 	iommu->pasid_allocator.free = intel_vcmd_ioasid_free;
3266 	iommu->pasid_allocator.pdata = (void *)iommu;
3267 	if (ioasid_register_allocator(&iommu->pasid_allocator)) {
3268 		pr_warn("Custom PASID allocator failed, scalable mode disabled\n");
3269 		/*
3270 		 * Disable scalable mode on this IOMMU if there
3271 		 * is no custom allocator. Mixing SM capable vIOMMU
3272 		 * and non-SM vIOMMU are not supported.
3273 		 */
3274 		intel_iommu_sm = 0;
3275 	}
3276 }
3277 #endif
3278 
init_dmars(void)3279 static int __init init_dmars(void)
3280 {
3281 	struct dmar_drhd_unit *drhd;
3282 	struct intel_iommu *iommu;
3283 	int ret;
3284 
3285 	/*
3286 	 * for each drhd
3287 	 *    allocate root
3288 	 *    initialize and program root entry to not present
3289 	 * endfor
3290 	 */
3291 	for_each_drhd_unit(drhd) {
3292 		/*
3293 		 * lock not needed as this is only incremented in the single
3294 		 * threaded kernel __init code path all other access are read
3295 		 * only
3296 		 */
3297 		if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3298 			g_num_of_iommus++;
3299 			continue;
3300 		}
3301 		pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3302 	}
3303 
3304 	/* Preallocate enough resources for IOMMU hot-addition */
3305 	if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3306 		g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3307 
3308 	g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3309 			GFP_KERNEL);
3310 	if (!g_iommus) {
3311 		pr_err("Allocating global iommu array failed\n");
3312 		ret = -ENOMEM;
3313 		goto error;
3314 	}
3315 
3316 	for_each_iommu(iommu, drhd) {
3317 		if (drhd->ignored) {
3318 			iommu_disable_translation(iommu);
3319 			continue;
3320 		}
3321 
3322 		/*
3323 		 * Find the max pasid size of all IOMMU's in the system.
3324 		 * We need to ensure the system pasid table is no bigger
3325 		 * than the smallest supported.
3326 		 */
3327 		if (pasid_supported(iommu)) {
3328 			u32 temp = 2 << ecap_pss(iommu->ecap);
3329 
3330 			intel_pasid_max_id = min_t(u32, temp,
3331 						   intel_pasid_max_id);
3332 		}
3333 
3334 		g_iommus[iommu->seq_id] = iommu;
3335 
3336 		intel_iommu_init_qi(iommu);
3337 
3338 		ret = iommu_init_domains(iommu);
3339 		if (ret)
3340 			goto free_iommu;
3341 
3342 		init_translation_status(iommu);
3343 
3344 		if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3345 			iommu_disable_translation(iommu);
3346 			clear_translation_pre_enabled(iommu);
3347 			pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3348 				iommu->name);
3349 		}
3350 
3351 		/*
3352 		 * TBD:
3353 		 * we could share the same root & context tables
3354 		 * among all IOMMU's. Need to Split it later.
3355 		 */
3356 		ret = iommu_alloc_root_entry(iommu);
3357 		if (ret)
3358 			goto free_iommu;
3359 
3360 		if (translation_pre_enabled(iommu)) {
3361 			pr_info("Translation already enabled - trying to copy translation structures\n");
3362 
3363 			ret = copy_translation_tables(iommu);
3364 			if (ret) {
3365 				/*
3366 				 * We found the IOMMU with translation
3367 				 * enabled - but failed to copy over the
3368 				 * old root-entry table. Try to proceed
3369 				 * by disabling translation now and
3370 				 * allocating a clean root-entry table.
3371 				 * This might cause DMAR faults, but
3372 				 * probably the dump will still succeed.
3373 				 */
3374 				pr_err("Failed to copy translation tables from previous kernel for %s\n",
3375 				       iommu->name);
3376 				iommu_disable_translation(iommu);
3377 				clear_translation_pre_enabled(iommu);
3378 			} else {
3379 				pr_info("Copied translation tables from previous kernel for %s\n",
3380 					iommu->name);
3381 			}
3382 		}
3383 
3384 		if (!ecap_pass_through(iommu->ecap))
3385 			hw_pass_through = 0;
3386 
3387 		if (!intel_iommu_strict && cap_caching_mode(iommu->cap)) {
3388 			pr_warn("Disable batched IOTLB flush due to virtualization");
3389 			intel_iommu_strict = 1;
3390 		}
3391 		intel_svm_check(iommu);
3392 	}
3393 
3394 	/*
3395 	 * Now that qi is enabled on all iommus, set the root entry and flush
3396 	 * caches. This is required on some Intel X58 chipsets, otherwise the
3397 	 * flush_context function will loop forever and the boot hangs.
3398 	 */
3399 	for_each_active_iommu(iommu, drhd) {
3400 		iommu_flush_write_buffer(iommu);
3401 #ifdef CONFIG_INTEL_IOMMU_SVM
3402 		register_pasid_allocator(iommu);
3403 #endif
3404 		iommu_set_root_entry(iommu);
3405 	}
3406 
3407 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3408 	dmar_map_gfx = 0;
3409 #endif
3410 
3411 	if (!dmar_map_gfx)
3412 		iommu_identity_mapping |= IDENTMAP_GFX;
3413 
3414 	check_tylersburg_isoch();
3415 
3416 	ret = si_domain_init(hw_pass_through);
3417 	if (ret)
3418 		goto free_iommu;
3419 
3420 	/*
3421 	 * for each drhd
3422 	 *   enable fault log
3423 	 *   global invalidate context cache
3424 	 *   global invalidate iotlb
3425 	 *   enable translation
3426 	 */
3427 	for_each_iommu(iommu, drhd) {
3428 		if (drhd->ignored) {
3429 			/*
3430 			 * we always have to disable PMRs or DMA may fail on
3431 			 * this device
3432 			 */
3433 			if (force_on)
3434 				iommu_disable_protect_mem_regions(iommu);
3435 			continue;
3436 		}
3437 
3438 		iommu_flush_write_buffer(iommu);
3439 
3440 #ifdef CONFIG_INTEL_IOMMU_SVM
3441 		if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3442 			/*
3443 			 * Call dmar_alloc_hwirq() with dmar_global_lock held,
3444 			 * could cause possible lock race condition.
3445 			 */
3446 			up_write(&dmar_global_lock);
3447 			ret = intel_svm_enable_prq(iommu);
3448 			down_write(&dmar_global_lock);
3449 			if (ret)
3450 				goto free_iommu;
3451 		}
3452 #endif
3453 		ret = dmar_set_interrupt(iommu);
3454 		if (ret)
3455 			goto free_iommu;
3456 	}
3457 
3458 	return 0;
3459 
3460 free_iommu:
3461 	for_each_active_iommu(iommu, drhd) {
3462 		disable_dmar_iommu(iommu);
3463 		free_dmar_iommu(iommu);
3464 	}
3465 
3466 	kfree(g_iommus);
3467 
3468 error:
3469 	return ret;
3470 }
3471 
3472 /* This takes a number of _MM_ pages, not VTD pages */
intel_alloc_iova(struct device * dev,struct dmar_domain * domain,unsigned long nrpages,uint64_t dma_mask)3473 static unsigned long intel_alloc_iova(struct device *dev,
3474 				     struct dmar_domain *domain,
3475 				     unsigned long nrpages, uint64_t dma_mask)
3476 {
3477 	unsigned long iova_pfn;
3478 
3479 	/*
3480 	 * Restrict dma_mask to the width that the iommu can handle.
3481 	 * First-level translation restricts the input-address to a
3482 	 * canonical address (i.e., address bits 63:N have the same
3483 	 * value as address bit [N-1], where N is 48-bits with 4-level
3484 	 * paging and 57-bits with 5-level paging). Hence, skip bit
3485 	 * [N-1].
3486 	 */
3487 	if (domain_use_first_level(domain))
3488 		dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw - 1),
3489 				 dma_mask);
3490 	else
3491 		dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw),
3492 				 dma_mask);
3493 
3494 	/* Ensure we reserve the whole size-aligned region */
3495 	nrpages = __roundup_pow_of_two(nrpages);
3496 
3497 	if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3498 		/*
3499 		 * First try to allocate an io virtual address in
3500 		 * DMA_BIT_MASK(32) and if that fails then try allocating
3501 		 * from higher range
3502 		 */
3503 		iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3504 					   IOVA_PFN(DMA_BIT_MASK(32)), false);
3505 		if (iova_pfn)
3506 			return iova_pfn;
3507 	}
3508 	iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3509 				   IOVA_PFN(dma_mask), true);
3510 	if (unlikely(!iova_pfn)) {
3511 		dev_err_once(dev, "Allocating %ld-page iova failed\n",
3512 			     nrpages);
3513 		return 0;
3514 	}
3515 
3516 	return iova_pfn;
3517 }
3518 
__intel_map_single(struct device * dev,phys_addr_t paddr,size_t size,int dir,u64 dma_mask)3519 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3520 				     size_t size, int dir, u64 dma_mask)
3521 {
3522 	struct dmar_domain *domain;
3523 	phys_addr_t start_paddr;
3524 	unsigned long iova_pfn;
3525 	int prot = 0;
3526 	int ret;
3527 	struct intel_iommu *iommu;
3528 	unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3529 
3530 	BUG_ON(dir == DMA_NONE);
3531 
3532 	if (unlikely(attach_deferred(dev)))
3533 		do_deferred_attach(dev);
3534 
3535 	domain = find_domain(dev);
3536 	if (!domain)
3537 		return DMA_MAPPING_ERROR;
3538 
3539 	iommu = domain_get_iommu(domain);
3540 	size = aligned_nrpages(paddr, size);
3541 
3542 	iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3543 	if (!iova_pfn)
3544 		goto error;
3545 
3546 	/*
3547 	 * Check if DMAR supports zero-length reads on write only
3548 	 * mappings..
3549 	 */
3550 	if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3551 			!cap_zlr(iommu->cap))
3552 		prot |= DMA_PTE_READ;
3553 	if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3554 		prot |= DMA_PTE_WRITE;
3555 	/*
3556 	 * paddr - (paddr + size) might be partial page, we should map the whole
3557 	 * page.  Note: if two part of one page are separately mapped, we
3558 	 * might have two guest_addr mapping to the same host paddr, but this
3559 	 * is not a big problem
3560 	 */
3561 	ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3562 				 mm_to_dma_pfn(paddr_pfn), size, prot);
3563 	if (ret)
3564 		goto error;
3565 
3566 	start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
3567 	start_paddr += paddr & ~PAGE_MASK;
3568 
3569 	trace_map_single(dev, start_paddr, paddr, size << VTD_PAGE_SHIFT);
3570 
3571 	return start_paddr;
3572 
3573 error:
3574 	if (iova_pfn)
3575 		free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3576 	dev_err(dev, "Device request: %zx@%llx dir %d --- failed\n",
3577 		size, (unsigned long long)paddr, dir);
3578 	return DMA_MAPPING_ERROR;
3579 }
3580 
intel_map_page(struct device * dev,struct page * page,unsigned long offset,size_t size,enum dma_data_direction dir,unsigned long attrs)3581 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3582 				 unsigned long offset, size_t size,
3583 				 enum dma_data_direction dir,
3584 				 unsigned long attrs)
3585 {
3586 	return __intel_map_single(dev, page_to_phys(page) + offset,
3587 				  size, dir, *dev->dma_mask);
3588 }
3589 
intel_map_resource(struct device * dev,phys_addr_t phys_addr,size_t size,enum dma_data_direction dir,unsigned long attrs)3590 static dma_addr_t intel_map_resource(struct device *dev, phys_addr_t phys_addr,
3591 				     size_t size, enum dma_data_direction dir,
3592 				     unsigned long attrs)
3593 {
3594 	return __intel_map_single(dev, phys_addr, size, dir, *dev->dma_mask);
3595 }
3596 
intel_unmap(struct device * dev,dma_addr_t dev_addr,size_t size)3597 static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
3598 {
3599 	struct dmar_domain *domain;
3600 	unsigned long start_pfn, last_pfn;
3601 	unsigned long nrpages;
3602 	unsigned long iova_pfn;
3603 	struct intel_iommu *iommu;
3604 	struct page *freelist;
3605 	struct pci_dev *pdev = NULL;
3606 
3607 	domain = find_domain(dev);
3608 	BUG_ON(!domain);
3609 
3610 	iommu = domain_get_iommu(domain);
3611 
3612 	iova_pfn = IOVA_PFN(dev_addr);
3613 
3614 	nrpages = aligned_nrpages(dev_addr, size);
3615 	start_pfn = mm_to_dma_pfn(iova_pfn);
3616 	last_pfn = start_pfn + nrpages - 1;
3617 
3618 	if (dev_is_pci(dev))
3619 		pdev = to_pci_dev(dev);
3620 
3621 	freelist = domain_unmap(domain, start_pfn, last_pfn);
3622 	if (intel_iommu_strict || (pdev && pdev->untrusted) ||
3623 			!has_iova_flush_queue(&domain->iovad)) {
3624 		iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3625 				      nrpages, !freelist, 0);
3626 		/* free iova */
3627 		free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3628 		dma_free_pagelist(freelist);
3629 	} else {
3630 		queue_iova(&domain->iovad, iova_pfn, nrpages,
3631 			   (unsigned long)freelist);
3632 		/*
3633 		 * queue up the release of the unmap to save the 1/6th of the
3634 		 * cpu used up by the iotlb flush operation...
3635 		 */
3636 	}
3637 
3638 	trace_unmap_single(dev, dev_addr, size);
3639 }
3640 
intel_unmap_page(struct device * dev,dma_addr_t dev_addr,size_t size,enum dma_data_direction dir,unsigned long attrs)3641 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3642 			     size_t size, enum dma_data_direction dir,
3643 			     unsigned long attrs)
3644 {
3645 	intel_unmap(dev, dev_addr, size);
3646 }
3647 
intel_unmap_resource(struct device * dev,dma_addr_t dev_addr,size_t size,enum dma_data_direction dir,unsigned long attrs)3648 static void intel_unmap_resource(struct device *dev, dma_addr_t dev_addr,
3649 		size_t size, enum dma_data_direction dir, unsigned long attrs)
3650 {
3651 	intel_unmap(dev, dev_addr, size);
3652 }
3653 
intel_alloc_coherent(struct device * dev,size_t size,dma_addr_t * dma_handle,gfp_t flags,unsigned long attrs)3654 static void *intel_alloc_coherent(struct device *dev, size_t size,
3655 				  dma_addr_t *dma_handle, gfp_t flags,
3656 				  unsigned long attrs)
3657 {
3658 	struct page *page = NULL;
3659 	int order;
3660 
3661 	if (unlikely(attach_deferred(dev)))
3662 		do_deferred_attach(dev);
3663 
3664 	size = PAGE_ALIGN(size);
3665 	order = get_order(size);
3666 
3667 	if (gfpflags_allow_blocking(flags)) {
3668 		unsigned int count = size >> PAGE_SHIFT;
3669 
3670 		page = dma_alloc_from_contiguous(dev, count, order,
3671 						 flags & __GFP_NOWARN);
3672 	}
3673 
3674 	if (!page)
3675 		page = alloc_pages(flags, order);
3676 	if (!page)
3677 		return NULL;
3678 	memset(page_address(page), 0, size);
3679 
3680 	*dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3681 					 DMA_BIDIRECTIONAL,
3682 					 dev->coherent_dma_mask);
3683 	if (*dma_handle != DMA_MAPPING_ERROR)
3684 		return page_address(page);
3685 	if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3686 		__free_pages(page, order);
3687 
3688 	return NULL;
3689 }
3690 
intel_free_coherent(struct device * dev,size_t size,void * vaddr,dma_addr_t dma_handle,unsigned long attrs)3691 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3692 				dma_addr_t dma_handle, unsigned long attrs)
3693 {
3694 	int order;
3695 	struct page *page = virt_to_page(vaddr);
3696 
3697 	size = PAGE_ALIGN(size);
3698 	order = get_order(size);
3699 
3700 	intel_unmap(dev, dma_handle, size);
3701 	if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3702 		__free_pages(page, order);
3703 }
3704 
intel_unmap_sg(struct device * dev,struct scatterlist * sglist,int nelems,enum dma_data_direction dir,unsigned long attrs)3705 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3706 			   int nelems, enum dma_data_direction dir,
3707 			   unsigned long attrs)
3708 {
3709 	dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
3710 	unsigned long nrpages = 0;
3711 	struct scatterlist *sg;
3712 	int i;
3713 
3714 	for_each_sg(sglist, sg, nelems, i) {
3715 		nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
3716 	}
3717 
3718 	intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3719 
3720 	trace_unmap_sg(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3721 }
3722 
intel_map_sg(struct device * dev,struct scatterlist * sglist,int nelems,enum dma_data_direction dir,unsigned long attrs)3723 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3724 			enum dma_data_direction dir, unsigned long attrs)
3725 {
3726 	int i;
3727 	struct dmar_domain *domain;
3728 	size_t size = 0;
3729 	int prot = 0;
3730 	unsigned long iova_pfn;
3731 	int ret;
3732 	struct scatterlist *sg;
3733 	unsigned long start_vpfn;
3734 	struct intel_iommu *iommu;
3735 
3736 	BUG_ON(dir == DMA_NONE);
3737 
3738 	if (unlikely(attach_deferred(dev)))
3739 		do_deferred_attach(dev);
3740 
3741 	domain = find_domain(dev);
3742 	if (!domain)
3743 		return 0;
3744 
3745 	iommu = domain_get_iommu(domain);
3746 
3747 	for_each_sg(sglist, sg, nelems, i)
3748 		size += aligned_nrpages(sg->offset, sg->length);
3749 
3750 	iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3751 				*dev->dma_mask);
3752 	if (!iova_pfn) {
3753 		sglist->dma_length = 0;
3754 		return 0;
3755 	}
3756 
3757 	/*
3758 	 * Check if DMAR supports zero-length reads on write only
3759 	 * mappings..
3760 	 */
3761 	if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3762 			!cap_zlr(iommu->cap))
3763 		prot |= DMA_PTE_READ;
3764 	if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3765 		prot |= DMA_PTE_WRITE;
3766 
3767 	start_vpfn = mm_to_dma_pfn(iova_pfn);
3768 
3769 	ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3770 	if (unlikely(ret)) {
3771 		dma_pte_free_pagetable(domain, start_vpfn,
3772 				       start_vpfn + size - 1,
3773 				       agaw_to_level(domain->agaw) + 1);
3774 		free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3775 		return 0;
3776 	}
3777 
3778 	for_each_sg(sglist, sg, nelems, i)
3779 		trace_map_sg(dev, i + 1, nelems, sg);
3780 
3781 	return nelems;
3782 }
3783 
intel_get_required_mask(struct device * dev)3784 static u64 intel_get_required_mask(struct device *dev)
3785 {
3786 	return DMA_BIT_MASK(32);
3787 }
3788 
3789 static const struct dma_map_ops intel_dma_ops = {
3790 	.alloc = intel_alloc_coherent,
3791 	.free = intel_free_coherent,
3792 	.map_sg = intel_map_sg,
3793 	.unmap_sg = intel_unmap_sg,
3794 	.map_page = intel_map_page,
3795 	.unmap_page = intel_unmap_page,
3796 	.map_resource = intel_map_resource,
3797 	.unmap_resource = intel_unmap_resource,
3798 	.dma_supported = dma_direct_supported,
3799 	.mmap = dma_common_mmap,
3800 	.get_sgtable = dma_common_get_sgtable,
3801 	.alloc_pages = dma_common_alloc_pages,
3802 	.free_pages = dma_common_free_pages,
3803 	.get_required_mask = intel_get_required_mask,
3804 };
3805 
3806 static void
bounce_sync_single(struct device * dev,dma_addr_t addr,size_t size,enum dma_data_direction dir,enum dma_sync_target target)3807 bounce_sync_single(struct device *dev, dma_addr_t addr, size_t size,
3808 		   enum dma_data_direction dir, enum dma_sync_target target)
3809 {
3810 	struct dmar_domain *domain;
3811 	phys_addr_t tlb_addr;
3812 
3813 	domain = find_domain(dev);
3814 	if (WARN_ON(!domain))
3815 		return;
3816 
3817 	tlb_addr = intel_iommu_iova_to_phys(&domain->domain, addr);
3818 	if (is_swiotlb_buffer(tlb_addr))
3819 		swiotlb_tbl_sync_single(dev, tlb_addr, size, dir, target);
3820 }
3821 
3822 static dma_addr_t
bounce_map_single(struct device * dev,phys_addr_t paddr,size_t size,enum dma_data_direction dir,unsigned long attrs,u64 dma_mask)3823 bounce_map_single(struct device *dev, phys_addr_t paddr, size_t size,
3824 		  enum dma_data_direction dir, unsigned long attrs,
3825 		  u64 dma_mask)
3826 {
3827 	size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
3828 	struct dmar_domain *domain;
3829 	struct intel_iommu *iommu;
3830 	unsigned long iova_pfn;
3831 	unsigned long nrpages;
3832 	phys_addr_t tlb_addr;
3833 	int prot = 0;
3834 	int ret;
3835 
3836 	if (unlikely(attach_deferred(dev)))
3837 		do_deferred_attach(dev);
3838 
3839 	domain = find_domain(dev);
3840 
3841 	if (WARN_ON(dir == DMA_NONE || !domain))
3842 		return DMA_MAPPING_ERROR;
3843 
3844 	iommu = domain_get_iommu(domain);
3845 	if (WARN_ON(!iommu))
3846 		return DMA_MAPPING_ERROR;
3847 
3848 	nrpages = aligned_nrpages(0, size);
3849 	iova_pfn = intel_alloc_iova(dev, domain,
3850 				    dma_to_mm_pfn(nrpages), dma_mask);
3851 	if (!iova_pfn)
3852 		return DMA_MAPPING_ERROR;
3853 
3854 	/*
3855 	 * Check if DMAR supports zero-length reads on write only
3856 	 * mappings..
3857 	 */
3858 	if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL ||
3859 			!cap_zlr(iommu->cap))
3860 		prot |= DMA_PTE_READ;
3861 	if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3862 		prot |= DMA_PTE_WRITE;
3863 
3864 	/*
3865 	 * If both the physical buffer start address and size are
3866 	 * page aligned, we don't need to use a bounce page.
3867 	 */
3868 	if (!IS_ALIGNED(paddr | size, VTD_PAGE_SIZE)) {
3869 		tlb_addr = swiotlb_tbl_map_single(dev, paddr, size,
3870 				aligned_size, dir, attrs);
3871 		if (tlb_addr == DMA_MAPPING_ERROR) {
3872 			goto swiotlb_error;
3873 		} else {
3874 			/* Cleanup the padding area. */
3875 			void *padding_start = phys_to_virt(tlb_addr);
3876 			size_t padding_size = aligned_size;
3877 
3878 			if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
3879 			    (dir == DMA_TO_DEVICE ||
3880 			     dir == DMA_BIDIRECTIONAL)) {
3881 				padding_start += size;
3882 				padding_size -= size;
3883 			}
3884 
3885 			memset(padding_start, 0, padding_size);
3886 		}
3887 	} else {
3888 		tlb_addr = paddr;
3889 	}
3890 
3891 	ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3892 				 tlb_addr >> VTD_PAGE_SHIFT, nrpages, prot);
3893 	if (ret)
3894 		goto mapping_error;
3895 
3896 	trace_bounce_map_single(dev, iova_pfn << PAGE_SHIFT, paddr, size);
3897 
3898 	return (phys_addr_t)iova_pfn << PAGE_SHIFT;
3899 
3900 mapping_error:
3901 	if (is_swiotlb_buffer(tlb_addr))
3902 		swiotlb_tbl_unmap_single(dev, tlb_addr, size,
3903 					 aligned_size, dir, attrs);
3904 swiotlb_error:
3905 	free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3906 	dev_err(dev, "Device bounce map: %zx@%llx dir %d --- failed\n",
3907 		size, (unsigned long long)paddr, dir);
3908 
3909 	return DMA_MAPPING_ERROR;
3910 }
3911 
3912 static void
bounce_unmap_single(struct device * dev,dma_addr_t dev_addr,size_t size,enum dma_data_direction dir,unsigned long attrs)3913 bounce_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
3914 		    enum dma_data_direction dir, unsigned long attrs)
3915 {
3916 	size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
3917 	struct dmar_domain *domain;
3918 	phys_addr_t tlb_addr;
3919 
3920 	domain = find_domain(dev);
3921 	if (WARN_ON(!domain))
3922 		return;
3923 
3924 	tlb_addr = intel_iommu_iova_to_phys(&domain->domain, dev_addr);
3925 	if (WARN_ON(!tlb_addr))
3926 		return;
3927 
3928 	intel_unmap(dev, dev_addr, size);
3929 	if (is_swiotlb_buffer(tlb_addr))
3930 		swiotlb_tbl_unmap_single(dev, tlb_addr, size,
3931 					 aligned_size, dir, attrs);
3932 
3933 	trace_bounce_unmap_single(dev, dev_addr, size);
3934 }
3935 
3936 static dma_addr_t
bounce_map_page(struct device * dev,struct page * page,unsigned long offset,size_t size,enum dma_data_direction dir,unsigned long attrs)3937 bounce_map_page(struct device *dev, struct page *page, unsigned long offset,
3938 		size_t size, enum dma_data_direction dir, unsigned long attrs)
3939 {
3940 	return bounce_map_single(dev, page_to_phys(page) + offset,
3941 				 size, dir, attrs, *dev->dma_mask);
3942 }
3943 
3944 static dma_addr_t
bounce_map_resource(struct device * dev,phys_addr_t phys_addr,size_t size,enum dma_data_direction dir,unsigned long attrs)3945 bounce_map_resource(struct device *dev, phys_addr_t phys_addr, size_t size,
3946 		    enum dma_data_direction dir, unsigned long attrs)
3947 {
3948 	return bounce_map_single(dev, phys_addr, size,
3949 				 dir, attrs, *dev->dma_mask);
3950 }
3951 
3952 static void
bounce_unmap_page(struct device * dev,dma_addr_t dev_addr,size_t size,enum dma_data_direction dir,unsigned long attrs)3953 bounce_unmap_page(struct device *dev, dma_addr_t dev_addr, size_t size,
3954 		  enum dma_data_direction dir, unsigned long attrs)
3955 {
3956 	bounce_unmap_single(dev, dev_addr, size, dir, attrs);
3957 }
3958 
3959 static void
bounce_unmap_resource(struct device * dev,dma_addr_t dev_addr,size_t size,enum dma_data_direction dir,unsigned long attrs)3960 bounce_unmap_resource(struct device *dev, dma_addr_t dev_addr, size_t size,
3961 		      enum dma_data_direction dir, unsigned long attrs)
3962 {
3963 	bounce_unmap_single(dev, dev_addr, size, dir, attrs);
3964 }
3965 
3966 static void
bounce_unmap_sg(struct device * dev,struct scatterlist * sglist,int nelems,enum dma_data_direction dir,unsigned long attrs)3967 bounce_unmap_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3968 		enum dma_data_direction dir, unsigned long attrs)
3969 {
3970 	struct scatterlist *sg;
3971 	int i;
3972 
3973 	for_each_sg(sglist, sg, nelems, i)
3974 		bounce_unmap_page(dev, sg->dma_address,
3975 				  sg_dma_len(sg), dir, attrs);
3976 }
3977 
3978 static int
bounce_map_sg(struct device * dev,struct scatterlist * sglist,int nelems,enum dma_data_direction dir,unsigned long attrs)3979 bounce_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3980 	      enum dma_data_direction dir, unsigned long attrs)
3981 {
3982 	int i;
3983 	struct scatterlist *sg;
3984 
3985 	for_each_sg(sglist, sg, nelems, i) {
3986 		sg->dma_address = bounce_map_page(dev, sg_page(sg),
3987 						  sg->offset, sg->length,
3988 						  dir, attrs);
3989 		if (sg->dma_address == DMA_MAPPING_ERROR)
3990 			goto out_unmap;
3991 		sg_dma_len(sg) = sg->length;
3992 	}
3993 
3994 	for_each_sg(sglist, sg, nelems, i)
3995 		trace_bounce_map_sg(dev, i + 1, nelems, sg);
3996 
3997 	return nelems;
3998 
3999 out_unmap:
4000 	bounce_unmap_sg(dev, sglist, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
4001 	return 0;
4002 }
4003 
4004 static void
bounce_sync_single_for_cpu(struct device * dev,dma_addr_t addr,size_t size,enum dma_data_direction dir)4005 bounce_sync_single_for_cpu(struct device *dev, dma_addr_t addr,
4006 			   size_t size, enum dma_data_direction dir)
4007 {
4008 	bounce_sync_single(dev, addr, size, dir, SYNC_FOR_CPU);
4009 }
4010 
4011 static void
bounce_sync_single_for_device(struct device * dev,dma_addr_t addr,size_t size,enum dma_data_direction dir)4012 bounce_sync_single_for_device(struct device *dev, dma_addr_t addr,
4013 			      size_t size, enum dma_data_direction dir)
4014 {
4015 	bounce_sync_single(dev, addr, size, dir, SYNC_FOR_DEVICE);
4016 }
4017 
4018 static void
bounce_sync_sg_for_cpu(struct device * dev,struct scatterlist * sglist,int nelems,enum dma_data_direction dir)4019 bounce_sync_sg_for_cpu(struct device *dev, struct scatterlist *sglist,
4020 		       int nelems, enum dma_data_direction dir)
4021 {
4022 	struct scatterlist *sg;
4023 	int i;
4024 
4025 	for_each_sg(sglist, sg, nelems, i)
4026 		bounce_sync_single(dev, sg_dma_address(sg),
4027 				   sg_dma_len(sg), dir, SYNC_FOR_CPU);
4028 }
4029 
4030 static void
bounce_sync_sg_for_device(struct device * dev,struct scatterlist * sglist,int nelems,enum dma_data_direction dir)4031 bounce_sync_sg_for_device(struct device *dev, struct scatterlist *sglist,
4032 			  int nelems, enum dma_data_direction dir)
4033 {
4034 	struct scatterlist *sg;
4035 	int i;
4036 
4037 	for_each_sg(sglist, sg, nelems, i)
4038 		bounce_sync_single(dev, sg_dma_address(sg),
4039 				   sg_dma_len(sg), dir, SYNC_FOR_DEVICE);
4040 }
4041 
4042 static const struct dma_map_ops bounce_dma_ops = {
4043 	.alloc			= intel_alloc_coherent,
4044 	.free			= intel_free_coherent,
4045 	.map_sg			= bounce_map_sg,
4046 	.unmap_sg		= bounce_unmap_sg,
4047 	.map_page		= bounce_map_page,
4048 	.unmap_page		= bounce_unmap_page,
4049 	.sync_single_for_cpu	= bounce_sync_single_for_cpu,
4050 	.sync_single_for_device	= bounce_sync_single_for_device,
4051 	.sync_sg_for_cpu	= bounce_sync_sg_for_cpu,
4052 	.sync_sg_for_device	= bounce_sync_sg_for_device,
4053 	.map_resource		= bounce_map_resource,
4054 	.unmap_resource		= bounce_unmap_resource,
4055 	.alloc_pages		= dma_common_alloc_pages,
4056 	.free_pages		= dma_common_free_pages,
4057 	.dma_supported		= dma_direct_supported,
4058 };
4059 
iommu_domain_cache_init(void)4060 static inline int iommu_domain_cache_init(void)
4061 {
4062 	int ret = 0;
4063 
4064 	iommu_domain_cache = kmem_cache_create("iommu_domain",
4065 					 sizeof(struct dmar_domain),
4066 					 0,
4067 					 SLAB_HWCACHE_ALIGN,
4068 
4069 					 NULL);
4070 	if (!iommu_domain_cache) {
4071 		pr_err("Couldn't create iommu_domain cache\n");
4072 		ret = -ENOMEM;
4073 	}
4074 
4075 	return ret;
4076 }
4077 
iommu_devinfo_cache_init(void)4078 static inline int iommu_devinfo_cache_init(void)
4079 {
4080 	int ret = 0;
4081 
4082 	iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
4083 					 sizeof(struct device_domain_info),
4084 					 0,
4085 					 SLAB_HWCACHE_ALIGN,
4086 					 NULL);
4087 	if (!iommu_devinfo_cache) {
4088 		pr_err("Couldn't create devinfo cache\n");
4089 		ret = -ENOMEM;
4090 	}
4091 
4092 	return ret;
4093 }
4094 
iommu_init_mempool(void)4095 static int __init iommu_init_mempool(void)
4096 {
4097 	int ret;
4098 	ret = iova_cache_get();
4099 	if (ret)
4100 		return ret;
4101 
4102 	ret = iommu_domain_cache_init();
4103 	if (ret)
4104 		goto domain_error;
4105 
4106 	ret = iommu_devinfo_cache_init();
4107 	if (!ret)
4108 		return ret;
4109 
4110 	kmem_cache_destroy(iommu_domain_cache);
4111 domain_error:
4112 	iova_cache_put();
4113 
4114 	return -ENOMEM;
4115 }
4116 
iommu_exit_mempool(void)4117 static void __init iommu_exit_mempool(void)
4118 {
4119 	kmem_cache_destroy(iommu_devinfo_cache);
4120 	kmem_cache_destroy(iommu_domain_cache);
4121 	iova_cache_put();
4122 }
4123 
init_no_remapping_devices(void)4124 static void __init init_no_remapping_devices(void)
4125 {
4126 	struct dmar_drhd_unit *drhd;
4127 	struct device *dev;
4128 	int i;
4129 
4130 	for_each_drhd_unit(drhd) {
4131 		if (!drhd->include_all) {
4132 			for_each_active_dev_scope(drhd->devices,
4133 						  drhd->devices_cnt, i, dev)
4134 				break;
4135 			/* ignore DMAR unit if no devices exist */
4136 			if (i == drhd->devices_cnt)
4137 				drhd->ignored = 1;
4138 		}
4139 	}
4140 
4141 	for_each_active_drhd_unit(drhd) {
4142 		if (drhd->include_all)
4143 			continue;
4144 
4145 		for_each_active_dev_scope(drhd->devices,
4146 					  drhd->devices_cnt, i, dev)
4147 			if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
4148 				break;
4149 		if (i < drhd->devices_cnt)
4150 			continue;
4151 
4152 		/* This IOMMU has *only* gfx devices. Either bypass it or
4153 		   set the gfx_mapped flag, as appropriate */
4154 		drhd->gfx_dedicated = 1;
4155 		if (!dmar_map_gfx)
4156 			drhd->ignored = 1;
4157 	}
4158 }
4159 
4160 #ifdef CONFIG_SUSPEND
init_iommu_hw(void)4161 static int init_iommu_hw(void)
4162 {
4163 	struct dmar_drhd_unit *drhd;
4164 	struct intel_iommu *iommu = NULL;
4165 
4166 	for_each_active_iommu(iommu, drhd)
4167 		if (iommu->qi)
4168 			dmar_reenable_qi(iommu);
4169 
4170 	for_each_iommu(iommu, drhd) {
4171 		if (drhd->ignored) {
4172 			/*
4173 			 * we always have to disable PMRs or DMA may fail on
4174 			 * this device
4175 			 */
4176 			if (force_on)
4177 				iommu_disable_protect_mem_regions(iommu);
4178 			continue;
4179 		}
4180 
4181 		iommu_flush_write_buffer(iommu);
4182 		iommu_set_root_entry(iommu);
4183 		iommu_enable_translation(iommu);
4184 		iommu_disable_protect_mem_regions(iommu);
4185 	}
4186 
4187 	return 0;
4188 }
4189 
iommu_flush_all(void)4190 static void iommu_flush_all(void)
4191 {
4192 	struct dmar_drhd_unit *drhd;
4193 	struct intel_iommu *iommu;
4194 
4195 	for_each_active_iommu(iommu, drhd) {
4196 		iommu->flush.flush_context(iommu, 0, 0, 0,
4197 					   DMA_CCMD_GLOBAL_INVL);
4198 		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
4199 					 DMA_TLB_GLOBAL_FLUSH);
4200 	}
4201 }
4202 
iommu_suspend(void)4203 static int iommu_suspend(void)
4204 {
4205 	struct dmar_drhd_unit *drhd;
4206 	struct intel_iommu *iommu = NULL;
4207 	unsigned long flag;
4208 
4209 	for_each_active_iommu(iommu, drhd) {
4210 		iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
4211 						 GFP_ATOMIC);
4212 		if (!iommu->iommu_state)
4213 			goto nomem;
4214 	}
4215 
4216 	iommu_flush_all();
4217 
4218 	for_each_active_iommu(iommu, drhd) {
4219 		iommu_disable_translation(iommu);
4220 
4221 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
4222 
4223 		iommu->iommu_state[SR_DMAR_FECTL_REG] =
4224 			readl(iommu->reg + DMAR_FECTL_REG);
4225 		iommu->iommu_state[SR_DMAR_FEDATA_REG] =
4226 			readl(iommu->reg + DMAR_FEDATA_REG);
4227 		iommu->iommu_state[SR_DMAR_FEADDR_REG] =
4228 			readl(iommu->reg + DMAR_FEADDR_REG);
4229 		iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
4230 			readl(iommu->reg + DMAR_FEUADDR_REG);
4231 
4232 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4233 	}
4234 	return 0;
4235 
4236 nomem:
4237 	for_each_active_iommu(iommu, drhd)
4238 		kfree(iommu->iommu_state);
4239 
4240 	return -ENOMEM;
4241 }
4242 
iommu_resume(void)4243 static void iommu_resume(void)
4244 {
4245 	struct dmar_drhd_unit *drhd;
4246 	struct intel_iommu *iommu = NULL;
4247 	unsigned long flag;
4248 
4249 	if (init_iommu_hw()) {
4250 		if (force_on)
4251 			panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
4252 		else
4253 			WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
4254 		return;
4255 	}
4256 
4257 	for_each_active_iommu(iommu, drhd) {
4258 
4259 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
4260 
4261 		writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
4262 			iommu->reg + DMAR_FECTL_REG);
4263 		writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
4264 			iommu->reg + DMAR_FEDATA_REG);
4265 		writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
4266 			iommu->reg + DMAR_FEADDR_REG);
4267 		writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
4268 			iommu->reg + DMAR_FEUADDR_REG);
4269 
4270 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4271 	}
4272 
4273 	for_each_active_iommu(iommu, drhd)
4274 		kfree(iommu->iommu_state);
4275 }
4276 
4277 static struct syscore_ops iommu_syscore_ops = {
4278 	.resume		= iommu_resume,
4279 	.suspend	= iommu_suspend,
4280 };
4281 
init_iommu_pm_ops(void)4282 static void __init init_iommu_pm_ops(void)
4283 {
4284 	register_syscore_ops(&iommu_syscore_ops);
4285 }
4286 
4287 #else
init_iommu_pm_ops(void)4288 static inline void init_iommu_pm_ops(void) {}
4289 #endif	/* CONFIG_PM */
4290 
rmrr_sanity_check(struct acpi_dmar_reserved_memory * rmrr)4291 static int rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
4292 {
4293 	if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
4294 	    !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
4295 	    rmrr->end_address <= rmrr->base_address ||
4296 	    arch_rmrr_sanity_check(rmrr))
4297 		return -EINVAL;
4298 
4299 	return 0;
4300 }
4301 
dmar_parse_one_rmrr(struct acpi_dmar_header * header,void * arg)4302 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4303 {
4304 	struct acpi_dmar_reserved_memory *rmrr;
4305 	struct dmar_rmrr_unit *rmrru;
4306 
4307 	rmrr = (struct acpi_dmar_reserved_memory *)header;
4308 	if (rmrr_sanity_check(rmrr)) {
4309 		pr_warn(FW_BUG
4310 			   "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
4311 			   "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4312 			   rmrr->base_address, rmrr->end_address,
4313 			   dmi_get_system_info(DMI_BIOS_VENDOR),
4314 			   dmi_get_system_info(DMI_BIOS_VERSION),
4315 			   dmi_get_system_info(DMI_PRODUCT_VERSION));
4316 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
4317 	}
4318 
4319 	rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4320 	if (!rmrru)
4321 		goto out;
4322 
4323 	rmrru->hdr = header;
4324 
4325 	rmrru->base_address = rmrr->base_address;
4326 	rmrru->end_address = rmrr->end_address;
4327 
4328 	rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4329 				((void *)rmrr) + rmrr->header.length,
4330 				&rmrru->devices_cnt);
4331 	if (rmrru->devices_cnt && rmrru->devices == NULL)
4332 		goto free_rmrru;
4333 
4334 	list_add(&rmrru->list, &dmar_rmrr_units);
4335 
4336 	return 0;
4337 free_rmrru:
4338 	kfree(rmrru);
4339 out:
4340 	return -ENOMEM;
4341 }
4342 
dmar_find_atsr(struct acpi_dmar_atsr * atsr)4343 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4344 {
4345 	struct dmar_atsr_unit *atsru;
4346 	struct acpi_dmar_atsr *tmp;
4347 
4348 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
4349 				dmar_rcu_check()) {
4350 		tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4351 		if (atsr->segment != tmp->segment)
4352 			continue;
4353 		if (atsr->header.length != tmp->header.length)
4354 			continue;
4355 		if (memcmp(atsr, tmp, atsr->header.length) == 0)
4356 			return atsru;
4357 	}
4358 
4359 	return NULL;
4360 }
4361 
dmar_parse_one_atsr(struct acpi_dmar_header * hdr,void * arg)4362 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4363 {
4364 	struct acpi_dmar_atsr *atsr;
4365 	struct dmar_atsr_unit *atsru;
4366 
4367 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
4368 		return 0;
4369 
4370 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4371 	atsru = dmar_find_atsr(atsr);
4372 	if (atsru)
4373 		return 0;
4374 
4375 	atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4376 	if (!atsru)
4377 		return -ENOMEM;
4378 
4379 	/*
4380 	 * If memory is allocated from slab by ACPI _DSM method, we need to
4381 	 * copy the memory content because the memory buffer will be freed
4382 	 * on return.
4383 	 */
4384 	atsru->hdr = (void *)(atsru + 1);
4385 	memcpy(atsru->hdr, hdr, hdr->length);
4386 	atsru->include_all = atsr->flags & 0x1;
4387 	if (!atsru->include_all) {
4388 		atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4389 				(void *)atsr + atsr->header.length,
4390 				&atsru->devices_cnt);
4391 		if (atsru->devices_cnt && atsru->devices == NULL) {
4392 			kfree(atsru);
4393 			return -ENOMEM;
4394 		}
4395 	}
4396 
4397 	list_add_rcu(&atsru->list, &dmar_atsr_units);
4398 
4399 	return 0;
4400 }
4401 
intel_iommu_free_atsr(struct dmar_atsr_unit * atsru)4402 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4403 {
4404 	dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4405 	kfree(atsru);
4406 }
4407 
dmar_release_one_atsr(struct acpi_dmar_header * hdr,void * arg)4408 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4409 {
4410 	struct acpi_dmar_atsr *atsr;
4411 	struct dmar_atsr_unit *atsru;
4412 
4413 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4414 	atsru = dmar_find_atsr(atsr);
4415 	if (atsru) {
4416 		list_del_rcu(&atsru->list);
4417 		synchronize_rcu();
4418 		intel_iommu_free_atsr(atsru);
4419 	}
4420 
4421 	return 0;
4422 }
4423 
dmar_check_one_atsr(struct acpi_dmar_header * hdr,void * arg)4424 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4425 {
4426 	int i;
4427 	struct device *dev;
4428 	struct acpi_dmar_atsr *atsr;
4429 	struct dmar_atsr_unit *atsru;
4430 
4431 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4432 	atsru = dmar_find_atsr(atsr);
4433 	if (!atsru)
4434 		return 0;
4435 
4436 	if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4437 		for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4438 					  i, dev)
4439 			return -EBUSY;
4440 	}
4441 
4442 	return 0;
4443 }
4444 
intel_iommu_add(struct dmar_drhd_unit * dmaru)4445 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4446 {
4447 	int sp, ret;
4448 	struct intel_iommu *iommu = dmaru->iommu;
4449 
4450 	if (g_iommus[iommu->seq_id])
4451 		return 0;
4452 
4453 	if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4454 		pr_warn("%s: Doesn't support hardware pass through.\n",
4455 			iommu->name);
4456 		return -ENXIO;
4457 	}
4458 	if (!ecap_sc_support(iommu->ecap) &&
4459 	    domain_update_iommu_snooping(iommu)) {
4460 		pr_warn("%s: Doesn't support snooping.\n",
4461 			iommu->name);
4462 		return -ENXIO;
4463 	}
4464 	sp = domain_update_iommu_superpage(NULL, iommu) - 1;
4465 	if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4466 		pr_warn("%s: Doesn't support large page.\n",
4467 			iommu->name);
4468 		return -ENXIO;
4469 	}
4470 
4471 	/*
4472 	 * Disable translation if already enabled prior to OS handover.
4473 	 */
4474 	if (iommu->gcmd & DMA_GCMD_TE)
4475 		iommu_disable_translation(iommu);
4476 
4477 	g_iommus[iommu->seq_id] = iommu;
4478 	ret = iommu_init_domains(iommu);
4479 	if (ret == 0)
4480 		ret = iommu_alloc_root_entry(iommu);
4481 	if (ret)
4482 		goto out;
4483 
4484 	intel_svm_check(iommu);
4485 
4486 	if (dmaru->ignored) {
4487 		/*
4488 		 * we always have to disable PMRs or DMA may fail on this device
4489 		 */
4490 		if (force_on)
4491 			iommu_disable_protect_mem_regions(iommu);
4492 		return 0;
4493 	}
4494 
4495 	intel_iommu_init_qi(iommu);
4496 	iommu_flush_write_buffer(iommu);
4497 
4498 #ifdef CONFIG_INTEL_IOMMU_SVM
4499 	if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
4500 		ret = intel_svm_enable_prq(iommu);
4501 		if (ret)
4502 			goto disable_iommu;
4503 	}
4504 #endif
4505 	ret = dmar_set_interrupt(iommu);
4506 	if (ret)
4507 		goto disable_iommu;
4508 
4509 	iommu_set_root_entry(iommu);
4510 	iommu_enable_translation(iommu);
4511 
4512 	iommu_disable_protect_mem_regions(iommu);
4513 	return 0;
4514 
4515 disable_iommu:
4516 	disable_dmar_iommu(iommu);
4517 out:
4518 	free_dmar_iommu(iommu);
4519 	return ret;
4520 }
4521 
dmar_iommu_hotplug(struct dmar_drhd_unit * dmaru,bool insert)4522 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4523 {
4524 	int ret = 0;
4525 	struct intel_iommu *iommu = dmaru->iommu;
4526 
4527 	if (!intel_iommu_enabled)
4528 		return 0;
4529 	if (iommu == NULL)
4530 		return -EINVAL;
4531 
4532 	if (insert) {
4533 		ret = intel_iommu_add(dmaru);
4534 	} else {
4535 		disable_dmar_iommu(iommu);
4536 		free_dmar_iommu(iommu);
4537 	}
4538 
4539 	return ret;
4540 }
4541 
intel_iommu_free_dmars(void)4542 static void intel_iommu_free_dmars(void)
4543 {
4544 	struct dmar_rmrr_unit *rmrru, *rmrr_n;
4545 	struct dmar_atsr_unit *atsru, *atsr_n;
4546 
4547 	list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4548 		list_del(&rmrru->list);
4549 		dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4550 		kfree(rmrru);
4551 	}
4552 
4553 	list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4554 		list_del(&atsru->list);
4555 		intel_iommu_free_atsr(atsru);
4556 	}
4557 }
4558 
dmar_find_matched_atsr_unit(struct pci_dev * dev)4559 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4560 {
4561 	int i, ret = 1;
4562 	struct pci_bus *bus;
4563 	struct pci_dev *bridge = NULL;
4564 	struct device *tmp;
4565 	struct acpi_dmar_atsr *atsr;
4566 	struct dmar_atsr_unit *atsru;
4567 
4568 	dev = pci_physfn(dev);
4569 	for (bus = dev->bus; bus; bus = bus->parent) {
4570 		bridge = bus->self;
4571 		/* If it's an integrated device, allow ATS */
4572 		if (!bridge)
4573 			return 1;
4574 		/* Connected via non-PCIe: no ATS */
4575 		if (!pci_is_pcie(bridge) ||
4576 		    pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4577 			return 0;
4578 		/* If we found the root port, look it up in the ATSR */
4579 		if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4580 			break;
4581 	}
4582 
4583 	rcu_read_lock();
4584 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4585 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4586 		if (atsr->segment != pci_domain_nr(dev->bus))
4587 			continue;
4588 
4589 		for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4590 			if (tmp == &bridge->dev)
4591 				goto out;
4592 
4593 		if (atsru->include_all)
4594 			goto out;
4595 	}
4596 	ret = 0;
4597 out:
4598 	rcu_read_unlock();
4599 
4600 	return ret;
4601 }
4602 
dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info * info)4603 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4604 {
4605 	int ret;
4606 	struct dmar_rmrr_unit *rmrru;
4607 	struct dmar_atsr_unit *atsru;
4608 	struct acpi_dmar_atsr *atsr;
4609 	struct acpi_dmar_reserved_memory *rmrr;
4610 
4611 	if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4612 		return 0;
4613 
4614 	list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4615 		rmrr = container_of(rmrru->hdr,
4616 				    struct acpi_dmar_reserved_memory, header);
4617 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4618 			ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4619 				((void *)rmrr) + rmrr->header.length,
4620 				rmrr->segment, rmrru->devices,
4621 				rmrru->devices_cnt);
4622 			if (ret < 0)
4623 				return ret;
4624 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4625 			dmar_remove_dev_scope(info, rmrr->segment,
4626 				rmrru->devices, rmrru->devices_cnt);
4627 		}
4628 	}
4629 
4630 	list_for_each_entry(atsru, &dmar_atsr_units, list) {
4631 		if (atsru->include_all)
4632 			continue;
4633 
4634 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4635 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4636 			ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4637 					(void *)atsr + atsr->header.length,
4638 					atsr->segment, atsru->devices,
4639 					atsru->devices_cnt);
4640 			if (ret > 0)
4641 				break;
4642 			else if (ret < 0)
4643 				return ret;
4644 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4645 			if (dmar_remove_dev_scope(info, atsr->segment,
4646 					atsru->devices, atsru->devices_cnt))
4647 				break;
4648 		}
4649 	}
4650 
4651 	return 0;
4652 }
4653 
intel_iommu_memory_notifier(struct notifier_block * nb,unsigned long val,void * v)4654 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4655 				       unsigned long val, void *v)
4656 {
4657 	struct memory_notify *mhp = v;
4658 	unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4659 	unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn +
4660 			mhp->nr_pages - 1);
4661 
4662 	switch (val) {
4663 	case MEM_GOING_ONLINE:
4664 		if (iommu_domain_identity_map(si_domain,
4665 					      start_vpfn, last_vpfn)) {
4666 			pr_warn("Failed to build identity map for [%lx-%lx]\n",
4667 				start_vpfn, last_vpfn);
4668 			return NOTIFY_BAD;
4669 		}
4670 		break;
4671 
4672 	case MEM_OFFLINE:
4673 	case MEM_CANCEL_ONLINE:
4674 		{
4675 			struct dmar_drhd_unit *drhd;
4676 			struct intel_iommu *iommu;
4677 			struct page *freelist;
4678 
4679 			freelist = domain_unmap(si_domain,
4680 						start_vpfn, last_vpfn);
4681 
4682 			rcu_read_lock();
4683 			for_each_active_iommu(iommu, drhd)
4684 				iommu_flush_iotlb_psi(iommu, si_domain,
4685 					start_vpfn, mhp->nr_pages,
4686 					!freelist, 0);
4687 			rcu_read_unlock();
4688 			dma_free_pagelist(freelist);
4689 		}
4690 		break;
4691 	}
4692 
4693 	return NOTIFY_OK;
4694 }
4695 
4696 static struct notifier_block intel_iommu_memory_nb = {
4697 	.notifier_call = intel_iommu_memory_notifier,
4698 	.priority = 0
4699 };
4700 
free_all_cpu_cached_iovas(unsigned int cpu)4701 static void free_all_cpu_cached_iovas(unsigned int cpu)
4702 {
4703 	int i;
4704 
4705 	for (i = 0; i < g_num_of_iommus; i++) {
4706 		struct intel_iommu *iommu = g_iommus[i];
4707 		struct dmar_domain *domain;
4708 		int did;
4709 
4710 		if (!iommu)
4711 			continue;
4712 
4713 		for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4714 			domain = get_iommu_domain(iommu, (u16)did);
4715 
4716 			if (!domain || domain->domain.type != IOMMU_DOMAIN_DMA)
4717 				continue;
4718 
4719 			free_cpu_cached_iovas(cpu, &domain->iovad);
4720 		}
4721 	}
4722 }
4723 
intel_iommu_cpu_dead(unsigned int cpu)4724 static int intel_iommu_cpu_dead(unsigned int cpu)
4725 {
4726 	free_all_cpu_cached_iovas(cpu);
4727 	return 0;
4728 }
4729 
intel_disable_iommus(void)4730 static void intel_disable_iommus(void)
4731 {
4732 	struct intel_iommu *iommu = NULL;
4733 	struct dmar_drhd_unit *drhd;
4734 
4735 	for_each_iommu(iommu, drhd)
4736 		iommu_disable_translation(iommu);
4737 }
4738 
intel_iommu_shutdown(void)4739 void intel_iommu_shutdown(void)
4740 {
4741 	struct dmar_drhd_unit *drhd;
4742 	struct intel_iommu *iommu = NULL;
4743 
4744 	if (no_iommu || dmar_disabled)
4745 		return;
4746 
4747 	down_write(&dmar_global_lock);
4748 
4749 	/* Disable PMRs explicitly here. */
4750 	for_each_iommu(iommu, drhd)
4751 		iommu_disable_protect_mem_regions(iommu);
4752 
4753 	/* Make sure the IOMMUs are switched off */
4754 	intel_disable_iommus();
4755 
4756 	up_write(&dmar_global_lock);
4757 }
4758 
dev_to_intel_iommu(struct device * dev)4759 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4760 {
4761 	struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4762 
4763 	return container_of(iommu_dev, struct intel_iommu, iommu);
4764 }
4765 
intel_iommu_show_version(struct device * dev,struct device_attribute * attr,char * buf)4766 static ssize_t intel_iommu_show_version(struct device *dev,
4767 					struct device_attribute *attr,
4768 					char *buf)
4769 {
4770 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4771 	u32 ver = readl(iommu->reg + DMAR_VER_REG);
4772 	return sprintf(buf, "%d:%d\n",
4773 		       DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4774 }
4775 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4776 
intel_iommu_show_address(struct device * dev,struct device_attribute * attr,char * buf)4777 static ssize_t intel_iommu_show_address(struct device *dev,
4778 					struct device_attribute *attr,
4779 					char *buf)
4780 {
4781 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4782 	return sprintf(buf, "%llx\n", iommu->reg_phys);
4783 }
4784 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4785 
intel_iommu_show_cap(struct device * dev,struct device_attribute * attr,char * buf)4786 static ssize_t intel_iommu_show_cap(struct device *dev,
4787 				    struct device_attribute *attr,
4788 				    char *buf)
4789 {
4790 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4791 	return sprintf(buf, "%llx\n", iommu->cap);
4792 }
4793 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4794 
intel_iommu_show_ecap(struct device * dev,struct device_attribute * attr,char * buf)4795 static ssize_t intel_iommu_show_ecap(struct device *dev,
4796 				    struct device_attribute *attr,
4797 				    char *buf)
4798 {
4799 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4800 	return sprintf(buf, "%llx\n", iommu->ecap);
4801 }
4802 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4803 
intel_iommu_show_ndoms(struct device * dev,struct device_attribute * attr,char * buf)4804 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4805 				      struct device_attribute *attr,
4806 				      char *buf)
4807 {
4808 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4809 	return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4810 }
4811 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4812 
intel_iommu_show_ndoms_used(struct device * dev,struct device_attribute * attr,char * buf)4813 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4814 					   struct device_attribute *attr,
4815 					   char *buf)
4816 {
4817 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4818 	return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4819 						  cap_ndoms(iommu->cap)));
4820 }
4821 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4822 
4823 static struct attribute *intel_iommu_attrs[] = {
4824 	&dev_attr_version.attr,
4825 	&dev_attr_address.attr,
4826 	&dev_attr_cap.attr,
4827 	&dev_attr_ecap.attr,
4828 	&dev_attr_domains_supported.attr,
4829 	&dev_attr_domains_used.attr,
4830 	NULL,
4831 };
4832 
4833 static struct attribute_group intel_iommu_group = {
4834 	.name = "intel-iommu",
4835 	.attrs = intel_iommu_attrs,
4836 };
4837 
4838 const struct attribute_group *intel_iommu_groups[] = {
4839 	&intel_iommu_group,
4840 	NULL,
4841 };
4842 
has_external_pci(void)4843 static inline bool has_external_pci(void)
4844 {
4845 	struct pci_dev *pdev = NULL;
4846 
4847 	for_each_pci_dev(pdev)
4848 		if (pdev->external_facing)
4849 			return true;
4850 
4851 	return false;
4852 }
4853 
platform_optin_force_iommu(void)4854 static int __init platform_optin_force_iommu(void)
4855 {
4856 	if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
4857 		return 0;
4858 
4859 	if (no_iommu || dmar_disabled)
4860 		pr_info("Intel-IOMMU force enabled due to platform opt in\n");
4861 
4862 	/*
4863 	 * If Intel-IOMMU is disabled by default, we will apply identity
4864 	 * map for all devices except those marked as being untrusted.
4865 	 */
4866 	if (dmar_disabled)
4867 		iommu_set_default_passthrough(false);
4868 
4869 	dmar_disabled = 0;
4870 	no_iommu = 0;
4871 
4872 	return 1;
4873 }
4874 
probe_acpi_namespace_devices(void)4875 static int __init probe_acpi_namespace_devices(void)
4876 {
4877 	struct dmar_drhd_unit *drhd;
4878 	/* To avoid a -Wunused-but-set-variable warning. */
4879 	struct intel_iommu *iommu __maybe_unused;
4880 	struct device *dev;
4881 	int i, ret = 0;
4882 
4883 	for_each_active_iommu(iommu, drhd) {
4884 		for_each_active_dev_scope(drhd->devices,
4885 					  drhd->devices_cnt, i, dev) {
4886 			struct acpi_device_physical_node *pn;
4887 			struct iommu_group *group;
4888 			struct acpi_device *adev;
4889 
4890 			if (dev->bus != &acpi_bus_type)
4891 				continue;
4892 
4893 			adev = to_acpi_device(dev);
4894 			mutex_lock(&adev->physical_node_lock);
4895 			list_for_each_entry(pn,
4896 					    &adev->physical_node_list, node) {
4897 				group = iommu_group_get(pn->dev);
4898 				if (group) {
4899 					iommu_group_put(group);
4900 					continue;
4901 				}
4902 
4903 				pn->dev->bus->iommu_ops = &intel_iommu_ops;
4904 				ret = iommu_probe_device(pn->dev);
4905 				if (ret)
4906 					break;
4907 			}
4908 			mutex_unlock(&adev->physical_node_lock);
4909 
4910 			if (ret)
4911 				return ret;
4912 		}
4913 	}
4914 
4915 	return 0;
4916 }
4917 
intel_iommu_init(void)4918 int __init intel_iommu_init(void)
4919 {
4920 	int ret = -ENODEV;
4921 	struct dmar_drhd_unit *drhd;
4922 	struct intel_iommu *iommu;
4923 
4924 	/*
4925 	 * Intel IOMMU is required for a TXT/tboot launch or platform
4926 	 * opt in, so enforce that.
4927 	 */
4928 	force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
4929 		    platform_optin_force_iommu();
4930 
4931 	if (iommu_init_mempool()) {
4932 		if (force_on)
4933 			panic("tboot: Failed to initialize iommu memory\n");
4934 		return -ENOMEM;
4935 	}
4936 
4937 	down_write(&dmar_global_lock);
4938 	if (dmar_table_init()) {
4939 		if (force_on)
4940 			panic("tboot: Failed to initialize DMAR table\n");
4941 		goto out_free_dmar;
4942 	}
4943 
4944 	if (dmar_dev_scope_init() < 0) {
4945 		if (force_on)
4946 			panic("tboot: Failed to initialize DMAR device scope\n");
4947 		goto out_free_dmar;
4948 	}
4949 
4950 	up_write(&dmar_global_lock);
4951 
4952 	/*
4953 	 * The bus notifier takes the dmar_global_lock, so lockdep will
4954 	 * complain later when we register it under the lock.
4955 	 */
4956 	dmar_register_bus_notifier();
4957 
4958 	down_write(&dmar_global_lock);
4959 
4960 	if (!no_iommu)
4961 		intel_iommu_debugfs_init();
4962 
4963 	if (no_iommu || dmar_disabled) {
4964 		/*
4965 		 * We exit the function here to ensure IOMMU's remapping and
4966 		 * mempool aren't setup, which means that the IOMMU's PMRs
4967 		 * won't be disabled via the call to init_dmars(). So disable
4968 		 * it explicitly here. The PMRs were setup by tboot prior to
4969 		 * calling SENTER, but the kernel is expected to reset/tear
4970 		 * down the PMRs.
4971 		 */
4972 		if (intel_iommu_tboot_noforce) {
4973 			for_each_iommu(iommu, drhd)
4974 				iommu_disable_protect_mem_regions(iommu);
4975 		}
4976 
4977 		/*
4978 		 * Make sure the IOMMUs are switched off, even when we
4979 		 * boot into a kexec kernel and the previous kernel left
4980 		 * them enabled
4981 		 */
4982 		intel_disable_iommus();
4983 		goto out_free_dmar;
4984 	}
4985 
4986 	if (list_empty(&dmar_rmrr_units))
4987 		pr_info("No RMRR found\n");
4988 
4989 	if (list_empty(&dmar_atsr_units))
4990 		pr_info("No ATSR found\n");
4991 
4992 	if (dmar_init_reserved_ranges()) {
4993 		if (force_on)
4994 			panic("tboot: Failed to reserve iommu ranges\n");
4995 		goto out_free_reserved_range;
4996 	}
4997 
4998 	if (dmar_map_gfx)
4999 		intel_iommu_gfx_mapped = 1;
5000 
5001 	init_no_remapping_devices();
5002 
5003 	ret = init_dmars();
5004 	if (ret) {
5005 		if (force_on)
5006 			panic("tboot: Failed to initialize DMARs\n");
5007 		pr_err("Initialization failed\n");
5008 		goto out_free_reserved_range;
5009 	}
5010 	up_write(&dmar_global_lock);
5011 
5012 	init_iommu_pm_ops();
5013 
5014 	down_read(&dmar_global_lock);
5015 	for_each_active_iommu(iommu, drhd) {
5016 		iommu_device_sysfs_add(&iommu->iommu, NULL,
5017 				       intel_iommu_groups,
5018 				       "%s", iommu->name);
5019 		iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
5020 		iommu_device_register(&iommu->iommu);
5021 	}
5022 	up_read(&dmar_global_lock);
5023 
5024 	bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
5025 	if (si_domain && !hw_pass_through)
5026 		register_memory_notifier(&intel_iommu_memory_nb);
5027 	cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
5028 			  intel_iommu_cpu_dead);
5029 
5030 	down_read(&dmar_global_lock);
5031 	if (probe_acpi_namespace_devices())
5032 		pr_warn("ACPI name space devices didn't probe correctly\n");
5033 
5034 	/* Finally, we enable the DMA remapping hardware. */
5035 	for_each_iommu(iommu, drhd) {
5036 		if (!drhd->ignored && !translation_pre_enabled(iommu))
5037 			iommu_enable_translation(iommu);
5038 
5039 		iommu_disable_protect_mem_regions(iommu);
5040 	}
5041 	up_read(&dmar_global_lock);
5042 
5043 	pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
5044 
5045 	intel_iommu_enabled = 1;
5046 
5047 	return 0;
5048 
5049 out_free_reserved_range:
5050 	put_iova_domain(&reserved_iova_list);
5051 out_free_dmar:
5052 	intel_iommu_free_dmars();
5053 	up_write(&dmar_global_lock);
5054 	iommu_exit_mempool();
5055 	return ret;
5056 }
5057 
domain_context_clear_one_cb(struct pci_dev * pdev,u16 alias,void * opaque)5058 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
5059 {
5060 	struct intel_iommu *iommu = opaque;
5061 
5062 	domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
5063 	return 0;
5064 }
5065 
5066 /*
5067  * NB - intel-iommu lacks any sort of reference counting for the users of
5068  * dependent devices.  If multiple endpoints have intersecting dependent
5069  * devices, unbinding the driver from any one of them will possibly leave
5070  * the others unable to operate.
5071  */
domain_context_clear(struct intel_iommu * iommu,struct device * dev)5072 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
5073 {
5074 	if (!iommu || !dev || !dev_is_pci(dev))
5075 		return;
5076 
5077 	pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
5078 }
5079 
__dmar_remove_one_dev_info(struct device_domain_info * info)5080 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
5081 {
5082 	struct dmar_domain *domain;
5083 	struct intel_iommu *iommu;
5084 	unsigned long flags;
5085 
5086 	assert_spin_locked(&device_domain_lock);
5087 
5088 	if (WARN_ON(!info))
5089 		return;
5090 
5091 	iommu = info->iommu;
5092 	domain = info->domain;
5093 
5094 	if (info->dev) {
5095 		if (dev_is_pci(info->dev) && sm_supported(iommu))
5096 			intel_pasid_tear_down_entry(iommu, info->dev,
5097 					PASID_RID2PASID, false);
5098 
5099 		iommu_disable_dev_iotlb(info);
5100 		if (!dev_is_real_dma_subdevice(info->dev))
5101 			domain_context_clear(iommu, info->dev);
5102 		intel_pasid_free_table(info->dev);
5103 	}
5104 
5105 	unlink_domain_info(info);
5106 
5107 	spin_lock_irqsave(&iommu->lock, flags);
5108 	domain_detach_iommu(domain, iommu);
5109 	spin_unlock_irqrestore(&iommu->lock, flags);
5110 
5111 	free_devinfo_mem(info);
5112 }
5113 
dmar_remove_one_dev_info(struct device * dev)5114 static void dmar_remove_one_dev_info(struct device *dev)
5115 {
5116 	struct device_domain_info *info;
5117 	unsigned long flags;
5118 
5119 	spin_lock_irqsave(&device_domain_lock, flags);
5120 	info = get_domain_info(dev);
5121 	if (info)
5122 		__dmar_remove_one_dev_info(info);
5123 	spin_unlock_irqrestore(&device_domain_lock, flags);
5124 }
5125 
md_domain_init(struct dmar_domain * domain,int guest_width)5126 static int md_domain_init(struct dmar_domain *domain, int guest_width)
5127 {
5128 	int adjust_width;
5129 
5130 	/* calculate AGAW */
5131 	domain->gaw = guest_width;
5132 	adjust_width = guestwidth_to_adjustwidth(guest_width);
5133 	domain->agaw = width_to_agaw(adjust_width);
5134 
5135 	domain->iommu_coherency = 0;
5136 	domain->iommu_snooping = 0;
5137 	domain->iommu_superpage = 0;
5138 	domain->max_addr = 0;
5139 
5140 	/* always allocate the top pgd */
5141 	domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
5142 	if (!domain->pgd)
5143 		return -ENOMEM;
5144 	domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
5145 	return 0;
5146 }
5147 
intel_init_iova_domain(struct dmar_domain * dmar_domain)5148 static void intel_init_iova_domain(struct dmar_domain *dmar_domain)
5149 {
5150 	init_iova_domain(&dmar_domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
5151 	copy_reserved_iova(&reserved_iova_list, &dmar_domain->iovad);
5152 
5153 	if (!intel_iommu_strict &&
5154 	    init_iova_flush_queue(&dmar_domain->iovad,
5155 				  iommu_flush_iova, iova_entry_free))
5156 		pr_info("iova flush queue initialization failed\n");
5157 }
5158 
intel_iommu_domain_alloc(unsigned type)5159 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
5160 {
5161 	struct dmar_domain *dmar_domain;
5162 	struct iommu_domain *domain;
5163 
5164 	switch (type) {
5165 	case IOMMU_DOMAIN_DMA:
5166 	case IOMMU_DOMAIN_UNMANAGED:
5167 		dmar_domain = alloc_domain(0);
5168 		if (!dmar_domain) {
5169 			pr_err("Can't allocate dmar_domain\n");
5170 			return NULL;
5171 		}
5172 		if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
5173 			pr_err("Domain initialization failed\n");
5174 			domain_exit(dmar_domain);
5175 			return NULL;
5176 		}
5177 
5178 		if (type == IOMMU_DOMAIN_DMA)
5179 			intel_init_iova_domain(dmar_domain);
5180 
5181 		domain = &dmar_domain->domain;
5182 		domain->geometry.aperture_start = 0;
5183 		domain->geometry.aperture_end   =
5184 				__DOMAIN_MAX_ADDR(dmar_domain->gaw);
5185 		domain->geometry.force_aperture = true;
5186 
5187 		return domain;
5188 	case IOMMU_DOMAIN_IDENTITY:
5189 		return &si_domain->domain;
5190 	default:
5191 		return NULL;
5192 	}
5193 
5194 	return NULL;
5195 }
5196 
intel_iommu_domain_free(struct iommu_domain * domain)5197 static void intel_iommu_domain_free(struct iommu_domain *domain)
5198 {
5199 	if (domain != &si_domain->domain)
5200 		domain_exit(to_dmar_domain(domain));
5201 }
5202 
5203 /*
5204  * Check whether a @domain could be attached to the @dev through the
5205  * aux-domain attach/detach APIs.
5206  */
5207 static inline bool
is_aux_domain(struct device * dev,struct iommu_domain * domain)5208 is_aux_domain(struct device *dev, struct iommu_domain *domain)
5209 {
5210 	struct device_domain_info *info = get_domain_info(dev);
5211 
5212 	return info && info->auxd_enabled &&
5213 			domain->type == IOMMU_DOMAIN_UNMANAGED;
5214 }
5215 
auxiliary_link_device(struct dmar_domain * domain,struct device * dev)5216 static void auxiliary_link_device(struct dmar_domain *domain,
5217 				  struct device *dev)
5218 {
5219 	struct device_domain_info *info = get_domain_info(dev);
5220 
5221 	assert_spin_locked(&device_domain_lock);
5222 	if (WARN_ON(!info))
5223 		return;
5224 
5225 	domain->auxd_refcnt++;
5226 	list_add(&domain->auxd, &info->auxiliary_domains);
5227 }
5228 
auxiliary_unlink_device(struct dmar_domain * domain,struct device * dev)5229 static void auxiliary_unlink_device(struct dmar_domain *domain,
5230 				    struct device *dev)
5231 {
5232 	struct device_domain_info *info = get_domain_info(dev);
5233 
5234 	assert_spin_locked(&device_domain_lock);
5235 	if (WARN_ON(!info))
5236 		return;
5237 
5238 	list_del(&domain->auxd);
5239 	domain->auxd_refcnt--;
5240 
5241 	if (!domain->auxd_refcnt && domain->default_pasid > 0)
5242 		ioasid_free(domain->default_pasid);
5243 }
5244 
aux_domain_add_dev(struct dmar_domain * domain,struct device * dev)5245 static int aux_domain_add_dev(struct dmar_domain *domain,
5246 			      struct device *dev)
5247 {
5248 	int ret;
5249 	unsigned long flags;
5250 	struct intel_iommu *iommu;
5251 
5252 	iommu = device_to_iommu(dev, NULL, NULL);
5253 	if (!iommu)
5254 		return -ENODEV;
5255 
5256 	if (domain->default_pasid <= 0) {
5257 		u32 pasid;
5258 
5259 		/* No private data needed for the default pasid */
5260 		pasid = ioasid_alloc(NULL, PASID_MIN,
5261 				     pci_max_pasids(to_pci_dev(dev)) - 1,
5262 				     NULL);
5263 		if (pasid == INVALID_IOASID) {
5264 			pr_err("Can't allocate default pasid\n");
5265 			return -ENODEV;
5266 		}
5267 		domain->default_pasid = pasid;
5268 	}
5269 
5270 	spin_lock_irqsave(&device_domain_lock, flags);
5271 	/*
5272 	 * iommu->lock must be held to attach domain to iommu and setup the
5273 	 * pasid entry for second level translation.
5274 	 */
5275 	spin_lock(&iommu->lock);
5276 	ret = domain_attach_iommu(domain, iommu);
5277 	if (ret)
5278 		goto attach_failed;
5279 
5280 	/* Setup the PASID entry for mediated devices: */
5281 	if (domain_use_first_level(domain))
5282 		ret = domain_setup_first_level(iommu, domain, dev,
5283 					       domain->default_pasid);
5284 	else
5285 		ret = intel_pasid_setup_second_level(iommu, domain, dev,
5286 						     domain->default_pasid);
5287 	if (ret)
5288 		goto table_failed;
5289 	spin_unlock(&iommu->lock);
5290 
5291 	auxiliary_link_device(domain, dev);
5292 
5293 	spin_unlock_irqrestore(&device_domain_lock, flags);
5294 
5295 	return 0;
5296 
5297 table_failed:
5298 	domain_detach_iommu(domain, iommu);
5299 attach_failed:
5300 	spin_unlock(&iommu->lock);
5301 	spin_unlock_irqrestore(&device_domain_lock, flags);
5302 	if (!domain->auxd_refcnt && domain->default_pasid > 0)
5303 		ioasid_free(domain->default_pasid);
5304 
5305 	return ret;
5306 }
5307 
aux_domain_remove_dev(struct dmar_domain * domain,struct device * dev)5308 static void aux_domain_remove_dev(struct dmar_domain *domain,
5309 				  struct device *dev)
5310 {
5311 	struct device_domain_info *info;
5312 	struct intel_iommu *iommu;
5313 	unsigned long flags;
5314 
5315 	if (!is_aux_domain(dev, &domain->domain))
5316 		return;
5317 
5318 	spin_lock_irqsave(&device_domain_lock, flags);
5319 	info = get_domain_info(dev);
5320 	iommu = info->iommu;
5321 
5322 	auxiliary_unlink_device(domain, dev);
5323 
5324 	spin_lock(&iommu->lock);
5325 	intel_pasid_tear_down_entry(iommu, dev, domain->default_pasid, false);
5326 	domain_detach_iommu(domain, iommu);
5327 	spin_unlock(&iommu->lock);
5328 
5329 	spin_unlock_irqrestore(&device_domain_lock, flags);
5330 }
5331 
prepare_domain_attach_device(struct iommu_domain * domain,struct device * dev)5332 static int prepare_domain_attach_device(struct iommu_domain *domain,
5333 					struct device *dev)
5334 {
5335 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5336 	struct intel_iommu *iommu;
5337 	int addr_width;
5338 
5339 	iommu = device_to_iommu(dev, NULL, NULL);
5340 	if (!iommu)
5341 		return -ENODEV;
5342 
5343 	/* check if this iommu agaw is sufficient for max mapped address */
5344 	addr_width = agaw_to_width(iommu->agaw);
5345 	if (addr_width > cap_mgaw(iommu->cap))
5346 		addr_width = cap_mgaw(iommu->cap);
5347 
5348 	if (dmar_domain->max_addr > (1LL << addr_width)) {
5349 		dev_err(dev, "%s: iommu width (%d) is not "
5350 		        "sufficient for the mapped address (%llx)\n",
5351 		        __func__, addr_width, dmar_domain->max_addr);
5352 		return -EFAULT;
5353 	}
5354 	dmar_domain->gaw = addr_width;
5355 
5356 	/*
5357 	 * Knock out extra levels of page tables if necessary
5358 	 */
5359 	while (iommu->agaw < dmar_domain->agaw) {
5360 		struct dma_pte *pte;
5361 
5362 		pte = dmar_domain->pgd;
5363 		if (dma_pte_present(pte)) {
5364 			dmar_domain->pgd = (struct dma_pte *)
5365 				phys_to_virt(dma_pte_addr(pte));
5366 			free_pgtable_page(pte);
5367 		}
5368 		dmar_domain->agaw--;
5369 	}
5370 
5371 	return 0;
5372 }
5373 
intel_iommu_attach_device(struct iommu_domain * domain,struct device * dev)5374 static int intel_iommu_attach_device(struct iommu_domain *domain,
5375 				     struct device *dev)
5376 {
5377 	int ret;
5378 
5379 	if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
5380 	    device_is_rmrr_locked(dev)) {
5381 		dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
5382 		return -EPERM;
5383 	}
5384 
5385 	if (is_aux_domain(dev, domain))
5386 		return -EPERM;
5387 
5388 	/* normally dev is not mapped */
5389 	if (unlikely(domain_context_mapped(dev))) {
5390 		struct dmar_domain *old_domain;
5391 
5392 		old_domain = find_domain(dev);
5393 		if (old_domain)
5394 			dmar_remove_one_dev_info(dev);
5395 	}
5396 
5397 	ret = prepare_domain_attach_device(domain, dev);
5398 	if (ret)
5399 		return ret;
5400 
5401 	return domain_add_dev_info(to_dmar_domain(domain), dev);
5402 }
5403 
intel_iommu_aux_attach_device(struct iommu_domain * domain,struct device * dev)5404 static int intel_iommu_aux_attach_device(struct iommu_domain *domain,
5405 					 struct device *dev)
5406 {
5407 	int ret;
5408 
5409 	if (!is_aux_domain(dev, domain))
5410 		return -EPERM;
5411 
5412 	ret = prepare_domain_attach_device(domain, dev);
5413 	if (ret)
5414 		return ret;
5415 
5416 	return aux_domain_add_dev(to_dmar_domain(domain), dev);
5417 }
5418 
intel_iommu_detach_device(struct iommu_domain * domain,struct device * dev)5419 static void intel_iommu_detach_device(struct iommu_domain *domain,
5420 				      struct device *dev)
5421 {
5422 	dmar_remove_one_dev_info(dev);
5423 }
5424 
intel_iommu_aux_detach_device(struct iommu_domain * domain,struct device * dev)5425 static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
5426 					  struct device *dev)
5427 {
5428 	aux_domain_remove_dev(to_dmar_domain(domain), dev);
5429 }
5430 
5431 #ifdef CONFIG_INTEL_IOMMU_SVM
5432 /*
5433  * 2D array for converting and sanitizing IOMMU generic TLB granularity to
5434  * VT-d granularity. Invalidation is typically included in the unmap operation
5435  * as a result of DMA or VFIO unmap. However, for assigned devices guest
5436  * owns the first level page tables. Invalidations of translation caches in the
5437  * guest are trapped and passed down to the host.
5438  *
5439  * vIOMMU in the guest will only expose first level page tables, therefore
5440  * we do not support IOTLB granularity for request without PASID (second level).
5441  *
5442  * For example, to find the VT-d granularity encoding for IOTLB
5443  * type and page selective granularity within PASID:
5444  * X: indexed by iommu cache type
5445  * Y: indexed by enum iommu_inv_granularity
5446  * [IOMMU_CACHE_INV_TYPE_IOTLB][IOMMU_INV_GRANU_ADDR]
5447  */
5448 
5449 static const int
5450 inv_type_granu_table[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU_NR] = {
5451 	/*
5452 	 * PASID based IOTLB invalidation: PASID selective (per PASID),
5453 	 * page selective (address granularity)
5454 	 */
5455 	{-EINVAL, QI_GRAN_NONG_PASID, QI_GRAN_PSI_PASID},
5456 	/* PASID based dev TLBs */
5457 	{-EINVAL, -EINVAL, QI_DEV_IOTLB_GRAN_PASID_SEL},
5458 	/* PASID cache */
5459 	{-EINVAL, -EINVAL, -EINVAL}
5460 };
5461 
to_vtd_granularity(int type,int granu)5462 static inline int to_vtd_granularity(int type, int granu)
5463 {
5464 	return inv_type_granu_table[type][granu];
5465 }
5466 
to_vtd_size(u64 granu_size,u64 nr_granules)5467 static inline u64 to_vtd_size(u64 granu_size, u64 nr_granules)
5468 {
5469 	u64 nr_pages = (granu_size * nr_granules) >> VTD_PAGE_SHIFT;
5470 
5471 	/* VT-d size is encoded as 2^size of 4K pages, 0 for 4k, 9 for 2MB, etc.
5472 	 * IOMMU cache invalidate API passes granu_size in bytes, and number of
5473 	 * granu size in contiguous memory.
5474 	 */
5475 	return order_base_2(nr_pages);
5476 }
5477 
5478 static int
intel_iommu_sva_invalidate(struct iommu_domain * domain,struct device * dev,struct iommu_cache_invalidate_info * inv_info)5479 intel_iommu_sva_invalidate(struct iommu_domain *domain, struct device *dev,
5480 			   struct iommu_cache_invalidate_info *inv_info)
5481 {
5482 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5483 	struct device_domain_info *info;
5484 	struct intel_iommu *iommu;
5485 	unsigned long flags;
5486 	int cache_type;
5487 	u8 bus, devfn;
5488 	u16 did, sid;
5489 	int ret = 0;
5490 	u64 size = 0;
5491 
5492 	if (!inv_info || !dmar_domain)
5493 		return -EINVAL;
5494 
5495 	if (!dev || !dev_is_pci(dev))
5496 		return -ENODEV;
5497 
5498 	iommu = device_to_iommu(dev, &bus, &devfn);
5499 	if (!iommu)
5500 		return -ENODEV;
5501 
5502 	if (!(dmar_domain->flags & DOMAIN_FLAG_NESTING_MODE))
5503 		return -EINVAL;
5504 
5505 	spin_lock_irqsave(&device_domain_lock, flags);
5506 	spin_lock(&iommu->lock);
5507 	info = get_domain_info(dev);
5508 	if (!info) {
5509 		ret = -EINVAL;
5510 		goto out_unlock;
5511 	}
5512 	did = dmar_domain->iommu_did[iommu->seq_id];
5513 	sid = PCI_DEVID(bus, devfn);
5514 
5515 	/* Size is only valid in address selective invalidation */
5516 	if (inv_info->granularity == IOMMU_INV_GRANU_ADDR)
5517 		size = to_vtd_size(inv_info->granu.addr_info.granule_size,
5518 				   inv_info->granu.addr_info.nb_granules);
5519 
5520 	for_each_set_bit(cache_type,
5521 			 (unsigned long *)&inv_info->cache,
5522 			 IOMMU_CACHE_INV_TYPE_NR) {
5523 		int granu = 0;
5524 		u64 pasid = 0;
5525 		u64 addr = 0;
5526 
5527 		granu = to_vtd_granularity(cache_type, inv_info->granularity);
5528 		if (granu == -EINVAL) {
5529 			pr_err_ratelimited("Invalid cache type and granu combination %d/%d\n",
5530 					   cache_type, inv_info->granularity);
5531 			break;
5532 		}
5533 
5534 		/*
5535 		 * PASID is stored in different locations based on the
5536 		 * granularity.
5537 		 */
5538 		if (inv_info->granularity == IOMMU_INV_GRANU_PASID &&
5539 		    (inv_info->granu.pasid_info.flags & IOMMU_INV_PASID_FLAGS_PASID))
5540 			pasid = inv_info->granu.pasid_info.pasid;
5541 		else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
5542 			 (inv_info->granu.addr_info.flags & IOMMU_INV_ADDR_FLAGS_PASID))
5543 			pasid = inv_info->granu.addr_info.pasid;
5544 
5545 		switch (BIT(cache_type)) {
5546 		case IOMMU_CACHE_INV_TYPE_IOTLB:
5547 			/* HW will ignore LSB bits based on address mask */
5548 			if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
5549 			    size &&
5550 			    (inv_info->granu.addr_info.addr & ((BIT(VTD_PAGE_SHIFT + size)) - 1))) {
5551 				pr_err_ratelimited("User address not aligned, 0x%llx, size order %llu\n",
5552 						   inv_info->granu.addr_info.addr, size);
5553 			}
5554 
5555 			/*
5556 			 * If granu is PASID-selective, address is ignored.
5557 			 * We use npages = -1 to indicate that.
5558 			 */
5559 			qi_flush_piotlb(iommu, did, pasid,
5560 					mm_to_dma_pfn(inv_info->granu.addr_info.addr),
5561 					(granu == QI_GRAN_NONG_PASID) ? -1 : 1 << size,
5562 					inv_info->granu.addr_info.flags & IOMMU_INV_ADDR_FLAGS_LEAF);
5563 
5564 			if (!info->ats_enabled)
5565 				break;
5566 			/*
5567 			 * Always flush device IOTLB if ATS is enabled. vIOMMU
5568 			 * in the guest may assume IOTLB flush is inclusive,
5569 			 * which is more efficient.
5570 			 */
5571 			fallthrough;
5572 		case IOMMU_CACHE_INV_TYPE_DEV_IOTLB:
5573 			/*
5574 			 * PASID based device TLB invalidation does not support
5575 			 * IOMMU_INV_GRANU_PASID granularity but only supports
5576 			 * IOMMU_INV_GRANU_ADDR.
5577 			 * The equivalent of that is we set the size to be the
5578 			 * entire range of 64 bit. User only provides PASID info
5579 			 * without address info. So we set addr to 0.
5580 			 */
5581 			if (inv_info->granularity == IOMMU_INV_GRANU_PASID) {
5582 				size = 64 - VTD_PAGE_SHIFT;
5583 				addr = 0;
5584 			} else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR) {
5585 				addr = inv_info->granu.addr_info.addr;
5586 			}
5587 
5588 			if (info->ats_enabled)
5589 				qi_flush_dev_iotlb_pasid(iommu, sid,
5590 						info->pfsid, pasid,
5591 						info->ats_qdep, addr,
5592 						size);
5593 			else
5594 				pr_warn_ratelimited("Passdown device IOTLB flush w/o ATS!\n");
5595 			break;
5596 		default:
5597 			dev_err_ratelimited(dev, "Unsupported IOMMU invalidation type %d\n",
5598 					    cache_type);
5599 			ret = -EINVAL;
5600 		}
5601 	}
5602 out_unlock:
5603 	spin_unlock(&iommu->lock);
5604 	spin_unlock_irqrestore(&device_domain_lock, flags);
5605 
5606 	return ret;
5607 }
5608 #endif
5609 
intel_iommu_map(struct iommu_domain * domain,unsigned long iova,phys_addr_t hpa,size_t size,int iommu_prot,gfp_t gfp)5610 static int intel_iommu_map(struct iommu_domain *domain,
5611 			   unsigned long iova, phys_addr_t hpa,
5612 			   size_t size, int iommu_prot, gfp_t gfp)
5613 {
5614 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5615 	u64 max_addr;
5616 	int prot = 0;
5617 	int ret;
5618 
5619 	if (iommu_prot & IOMMU_READ)
5620 		prot |= DMA_PTE_READ;
5621 	if (iommu_prot & IOMMU_WRITE)
5622 		prot |= DMA_PTE_WRITE;
5623 	if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5624 		prot |= DMA_PTE_SNP;
5625 
5626 	max_addr = iova + size;
5627 	if (dmar_domain->max_addr < max_addr) {
5628 		u64 end;
5629 
5630 		/* check if minimum agaw is sufficient for mapped address */
5631 		end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5632 		if (end < max_addr) {
5633 			pr_err("%s: iommu width (%d) is not "
5634 			       "sufficient for the mapped address (%llx)\n",
5635 			       __func__, dmar_domain->gaw, max_addr);
5636 			return -EFAULT;
5637 		}
5638 		dmar_domain->max_addr = max_addr;
5639 	}
5640 	/* Round up size to next multiple of PAGE_SIZE, if it and
5641 	   the low bits of hpa would take us onto the next page */
5642 	size = aligned_nrpages(hpa, size);
5643 	ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5644 				 hpa >> VTD_PAGE_SHIFT, size, prot);
5645 	return ret;
5646 }
5647 
intel_iommu_unmap(struct iommu_domain * domain,unsigned long iova,size_t size,struct iommu_iotlb_gather * gather)5648 static size_t intel_iommu_unmap(struct iommu_domain *domain,
5649 				unsigned long iova, size_t size,
5650 				struct iommu_iotlb_gather *gather)
5651 {
5652 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5653 	struct page *freelist = NULL;
5654 	unsigned long start_pfn, last_pfn;
5655 	unsigned int npages;
5656 	int iommu_id, level = 0;
5657 
5658 	/* Cope with horrid API which requires us to unmap more than the
5659 	   size argument if it happens to be a large-page mapping. */
5660 	BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5661 
5662 	if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5663 		size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5664 
5665 	start_pfn = iova >> VTD_PAGE_SHIFT;
5666 	last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5667 
5668 	freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
5669 
5670 	npages = last_pfn - start_pfn + 1;
5671 
5672 	for_each_domain_iommu(iommu_id, dmar_domain)
5673 		iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5674 				      start_pfn, npages, !freelist, 0);
5675 
5676 	dma_free_pagelist(freelist);
5677 
5678 	if (dmar_domain->max_addr == iova + size)
5679 		dmar_domain->max_addr = iova;
5680 
5681 	return size;
5682 }
5683 
intel_iommu_iova_to_phys(struct iommu_domain * domain,dma_addr_t iova)5684 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5685 					    dma_addr_t iova)
5686 {
5687 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5688 	struct dma_pte *pte;
5689 	int level = 0;
5690 	u64 phys = 0;
5691 
5692 	pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5693 	if (pte && dma_pte_present(pte))
5694 		phys = dma_pte_addr(pte) +
5695 			(iova & (BIT_MASK(level_to_offset_bits(level) +
5696 						VTD_PAGE_SHIFT) - 1));
5697 
5698 	return phys;
5699 }
5700 
scalable_mode_support(void)5701 static inline bool scalable_mode_support(void)
5702 {
5703 	struct dmar_drhd_unit *drhd;
5704 	struct intel_iommu *iommu;
5705 	bool ret = true;
5706 
5707 	rcu_read_lock();
5708 	for_each_active_iommu(iommu, drhd) {
5709 		if (!sm_supported(iommu)) {
5710 			ret = false;
5711 			break;
5712 		}
5713 	}
5714 	rcu_read_unlock();
5715 
5716 	return ret;
5717 }
5718 
iommu_pasid_support(void)5719 static inline bool iommu_pasid_support(void)
5720 {
5721 	struct dmar_drhd_unit *drhd;
5722 	struct intel_iommu *iommu;
5723 	bool ret = true;
5724 
5725 	rcu_read_lock();
5726 	for_each_active_iommu(iommu, drhd) {
5727 		if (!pasid_supported(iommu)) {
5728 			ret = false;
5729 			break;
5730 		}
5731 	}
5732 	rcu_read_unlock();
5733 
5734 	return ret;
5735 }
5736 
nested_mode_support(void)5737 static inline bool nested_mode_support(void)
5738 {
5739 	struct dmar_drhd_unit *drhd;
5740 	struct intel_iommu *iommu;
5741 	bool ret = true;
5742 
5743 	rcu_read_lock();
5744 	for_each_active_iommu(iommu, drhd) {
5745 		if (!sm_supported(iommu) || !ecap_nest(iommu->ecap)) {
5746 			ret = false;
5747 			break;
5748 		}
5749 	}
5750 	rcu_read_unlock();
5751 
5752 	return ret;
5753 }
5754 
intel_iommu_capable(enum iommu_cap cap)5755 static bool intel_iommu_capable(enum iommu_cap cap)
5756 {
5757 	if (cap == IOMMU_CAP_CACHE_COHERENCY)
5758 		return domain_update_iommu_snooping(NULL) == 1;
5759 	if (cap == IOMMU_CAP_INTR_REMAP)
5760 		return irq_remapping_enabled == 1;
5761 
5762 	return false;
5763 }
5764 
intel_iommu_probe_device(struct device * dev)5765 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
5766 {
5767 	struct intel_iommu *iommu;
5768 
5769 	iommu = device_to_iommu(dev, NULL, NULL);
5770 	if (!iommu)
5771 		return ERR_PTR(-ENODEV);
5772 
5773 	if (translation_pre_enabled(iommu))
5774 		dev_iommu_priv_set(dev, DEFER_DEVICE_DOMAIN_INFO);
5775 
5776 	return &iommu->iommu;
5777 }
5778 
intel_iommu_release_device(struct device * dev)5779 static void intel_iommu_release_device(struct device *dev)
5780 {
5781 	struct intel_iommu *iommu;
5782 
5783 	iommu = device_to_iommu(dev, NULL, NULL);
5784 	if (!iommu)
5785 		return;
5786 
5787 	dmar_remove_one_dev_info(dev);
5788 
5789 	set_dma_ops(dev, NULL);
5790 }
5791 
intel_iommu_probe_finalize(struct device * dev)5792 static void intel_iommu_probe_finalize(struct device *dev)
5793 {
5794 	struct iommu_domain *domain;
5795 
5796 	domain = iommu_get_domain_for_dev(dev);
5797 	if (device_needs_bounce(dev))
5798 		set_dma_ops(dev, &bounce_dma_ops);
5799 	else if (domain && domain->type == IOMMU_DOMAIN_DMA)
5800 		set_dma_ops(dev, &intel_dma_ops);
5801 	else
5802 		set_dma_ops(dev, NULL);
5803 }
5804 
intel_iommu_get_resv_regions(struct device * device,struct list_head * head)5805 static void intel_iommu_get_resv_regions(struct device *device,
5806 					 struct list_head *head)
5807 {
5808 	int prot = DMA_PTE_READ | DMA_PTE_WRITE;
5809 	struct iommu_resv_region *reg;
5810 	struct dmar_rmrr_unit *rmrr;
5811 	struct device *i_dev;
5812 	int i;
5813 
5814 	down_read(&dmar_global_lock);
5815 	for_each_rmrr_units(rmrr) {
5816 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5817 					  i, i_dev) {
5818 			struct iommu_resv_region *resv;
5819 			enum iommu_resv_type type;
5820 			size_t length;
5821 
5822 			if (i_dev != device &&
5823 			    !is_downstream_to_pci_bridge(device, i_dev))
5824 				continue;
5825 
5826 			length = rmrr->end_address - rmrr->base_address + 1;
5827 
5828 			type = device_rmrr_is_relaxable(device) ?
5829 				IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
5830 
5831 			resv = iommu_alloc_resv_region(rmrr->base_address,
5832 						       length, prot, type);
5833 			if (!resv)
5834 				break;
5835 
5836 			list_add_tail(&resv->list, head);
5837 		}
5838 	}
5839 	up_read(&dmar_global_lock);
5840 
5841 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
5842 	if (dev_is_pci(device)) {
5843 		struct pci_dev *pdev = to_pci_dev(device);
5844 
5845 		if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
5846 			reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
5847 						   IOMMU_RESV_DIRECT_RELAXABLE);
5848 			if (reg)
5849 				list_add_tail(&reg->list, head);
5850 		}
5851 	}
5852 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
5853 
5854 	reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5855 				      IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5856 				      0, IOMMU_RESV_MSI);
5857 	if (!reg)
5858 		return;
5859 	list_add_tail(&reg->list, head);
5860 }
5861 
intel_iommu_enable_pasid(struct intel_iommu * iommu,struct device * dev)5862 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
5863 {
5864 	struct device_domain_info *info;
5865 	struct context_entry *context;
5866 	struct dmar_domain *domain;
5867 	unsigned long flags;
5868 	u64 ctx_lo;
5869 	int ret;
5870 
5871 	domain = find_domain(dev);
5872 	if (!domain)
5873 		return -EINVAL;
5874 
5875 	spin_lock_irqsave(&device_domain_lock, flags);
5876 	spin_lock(&iommu->lock);
5877 
5878 	ret = -EINVAL;
5879 	info = get_domain_info(dev);
5880 	if (!info || !info->pasid_supported)
5881 		goto out;
5882 
5883 	context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5884 	if (WARN_ON(!context))
5885 		goto out;
5886 
5887 	ctx_lo = context[0].lo;
5888 
5889 	if (!(ctx_lo & CONTEXT_PASIDE)) {
5890 		ctx_lo |= CONTEXT_PASIDE;
5891 		context[0].lo = ctx_lo;
5892 		wmb();
5893 		iommu->flush.flush_context(iommu,
5894 					   domain->iommu_did[iommu->seq_id],
5895 					   PCI_DEVID(info->bus, info->devfn),
5896 					   DMA_CCMD_MASK_NOBIT,
5897 					   DMA_CCMD_DEVICE_INVL);
5898 	}
5899 
5900 	/* Enable PASID support in the device, if it wasn't already */
5901 	if (!info->pasid_enabled)
5902 		iommu_enable_dev_iotlb(info);
5903 
5904 	ret = 0;
5905 
5906  out:
5907 	spin_unlock(&iommu->lock);
5908 	spin_unlock_irqrestore(&device_domain_lock, flags);
5909 
5910 	return ret;
5911 }
5912 
intel_iommu_apply_resv_region(struct device * dev,struct iommu_domain * domain,struct iommu_resv_region * region)5913 static void intel_iommu_apply_resv_region(struct device *dev,
5914 					  struct iommu_domain *domain,
5915 					  struct iommu_resv_region *region)
5916 {
5917 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5918 	unsigned long start, end;
5919 
5920 	start = IOVA_PFN(region->start);
5921 	end   = IOVA_PFN(region->start + region->length - 1);
5922 
5923 	WARN_ON_ONCE(!reserve_iova(&dmar_domain->iovad, start, end));
5924 }
5925 
intel_iommu_device_group(struct device * dev)5926 static struct iommu_group *intel_iommu_device_group(struct device *dev)
5927 {
5928 	if (dev_is_pci(dev))
5929 		return pci_device_group(dev);
5930 	return generic_device_group(dev);
5931 }
5932 
intel_iommu_enable_auxd(struct device * dev)5933 static int intel_iommu_enable_auxd(struct device *dev)
5934 {
5935 	struct device_domain_info *info;
5936 	struct intel_iommu *iommu;
5937 	unsigned long flags;
5938 	int ret;
5939 
5940 	iommu = device_to_iommu(dev, NULL, NULL);
5941 	if (!iommu || dmar_disabled)
5942 		return -EINVAL;
5943 
5944 	if (!sm_supported(iommu) || !pasid_supported(iommu))
5945 		return -EINVAL;
5946 
5947 	ret = intel_iommu_enable_pasid(iommu, dev);
5948 	if (ret)
5949 		return -ENODEV;
5950 
5951 	spin_lock_irqsave(&device_domain_lock, flags);
5952 	info = get_domain_info(dev);
5953 	info->auxd_enabled = 1;
5954 	spin_unlock_irqrestore(&device_domain_lock, flags);
5955 
5956 	return 0;
5957 }
5958 
intel_iommu_disable_auxd(struct device * dev)5959 static int intel_iommu_disable_auxd(struct device *dev)
5960 {
5961 	struct device_domain_info *info;
5962 	unsigned long flags;
5963 
5964 	spin_lock_irqsave(&device_domain_lock, flags);
5965 	info = get_domain_info(dev);
5966 	if (!WARN_ON(!info))
5967 		info->auxd_enabled = 0;
5968 	spin_unlock_irqrestore(&device_domain_lock, flags);
5969 
5970 	return 0;
5971 }
5972 
5973 /*
5974  * A PCI express designated vendor specific extended capability is defined
5975  * in the section 3.7 of Intel scalable I/O virtualization technical spec
5976  * for system software and tools to detect endpoint devices supporting the
5977  * Intel scalable IO virtualization without host driver dependency.
5978  *
5979  * Returns the address of the matching extended capability structure within
5980  * the device's PCI configuration space or 0 if the device does not support
5981  * it.
5982  */
siov_find_pci_dvsec(struct pci_dev * pdev)5983 static int siov_find_pci_dvsec(struct pci_dev *pdev)
5984 {
5985 	int pos;
5986 	u16 vendor, id;
5987 
5988 	pos = pci_find_next_ext_capability(pdev, 0, 0x23);
5989 	while (pos) {
5990 		pci_read_config_word(pdev, pos + 4, &vendor);
5991 		pci_read_config_word(pdev, pos + 8, &id);
5992 		if (vendor == PCI_VENDOR_ID_INTEL && id == 5)
5993 			return pos;
5994 
5995 		pos = pci_find_next_ext_capability(pdev, pos, 0x23);
5996 	}
5997 
5998 	return 0;
5999 }
6000 
6001 static bool
intel_iommu_dev_has_feat(struct device * dev,enum iommu_dev_features feat)6002 intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
6003 {
6004 	if (feat == IOMMU_DEV_FEAT_AUX) {
6005 		int ret;
6006 
6007 		if (!dev_is_pci(dev) || dmar_disabled ||
6008 		    !scalable_mode_support() || !iommu_pasid_support())
6009 			return false;
6010 
6011 		ret = pci_pasid_features(to_pci_dev(dev));
6012 		if (ret < 0)
6013 			return false;
6014 
6015 		return !!siov_find_pci_dvsec(to_pci_dev(dev));
6016 	}
6017 
6018 	if (feat == IOMMU_DEV_FEAT_SVA) {
6019 		struct device_domain_info *info = get_domain_info(dev);
6020 
6021 		return info && (info->iommu->flags & VTD_FLAG_SVM_CAPABLE) &&
6022 			info->pasid_supported && info->pri_supported &&
6023 			info->ats_supported;
6024 	}
6025 
6026 	return false;
6027 }
6028 
6029 static int
intel_iommu_dev_enable_feat(struct device * dev,enum iommu_dev_features feat)6030 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
6031 {
6032 	if (feat == IOMMU_DEV_FEAT_AUX)
6033 		return intel_iommu_enable_auxd(dev);
6034 
6035 	if (feat == IOMMU_DEV_FEAT_SVA) {
6036 		struct device_domain_info *info = get_domain_info(dev);
6037 
6038 		if (!info)
6039 			return -EINVAL;
6040 
6041 		if (info->iommu->flags & VTD_FLAG_SVM_CAPABLE)
6042 			return 0;
6043 	}
6044 
6045 	return -ENODEV;
6046 }
6047 
6048 static int
intel_iommu_dev_disable_feat(struct device * dev,enum iommu_dev_features feat)6049 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
6050 {
6051 	if (feat == IOMMU_DEV_FEAT_AUX)
6052 		return intel_iommu_disable_auxd(dev);
6053 
6054 	return -ENODEV;
6055 }
6056 
6057 static bool
intel_iommu_dev_feat_enabled(struct device * dev,enum iommu_dev_features feat)6058 intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat)
6059 {
6060 	struct device_domain_info *info = get_domain_info(dev);
6061 
6062 	if (feat == IOMMU_DEV_FEAT_AUX)
6063 		return scalable_mode_support() && info && info->auxd_enabled;
6064 
6065 	return false;
6066 }
6067 
6068 static int
intel_iommu_aux_get_pasid(struct iommu_domain * domain,struct device * dev)6069 intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
6070 {
6071 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
6072 
6073 	return dmar_domain->default_pasid > 0 ?
6074 			dmar_domain->default_pasid : -EINVAL;
6075 }
6076 
intel_iommu_is_attach_deferred(struct iommu_domain * domain,struct device * dev)6077 static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,
6078 					   struct device *dev)
6079 {
6080 	return attach_deferred(dev);
6081 }
6082 
6083 static int
intel_iommu_domain_set_attr(struct iommu_domain * domain,enum iommu_attr attr,void * data)6084 intel_iommu_domain_set_attr(struct iommu_domain *domain,
6085 			    enum iommu_attr attr, void *data)
6086 {
6087 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
6088 	unsigned long flags;
6089 	int ret = 0;
6090 
6091 	if (domain->type != IOMMU_DOMAIN_UNMANAGED)
6092 		return -EINVAL;
6093 
6094 	switch (attr) {
6095 	case DOMAIN_ATTR_NESTING:
6096 		spin_lock_irqsave(&device_domain_lock, flags);
6097 		if (nested_mode_support() &&
6098 		    list_empty(&dmar_domain->devices)) {
6099 			dmar_domain->flags |= DOMAIN_FLAG_NESTING_MODE;
6100 			dmar_domain->flags &= ~DOMAIN_FLAG_USE_FIRST_LEVEL;
6101 		} else {
6102 			ret = -ENODEV;
6103 		}
6104 		spin_unlock_irqrestore(&device_domain_lock, flags);
6105 		break;
6106 	default:
6107 		ret = -EINVAL;
6108 		break;
6109 	}
6110 
6111 	return ret;
6112 }
6113 
6114 /*
6115  * Check that the device does not live on an external facing PCI port that is
6116  * marked as untrusted. Such devices should not be able to apply quirks and
6117  * thus not be able to bypass the IOMMU restrictions.
6118  */
risky_device(struct pci_dev * pdev)6119 static bool risky_device(struct pci_dev *pdev)
6120 {
6121 	if (pdev->untrusted) {
6122 		pci_info(pdev,
6123 			 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
6124 			 pdev->vendor, pdev->device);
6125 		pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
6126 		return true;
6127 	}
6128 	return false;
6129 }
6130 
6131 const struct iommu_ops intel_iommu_ops = {
6132 	.capable		= intel_iommu_capable,
6133 	.domain_alloc		= intel_iommu_domain_alloc,
6134 	.domain_free		= intel_iommu_domain_free,
6135 	.domain_set_attr	= intel_iommu_domain_set_attr,
6136 	.attach_dev		= intel_iommu_attach_device,
6137 	.detach_dev		= intel_iommu_detach_device,
6138 	.aux_attach_dev		= intel_iommu_aux_attach_device,
6139 	.aux_detach_dev		= intel_iommu_aux_detach_device,
6140 	.aux_get_pasid		= intel_iommu_aux_get_pasid,
6141 	.map			= intel_iommu_map,
6142 	.unmap			= intel_iommu_unmap,
6143 	.iova_to_phys		= intel_iommu_iova_to_phys,
6144 	.probe_device		= intel_iommu_probe_device,
6145 	.probe_finalize		= intel_iommu_probe_finalize,
6146 	.release_device		= intel_iommu_release_device,
6147 	.get_resv_regions	= intel_iommu_get_resv_regions,
6148 	.put_resv_regions	= generic_iommu_put_resv_regions,
6149 	.apply_resv_region	= intel_iommu_apply_resv_region,
6150 	.device_group		= intel_iommu_device_group,
6151 	.dev_has_feat		= intel_iommu_dev_has_feat,
6152 	.dev_feat_enabled	= intel_iommu_dev_feat_enabled,
6153 	.dev_enable_feat	= intel_iommu_dev_enable_feat,
6154 	.dev_disable_feat	= intel_iommu_dev_disable_feat,
6155 	.is_attach_deferred	= intel_iommu_is_attach_deferred,
6156 	.def_domain_type	= device_def_domain_type,
6157 	.pgsize_bitmap		= INTEL_IOMMU_PGSIZES,
6158 #ifdef CONFIG_INTEL_IOMMU_SVM
6159 	.cache_invalidate	= intel_iommu_sva_invalidate,
6160 	.sva_bind_gpasid	= intel_svm_bind_gpasid,
6161 	.sva_unbind_gpasid	= intel_svm_unbind_gpasid,
6162 	.sva_bind		= intel_svm_bind,
6163 	.sva_unbind		= intel_svm_unbind,
6164 	.sva_get_pasid		= intel_svm_get_pasid,
6165 	.page_response		= intel_svm_page_response,
6166 #endif
6167 };
6168 
quirk_iommu_igfx(struct pci_dev * dev)6169 static void quirk_iommu_igfx(struct pci_dev *dev)
6170 {
6171 	if (risky_device(dev))
6172 		return;
6173 
6174 	pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
6175 	dmar_map_gfx = 0;
6176 }
6177 
6178 /* G4x/GM45 integrated gfx dmar support is totally busted. */
6179 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
6180 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
6181 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
6182 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
6183 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
6184 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
6185 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
6186 
6187 /* Broadwell igfx malfunctions with dmar */
6188 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
6189 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
6190 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
6191 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
6192 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
6193 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
6194 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
6195 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
6196 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
6197 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
6198 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
6199 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
6200 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
6201 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
6202 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
6203 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
6204 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
6205 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
6206 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
6207 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
6208 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
6209 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
6210 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
6211 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
6212 
quirk_iommu_rwbf(struct pci_dev * dev)6213 static void quirk_iommu_rwbf(struct pci_dev *dev)
6214 {
6215 	if (risky_device(dev))
6216 		return;
6217 
6218 	/*
6219 	 * Mobile 4 Series Chipset neglects to set RWBF capability,
6220 	 * but needs it. Same seems to hold for the desktop versions.
6221 	 */
6222 	pci_info(dev, "Forcing write-buffer flush capability\n");
6223 	rwbf_quirk = 1;
6224 }
6225 
6226 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
6227 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
6228 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
6229 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
6230 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
6231 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
6232 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
6233 
6234 #define GGC 0x52
6235 #define GGC_MEMORY_SIZE_MASK	(0xf << 8)
6236 #define GGC_MEMORY_SIZE_NONE	(0x0 << 8)
6237 #define GGC_MEMORY_SIZE_1M	(0x1 << 8)
6238 #define GGC_MEMORY_SIZE_2M	(0x3 << 8)
6239 #define GGC_MEMORY_VT_ENABLED	(0x8 << 8)
6240 #define GGC_MEMORY_SIZE_2M_VT	(0x9 << 8)
6241 #define GGC_MEMORY_SIZE_3M_VT	(0xa << 8)
6242 #define GGC_MEMORY_SIZE_4M_VT	(0xb << 8)
6243 
quirk_calpella_no_shadow_gtt(struct pci_dev * dev)6244 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
6245 {
6246 	unsigned short ggc;
6247 
6248 	if (risky_device(dev))
6249 		return;
6250 
6251 	if (pci_read_config_word(dev, GGC, &ggc))
6252 		return;
6253 
6254 	if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
6255 		pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
6256 		dmar_map_gfx = 0;
6257 	} else if (dmar_map_gfx) {
6258 		/* we have to ensure the gfx device is idle before we flush */
6259 		pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
6260 		intel_iommu_strict = 1;
6261        }
6262 }
6263 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
6264 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
6265 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
6266 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
6267 
quirk_igfx_skip_te_disable(struct pci_dev * dev)6268 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
6269 {
6270 	unsigned short ver;
6271 
6272 	if (!IS_GFX_DEVICE(dev))
6273 		return;
6274 
6275 	ver = (dev->device >> 8) & 0xff;
6276 	if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
6277 	    ver != 0x4e && ver != 0x8a && ver != 0x98 &&
6278 	    ver != 0x9a)
6279 		return;
6280 
6281 	if (risky_device(dev))
6282 		return;
6283 
6284 	pci_info(dev, "Skip IOMMU disabling for graphics\n");
6285 	iommu_skip_te_disable = 1;
6286 }
6287 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
6288 
6289 /* On Tylersburg chipsets, some BIOSes have been known to enable the
6290    ISOCH DMAR unit for the Azalia sound device, but not give it any
6291    TLB entries, which causes it to deadlock. Check for that.  We do
6292    this in a function called from init_dmars(), instead of in a PCI
6293    quirk, because we don't want to print the obnoxious "BIOS broken"
6294    message if VT-d is actually disabled.
6295 */
check_tylersburg_isoch(void)6296 static void __init check_tylersburg_isoch(void)
6297 {
6298 	struct pci_dev *pdev;
6299 	uint32_t vtisochctrl;
6300 
6301 	/* If there's no Azalia in the system anyway, forget it. */
6302 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
6303 	if (!pdev)
6304 		return;
6305 
6306 	if (risky_device(pdev)) {
6307 		pci_dev_put(pdev);
6308 		return;
6309 	}
6310 
6311 	pci_dev_put(pdev);
6312 
6313 	/* System Management Registers. Might be hidden, in which case
6314 	   we can't do the sanity check. But that's OK, because the
6315 	   known-broken BIOSes _don't_ actually hide it, so far. */
6316 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
6317 	if (!pdev)
6318 		return;
6319 
6320 	if (risky_device(pdev)) {
6321 		pci_dev_put(pdev);
6322 		return;
6323 	}
6324 
6325 	if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
6326 		pci_dev_put(pdev);
6327 		return;
6328 	}
6329 
6330 	pci_dev_put(pdev);
6331 
6332 	/* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
6333 	if (vtisochctrl & 1)
6334 		return;
6335 
6336 	/* Drop all bits other than the number of TLB entries */
6337 	vtisochctrl &= 0x1c;
6338 
6339 	/* If we have the recommended number of TLB entries (16), fine. */
6340 	if (vtisochctrl == 0x10)
6341 		return;
6342 
6343 	/* Zero TLB entries? You get to ride the short bus to school. */
6344 	if (!vtisochctrl) {
6345 		WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
6346 		     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
6347 		     dmi_get_system_info(DMI_BIOS_VENDOR),
6348 		     dmi_get_system_info(DMI_BIOS_VERSION),
6349 		     dmi_get_system_info(DMI_PRODUCT_VERSION));
6350 		iommu_identity_mapping |= IDENTMAP_AZALIA;
6351 		return;
6352 	}
6353 
6354 	pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
6355 	       vtisochctrl);
6356 }
6357