• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2006-2014 Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * Authors: David Woodhouse <dwmw2@infradead.org>,
14  *          Ashok Raj <ashok.raj@intel.com>,
15  *          Shaohua Li <shaohua.li@intel.com>,
16  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
17  *          Fenghua Yu <fenghua.yu@intel.com>
18  *          Joerg Roedel <jroedel@suse.de>
19  */
20 
21 #define pr_fmt(fmt)     "DMAR: " fmt
22 
23 #include <linux/init.h>
24 #include <linux/bitmap.h>
25 #include <linux/debugfs.h>
26 #include <linux/export.h>
27 #include <linux/slab.h>
28 #include <linux/irq.h>
29 #include <linux/interrupt.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/memory.h>
36 #include <linux/cpu.h>
37 #include <linux/timer.h>
38 #include <linux/io.h>
39 #include <linux/iova.h>
40 #include <linux/iommu.h>
41 #include <linux/intel-iommu.h>
42 #include <linux/syscore_ops.h>
43 #include <linux/tboot.h>
44 #include <linux/dmi.h>
45 #include <linux/pci-ats.h>
46 #include <linux/memblock.h>
47 #include <linux/dma-contiguous.h>
48 #include <linux/crash_dump.h>
49 #include <asm/irq_remapping.h>
50 #include <asm/cacheflush.h>
51 #include <asm/iommu.h>
52 
53 #include "irq_remapping.h"
54 
55 #define ROOT_SIZE		VTD_PAGE_SIZE
56 #define CONTEXT_SIZE		VTD_PAGE_SIZE
57 
58 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
59 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
60 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
61 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
62 
63 #define IOAPIC_RANGE_START	(0xfee00000)
64 #define IOAPIC_RANGE_END	(0xfeefffff)
65 #define IOVA_START_ADDR		(0x1000)
66 
67 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
68 
69 #define MAX_AGAW_WIDTH 64
70 #define MAX_AGAW_PFN_WIDTH	(MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
71 
72 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
73 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
74 
75 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
76    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
77 #define DOMAIN_MAX_PFN(gaw)	((unsigned long) min_t(uint64_t, \
78 				__DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
79 #define DOMAIN_MAX_ADDR(gaw)	(((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
80 
81 /* IO virtual address start page frame number */
82 #define IOVA_START_PFN		(1)
83 
84 #define IOVA_PFN(addr)		((addr) >> PAGE_SHIFT)
85 #define DMA_32BIT_PFN		IOVA_PFN(DMA_BIT_MASK(32))
86 #define DMA_64BIT_PFN		IOVA_PFN(DMA_BIT_MASK(64))
87 
88 /* page table handling */
89 #define LEVEL_STRIDE		(9)
90 #define LEVEL_MASK		(((u64)1 << LEVEL_STRIDE) - 1)
91 
92 /*
93  * This bitmap is used to advertise the page sizes our hardware support
94  * to the IOMMU core, which will then use this information to split
95  * physically contiguous memory regions it is mapping into page sizes
96  * that we support.
97  *
98  * Traditionally the IOMMU core just handed us the mappings directly,
99  * after making sure the size is an order of a 4KiB page and that the
100  * mapping has natural alignment.
101  *
102  * To retain this behavior, we currently advertise that we support
103  * all page sizes that are an order of 4KiB.
104  *
105  * If at some point we'd like to utilize the IOMMU core's new behavior,
106  * we could change this to advertise the real page sizes we support.
107  */
108 #define INTEL_IOMMU_PGSIZES	(~0xFFFUL)
109 
agaw_to_level(int agaw)110 static inline int agaw_to_level(int agaw)
111 {
112 	return agaw + 2;
113 }
114 
agaw_to_width(int agaw)115 static inline int agaw_to_width(int agaw)
116 {
117 	return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
118 }
119 
width_to_agaw(int width)120 static inline int width_to_agaw(int width)
121 {
122 	return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
123 }
124 
level_to_offset_bits(int level)125 static inline unsigned int level_to_offset_bits(int level)
126 {
127 	return (level - 1) * LEVEL_STRIDE;
128 }
129 
pfn_level_offset(unsigned long pfn,int level)130 static inline int pfn_level_offset(unsigned long pfn, int level)
131 {
132 	return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
133 }
134 
level_mask(int level)135 static inline unsigned long level_mask(int level)
136 {
137 	return -1UL << level_to_offset_bits(level);
138 }
139 
level_size(int level)140 static inline unsigned long level_size(int level)
141 {
142 	return 1UL << level_to_offset_bits(level);
143 }
144 
align_to_level(unsigned long pfn,int level)145 static inline unsigned long align_to_level(unsigned long pfn, int level)
146 {
147 	return (pfn + level_size(level) - 1) & level_mask(level);
148 }
149 
lvl_to_nr_pages(unsigned int lvl)150 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
151 {
152 	return  1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
153 }
154 
155 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
156    are never going to work. */
dma_to_mm_pfn(unsigned long dma_pfn)157 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
158 {
159 	return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
160 }
161 
mm_to_dma_pfn(unsigned long mm_pfn)162 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
163 {
164 	return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
165 }
page_to_dma_pfn(struct page * pg)166 static inline unsigned long page_to_dma_pfn(struct page *pg)
167 {
168 	return mm_to_dma_pfn(page_to_pfn(pg));
169 }
virt_to_dma_pfn(void * p)170 static inline unsigned long virt_to_dma_pfn(void *p)
171 {
172 	return page_to_dma_pfn(virt_to_page(p));
173 }
174 
175 /* global iommu list, set NULL for ignored DMAR units */
176 static struct intel_iommu **g_iommus;
177 
178 static void __init check_tylersburg_isoch(void);
179 static int rwbf_quirk;
180 
181 /*
182  * set to 1 to panic kernel if can't successfully enable VT-d
183  * (used when kernel is launched w/ TXT)
184  */
185 static int force_on = 0;
186 int intel_iommu_tboot_noforce;
187 
188 /*
189  * 0: Present
190  * 1-11: Reserved
191  * 12-63: Context Ptr (12 - (haw-1))
192  * 64-127: Reserved
193  */
194 struct root_entry {
195 	u64	lo;
196 	u64	hi;
197 };
198 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
199 
200 /*
201  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
202  * if marked present.
203  */
root_entry_lctp(struct root_entry * re)204 static phys_addr_t root_entry_lctp(struct root_entry *re)
205 {
206 	if (!(re->lo & 1))
207 		return 0;
208 
209 	return re->lo & VTD_PAGE_MASK;
210 }
211 
212 /*
213  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
214  * if marked present.
215  */
root_entry_uctp(struct root_entry * re)216 static phys_addr_t root_entry_uctp(struct root_entry *re)
217 {
218 	if (!(re->hi & 1))
219 		return 0;
220 
221 	return re->hi & VTD_PAGE_MASK;
222 }
223 /*
224  * low 64 bits:
225  * 0: present
226  * 1: fault processing disable
227  * 2-3: translation type
228  * 12-63: address space root
229  * high 64 bits:
230  * 0-2: address width
231  * 3-6: aval
232  * 8-23: domain id
233  */
234 struct context_entry {
235 	u64 lo;
236 	u64 hi;
237 };
238 
context_clear_pasid_enable(struct context_entry * context)239 static inline void context_clear_pasid_enable(struct context_entry *context)
240 {
241 	context->lo &= ~(1ULL << 11);
242 }
243 
context_pasid_enabled(struct context_entry * context)244 static inline bool context_pasid_enabled(struct context_entry *context)
245 {
246 	return !!(context->lo & (1ULL << 11));
247 }
248 
context_set_copied(struct context_entry * context)249 static inline void context_set_copied(struct context_entry *context)
250 {
251 	context->hi |= (1ull << 3);
252 }
253 
context_copied(struct context_entry * context)254 static inline bool context_copied(struct context_entry *context)
255 {
256 	return !!(context->hi & (1ULL << 3));
257 }
258 
__context_present(struct context_entry * context)259 static inline bool __context_present(struct context_entry *context)
260 {
261 	return (context->lo & 1);
262 }
263 
context_present(struct context_entry * context)264 static inline bool context_present(struct context_entry *context)
265 {
266 	return context_pasid_enabled(context) ?
267 	     __context_present(context) :
268 	     __context_present(context) && !context_copied(context);
269 }
270 
context_set_present(struct context_entry * context)271 static inline void context_set_present(struct context_entry *context)
272 {
273 	context->lo |= 1;
274 }
275 
context_set_fault_enable(struct context_entry * context)276 static inline void context_set_fault_enable(struct context_entry *context)
277 {
278 	context->lo &= (((u64)-1) << 2) | 1;
279 }
280 
context_set_translation_type(struct context_entry * context,unsigned long value)281 static inline void context_set_translation_type(struct context_entry *context,
282 						unsigned long value)
283 {
284 	context->lo &= (((u64)-1) << 4) | 3;
285 	context->lo |= (value & 3) << 2;
286 }
287 
context_set_address_root(struct context_entry * context,unsigned long value)288 static inline void context_set_address_root(struct context_entry *context,
289 					    unsigned long value)
290 {
291 	context->lo &= ~VTD_PAGE_MASK;
292 	context->lo |= value & VTD_PAGE_MASK;
293 }
294 
context_set_address_width(struct context_entry * context,unsigned long value)295 static inline void context_set_address_width(struct context_entry *context,
296 					     unsigned long value)
297 {
298 	context->hi |= value & 7;
299 }
300 
context_set_domain_id(struct context_entry * context,unsigned long value)301 static inline void context_set_domain_id(struct context_entry *context,
302 					 unsigned long value)
303 {
304 	context->hi |= (value & ((1 << 16) - 1)) << 8;
305 }
306 
context_domain_id(struct context_entry * c)307 static inline int context_domain_id(struct context_entry *c)
308 {
309 	return((c->hi >> 8) & 0xffff);
310 }
311 
context_clear_entry(struct context_entry * context)312 static inline void context_clear_entry(struct context_entry *context)
313 {
314 	context->lo = 0;
315 	context->hi = 0;
316 }
317 
318 /*
319  * 0: readable
320  * 1: writable
321  * 2-6: reserved
322  * 7: super page
323  * 8-10: available
324  * 11: snoop behavior
325  * 12-63: Host physcial address
326  */
327 struct dma_pte {
328 	u64 val;
329 };
330 
dma_clear_pte(struct dma_pte * pte)331 static inline void dma_clear_pte(struct dma_pte *pte)
332 {
333 	pte->val = 0;
334 }
335 
dma_pte_addr(struct dma_pte * pte)336 static inline u64 dma_pte_addr(struct dma_pte *pte)
337 {
338 #ifdef CONFIG_64BIT
339 	return pte->val & VTD_PAGE_MASK;
340 #else
341 	/* Must have a full atomic 64-bit read */
342 	return  __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
343 #endif
344 }
345 
dma_pte_present(struct dma_pte * pte)346 static inline bool dma_pte_present(struct dma_pte *pte)
347 {
348 	return (pte->val & 3) != 0;
349 }
350 
dma_pte_superpage(struct dma_pte * pte)351 static inline bool dma_pte_superpage(struct dma_pte *pte)
352 {
353 	return (pte->val & DMA_PTE_LARGE_PAGE);
354 }
355 
first_pte_in_page(struct dma_pte * pte)356 static inline int first_pte_in_page(struct dma_pte *pte)
357 {
358 	return !((unsigned long)pte & ~VTD_PAGE_MASK);
359 }
360 
361 /*
362  * This domain is a statically identity mapping domain.
363  *	1. This domain creats a static 1:1 mapping to all usable memory.
364  * 	2. It maps to each iommu if successful.
365  *	3. Each iommu mapps to this domain if successful.
366  */
367 static struct dmar_domain *si_domain;
368 static int hw_pass_through = 1;
369 
370 /*
371  * Domain represents a virtual machine, more than one devices
372  * across iommus may be owned in one domain, e.g. kvm guest.
373  */
374 #define DOMAIN_FLAG_VIRTUAL_MACHINE	(1 << 0)
375 
376 /* si_domain contains mulitple devices */
377 #define DOMAIN_FLAG_STATIC_IDENTITY	(1 << 1)
378 
379 #define for_each_domain_iommu(idx, domain)			\
380 	for (idx = 0; idx < g_num_of_iommus; idx++)		\
381 		if (domain->iommu_refcnt[idx])
382 
383 struct dmar_domain {
384 	int	nid;			/* node id */
385 
386 	unsigned	iommu_refcnt[DMAR_UNITS_SUPPORTED];
387 					/* Refcount of devices per iommu */
388 
389 
390 	u16		iommu_did[DMAR_UNITS_SUPPORTED];
391 					/* Domain ids per IOMMU. Use u16 since
392 					 * domain ids are 16 bit wide according
393 					 * to VT-d spec, section 9.3 */
394 
395 	bool has_iotlb_device;
396 	struct list_head devices;	/* all devices' list */
397 	struct iova_domain iovad;	/* iova's that belong to this domain */
398 
399 	struct dma_pte	*pgd;		/* virtual address */
400 	int		gaw;		/* max guest address width */
401 
402 	/* adjusted guest address width, 0 is level 2 30-bit */
403 	int		agaw;
404 
405 	int		flags;		/* flags to find out type of domain */
406 
407 	int		iommu_coherency;/* indicate coherency of iommu access */
408 	int		iommu_snooping; /* indicate snooping control feature*/
409 	int		iommu_count;	/* reference count of iommu */
410 	int		iommu_superpage;/* Level of superpages supported:
411 					   0 == 4KiB (no superpages), 1 == 2MiB,
412 					   2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
413 	u64		max_addr;	/* maximum mapped address */
414 
415 	struct iommu_domain domain;	/* generic domain data structure for
416 					   iommu core */
417 };
418 
419 /* PCI domain-device relationship */
420 struct device_domain_info {
421 	struct list_head link;	/* link to domain siblings */
422 	struct list_head global; /* link to global list */
423 	u8 bus;			/* PCI bus number */
424 	u8 devfn;		/* PCI devfn number */
425 	u16 pfsid;		/* SRIOV physical function source ID */
426 	u8 pasid_supported:3;
427 	u8 pasid_enabled:1;
428 	u8 pri_supported:1;
429 	u8 pri_enabled:1;
430 	u8 ats_supported:1;
431 	u8 ats_enabled:1;
432 	u8 ats_qdep;
433 	struct device *dev; /* it's NULL for PCIe-to-PCI bridge */
434 	struct intel_iommu *iommu; /* IOMMU used by this device */
435 	struct dmar_domain *domain; /* pointer to domain */
436 };
437 
438 struct dmar_rmrr_unit {
439 	struct list_head list;		/* list of rmrr units	*/
440 	struct acpi_dmar_header *hdr;	/* ACPI header		*/
441 	u64	base_address;		/* reserved base address*/
442 	u64	end_address;		/* reserved end address */
443 	struct dmar_dev_scope *devices;	/* target devices */
444 	int	devices_cnt;		/* target device count */
445 };
446 
447 struct dmar_atsr_unit {
448 	struct list_head list;		/* list of ATSR units */
449 	struct acpi_dmar_header *hdr;	/* ACPI header */
450 	struct dmar_dev_scope *devices;	/* target devices */
451 	int devices_cnt;		/* target device count */
452 	u8 include_all:1;		/* include all ports */
453 };
454 
455 static LIST_HEAD(dmar_atsr_units);
456 static LIST_HEAD(dmar_rmrr_units);
457 
458 #define for_each_rmrr_units(rmrr) \
459 	list_for_each_entry(rmrr, &dmar_rmrr_units, list)
460 
461 /* bitmap for indexing intel_iommus */
462 static int g_num_of_iommus;
463 
464 static void domain_exit(struct dmar_domain *domain);
465 static void domain_remove_dev_info(struct dmar_domain *domain);
466 static void dmar_remove_one_dev_info(struct dmar_domain *domain,
467 				     struct device *dev);
468 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
469 static void domain_context_clear(struct intel_iommu *iommu,
470 				 struct device *dev);
471 static int domain_detach_iommu(struct dmar_domain *domain,
472 			       struct intel_iommu *iommu);
473 
474 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
475 int dmar_disabled = 0;
476 #else
477 int dmar_disabled = 1;
478 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
479 
480 int intel_iommu_enabled = 0;
481 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
482 
483 static int dmar_map_gfx = 1;
484 static int dmar_forcedac;
485 static int intel_iommu_strict;
486 static int intel_iommu_superpage = 1;
487 static int intel_iommu_ecs = 1;
488 static int intel_iommu_pasid28;
489 static int iommu_identity_mapping;
490 
491 #define IDENTMAP_ALL		1
492 #define IDENTMAP_GFX		2
493 #define IDENTMAP_AZALIA		4
494 
495 /* Broadwell and Skylake have broken ECS support — normal so-called "second
496  * level" translation of DMA requests-without-PASID doesn't actually happen
497  * unless you also set the NESTE bit in an extended context-entry. Which of
498  * course means that SVM doesn't work because it's trying to do nested
499  * translation of the physical addresses it finds in the process page tables,
500  * through the IOVA->phys mapping found in the "second level" page tables.
501  *
502  * The VT-d specification was retroactively changed to change the definition
503  * of the capability bits and pretend that Broadwell/Skylake never happened...
504  * but unfortunately the wrong bit was changed. It's ECS which is broken, but
505  * for some reason it was the PASID capability bit which was redefined (from
506  * bit 28 on BDW/SKL to bit 40 in future).
507  *
508  * So our test for ECS needs to eschew those implementations which set the old
509  * PASID capabiity bit 28, since those are the ones on which ECS is broken.
510  * Unless we are working around the 'pasid28' limitations, that is, by putting
511  * the device into passthrough mode for normal DMA and thus masking the bug.
512  */
513 #define ecs_enabled(iommu) (intel_iommu_ecs && ecap_ecs(iommu->ecap) && \
514 			    (intel_iommu_pasid28 || !ecap_broken_pasid(iommu->ecap)))
515 /* PASID support is thus enabled if ECS is enabled and *either* of the old
516  * or new capability bits are set. */
517 #define pasid_enabled(iommu) (ecs_enabled(iommu) &&			\
518 			      (ecap_pasid(iommu->ecap) || ecap_broken_pasid(iommu->ecap)))
519 
520 int intel_iommu_gfx_mapped;
521 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
522 
523 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
524 static DEFINE_SPINLOCK(device_domain_lock);
525 static LIST_HEAD(device_domain_list);
526 
527 const struct iommu_ops intel_iommu_ops;
528 
translation_pre_enabled(struct intel_iommu * iommu)529 static bool translation_pre_enabled(struct intel_iommu *iommu)
530 {
531 	return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
532 }
533 
clear_translation_pre_enabled(struct intel_iommu * iommu)534 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
535 {
536 	iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
537 }
538 
init_translation_status(struct intel_iommu * iommu)539 static void init_translation_status(struct intel_iommu *iommu)
540 {
541 	u32 gsts;
542 
543 	gsts = readl(iommu->reg + DMAR_GSTS_REG);
544 	if (gsts & DMA_GSTS_TES)
545 		iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
546 }
547 
548 /* Convert generic 'struct iommu_domain to private struct dmar_domain */
to_dmar_domain(struct iommu_domain * dom)549 static struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
550 {
551 	return container_of(dom, struct dmar_domain, domain);
552 }
553 
intel_iommu_setup(char * str)554 static int __init intel_iommu_setup(char *str)
555 {
556 	if (!str)
557 		return -EINVAL;
558 	while (*str) {
559 		if (!strncmp(str, "on", 2)) {
560 			dmar_disabled = 0;
561 			pr_info("IOMMU enabled\n");
562 		} else if (!strncmp(str, "off", 3)) {
563 			dmar_disabled = 1;
564 			pr_info("IOMMU disabled\n");
565 		} else if (!strncmp(str, "igfx_off", 8)) {
566 			dmar_map_gfx = 0;
567 			pr_info("Disable GFX device mapping\n");
568 		} else if (!strncmp(str, "forcedac", 8)) {
569 			pr_info("Forcing DAC for PCI devices\n");
570 			dmar_forcedac = 1;
571 		} else if (!strncmp(str, "strict", 6)) {
572 			pr_info("Disable batched IOTLB flush\n");
573 			intel_iommu_strict = 1;
574 		} else if (!strncmp(str, "sp_off", 6)) {
575 			pr_info("Disable supported super page\n");
576 			intel_iommu_superpage = 0;
577 		} else if (!strncmp(str, "ecs_off", 7)) {
578 			printk(KERN_INFO
579 				"Intel-IOMMU: disable extended context table support\n");
580 			intel_iommu_ecs = 0;
581 		} else if (!strncmp(str, "pasid28", 7)) {
582 			printk(KERN_INFO
583 				"Intel-IOMMU: enable pre-production PASID support\n");
584 			intel_iommu_pasid28 = 1;
585 			iommu_identity_mapping |= IDENTMAP_GFX;
586 		} else if (!strncmp(str, "tboot_noforce", 13)) {
587 			printk(KERN_INFO
588 				"Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
589 			intel_iommu_tboot_noforce = 1;
590 		}
591 
592 		str += strcspn(str, ",");
593 		while (*str == ',')
594 			str++;
595 	}
596 	return 0;
597 }
598 __setup("intel_iommu=", intel_iommu_setup);
599 
600 static struct kmem_cache *iommu_domain_cache;
601 static struct kmem_cache *iommu_devinfo_cache;
602 
get_iommu_domain(struct intel_iommu * iommu,u16 did)603 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
604 {
605 	struct dmar_domain **domains;
606 	int idx = did >> 8;
607 
608 	domains = iommu->domains[idx];
609 	if (!domains)
610 		return NULL;
611 
612 	return domains[did & 0xff];
613 }
614 
set_iommu_domain(struct intel_iommu * iommu,u16 did,struct dmar_domain * domain)615 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
616 			     struct dmar_domain *domain)
617 {
618 	struct dmar_domain **domains;
619 	int idx = did >> 8;
620 
621 	if (!iommu->domains[idx]) {
622 		size_t size = 256 * sizeof(struct dmar_domain *);
623 		iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
624 	}
625 
626 	domains = iommu->domains[idx];
627 	if (WARN_ON(!domains))
628 		return;
629 	else
630 		domains[did & 0xff] = domain;
631 }
632 
alloc_pgtable_page(int node)633 static inline void *alloc_pgtable_page(int node)
634 {
635 	struct page *page;
636 	void *vaddr = NULL;
637 
638 	page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
639 	if (page)
640 		vaddr = page_address(page);
641 	return vaddr;
642 }
643 
free_pgtable_page(void * vaddr)644 static inline void free_pgtable_page(void *vaddr)
645 {
646 	free_page((unsigned long)vaddr);
647 }
648 
alloc_domain_mem(void)649 static inline void *alloc_domain_mem(void)
650 {
651 	return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
652 }
653 
free_domain_mem(void * vaddr)654 static void free_domain_mem(void *vaddr)
655 {
656 	kmem_cache_free(iommu_domain_cache, vaddr);
657 }
658 
alloc_devinfo_mem(void)659 static inline void * alloc_devinfo_mem(void)
660 {
661 	return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
662 }
663 
free_devinfo_mem(void * vaddr)664 static inline void free_devinfo_mem(void *vaddr)
665 {
666 	kmem_cache_free(iommu_devinfo_cache, vaddr);
667 }
668 
domain_type_is_vm(struct dmar_domain * domain)669 static inline int domain_type_is_vm(struct dmar_domain *domain)
670 {
671 	return domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE;
672 }
673 
domain_type_is_si(struct dmar_domain * domain)674 static inline int domain_type_is_si(struct dmar_domain *domain)
675 {
676 	return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
677 }
678 
domain_type_is_vm_or_si(struct dmar_domain * domain)679 static inline int domain_type_is_vm_or_si(struct dmar_domain *domain)
680 {
681 	return domain->flags & (DOMAIN_FLAG_VIRTUAL_MACHINE |
682 				DOMAIN_FLAG_STATIC_IDENTITY);
683 }
684 
domain_pfn_supported(struct dmar_domain * domain,unsigned long pfn)685 static inline int domain_pfn_supported(struct dmar_domain *domain,
686 				       unsigned long pfn)
687 {
688 	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
689 
690 	return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
691 }
692 
__iommu_calculate_agaw(struct intel_iommu * iommu,int max_gaw)693 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
694 {
695 	unsigned long sagaw;
696 	int agaw = -1;
697 
698 	sagaw = cap_sagaw(iommu->cap);
699 	for (agaw = width_to_agaw(max_gaw);
700 	     agaw >= 0; agaw--) {
701 		if (test_bit(agaw, &sagaw))
702 			break;
703 	}
704 
705 	return agaw;
706 }
707 
708 /*
709  * Calculate max SAGAW for each iommu.
710  */
iommu_calculate_max_sagaw(struct intel_iommu * iommu)711 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
712 {
713 	return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
714 }
715 
716 /*
717  * calculate agaw for each iommu.
718  * "SAGAW" may be different across iommus, use a default agaw, and
719  * get a supported less agaw for iommus that don't support the default agaw.
720  */
iommu_calculate_agaw(struct intel_iommu * iommu)721 int iommu_calculate_agaw(struct intel_iommu *iommu)
722 {
723 	return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
724 }
725 
726 /* This functionin only returns single iommu in a domain */
domain_get_iommu(struct dmar_domain * domain)727 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
728 {
729 	int iommu_id;
730 
731 	/* si_domain and vm domain should not get here. */
732 	BUG_ON(domain_type_is_vm_or_si(domain));
733 	for_each_domain_iommu(iommu_id, domain)
734 		break;
735 
736 	if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
737 		return NULL;
738 
739 	return g_iommus[iommu_id];
740 }
741 
domain_update_iommu_coherency(struct dmar_domain * domain)742 static void domain_update_iommu_coherency(struct dmar_domain *domain)
743 {
744 	struct dmar_drhd_unit *drhd;
745 	struct intel_iommu *iommu;
746 	bool found = false;
747 	int i;
748 
749 	domain->iommu_coherency = 1;
750 
751 	for_each_domain_iommu(i, domain) {
752 		found = true;
753 		if (!ecap_coherent(g_iommus[i]->ecap)) {
754 			domain->iommu_coherency = 0;
755 			break;
756 		}
757 	}
758 	if (found)
759 		return;
760 
761 	/* No hardware attached; use lowest common denominator */
762 	rcu_read_lock();
763 	for_each_active_iommu(iommu, drhd) {
764 		if (!ecap_coherent(iommu->ecap)) {
765 			domain->iommu_coherency = 0;
766 			break;
767 		}
768 	}
769 	rcu_read_unlock();
770 }
771 
domain_update_iommu_snooping(struct intel_iommu * skip)772 static int domain_update_iommu_snooping(struct intel_iommu *skip)
773 {
774 	struct dmar_drhd_unit *drhd;
775 	struct intel_iommu *iommu;
776 	int ret = 1;
777 
778 	rcu_read_lock();
779 	for_each_active_iommu(iommu, drhd) {
780 		if (iommu != skip) {
781 			if (!ecap_sc_support(iommu->ecap)) {
782 				ret = 0;
783 				break;
784 			}
785 		}
786 	}
787 	rcu_read_unlock();
788 
789 	return ret;
790 }
791 
domain_update_iommu_superpage(struct intel_iommu * skip)792 static int domain_update_iommu_superpage(struct intel_iommu *skip)
793 {
794 	struct dmar_drhd_unit *drhd;
795 	struct intel_iommu *iommu;
796 	int mask = 0xf;
797 
798 	if (!intel_iommu_superpage) {
799 		return 0;
800 	}
801 
802 	/* set iommu_superpage to the smallest common denominator */
803 	rcu_read_lock();
804 	for_each_active_iommu(iommu, drhd) {
805 		if (iommu != skip) {
806 			mask &= cap_super_page_val(iommu->cap);
807 			if (!mask)
808 				break;
809 		}
810 	}
811 	rcu_read_unlock();
812 
813 	return fls(mask);
814 }
815 
816 /* Some capabilities may be different across iommus */
domain_update_iommu_cap(struct dmar_domain * domain)817 static void domain_update_iommu_cap(struct dmar_domain *domain)
818 {
819 	domain_update_iommu_coherency(domain);
820 	domain->iommu_snooping = domain_update_iommu_snooping(NULL);
821 	domain->iommu_superpage = domain_update_iommu_superpage(NULL);
822 }
823 
iommu_context_addr(struct intel_iommu * iommu,u8 bus,u8 devfn,int alloc)824 static inline struct context_entry *iommu_context_addr(struct intel_iommu *iommu,
825 						       u8 bus, u8 devfn, int alloc)
826 {
827 	struct root_entry *root = &iommu->root_entry[bus];
828 	struct context_entry *context;
829 	u64 *entry;
830 
831 	entry = &root->lo;
832 	if (ecs_enabled(iommu)) {
833 		if (devfn >= 0x80) {
834 			devfn -= 0x80;
835 			entry = &root->hi;
836 		}
837 		devfn *= 2;
838 	}
839 	if (*entry & 1)
840 		context = phys_to_virt(*entry & VTD_PAGE_MASK);
841 	else {
842 		unsigned long phy_addr;
843 		if (!alloc)
844 			return NULL;
845 
846 		context = alloc_pgtable_page(iommu->node);
847 		if (!context)
848 			return NULL;
849 
850 		__iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
851 		phy_addr = virt_to_phys((void *)context);
852 		*entry = phy_addr | 1;
853 		__iommu_flush_cache(iommu, entry, sizeof(*entry));
854 	}
855 	return &context[devfn];
856 }
857 
iommu_dummy(struct device * dev)858 static int iommu_dummy(struct device *dev)
859 {
860 	return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
861 }
862 
device_to_iommu(struct device * dev,u8 * bus,u8 * devfn)863 static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
864 {
865 	struct dmar_drhd_unit *drhd = NULL;
866 	struct intel_iommu *iommu;
867 	struct device *tmp;
868 	struct pci_dev *ptmp, *pdev = NULL;
869 	u16 segment = 0;
870 	int i;
871 
872 	if (iommu_dummy(dev))
873 		return NULL;
874 
875 	if (dev_is_pci(dev)) {
876 		struct pci_dev *pf_pdev;
877 
878 		pdev = to_pci_dev(dev);
879 
880 #ifdef CONFIG_X86
881 		/* VMD child devices currently cannot be handled individually */
882 		if (is_vmd(pdev->bus))
883 			return NULL;
884 #endif
885 
886 		/* VFs aren't listed in scope tables; we need to look up
887 		 * the PF instead to find the IOMMU. */
888 		pf_pdev = pci_physfn(pdev);
889 		dev = &pf_pdev->dev;
890 		segment = pci_domain_nr(pdev->bus);
891 	} else if (has_acpi_companion(dev))
892 		dev = &ACPI_COMPANION(dev)->dev;
893 
894 	rcu_read_lock();
895 	for_each_active_iommu(iommu, drhd) {
896 		if (pdev && segment != drhd->segment)
897 			continue;
898 
899 		for_each_active_dev_scope(drhd->devices,
900 					  drhd->devices_cnt, i, tmp) {
901 			if (tmp == dev) {
902 				/* For a VF use its original BDF# not that of the PF
903 				 * which we used for the IOMMU lookup. Strictly speaking
904 				 * we could do this for all PCI devices; we only need to
905 				 * get the BDF# from the scope table for ACPI matches. */
906 				if (pdev && pdev->is_virtfn)
907 					goto got_pdev;
908 
909 				*bus = drhd->devices[i].bus;
910 				*devfn = drhd->devices[i].devfn;
911 				goto out;
912 			}
913 
914 			if (!pdev || !dev_is_pci(tmp))
915 				continue;
916 
917 			ptmp = to_pci_dev(tmp);
918 			if (ptmp->subordinate &&
919 			    ptmp->subordinate->number <= pdev->bus->number &&
920 			    ptmp->subordinate->busn_res.end >= pdev->bus->number)
921 				goto got_pdev;
922 		}
923 
924 		if (pdev && drhd->include_all) {
925 		got_pdev:
926 			*bus = pdev->bus->number;
927 			*devfn = pdev->devfn;
928 			goto out;
929 		}
930 	}
931 	iommu = NULL;
932  out:
933 	rcu_read_unlock();
934 
935 	return iommu;
936 }
937 
domain_flush_cache(struct dmar_domain * domain,void * addr,int size)938 static void domain_flush_cache(struct dmar_domain *domain,
939 			       void *addr, int size)
940 {
941 	if (!domain->iommu_coherency)
942 		clflush_cache_range(addr, size);
943 }
944 
device_context_mapped(struct intel_iommu * iommu,u8 bus,u8 devfn)945 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
946 {
947 	struct context_entry *context;
948 	int ret = 0;
949 	unsigned long flags;
950 
951 	spin_lock_irqsave(&iommu->lock, flags);
952 	context = iommu_context_addr(iommu, bus, devfn, 0);
953 	if (context)
954 		ret = context_present(context);
955 	spin_unlock_irqrestore(&iommu->lock, flags);
956 	return ret;
957 }
958 
free_context_table(struct intel_iommu * iommu)959 static void free_context_table(struct intel_iommu *iommu)
960 {
961 	int i;
962 	unsigned long flags;
963 	struct context_entry *context;
964 
965 	spin_lock_irqsave(&iommu->lock, flags);
966 	if (!iommu->root_entry) {
967 		goto out;
968 	}
969 	for (i = 0; i < ROOT_ENTRY_NR; i++) {
970 		context = iommu_context_addr(iommu, i, 0, 0);
971 		if (context)
972 			free_pgtable_page(context);
973 
974 		if (!ecs_enabled(iommu))
975 			continue;
976 
977 		context = iommu_context_addr(iommu, i, 0x80, 0);
978 		if (context)
979 			free_pgtable_page(context);
980 
981 	}
982 	free_pgtable_page(iommu->root_entry);
983 	iommu->root_entry = NULL;
984 out:
985 	spin_unlock_irqrestore(&iommu->lock, flags);
986 }
987 
pfn_to_dma_pte(struct dmar_domain * domain,unsigned long pfn,int * target_level)988 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
989 				      unsigned long pfn, int *target_level)
990 {
991 	struct dma_pte *parent, *pte = NULL;
992 	int level = agaw_to_level(domain->agaw);
993 	int offset;
994 
995 	BUG_ON(!domain->pgd);
996 
997 	if (!domain_pfn_supported(domain, pfn))
998 		/* Address beyond IOMMU's addressing capabilities. */
999 		return NULL;
1000 
1001 	parent = domain->pgd;
1002 
1003 	while (1) {
1004 		void *tmp_page;
1005 
1006 		offset = pfn_level_offset(pfn, level);
1007 		pte = &parent[offset];
1008 		if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
1009 			break;
1010 		if (level == *target_level)
1011 			break;
1012 
1013 		if (!dma_pte_present(pte)) {
1014 			uint64_t pteval;
1015 
1016 			tmp_page = alloc_pgtable_page(domain->nid);
1017 
1018 			if (!tmp_page)
1019 				return NULL;
1020 
1021 			domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
1022 			pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
1023 			if (cmpxchg64(&pte->val, 0ULL, pteval))
1024 				/* Someone else set it while we were thinking; use theirs. */
1025 				free_pgtable_page(tmp_page);
1026 			else
1027 				domain_flush_cache(domain, pte, sizeof(*pte));
1028 		}
1029 		if (level == 1)
1030 			break;
1031 
1032 		parent = phys_to_virt(dma_pte_addr(pte));
1033 		level--;
1034 	}
1035 
1036 	if (!*target_level)
1037 		*target_level = level;
1038 
1039 	return pte;
1040 }
1041 
1042 
1043 /* return address's pte at specific level */
dma_pfn_level_pte(struct dmar_domain * domain,unsigned long pfn,int level,int * large_page)1044 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
1045 					 unsigned long pfn,
1046 					 int level, int *large_page)
1047 {
1048 	struct dma_pte *parent, *pte = NULL;
1049 	int total = agaw_to_level(domain->agaw);
1050 	int offset;
1051 
1052 	parent = domain->pgd;
1053 	while (level <= total) {
1054 		offset = pfn_level_offset(pfn, total);
1055 		pte = &parent[offset];
1056 		if (level == total)
1057 			return pte;
1058 
1059 		if (!dma_pte_present(pte)) {
1060 			*large_page = total;
1061 			break;
1062 		}
1063 
1064 		if (dma_pte_superpage(pte)) {
1065 			*large_page = total;
1066 			return pte;
1067 		}
1068 
1069 		parent = phys_to_virt(dma_pte_addr(pte));
1070 		total--;
1071 	}
1072 	return NULL;
1073 }
1074 
1075 /* clear last level pte, a tlb flush should be followed */
dma_pte_clear_range(struct dmar_domain * domain,unsigned long start_pfn,unsigned long last_pfn)1076 static void dma_pte_clear_range(struct dmar_domain *domain,
1077 				unsigned long start_pfn,
1078 				unsigned long last_pfn)
1079 {
1080 	unsigned int large_page = 1;
1081 	struct dma_pte *first_pte, *pte;
1082 
1083 	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1084 	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1085 	BUG_ON(start_pfn > last_pfn);
1086 
1087 	/* we don't need lock here; nobody else touches the iova range */
1088 	do {
1089 		large_page = 1;
1090 		first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1091 		if (!pte) {
1092 			start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1093 			continue;
1094 		}
1095 		do {
1096 			dma_clear_pte(pte);
1097 			start_pfn += lvl_to_nr_pages(large_page);
1098 			pte++;
1099 		} while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1100 
1101 		domain_flush_cache(domain, first_pte,
1102 				   (void *)pte - (void *)first_pte);
1103 
1104 	} while (start_pfn && start_pfn <= last_pfn);
1105 }
1106 
dma_pte_free_level(struct dmar_domain * domain,int level,int retain_level,struct dma_pte * pte,unsigned long pfn,unsigned long start_pfn,unsigned long last_pfn)1107 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1108 			       int retain_level, struct dma_pte *pte,
1109 			       unsigned long pfn, unsigned long start_pfn,
1110 			       unsigned long last_pfn)
1111 {
1112 	pfn = max(start_pfn, pfn);
1113 	pte = &pte[pfn_level_offset(pfn, level)];
1114 
1115 	do {
1116 		unsigned long level_pfn;
1117 		struct dma_pte *level_pte;
1118 
1119 		if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1120 			goto next;
1121 
1122 		level_pfn = pfn & level_mask(level);
1123 		level_pte = phys_to_virt(dma_pte_addr(pte));
1124 
1125 		if (level > 2) {
1126 			dma_pte_free_level(domain, level - 1, retain_level,
1127 					   level_pte, level_pfn, start_pfn,
1128 					   last_pfn);
1129 		}
1130 
1131 		/*
1132 		 * Free the page table if we're below the level we want to
1133 		 * retain and the range covers the entire table.
1134 		 */
1135 		if (level < retain_level && !(start_pfn > level_pfn ||
1136 		      last_pfn < level_pfn + level_size(level) - 1)) {
1137 			dma_clear_pte(pte);
1138 			domain_flush_cache(domain, pte, sizeof(*pte));
1139 			free_pgtable_page(level_pte);
1140 		}
1141 next:
1142 		pfn += level_size(level);
1143 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1144 }
1145 
1146 /*
1147  * clear last level (leaf) ptes and free page table pages below the
1148  * level we wish to keep intact.
1149  */
dma_pte_free_pagetable(struct dmar_domain * domain,unsigned long start_pfn,unsigned long last_pfn,int retain_level)1150 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1151 				   unsigned long start_pfn,
1152 				   unsigned long last_pfn,
1153 				   int retain_level)
1154 {
1155 	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1156 	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1157 	BUG_ON(start_pfn > last_pfn);
1158 
1159 	dma_pte_clear_range(domain, start_pfn, last_pfn);
1160 
1161 	/* We don't need lock here; nobody else touches the iova range */
1162 	dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1163 			   domain->pgd, 0, start_pfn, last_pfn);
1164 
1165 	/* free pgd */
1166 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1167 		free_pgtable_page(domain->pgd);
1168 		domain->pgd = NULL;
1169 	}
1170 }
1171 
1172 /* When a page at a given level is being unlinked from its parent, we don't
1173    need to *modify* it at all. All we need to do is make a list of all the
1174    pages which can be freed just as soon as we've flushed the IOTLB and we
1175    know the hardware page-walk will no longer touch them.
1176    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1177    be freed. */
dma_pte_list_pagetables(struct dmar_domain * domain,int level,struct dma_pte * pte,struct page * freelist)1178 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1179 					    int level, struct dma_pte *pte,
1180 					    struct page *freelist)
1181 {
1182 	struct page *pg;
1183 
1184 	pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1185 	pg->freelist = freelist;
1186 	freelist = pg;
1187 
1188 	if (level == 1)
1189 		return freelist;
1190 
1191 	pte = page_address(pg);
1192 	do {
1193 		if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1194 			freelist = dma_pte_list_pagetables(domain, level - 1,
1195 							   pte, freelist);
1196 		pte++;
1197 	} while (!first_pte_in_page(pte));
1198 
1199 	return freelist;
1200 }
1201 
dma_pte_clear_level(struct dmar_domain * domain,int level,struct dma_pte * pte,unsigned long pfn,unsigned long start_pfn,unsigned long last_pfn,struct page * freelist)1202 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1203 					struct dma_pte *pte, unsigned long pfn,
1204 					unsigned long start_pfn,
1205 					unsigned long last_pfn,
1206 					struct page *freelist)
1207 {
1208 	struct dma_pte *first_pte = NULL, *last_pte = NULL;
1209 
1210 	pfn = max(start_pfn, pfn);
1211 	pte = &pte[pfn_level_offset(pfn, level)];
1212 
1213 	do {
1214 		unsigned long level_pfn;
1215 
1216 		if (!dma_pte_present(pte))
1217 			goto next;
1218 
1219 		level_pfn = pfn & level_mask(level);
1220 
1221 		/* If range covers entire pagetable, free it */
1222 		if (start_pfn <= level_pfn &&
1223 		    last_pfn >= level_pfn + level_size(level) - 1) {
1224 			/* These suborbinate page tables are going away entirely. Don't
1225 			   bother to clear them; we're just going to *free* them. */
1226 			if (level > 1 && !dma_pte_superpage(pte))
1227 				freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1228 
1229 			dma_clear_pte(pte);
1230 			if (!first_pte)
1231 				first_pte = pte;
1232 			last_pte = pte;
1233 		} else if (level > 1) {
1234 			/* Recurse down into a level that isn't *entirely* obsolete */
1235 			freelist = dma_pte_clear_level(domain, level - 1,
1236 						       phys_to_virt(dma_pte_addr(pte)),
1237 						       level_pfn, start_pfn, last_pfn,
1238 						       freelist);
1239 		}
1240 next:
1241 		pfn += level_size(level);
1242 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1243 
1244 	if (first_pte)
1245 		domain_flush_cache(domain, first_pte,
1246 				   (void *)++last_pte - (void *)first_pte);
1247 
1248 	return freelist;
1249 }
1250 
1251 /* We can't just free the pages because the IOMMU may still be walking
1252    the page tables, and may have cached the intermediate levels. The
1253    pages can only be freed after the IOTLB flush has been done. */
domain_unmap(struct dmar_domain * domain,unsigned long start_pfn,unsigned long last_pfn)1254 static struct page *domain_unmap(struct dmar_domain *domain,
1255 				 unsigned long start_pfn,
1256 				 unsigned long last_pfn)
1257 {
1258 	struct page *freelist = NULL;
1259 
1260 	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1261 	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1262 	BUG_ON(start_pfn > last_pfn);
1263 
1264 	/* we don't need lock here; nobody else touches the iova range */
1265 	freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1266 				       domain->pgd, 0, start_pfn, last_pfn, NULL);
1267 
1268 	/* free pgd */
1269 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1270 		struct page *pgd_page = virt_to_page(domain->pgd);
1271 		pgd_page->freelist = freelist;
1272 		freelist = pgd_page;
1273 
1274 		domain->pgd = NULL;
1275 	}
1276 
1277 	return freelist;
1278 }
1279 
dma_free_pagelist(struct page * freelist)1280 static void dma_free_pagelist(struct page *freelist)
1281 {
1282 	struct page *pg;
1283 
1284 	while ((pg = freelist)) {
1285 		freelist = pg->freelist;
1286 		free_pgtable_page(page_address(pg));
1287 	}
1288 }
1289 
iova_entry_free(unsigned long data)1290 static void iova_entry_free(unsigned long data)
1291 {
1292 	struct page *freelist = (struct page *)data;
1293 
1294 	dma_free_pagelist(freelist);
1295 }
1296 
1297 /* iommu handling */
iommu_alloc_root_entry(struct intel_iommu * iommu)1298 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1299 {
1300 	struct root_entry *root;
1301 	unsigned long flags;
1302 
1303 	root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1304 	if (!root) {
1305 		pr_err("Allocating root entry for %s failed\n",
1306 			iommu->name);
1307 		return -ENOMEM;
1308 	}
1309 
1310 	__iommu_flush_cache(iommu, root, ROOT_SIZE);
1311 
1312 	spin_lock_irqsave(&iommu->lock, flags);
1313 	iommu->root_entry = root;
1314 	spin_unlock_irqrestore(&iommu->lock, flags);
1315 
1316 	return 0;
1317 }
1318 
iommu_set_root_entry(struct intel_iommu * iommu)1319 static void iommu_set_root_entry(struct intel_iommu *iommu)
1320 {
1321 	u64 addr;
1322 	u32 sts;
1323 	unsigned long flag;
1324 
1325 	addr = virt_to_phys(iommu->root_entry);
1326 	if (ecs_enabled(iommu))
1327 		addr |= DMA_RTADDR_RTT;
1328 
1329 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1330 	dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1331 
1332 	writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1333 
1334 	/* Make sure hardware complete it */
1335 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1336 		      readl, (sts & DMA_GSTS_RTPS), sts);
1337 
1338 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1339 }
1340 
iommu_flush_write_buffer(struct intel_iommu * iommu)1341 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
1342 {
1343 	u32 val;
1344 	unsigned long flag;
1345 
1346 	if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1347 		return;
1348 
1349 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1350 	writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1351 
1352 	/* Make sure hardware complete it */
1353 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1354 		      readl, (!(val & DMA_GSTS_WBFS)), val);
1355 
1356 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1357 }
1358 
1359 /* return value determine if we need a write buffer flush */
__iommu_flush_context(struct intel_iommu * iommu,u16 did,u16 source_id,u8 function_mask,u64 type)1360 static void __iommu_flush_context(struct intel_iommu *iommu,
1361 				  u16 did, u16 source_id, u8 function_mask,
1362 				  u64 type)
1363 {
1364 	u64 val = 0;
1365 	unsigned long flag;
1366 
1367 	switch (type) {
1368 	case DMA_CCMD_GLOBAL_INVL:
1369 		val = DMA_CCMD_GLOBAL_INVL;
1370 		break;
1371 	case DMA_CCMD_DOMAIN_INVL:
1372 		val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1373 		break;
1374 	case DMA_CCMD_DEVICE_INVL:
1375 		val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1376 			| DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1377 		break;
1378 	default:
1379 		BUG();
1380 	}
1381 	val |= DMA_CCMD_ICC;
1382 
1383 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1384 	dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1385 
1386 	/* Make sure hardware complete it */
1387 	IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1388 		dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1389 
1390 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1391 }
1392 
1393 /* return value determine if we need a write buffer flush */
__iommu_flush_iotlb(struct intel_iommu * iommu,u16 did,u64 addr,unsigned int size_order,u64 type)1394 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1395 				u64 addr, unsigned int size_order, u64 type)
1396 {
1397 	int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1398 	u64 val = 0, val_iva = 0;
1399 	unsigned long flag;
1400 
1401 	switch (type) {
1402 	case DMA_TLB_GLOBAL_FLUSH:
1403 		/* global flush doesn't need set IVA_REG */
1404 		val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1405 		break;
1406 	case DMA_TLB_DSI_FLUSH:
1407 		val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1408 		break;
1409 	case DMA_TLB_PSI_FLUSH:
1410 		val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1411 		/* IH bit is passed in as part of address */
1412 		val_iva = size_order | addr;
1413 		break;
1414 	default:
1415 		BUG();
1416 	}
1417 	/* Note: set drain read/write */
1418 #if 0
1419 	/*
1420 	 * This is probably to be super secure.. Looks like we can
1421 	 * ignore it without any impact.
1422 	 */
1423 	if (cap_read_drain(iommu->cap))
1424 		val |= DMA_TLB_READ_DRAIN;
1425 #endif
1426 	if (cap_write_drain(iommu->cap))
1427 		val |= DMA_TLB_WRITE_DRAIN;
1428 
1429 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1430 	/* Note: Only uses first TLB reg currently */
1431 	if (val_iva)
1432 		dmar_writeq(iommu->reg + tlb_offset, val_iva);
1433 	dmar_writeq(iommu->reg + tlb_offset + 8, val);
1434 
1435 	/* Make sure hardware complete it */
1436 	IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1437 		dmar_readq, (!(val & DMA_TLB_IVT)), val);
1438 
1439 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1440 
1441 	/* check IOTLB invalidation granularity */
1442 	if (DMA_TLB_IAIG(val) == 0)
1443 		pr_err("Flush IOTLB failed\n");
1444 	if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1445 		pr_debug("TLB flush request %Lx, actual %Lx\n",
1446 			(unsigned long long)DMA_TLB_IIRG(type),
1447 			(unsigned long long)DMA_TLB_IAIG(val));
1448 }
1449 
1450 static struct device_domain_info *
iommu_support_dev_iotlb(struct dmar_domain * domain,struct intel_iommu * iommu,u8 bus,u8 devfn)1451 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1452 			 u8 bus, u8 devfn)
1453 {
1454 	struct device_domain_info *info;
1455 
1456 	assert_spin_locked(&device_domain_lock);
1457 
1458 	if (!iommu->qi)
1459 		return NULL;
1460 
1461 	list_for_each_entry(info, &domain->devices, link)
1462 		if (info->iommu == iommu && info->bus == bus &&
1463 		    info->devfn == devfn) {
1464 			if (info->ats_supported && info->dev)
1465 				return info;
1466 			break;
1467 		}
1468 
1469 	return NULL;
1470 }
1471 
domain_update_iotlb(struct dmar_domain * domain)1472 static void domain_update_iotlb(struct dmar_domain *domain)
1473 {
1474 	struct device_domain_info *info;
1475 	bool has_iotlb_device = false;
1476 
1477 	assert_spin_locked(&device_domain_lock);
1478 
1479 	list_for_each_entry(info, &domain->devices, link) {
1480 		struct pci_dev *pdev;
1481 
1482 		if (!info->dev || !dev_is_pci(info->dev))
1483 			continue;
1484 
1485 		pdev = to_pci_dev(info->dev);
1486 		if (pdev->ats_enabled) {
1487 			has_iotlb_device = true;
1488 			break;
1489 		}
1490 	}
1491 
1492 	domain->has_iotlb_device = has_iotlb_device;
1493 }
1494 
iommu_enable_dev_iotlb(struct device_domain_info * info)1495 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1496 {
1497 	struct pci_dev *pdev;
1498 
1499 	assert_spin_locked(&device_domain_lock);
1500 
1501 	if (!info || !dev_is_pci(info->dev))
1502 		return;
1503 
1504 	pdev = to_pci_dev(info->dev);
1505 	/* For IOMMU that supports device IOTLB throttling (DIT), we assign
1506 	 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1507 	 * queue depth at PF level. If DIT is not set, PFSID will be treated as
1508 	 * reserved, which should be set to 0.
1509 	 */
1510 	if (!ecap_dit(info->iommu->ecap))
1511 		info->pfsid = 0;
1512 	else {
1513 		struct pci_dev *pf_pdev;
1514 
1515 		/* pdev will be returned if device is not a vf */
1516 		pf_pdev = pci_physfn(pdev);
1517 		info->pfsid = PCI_DEVID(pf_pdev->bus->number, pf_pdev->devfn);
1518 	}
1519 
1520 #ifdef CONFIG_INTEL_IOMMU_SVM
1521 	/* The PCIe spec, in its wisdom, declares that the behaviour of
1522 	   the device if you enable PASID support after ATS support is
1523 	   undefined. So always enable PASID support on devices which
1524 	   have it, even if we can't yet know if we're ever going to
1525 	   use it. */
1526 	if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1527 		info->pasid_enabled = 1;
1528 
1529 	if (info->pri_supported && !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1530 		info->pri_enabled = 1;
1531 #endif
1532 	if (info->ats_supported && !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1533 		info->ats_enabled = 1;
1534 		domain_update_iotlb(info->domain);
1535 		info->ats_qdep = pci_ats_queue_depth(pdev);
1536 	}
1537 }
1538 
iommu_disable_dev_iotlb(struct device_domain_info * info)1539 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1540 {
1541 	struct pci_dev *pdev;
1542 
1543 	assert_spin_locked(&device_domain_lock);
1544 
1545 	if (!dev_is_pci(info->dev))
1546 		return;
1547 
1548 	pdev = to_pci_dev(info->dev);
1549 
1550 	if (info->ats_enabled) {
1551 		pci_disable_ats(pdev);
1552 		info->ats_enabled = 0;
1553 		domain_update_iotlb(info->domain);
1554 	}
1555 #ifdef CONFIG_INTEL_IOMMU_SVM
1556 	if (info->pri_enabled) {
1557 		pci_disable_pri(pdev);
1558 		info->pri_enabled = 0;
1559 	}
1560 	if (info->pasid_enabled) {
1561 		pci_disable_pasid(pdev);
1562 		info->pasid_enabled = 0;
1563 	}
1564 #endif
1565 }
1566 
iommu_flush_dev_iotlb(struct dmar_domain * domain,u64 addr,unsigned mask)1567 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1568 				  u64 addr, unsigned mask)
1569 {
1570 	u16 sid, qdep;
1571 	unsigned long flags;
1572 	struct device_domain_info *info;
1573 
1574 	if (!domain->has_iotlb_device)
1575 		return;
1576 
1577 	spin_lock_irqsave(&device_domain_lock, flags);
1578 	list_for_each_entry(info, &domain->devices, link) {
1579 		if (!info->ats_enabled)
1580 			continue;
1581 
1582 		sid = info->bus << 8 | info->devfn;
1583 		qdep = info->ats_qdep;
1584 		qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1585 				qdep, addr, mask);
1586 	}
1587 	spin_unlock_irqrestore(&device_domain_lock, flags);
1588 }
1589 
iommu_flush_iotlb_psi(struct intel_iommu * iommu,struct dmar_domain * domain,unsigned long pfn,unsigned int pages,int ih,int map)1590 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1591 				  struct dmar_domain *domain,
1592 				  unsigned long pfn, unsigned int pages,
1593 				  int ih, int map)
1594 {
1595 	unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1596 	uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1597 	u16 did = domain->iommu_did[iommu->seq_id];
1598 
1599 	BUG_ON(pages == 0);
1600 
1601 	if (ih)
1602 		ih = 1 << 6;
1603 	/*
1604 	 * Fallback to domain selective flush if no PSI support or the size is
1605 	 * too big.
1606 	 * PSI requires page size to be 2 ^ x, and the base address is naturally
1607 	 * aligned to the size
1608 	 */
1609 	if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1610 		iommu->flush.flush_iotlb(iommu, did, 0, 0,
1611 						DMA_TLB_DSI_FLUSH);
1612 	else
1613 		iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1614 						DMA_TLB_PSI_FLUSH);
1615 
1616 	/*
1617 	 * In caching mode, changes of pages from non-present to present require
1618 	 * flush. However, device IOTLB doesn't need to be flushed in this case.
1619 	 */
1620 	if (!cap_caching_mode(iommu->cap) || !map)
1621 		iommu_flush_dev_iotlb(domain, addr, mask);
1622 }
1623 
iommu_flush_iova(struct iova_domain * iovad)1624 static void iommu_flush_iova(struct iova_domain *iovad)
1625 {
1626 	struct dmar_domain *domain;
1627 	int idx;
1628 
1629 	domain = container_of(iovad, struct dmar_domain, iovad);
1630 
1631 	for_each_domain_iommu(idx, domain) {
1632 		struct intel_iommu *iommu = g_iommus[idx];
1633 		u16 did = domain->iommu_did[iommu->seq_id];
1634 
1635 		iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
1636 
1637 		if (!cap_caching_mode(iommu->cap))
1638 			iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1639 					      0, MAX_AGAW_PFN_WIDTH);
1640 	}
1641 }
1642 
iommu_disable_protect_mem_regions(struct intel_iommu * iommu)1643 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1644 {
1645 	u32 pmen;
1646 	unsigned long flags;
1647 
1648 	if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1649 		return;
1650 
1651 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1652 	pmen = readl(iommu->reg + DMAR_PMEN_REG);
1653 	pmen &= ~DMA_PMEN_EPM;
1654 	writel(pmen, iommu->reg + DMAR_PMEN_REG);
1655 
1656 	/* wait for the protected region status bit to clear */
1657 	IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1658 		readl, !(pmen & DMA_PMEN_PRS), pmen);
1659 
1660 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1661 }
1662 
iommu_enable_translation(struct intel_iommu * iommu)1663 static void iommu_enable_translation(struct intel_iommu *iommu)
1664 {
1665 	u32 sts;
1666 	unsigned long flags;
1667 
1668 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1669 	iommu->gcmd |= DMA_GCMD_TE;
1670 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1671 
1672 	/* Make sure hardware complete it */
1673 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1674 		      readl, (sts & DMA_GSTS_TES), sts);
1675 
1676 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1677 }
1678 
iommu_disable_translation(struct intel_iommu * iommu)1679 static void iommu_disable_translation(struct intel_iommu *iommu)
1680 {
1681 	u32 sts;
1682 	unsigned long flag;
1683 
1684 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1685 	iommu->gcmd &= ~DMA_GCMD_TE;
1686 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1687 
1688 	/* Make sure hardware complete it */
1689 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1690 		      readl, (!(sts & DMA_GSTS_TES)), sts);
1691 
1692 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1693 }
1694 
1695 
iommu_init_domains(struct intel_iommu * iommu)1696 static int iommu_init_domains(struct intel_iommu *iommu)
1697 {
1698 	u32 ndomains, nlongs;
1699 	size_t size;
1700 
1701 	ndomains = cap_ndoms(iommu->cap);
1702 	pr_debug("%s: Number of Domains supported <%d>\n",
1703 		 iommu->name, ndomains);
1704 	nlongs = BITS_TO_LONGS(ndomains);
1705 
1706 	spin_lock_init(&iommu->lock);
1707 
1708 	iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1709 	if (!iommu->domain_ids) {
1710 		pr_err("%s: Allocating domain id array failed\n",
1711 		       iommu->name);
1712 		return -ENOMEM;
1713 	}
1714 
1715 	size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1716 	iommu->domains = kzalloc(size, GFP_KERNEL);
1717 
1718 	if (iommu->domains) {
1719 		size = 256 * sizeof(struct dmar_domain *);
1720 		iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1721 	}
1722 
1723 	if (!iommu->domains || !iommu->domains[0]) {
1724 		pr_err("%s: Allocating domain array failed\n",
1725 		       iommu->name);
1726 		kfree(iommu->domain_ids);
1727 		kfree(iommu->domains);
1728 		iommu->domain_ids = NULL;
1729 		iommu->domains    = NULL;
1730 		return -ENOMEM;
1731 	}
1732 
1733 
1734 
1735 	/*
1736 	 * If Caching mode is set, then invalid translations are tagged
1737 	 * with domain-id 0, hence we need to pre-allocate it. We also
1738 	 * use domain-id 0 as a marker for non-allocated domain-id, so
1739 	 * make sure it is not used for a real domain.
1740 	 */
1741 	set_bit(0, iommu->domain_ids);
1742 
1743 	return 0;
1744 }
1745 
disable_dmar_iommu(struct intel_iommu * iommu)1746 static void disable_dmar_iommu(struct intel_iommu *iommu)
1747 {
1748 	struct device_domain_info *info, *tmp;
1749 	unsigned long flags;
1750 
1751 	if (!iommu->domains || !iommu->domain_ids)
1752 		return;
1753 
1754 again:
1755 	spin_lock_irqsave(&device_domain_lock, flags);
1756 	list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1757 		struct dmar_domain *domain;
1758 
1759 		if (info->iommu != iommu)
1760 			continue;
1761 
1762 		if (!info->dev || !info->domain)
1763 			continue;
1764 
1765 		domain = info->domain;
1766 
1767 		__dmar_remove_one_dev_info(info);
1768 
1769 		if (!domain_type_is_vm_or_si(domain)) {
1770 			/*
1771 			 * The domain_exit() function  can't be called under
1772 			 * device_domain_lock, as it takes this lock itself.
1773 			 * So release the lock here and re-run the loop
1774 			 * afterwards.
1775 			 */
1776 			spin_unlock_irqrestore(&device_domain_lock, flags);
1777 			domain_exit(domain);
1778 			goto again;
1779 		}
1780 	}
1781 	spin_unlock_irqrestore(&device_domain_lock, flags);
1782 
1783 	if (iommu->gcmd & DMA_GCMD_TE)
1784 		iommu_disable_translation(iommu);
1785 }
1786 
free_dmar_iommu(struct intel_iommu * iommu)1787 static void free_dmar_iommu(struct intel_iommu *iommu)
1788 {
1789 	if ((iommu->domains) && (iommu->domain_ids)) {
1790 		int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1791 		int i;
1792 
1793 		for (i = 0; i < elems; i++)
1794 			kfree(iommu->domains[i]);
1795 		kfree(iommu->domains);
1796 		kfree(iommu->domain_ids);
1797 		iommu->domains = NULL;
1798 		iommu->domain_ids = NULL;
1799 	}
1800 
1801 	g_iommus[iommu->seq_id] = NULL;
1802 
1803 	/* free context mapping */
1804 	free_context_table(iommu);
1805 
1806 #ifdef CONFIG_INTEL_IOMMU_SVM
1807 	if (pasid_enabled(iommu)) {
1808 		if (ecap_prs(iommu->ecap))
1809 			intel_svm_finish_prq(iommu);
1810 		intel_svm_free_pasid_tables(iommu);
1811 	}
1812 #endif
1813 }
1814 
alloc_domain(int flags)1815 static struct dmar_domain *alloc_domain(int flags)
1816 {
1817 	struct dmar_domain *domain;
1818 
1819 	domain = alloc_domain_mem();
1820 	if (!domain)
1821 		return NULL;
1822 
1823 	memset(domain, 0, sizeof(*domain));
1824 	domain->nid = -1;
1825 	domain->flags = flags;
1826 	domain->has_iotlb_device = false;
1827 	INIT_LIST_HEAD(&domain->devices);
1828 
1829 	return domain;
1830 }
1831 
1832 /* Must be called with iommu->lock */
domain_attach_iommu(struct dmar_domain * domain,struct intel_iommu * iommu)1833 static int domain_attach_iommu(struct dmar_domain *domain,
1834 			       struct intel_iommu *iommu)
1835 {
1836 	unsigned long ndomains;
1837 	int num;
1838 
1839 	assert_spin_locked(&device_domain_lock);
1840 	assert_spin_locked(&iommu->lock);
1841 
1842 	domain->iommu_refcnt[iommu->seq_id] += 1;
1843 	domain->iommu_count += 1;
1844 	if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1845 		ndomains = cap_ndoms(iommu->cap);
1846 		num      = find_first_zero_bit(iommu->domain_ids, ndomains);
1847 
1848 		if (num >= ndomains) {
1849 			pr_err("%s: No free domain ids\n", iommu->name);
1850 			domain->iommu_refcnt[iommu->seq_id] -= 1;
1851 			domain->iommu_count -= 1;
1852 			return -ENOSPC;
1853 		}
1854 
1855 		set_bit(num, iommu->domain_ids);
1856 		set_iommu_domain(iommu, num, domain);
1857 
1858 		domain->iommu_did[iommu->seq_id] = num;
1859 		domain->nid			 = iommu->node;
1860 
1861 		domain_update_iommu_cap(domain);
1862 	}
1863 
1864 	return 0;
1865 }
1866 
domain_detach_iommu(struct dmar_domain * domain,struct intel_iommu * iommu)1867 static int domain_detach_iommu(struct dmar_domain *domain,
1868 			       struct intel_iommu *iommu)
1869 {
1870 	int num, count = INT_MAX;
1871 
1872 	assert_spin_locked(&device_domain_lock);
1873 	assert_spin_locked(&iommu->lock);
1874 
1875 	domain->iommu_refcnt[iommu->seq_id] -= 1;
1876 	count = --domain->iommu_count;
1877 	if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1878 		num = domain->iommu_did[iommu->seq_id];
1879 		clear_bit(num, iommu->domain_ids);
1880 		set_iommu_domain(iommu, num, NULL);
1881 
1882 		domain_update_iommu_cap(domain);
1883 		domain->iommu_did[iommu->seq_id] = 0;
1884 	}
1885 
1886 	return count;
1887 }
1888 
1889 static struct iova_domain reserved_iova_list;
1890 static struct lock_class_key reserved_rbtree_key;
1891 
dmar_init_reserved_ranges(void)1892 static int dmar_init_reserved_ranges(void)
1893 {
1894 	struct pci_dev *pdev = NULL;
1895 	struct iova *iova;
1896 	int i;
1897 
1898 	init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN,
1899 			DMA_32BIT_PFN);
1900 
1901 	lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1902 		&reserved_rbtree_key);
1903 
1904 	/* IOAPIC ranges shouldn't be accessed by DMA */
1905 	iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1906 		IOVA_PFN(IOAPIC_RANGE_END));
1907 	if (!iova) {
1908 		pr_err("Reserve IOAPIC range failed\n");
1909 		return -ENODEV;
1910 	}
1911 
1912 	/* Reserve all PCI MMIO to avoid peer-to-peer access */
1913 	for_each_pci_dev(pdev) {
1914 		struct resource *r;
1915 
1916 		for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1917 			r = &pdev->resource[i];
1918 			if (!r->flags || !(r->flags & IORESOURCE_MEM))
1919 				continue;
1920 			iova = reserve_iova(&reserved_iova_list,
1921 					    IOVA_PFN(r->start),
1922 					    IOVA_PFN(r->end));
1923 			if (!iova) {
1924 				pr_err("Reserve iova failed\n");
1925 				return -ENODEV;
1926 			}
1927 		}
1928 	}
1929 	return 0;
1930 }
1931 
domain_reserve_special_ranges(struct dmar_domain * domain)1932 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1933 {
1934 	copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1935 }
1936 
guestwidth_to_adjustwidth(int gaw)1937 static inline int guestwidth_to_adjustwidth(int gaw)
1938 {
1939 	int agaw;
1940 	int r = (gaw - 12) % 9;
1941 
1942 	if (r == 0)
1943 		agaw = gaw;
1944 	else
1945 		agaw = gaw + 9 - r;
1946 	if (agaw > 64)
1947 		agaw = 64;
1948 	return agaw;
1949 }
1950 
domain_init(struct dmar_domain * domain,struct intel_iommu * iommu,int guest_width)1951 static int domain_init(struct dmar_domain *domain, struct intel_iommu *iommu,
1952 		       int guest_width)
1953 {
1954 	int adjust_width, agaw;
1955 	unsigned long sagaw;
1956 	int err;
1957 
1958 	init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN,
1959 			DMA_32BIT_PFN);
1960 
1961 	err = init_iova_flush_queue(&domain->iovad,
1962 				    iommu_flush_iova, iova_entry_free);
1963 	if (err)
1964 		return err;
1965 
1966 	domain_reserve_special_ranges(domain);
1967 
1968 	/* calculate AGAW */
1969 	if (guest_width > cap_mgaw(iommu->cap))
1970 		guest_width = cap_mgaw(iommu->cap);
1971 	domain->gaw = guest_width;
1972 	adjust_width = guestwidth_to_adjustwidth(guest_width);
1973 	agaw = width_to_agaw(adjust_width);
1974 	sagaw = cap_sagaw(iommu->cap);
1975 	if (!test_bit(agaw, &sagaw)) {
1976 		/* hardware doesn't support it, choose a bigger one */
1977 		pr_debug("Hardware doesn't support agaw %d\n", agaw);
1978 		agaw = find_next_bit(&sagaw, 5, agaw);
1979 		if (agaw >= 5)
1980 			return -ENODEV;
1981 	}
1982 	domain->agaw = agaw;
1983 
1984 	if (ecap_coherent(iommu->ecap))
1985 		domain->iommu_coherency = 1;
1986 	else
1987 		domain->iommu_coherency = 0;
1988 
1989 	if (ecap_sc_support(iommu->ecap))
1990 		domain->iommu_snooping = 1;
1991 	else
1992 		domain->iommu_snooping = 0;
1993 
1994 	if (intel_iommu_superpage)
1995 		domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1996 	else
1997 		domain->iommu_superpage = 0;
1998 
1999 	domain->nid = iommu->node;
2000 
2001 	/* always allocate the top pgd */
2002 	domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
2003 	if (!domain->pgd)
2004 		return -ENOMEM;
2005 	__iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
2006 	return 0;
2007 }
2008 
domain_exit(struct dmar_domain * domain)2009 static void domain_exit(struct dmar_domain *domain)
2010 {
2011 	struct page *freelist = NULL;
2012 
2013 	/* Domain 0 is reserved, so dont process it */
2014 	if (!domain)
2015 		return;
2016 
2017 	/* Remove associated devices and clear attached or cached domains */
2018 	rcu_read_lock();
2019 	domain_remove_dev_info(domain);
2020 	rcu_read_unlock();
2021 
2022 	/* destroy iovas */
2023 	put_iova_domain(&domain->iovad);
2024 
2025 	freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
2026 
2027 	dma_free_pagelist(freelist);
2028 
2029 	free_domain_mem(domain);
2030 }
2031 
domain_context_mapping_one(struct dmar_domain * domain,struct intel_iommu * iommu,u8 bus,u8 devfn)2032 static int domain_context_mapping_one(struct dmar_domain *domain,
2033 				      struct intel_iommu *iommu,
2034 				      u8 bus, u8 devfn)
2035 {
2036 	u16 did = domain->iommu_did[iommu->seq_id];
2037 	int translation = CONTEXT_TT_MULTI_LEVEL;
2038 	struct device_domain_info *info = NULL;
2039 	struct context_entry *context;
2040 	unsigned long flags;
2041 	struct dma_pte *pgd;
2042 	int ret, agaw;
2043 
2044 	WARN_ON(did == 0);
2045 
2046 	if (hw_pass_through && domain_type_is_si(domain))
2047 		translation = CONTEXT_TT_PASS_THROUGH;
2048 
2049 	pr_debug("Set context mapping for %02x:%02x.%d\n",
2050 		bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
2051 
2052 	BUG_ON(!domain->pgd);
2053 
2054 	spin_lock_irqsave(&device_domain_lock, flags);
2055 	spin_lock(&iommu->lock);
2056 
2057 	ret = -ENOMEM;
2058 	context = iommu_context_addr(iommu, bus, devfn, 1);
2059 	if (!context)
2060 		goto out_unlock;
2061 
2062 	ret = 0;
2063 	if (context_present(context))
2064 		goto out_unlock;
2065 
2066 	/*
2067 	 * For kdump cases, old valid entries may be cached due to the
2068 	 * in-flight DMA and copied pgtable, but there is no unmapping
2069 	 * behaviour for them, thus we need an explicit cache flush for
2070 	 * the newly-mapped device. For kdump, at this point, the device
2071 	 * is supposed to finish reset at its driver probe stage, so no
2072 	 * in-flight DMA will exist, and we don't need to worry anymore
2073 	 * hereafter.
2074 	 */
2075 	if (context_copied(context)) {
2076 		u16 did_old = context_domain_id(context);
2077 
2078 		if (did_old >= 0 && did_old < cap_ndoms(iommu->cap)) {
2079 			iommu->flush.flush_context(iommu, did_old,
2080 						   (((u16)bus) << 8) | devfn,
2081 						   DMA_CCMD_MASK_NOBIT,
2082 						   DMA_CCMD_DEVICE_INVL);
2083 			iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2084 						 DMA_TLB_DSI_FLUSH);
2085 		}
2086 	}
2087 
2088 	pgd = domain->pgd;
2089 
2090 	context_clear_entry(context);
2091 	context_set_domain_id(context, did);
2092 
2093 	/*
2094 	 * Skip top levels of page tables for iommu which has less agaw
2095 	 * than default.  Unnecessary for PT mode.
2096 	 */
2097 	if (translation != CONTEXT_TT_PASS_THROUGH) {
2098 		for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2099 			ret = -ENOMEM;
2100 			pgd = phys_to_virt(dma_pte_addr(pgd));
2101 			if (!dma_pte_present(pgd))
2102 				goto out_unlock;
2103 		}
2104 
2105 		info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2106 		if (info && info->ats_supported)
2107 			translation = CONTEXT_TT_DEV_IOTLB;
2108 		else
2109 			translation = CONTEXT_TT_MULTI_LEVEL;
2110 
2111 		context_set_address_root(context, virt_to_phys(pgd));
2112 		context_set_address_width(context, agaw);
2113 	} else {
2114 		/*
2115 		 * In pass through mode, AW must be programmed to
2116 		 * indicate the largest AGAW value supported by
2117 		 * hardware. And ASR is ignored by hardware.
2118 		 */
2119 		context_set_address_width(context, iommu->msagaw);
2120 	}
2121 
2122 	context_set_translation_type(context, translation);
2123 	context_set_fault_enable(context);
2124 	context_set_present(context);
2125 	domain_flush_cache(domain, context, sizeof(*context));
2126 
2127 	/*
2128 	 * It's a non-present to present mapping. If hardware doesn't cache
2129 	 * non-present entry we only need to flush the write-buffer. If the
2130 	 * _does_ cache non-present entries, then it does so in the special
2131 	 * domain #0, which we have to flush:
2132 	 */
2133 	if (cap_caching_mode(iommu->cap)) {
2134 		iommu->flush.flush_context(iommu, 0,
2135 					   (((u16)bus) << 8) | devfn,
2136 					   DMA_CCMD_MASK_NOBIT,
2137 					   DMA_CCMD_DEVICE_INVL);
2138 		iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2139 	} else {
2140 		iommu_flush_write_buffer(iommu);
2141 	}
2142 	iommu_enable_dev_iotlb(info);
2143 
2144 	ret = 0;
2145 
2146 out_unlock:
2147 	spin_unlock(&iommu->lock);
2148 	spin_unlock_irqrestore(&device_domain_lock, flags);
2149 
2150 	return ret;
2151 }
2152 
2153 struct domain_context_mapping_data {
2154 	struct dmar_domain *domain;
2155 	struct intel_iommu *iommu;
2156 };
2157 
domain_context_mapping_cb(struct pci_dev * pdev,u16 alias,void * opaque)2158 static int domain_context_mapping_cb(struct pci_dev *pdev,
2159 				     u16 alias, void *opaque)
2160 {
2161 	struct domain_context_mapping_data *data = opaque;
2162 
2163 	return domain_context_mapping_one(data->domain, data->iommu,
2164 					  PCI_BUS_NUM(alias), alias & 0xff);
2165 }
2166 
2167 static int
domain_context_mapping(struct dmar_domain * domain,struct device * dev)2168 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2169 {
2170 	struct intel_iommu *iommu;
2171 	u8 bus, devfn;
2172 	struct domain_context_mapping_data data;
2173 
2174 	iommu = device_to_iommu(dev, &bus, &devfn);
2175 	if (!iommu)
2176 		return -ENODEV;
2177 
2178 	if (!dev_is_pci(dev))
2179 		return domain_context_mapping_one(domain, iommu, bus, devfn);
2180 
2181 	data.domain = domain;
2182 	data.iommu = iommu;
2183 
2184 	return pci_for_each_dma_alias(to_pci_dev(dev),
2185 				      &domain_context_mapping_cb, &data);
2186 }
2187 
domain_context_mapped_cb(struct pci_dev * pdev,u16 alias,void * opaque)2188 static int domain_context_mapped_cb(struct pci_dev *pdev,
2189 				    u16 alias, void *opaque)
2190 {
2191 	struct intel_iommu *iommu = opaque;
2192 
2193 	return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2194 }
2195 
domain_context_mapped(struct device * dev)2196 static int domain_context_mapped(struct device *dev)
2197 {
2198 	struct intel_iommu *iommu;
2199 	u8 bus, devfn;
2200 
2201 	iommu = device_to_iommu(dev, &bus, &devfn);
2202 	if (!iommu)
2203 		return -ENODEV;
2204 
2205 	if (!dev_is_pci(dev))
2206 		return device_context_mapped(iommu, bus, devfn);
2207 
2208 	return !pci_for_each_dma_alias(to_pci_dev(dev),
2209 				       domain_context_mapped_cb, iommu);
2210 }
2211 
2212 /* Returns a number of VTD pages, but aligned to MM page size */
aligned_nrpages(unsigned long host_addr,size_t size)2213 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2214 					    size_t size)
2215 {
2216 	host_addr &= ~PAGE_MASK;
2217 	return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2218 }
2219 
2220 /* Return largest possible superpage level for a given mapping */
hardware_largepage_caps(struct dmar_domain * domain,unsigned long iov_pfn,unsigned long phy_pfn,unsigned long pages)2221 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2222 					  unsigned long iov_pfn,
2223 					  unsigned long phy_pfn,
2224 					  unsigned long pages)
2225 {
2226 	int support, level = 1;
2227 	unsigned long pfnmerge;
2228 
2229 	support = domain->iommu_superpage;
2230 
2231 	/* To use a large page, the virtual *and* physical addresses
2232 	   must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2233 	   of them will mean we have to use smaller pages. So just
2234 	   merge them and check both at once. */
2235 	pfnmerge = iov_pfn | phy_pfn;
2236 
2237 	while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2238 		pages >>= VTD_STRIDE_SHIFT;
2239 		if (!pages)
2240 			break;
2241 		pfnmerge >>= VTD_STRIDE_SHIFT;
2242 		level++;
2243 		support--;
2244 	}
2245 	return level;
2246 }
2247 
__domain_mapping(struct dmar_domain * domain,unsigned long iov_pfn,struct scatterlist * sg,unsigned long phys_pfn,unsigned long nr_pages,int prot)2248 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2249 			    struct scatterlist *sg, unsigned long phys_pfn,
2250 			    unsigned long nr_pages, int prot)
2251 {
2252 	struct dma_pte *first_pte = NULL, *pte = NULL;
2253 	phys_addr_t uninitialized_var(pteval);
2254 	unsigned long sg_res = 0;
2255 	unsigned int largepage_lvl = 0;
2256 	unsigned long lvl_pages = 0;
2257 
2258 	BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2259 
2260 	if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2261 		return -EINVAL;
2262 
2263 	prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
2264 
2265 	if (!sg) {
2266 		sg_res = nr_pages;
2267 		pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
2268 	}
2269 
2270 	while (nr_pages > 0) {
2271 		uint64_t tmp;
2272 
2273 		if (!sg_res) {
2274 			unsigned int pgoff = sg->offset & ~PAGE_MASK;
2275 
2276 			sg_res = aligned_nrpages(sg->offset, sg->length);
2277 			sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2278 			sg->dma_length = sg->length;
2279 			pteval = (sg_phys(sg) - pgoff) | prot;
2280 			phys_pfn = pteval >> VTD_PAGE_SHIFT;
2281 		}
2282 
2283 		if (!pte) {
2284 			largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2285 
2286 			first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2287 			if (!pte)
2288 				return -ENOMEM;
2289 			/* It is large page*/
2290 			if (largepage_lvl > 1) {
2291 				unsigned long nr_superpages, end_pfn;
2292 
2293 				pteval |= DMA_PTE_LARGE_PAGE;
2294 				lvl_pages = lvl_to_nr_pages(largepage_lvl);
2295 
2296 				nr_superpages = sg_res / lvl_pages;
2297 				end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2298 
2299 				/*
2300 				 * Ensure that old small page tables are
2301 				 * removed to make room for superpage(s).
2302 				 * We're adding new large pages, so make sure
2303 				 * we don't remove their parent tables.
2304 				 */
2305 				dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2306 						       largepage_lvl + 1);
2307 			} else {
2308 				pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2309 			}
2310 
2311 		}
2312 		/* We don't need lock here, nobody else
2313 		 * touches the iova range
2314 		 */
2315 		tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2316 		if (tmp) {
2317 			static int dumps = 5;
2318 			pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2319 				iov_pfn, tmp, (unsigned long long)pteval);
2320 			if (dumps) {
2321 				dumps--;
2322 				debug_dma_dump_mappings(NULL);
2323 			}
2324 			WARN_ON(1);
2325 		}
2326 
2327 		lvl_pages = lvl_to_nr_pages(largepage_lvl);
2328 
2329 		BUG_ON(nr_pages < lvl_pages);
2330 		BUG_ON(sg_res < lvl_pages);
2331 
2332 		nr_pages -= lvl_pages;
2333 		iov_pfn += lvl_pages;
2334 		phys_pfn += lvl_pages;
2335 		pteval += lvl_pages * VTD_PAGE_SIZE;
2336 		sg_res -= lvl_pages;
2337 
2338 		/* If the next PTE would be the first in a new page, then we
2339 		   need to flush the cache on the entries we've just written.
2340 		   And then we'll need to recalculate 'pte', so clear it and
2341 		   let it get set again in the if (!pte) block above.
2342 
2343 		   If we're done (!nr_pages) we need to flush the cache too.
2344 
2345 		   Also if we've been setting superpages, we may need to
2346 		   recalculate 'pte' and switch back to smaller pages for the
2347 		   end of the mapping, if the trailing size is not enough to
2348 		   use another superpage (i.e. sg_res < lvl_pages). */
2349 		pte++;
2350 		if (!nr_pages || first_pte_in_page(pte) ||
2351 		    (largepage_lvl > 1 && sg_res < lvl_pages)) {
2352 			domain_flush_cache(domain, first_pte,
2353 					   (void *)pte - (void *)first_pte);
2354 			pte = NULL;
2355 		}
2356 
2357 		if (!sg_res && nr_pages)
2358 			sg = sg_next(sg);
2359 	}
2360 	return 0;
2361 }
2362 
domain_sg_mapping(struct dmar_domain * domain,unsigned long iov_pfn,struct scatterlist * sg,unsigned long nr_pages,int prot)2363 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2364 				    struct scatterlist *sg, unsigned long nr_pages,
2365 				    int prot)
2366 {
2367 	return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2368 }
2369 
domain_pfn_mapping(struct dmar_domain * domain,unsigned long iov_pfn,unsigned long phys_pfn,unsigned long nr_pages,int prot)2370 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2371 				     unsigned long phys_pfn, unsigned long nr_pages,
2372 				     int prot)
2373 {
2374 	return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2375 }
2376 
domain_context_clear_one(struct intel_iommu * iommu,u8 bus,u8 devfn)2377 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2378 {
2379 	unsigned long flags;
2380 	struct context_entry *context;
2381 	u16 did_old;
2382 
2383 	if (!iommu)
2384 		return;
2385 
2386 	spin_lock_irqsave(&iommu->lock, flags);
2387 	context = iommu_context_addr(iommu, bus, devfn, 0);
2388 	if (!context) {
2389 		spin_unlock_irqrestore(&iommu->lock, flags);
2390 		return;
2391 	}
2392 	did_old = context_domain_id(context);
2393 	context_clear_entry(context);
2394 	__iommu_flush_cache(iommu, context, sizeof(*context));
2395 	spin_unlock_irqrestore(&iommu->lock, flags);
2396 	iommu->flush.flush_context(iommu,
2397 				   did_old,
2398 				   (((u16)bus) << 8) | devfn,
2399 				   DMA_CCMD_MASK_NOBIT,
2400 				   DMA_CCMD_DEVICE_INVL);
2401 	iommu->flush.flush_iotlb(iommu,
2402 				 did_old,
2403 				 0,
2404 				 0,
2405 				 DMA_TLB_DSI_FLUSH);
2406 }
2407 
unlink_domain_info(struct device_domain_info * info)2408 static inline void unlink_domain_info(struct device_domain_info *info)
2409 {
2410 	assert_spin_locked(&device_domain_lock);
2411 	list_del(&info->link);
2412 	list_del(&info->global);
2413 	if (info->dev)
2414 		info->dev->archdata.iommu = NULL;
2415 }
2416 
domain_remove_dev_info(struct dmar_domain * domain)2417 static void domain_remove_dev_info(struct dmar_domain *domain)
2418 {
2419 	struct device_domain_info *info, *tmp;
2420 	unsigned long flags;
2421 
2422 	spin_lock_irqsave(&device_domain_lock, flags);
2423 	list_for_each_entry_safe(info, tmp, &domain->devices, link)
2424 		__dmar_remove_one_dev_info(info);
2425 	spin_unlock_irqrestore(&device_domain_lock, flags);
2426 }
2427 
2428 /*
2429  * find_domain
2430  * Note: we use struct device->archdata.iommu stores the info
2431  */
find_domain(struct device * dev)2432 static struct dmar_domain *find_domain(struct device *dev)
2433 {
2434 	struct device_domain_info *info;
2435 
2436 	/* No lock here, assumes no domain exit in normal case */
2437 	info = dev->archdata.iommu;
2438 	if (likely(info))
2439 		return info->domain;
2440 	return NULL;
2441 }
2442 
2443 static inline struct device_domain_info *
dmar_search_domain_by_dev_info(int segment,int bus,int devfn)2444 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2445 {
2446 	struct device_domain_info *info;
2447 
2448 	list_for_each_entry(info, &device_domain_list, global)
2449 		if (info->iommu->segment == segment && info->bus == bus &&
2450 		    info->devfn == devfn)
2451 			return info;
2452 
2453 	return NULL;
2454 }
2455 
dmar_insert_one_dev_info(struct intel_iommu * iommu,int bus,int devfn,struct device * dev,struct dmar_domain * domain)2456 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2457 						    int bus, int devfn,
2458 						    struct device *dev,
2459 						    struct dmar_domain *domain)
2460 {
2461 	struct dmar_domain *found = NULL;
2462 	struct device_domain_info *info;
2463 	unsigned long flags;
2464 	int ret;
2465 
2466 	info = alloc_devinfo_mem();
2467 	if (!info)
2468 		return NULL;
2469 
2470 	info->bus = bus;
2471 	info->devfn = devfn;
2472 	info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2473 	info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2474 	info->ats_qdep = 0;
2475 	info->dev = dev;
2476 	info->domain = domain;
2477 	info->iommu = iommu;
2478 
2479 	if (dev && dev_is_pci(dev)) {
2480 		struct pci_dev *pdev = to_pci_dev(info->dev);
2481 
2482 		if (ecap_dev_iotlb_support(iommu->ecap) &&
2483 		    pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS) &&
2484 		    dmar_find_matched_atsr_unit(pdev))
2485 			info->ats_supported = 1;
2486 
2487 		if (ecs_enabled(iommu)) {
2488 			if (pasid_enabled(iommu)) {
2489 				int features = pci_pasid_features(pdev);
2490 				if (features >= 0)
2491 					info->pasid_supported = features | 1;
2492 			}
2493 
2494 			if (info->ats_supported && ecap_prs(iommu->ecap) &&
2495 			    pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
2496 				info->pri_supported = 1;
2497 		}
2498 	}
2499 
2500 	spin_lock_irqsave(&device_domain_lock, flags);
2501 	if (dev)
2502 		found = find_domain(dev);
2503 
2504 	if (!found) {
2505 		struct device_domain_info *info2;
2506 		info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2507 		if (info2) {
2508 			found      = info2->domain;
2509 			info2->dev = dev;
2510 		}
2511 	}
2512 
2513 	if (found) {
2514 		spin_unlock_irqrestore(&device_domain_lock, flags);
2515 		free_devinfo_mem(info);
2516 		/* Caller must free the original domain */
2517 		return found;
2518 	}
2519 
2520 	spin_lock(&iommu->lock);
2521 	ret = domain_attach_iommu(domain, iommu);
2522 	spin_unlock(&iommu->lock);
2523 
2524 	if (ret) {
2525 		spin_unlock_irqrestore(&device_domain_lock, flags);
2526 		free_devinfo_mem(info);
2527 		return NULL;
2528 	}
2529 
2530 	list_add(&info->link, &domain->devices);
2531 	list_add(&info->global, &device_domain_list);
2532 	if (dev)
2533 		dev->archdata.iommu = info;
2534 	spin_unlock_irqrestore(&device_domain_lock, flags);
2535 
2536 	if (dev && domain_context_mapping(domain, dev)) {
2537 		pr_err("Domain context map for %s failed\n", dev_name(dev));
2538 		dmar_remove_one_dev_info(domain, dev);
2539 		return NULL;
2540 	}
2541 
2542 	return domain;
2543 }
2544 
get_last_alias(struct pci_dev * pdev,u16 alias,void * opaque)2545 static int get_last_alias(struct pci_dev *pdev, u16 alias, void *opaque)
2546 {
2547 	*(u16 *)opaque = alias;
2548 	return 0;
2549 }
2550 
find_or_alloc_domain(struct device * dev,int gaw)2551 static struct dmar_domain *find_or_alloc_domain(struct device *dev, int gaw)
2552 {
2553 	struct device_domain_info *info = NULL;
2554 	struct dmar_domain *domain = NULL;
2555 	struct intel_iommu *iommu;
2556 	u16 req_id, dma_alias;
2557 	unsigned long flags;
2558 	u8 bus, devfn;
2559 
2560 	iommu = device_to_iommu(dev, &bus, &devfn);
2561 	if (!iommu)
2562 		return NULL;
2563 
2564 	req_id = ((u16)bus << 8) | devfn;
2565 
2566 	if (dev_is_pci(dev)) {
2567 		struct pci_dev *pdev = to_pci_dev(dev);
2568 
2569 		pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2570 
2571 		spin_lock_irqsave(&device_domain_lock, flags);
2572 		info = dmar_search_domain_by_dev_info(pci_domain_nr(pdev->bus),
2573 						      PCI_BUS_NUM(dma_alias),
2574 						      dma_alias & 0xff);
2575 		if (info) {
2576 			iommu = info->iommu;
2577 			domain = info->domain;
2578 		}
2579 		spin_unlock_irqrestore(&device_domain_lock, flags);
2580 
2581 		/* DMA alias already has a domain, use it */
2582 		if (info)
2583 			goto out;
2584 	}
2585 
2586 	/* Allocate and initialize new domain for the device */
2587 	domain = alloc_domain(0);
2588 	if (!domain)
2589 		return NULL;
2590 	if (domain_init(domain, iommu, gaw)) {
2591 		domain_exit(domain);
2592 		return NULL;
2593 	}
2594 
2595 out:
2596 
2597 	return domain;
2598 }
2599 
set_domain_for_dev(struct device * dev,struct dmar_domain * domain)2600 static struct dmar_domain *set_domain_for_dev(struct device *dev,
2601 					      struct dmar_domain *domain)
2602 {
2603 	struct intel_iommu *iommu;
2604 	struct dmar_domain *tmp;
2605 	u16 req_id, dma_alias;
2606 	u8 bus, devfn;
2607 
2608 	iommu = device_to_iommu(dev, &bus, &devfn);
2609 	if (!iommu)
2610 		return NULL;
2611 
2612 	req_id = ((u16)bus << 8) | devfn;
2613 
2614 	if (dev_is_pci(dev)) {
2615 		struct pci_dev *pdev = to_pci_dev(dev);
2616 
2617 		pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2618 
2619 		/* register PCI DMA alias device */
2620 		if (req_id != dma_alias) {
2621 			tmp = dmar_insert_one_dev_info(iommu, PCI_BUS_NUM(dma_alias),
2622 					dma_alias & 0xff, NULL, domain);
2623 
2624 			if (!tmp || tmp != domain)
2625 				return tmp;
2626 		}
2627 	}
2628 
2629 	tmp = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2630 	if (!tmp || tmp != domain)
2631 		return tmp;
2632 
2633 	return domain;
2634 }
2635 
get_domain_for_dev(struct device * dev,int gaw)2636 static struct dmar_domain *get_domain_for_dev(struct device *dev, int gaw)
2637 {
2638 	struct dmar_domain *domain, *tmp;
2639 
2640 	domain = find_domain(dev);
2641 	if (domain)
2642 		goto out;
2643 
2644 	domain = find_or_alloc_domain(dev, gaw);
2645 	if (!domain)
2646 		goto out;
2647 
2648 	tmp = set_domain_for_dev(dev, domain);
2649 	if (!tmp || domain != tmp) {
2650 		domain_exit(domain);
2651 		domain = tmp;
2652 	}
2653 
2654 out:
2655 
2656 	return domain;
2657 }
2658 
iommu_domain_identity_map(struct dmar_domain * domain,unsigned long long start,unsigned long long end)2659 static int iommu_domain_identity_map(struct dmar_domain *domain,
2660 				     unsigned long long start,
2661 				     unsigned long long end)
2662 {
2663 	unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2664 	unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2665 
2666 	if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2667 			  dma_to_mm_pfn(last_vpfn))) {
2668 		pr_err("Reserving iova failed\n");
2669 		return -ENOMEM;
2670 	}
2671 
2672 	pr_debug("Mapping reserved region %llx-%llx\n", start, end);
2673 	/*
2674 	 * RMRR range might have overlap with physical memory range,
2675 	 * clear it first
2676 	 */
2677 	dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2678 
2679 	return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
2680 				  last_vpfn - first_vpfn + 1,
2681 				  DMA_PTE_READ|DMA_PTE_WRITE);
2682 }
2683 
domain_prepare_identity_map(struct device * dev,struct dmar_domain * domain,unsigned long long start,unsigned long long end)2684 static int domain_prepare_identity_map(struct device *dev,
2685 				       struct dmar_domain *domain,
2686 				       unsigned long long start,
2687 				       unsigned long long end)
2688 {
2689 	/* For _hardware_ passthrough, don't bother. But for software
2690 	   passthrough, we do it anyway -- it may indicate a memory
2691 	   range which is reserved in E820, so which didn't get set
2692 	   up to start with in si_domain */
2693 	if (domain == si_domain && hw_pass_through) {
2694 		pr_warn("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2695 			dev_name(dev), start, end);
2696 		return 0;
2697 	}
2698 
2699 	pr_info("Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2700 		dev_name(dev), start, end);
2701 
2702 	if (end < start) {
2703 		WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2704 			"BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2705 			dmi_get_system_info(DMI_BIOS_VENDOR),
2706 			dmi_get_system_info(DMI_BIOS_VERSION),
2707 		     dmi_get_system_info(DMI_PRODUCT_VERSION));
2708 		return -EIO;
2709 	}
2710 
2711 	if (end >> agaw_to_width(domain->agaw)) {
2712 		WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2713 		     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2714 		     agaw_to_width(domain->agaw),
2715 		     dmi_get_system_info(DMI_BIOS_VENDOR),
2716 		     dmi_get_system_info(DMI_BIOS_VERSION),
2717 		     dmi_get_system_info(DMI_PRODUCT_VERSION));
2718 		return -EIO;
2719 	}
2720 
2721 	return iommu_domain_identity_map(domain, start, end);
2722 }
2723 
iommu_prepare_identity_map(struct device * dev,unsigned long long start,unsigned long long end)2724 static int iommu_prepare_identity_map(struct device *dev,
2725 				      unsigned long long start,
2726 				      unsigned long long end)
2727 {
2728 	struct dmar_domain *domain;
2729 	int ret;
2730 
2731 	domain = get_domain_for_dev(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2732 	if (!domain)
2733 		return -ENOMEM;
2734 
2735 	ret = domain_prepare_identity_map(dev, domain, start, end);
2736 	if (ret)
2737 		domain_exit(domain);
2738 
2739 	return ret;
2740 }
2741 
iommu_prepare_rmrr_dev(struct dmar_rmrr_unit * rmrr,struct device * dev)2742 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2743 					 struct device *dev)
2744 {
2745 	if (dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2746 		return 0;
2747 	return iommu_prepare_identity_map(dev, rmrr->base_address,
2748 					  rmrr->end_address);
2749 }
2750 
2751 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
iommu_prepare_isa(void)2752 static inline void iommu_prepare_isa(void)
2753 {
2754 	struct pci_dev *pdev;
2755 	int ret;
2756 
2757 	pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2758 	if (!pdev)
2759 		return;
2760 
2761 	pr_info("Prepare 0-16MiB unity mapping for LPC\n");
2762 	ret = iommu_prepare_identity_map(&pdev->dev, 0, 16*1024*1024 - 1);
2763 
2764 	if (ret)
2765 		pr_err("Failed to create 0-16MiB identity map - floppy might not work\n");
2766 
2767 	pci_dev_put(pdev);
2768 }
2769 #else
iommu_prepare_isa(void)2770 static inline void iommu_prepare_isa(void)
2771 {
2772 	return;
2773 }
2774 #endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2775 
2776 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2777 
si_domain_init(int hw)2778 static int __init si_domain_init(int hw)
2779 {
2780 	int nid, ret = 0;
2781 
2782 	si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2783 	if (!si_domain)
2784 		return -EFAULT;
2785 
2786 	if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2787 		domain_exit(si_domain);
2788 		return -EFAULT;
2789 	}
2790 
2791 	pr_debug("Identity mapping domain allocated\n");
2792 
2793 	if (hw)
2794 		return 0;
2795 
2796 	for_each_online_node(nid) {
2797 		unsigned long start_pfn, end_pfn;
2798 		int i;
2799 
2800 		for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2801 			ret = iommu_domain_identity_map(si_domain,
2802 					PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2803 			if (ret)
2804 				return ret;
2805 		}
2806 	}
2807 
2808 	return 0;
2809 }
2810 
identity_mapping(struct device * dev)2811 static int identity_mapping(struct device *dev)
2812 {
2813 	struct device_domain_info *info;
2814 
2815 	if (likely(!iommu_identity_mapping))
2816 		return 0;
2817 
2818 	info = dev->archdata.iommu;
2819 	if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2820 		return (info->domain == si_domain);
2821 
2822 	return 0;
2823 }
2824 
domain_add_dev_info(struct dmar_domain * domain,struct device * dev)2825 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2826 {
2827 	struct dmar_domain *ndomain;
2828 	struct intel_iommu *iommu;
2829 	u8 bus, devfn;
2830 
2831 	iommu = device_to_iommu(dev, &bus, &devfn);
2832 	if (!iommu)
2833 		return -ENODEV;
2834 
2835 	ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2836 	if (ndomain != domain)
2837 		return -EBUSY;
2838 
2839 	return 0;
2840 }
2841 
device_has_rmrr(struct device * dev)2842 static bool device_has_rmrr(struct device *dev)
2843 {
2844 	struct dmar_rmrr_unit *rmrr;
2845 	struct device *tmp;
2846 	int i;
2847 
2848 	rcu_read_lock();
2849 	for_each_rmrr_units(rmrr) {
2850 		/*
2851 		 * Return TRUE if this RMRR contains the device that
2852 		 * is passed in.
2853 		 */
2854 		for_each_active_dev_scope(rmrr->devices,
2855 					  rmrr->devices_cnt, i, tmp)
2856 			if (tmp == dev) {
2857 				rcu_read_unlock();
2858 				return true;
2859 			}
2860 	}
2861 	rcu_read_unlock();
2862 	return false;
2863 }
2864 
2865 /*
2866  * There are a couple cases where we need to restrict the functionality of
2867  * devices associated with RMRRs.  The first is when evaluating a device for
2868  * identity mapping because problems exist when devices are moved in and out
2869  * of domains and their respective RMRR information is lost.  This means that
2870  * a device with associated RMRRs will never be in a "passthrough" domain.
2871  * The second is use of the device through the IOMMU API.  This interface
2872  * expects to have full control of the IOVA space for the device.  We cannot
2873  * satisfy both the requirement that RMRR access is maintained and have an
2874  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2875  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2876  * We therefore prevent devices associated with an RMRR from participating in
2877  * the IOMMU API, which eliminates them from device assignment.
2878  *
2879  * In both cases we assume that PCI USB devices with RMRRs have them largely
2880  * for historical reasons and that the RMRR space is not actively used post
2881  * boot.  This exclusion may change if vendors begin to abuse it.
2882  *
2883  * The same exception is made for graphics devices, with the requirement that
2884  * any use of the RMRR regions will be torn down before assigning the device
2885  * to a guest.
2886  */
device_is_rmrr_locked(struct device * dev)2887 static bool device_is_rmrr_locked(struct device *dev)
2888 {
2889 	if (!device_has_rmrr(dev))
2890 		return false;
2891 
2892 	if (dev_is_pci(dev)) {
2893 		struct pci_dev *pdev = to_pci_dev(dev);
2894 
2895 		if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2896 			return false;
2897 	}
2898 
2899 	return true;
2900 }
2901 
iommu_should_identity_map(struct device * dev,int startup)2902 static int iommu_should_identity_map(struct device *dev, int startup)
2903 {
2904 
2905 	if (dev_is_pci(dev)) {
2906 		struct pci_dev *pdev = to_pci_dev(dev);
2907 
2908 		if (device_is_rmrr_locked(dev))
2909 			return 0;
2910 
2911 		if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2912 			return 1;
2913 
2914 		if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2915 			return 1;
2916 
2917 		if (!(iommu_identity_mapping & IDENTMAP_ALL))
2918 			return 0;
2919 
2920 		/*
2921 		 * We want to start off with all devices in the 1:1 domain, and
2922 		 * take them out later if we find they can't access all of memory.
2923 		 *
2924 		 * However, we can't do this for PCI devices behind bridges,
2925 		 * because all PCI devices behind the same bridge will end up
2926 		 * with the same source-id on their transactions.
2927 		 *
2928 		 * Practically speaking, we can't change things around for these
2929 		 * devices at run-time, because we can't be sure there'll be no
2930 		 * DMA transactions in flight for any of their siblings.
2931 		 *
2932 		 * So PCI devices (unless they're on the root bus) as well as
2933 		 * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2934 		 * the 1:1 domain, just in _case_ one of their siblings turns out
2935 		 * not to be able to map all of memory.
2936 		 */
2937 		if (!pci_is_pcie(pdev)) {
2938 			if (!pci_is_root_bus(pdev->bus))
2939 				return 0;
2940 			if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2941 				return 0;
2942 		} else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2943 			return 0;
2944 	} else {
2945 		if (device_has_rmrr(dev))
2946 			return 0;
2947 	}
2948 
2949 	/*
2950 	 * At boot time, we don't yet know if devices will be 64-bit capable.
2951 	 * Assume that they will — if they turn out not to be, then we can
2952 	 * take them out of the 1:1 domain later.
2953 	 */
2954 	if (!startup) {
2955 		/*
2956 		 * If the device's dma_mask is less than the system's memory
2957 		 * size then this is not a candidate for identity mapping.
2958 		 */
2959 		u64 dma_mask = *dev->dma_mask;
2960 
2961 		if (dev->coherent_dma_mask &&
2962 		    dev->coherent_dma_mask < dma_mask)
2963 			dma_mask = dev->coherent_dma_mask;
2964 
2965 		return dma_mask >= dma_get_required_mask(dev);
2966 	}
2967 
2968 	return 1;
2969 }
2970 
dev_prepare_static_identity_mapping(struct device * dev,int hw)2971 static int __init dev_prepare_static_identity_mapping(struct device *dev, int hw)
2972 {
2973 	int ret;
2974 
2975 	if (!iommu_should_identity_map(dev, 1))
2976 		return 0;
2977 
2978 	ret = domain_add_dev_info(si_domain, dev);
2979 	if (!ret)
2980 		pr_info("%s identity mapping for device %s\n",
2981 			hw ? "Hardware" : "Software", dev_name(dev));
2982 	else if (ret == -ENODEV)
2983 		/* device not associated with an iommu */
2984 		ret = 0;
2985 
2986 	return ret;
2987 }
2988 
2989 
iommu_prepare_static_identity_mapping(int hw)2990 static int __init iommu_prepare_static_identity_mapping(int hw)
2991 {
2992 	struct pci_dev *pdev = NULL;
2993 	struct dmar_drhd_unit *drhd;
2994 	struct intel_iommu *iommu;
2995 	struct device *dev;
2996 	int i;
2997 	int ret = 0;
2998 
2999 	for_each_pci_dev(pdev) {
3000 		ret = dev_prepare_static_identity_mapping(&pdev->dev, hw);
3001 		if (ret)
3002 			return ret;
3003 	}
3004 
3005 	for_each_active_iommu(iommu, drhd)
3006 		for_each_active_dev_scope(drhd->devices, drhd->devices_cnt, i, dev) {
3007 			struct acpi_device_physical_node *pn;
3008 			struct acpi_device *adev;
3009 
3010 			if (dev->bus != &acpi_bus_type)
3011 				continue;
3012 
3013 			adev= to_acpi_device(dev);
3014 			mutex_lock(&adev->physical_node_lock);
3015 			list_for_each_entry(pn, &adev->physical_node_list, node) {
3016 				ret = dev_prepare_static_identity_mapping(pn->dev, hw);
3017 				if (ret)
3018 					break;
3019 			}
3020 			mutex_unlock(&adev->physical_node_lock);
3021 			if (ret)
3022 				return ret;
3023 		}
3024 
3025 	return 0;
3026 }
3027 
intel_iommu_init_qi(struct intel_iommu * iommu)3028 static void intel_iommu_init_qi(struct intel_iommu *iommu)
3029 {
3030 	/*
3031 	 * Start from the sane iommu hardware state.
3032 	 * If the queued invalidation is already initialized by us
3033 	 * (for example, while enabling interrupt-remapping) then
3034 	 * we got the things already rolling from a sane state.
3035 	 */
3036 	if (!iommu->qi) {
3037 		/*
3038 		 * Clear any previous faults.
3039 		 */
3040 		dmar_fault(-1, iommu);
3041 		/*
3042 		 * Disable queued invalidation if supported and already enabled
3043 		 * before OS handover.
3044 		 */
3045 		dmar_disable_qi(iommu);
3046 	}
3047 
3048 	if (dmar_enable_qi(iommu)) {
3049 		/*
3050 		 * Queued Invalidate not enabled, use Register Based Invalidate
3051 		 */
3052 		iommu->flush.flush_context = __iommu_flush_context;
3053 		iommu->flush.flush_iotlb = __iommu_flush_iotlb;
3054 		pr_info("%s: Using Register based invalidation\n",
3055 			iommu->name);
3056 	} else {
3057 		iommu->flush.flush_context = qi_flush_context;
3058 		iommu->flush.flush_iotlb = qi_flush_iotlb;
3059 		pr_info("%s: Using Queued invalidation\n", iommu->name);
3060 	}
3061 }
3062 
copy_context_table(struct intel_iommu * iommu,struct root_entry * old_re,struct context_entry ** tbl,int bus,bool ext)3063 static int copy_context_table(struct intel_iommu *iommu,
3064 			      struct root_entry *old_re,
3065 			      struct context_entry **tbl,
3066 			      int bus, bool ext)
3067 {
3068 	int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
3069 	struct context_entry *new_ce = NULL, ce;
3070 	struct context_entry *old_ce = NULL;
3071 	struct root_entry re;
3072 	phys_addr_t old_ce_phys;
3073 
3074 	tbl_idx = ext ? bus * 2 : bus;
3075 	memcpy(&re, old_re, sizeof(re));
3076 
3077 	for (devfn = 0; devfn < 256; devfn++) {
3078 		/* First calculate the correct index */
3079 		idx = (ext ? devfn * 2 : devfn) % 256;
3080 
3081 		if (idx == 0) {
3082 			/* First save what we may have and clean up */
3083 			if (new_ce) {
3084 				tbl[tbl_idx] = new_ce;
3085 				__iommu_flush_cache(iommu, new_ce,
3086 						    VTD_PAGE_SIZE);
3087 				pos = 1;
3088 			}
3089 
3090 			if (old_ce)
3091 				memunmap(old_ce);
3092 
3093 			ret = 0;
3094 			if (devfn < 0x80)
3095 				old_ce_phys = root_entry_lctp(&re);
3096 			else
3097 				old_ce_phys = root_entry_uctp(&re);
3098 
3099 			if (!old_ce_phys) {
3100 				if (ext && devfn == 0) {
3101 					/* No LCTP, try UCTP */
3102 					devfn = 0x7f;
3103 					continue;
3104 				} else {
3105 					goto out;
3106 				}
3107 			}
3108 
3109 			ret = -ENOMEM;
3110 			old_ce = memremap(old_ce_phys, PAGE_SIZE,
3111 					MEMREMAP_WB);
3112 			if (!old_ce)
3113 				goto out;
3114 
3115 			new_ce = alloc_pgtable_page(iommu->node);
3116 			if (!new_ce)
3117 				goto out_unmap;
3118 
3119 			ret = 0;
3120 		}
3121 
3122 		/* Now copy the context entry */
3123 		memcpy(&ce, old_ce + idx, sizeof(ce));
3124 
3125 		if (!__context_present(&ce))
3126 			continue;
3127 
3128 		did = context_domain_id(&ce);
3129 		if (did >= 0 && did < cap_ndoms(iommu->cap))
3130 			set_bit(did, iommu->domain_ids);
3131 
3132 		/*
3133 		 * We need a marker for copied context entries. This
3134 		 * marker needs to work for the old format as well as
3135 		 * for extended context entries.
3136 		 *
3137 		 * Bit 67 of the context entry is used. In the old
3138 		 * format this bit is available to software, in the
3139 		 * extended format it is the PGE bit, but PGE is ignored
3140 		 * by HW if PASIDs are disabled (and thus still
3141 		 * available).
3142 		 *
3143 		 * So disable PASIDs first and then mark the entry
3144 		 * copied. This means that we don't copy PASID
3145 		 * translations from the old kernel, but this is fine as
3146 		 * faults there are not fatal.
3147 		 */
3148 		context_clear_pasid_enable(&ce);
3149 		context_set_copied(&ce);
3150 
3151 		new_ce[idx] = ce;
3152 	}
3153 
3154 	tbl[tbl_idx + pos] = new_ce;
3155 
3156 	__iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3157 
3158 out_unmap:
3159 	memunmap(old_ce);
3160 
3161 out:
3162 	return ret;
3163 }
3164 
copy_translation_tables(struct intel_iommu * iommu)3165 static int copy_translation_tables(struct intel_iommu *iommu)
3166 {
3167 	struct context_entry **ctxt_tbls;
3168 	struct root_entry *old_rt;
3169 	phys_addr_t old_rt_phys;
3170 	int ctxt_table_entries;
3171 	unsigned long flags;
3172 	u64 rtaddr_reg;
3173 	int bus, ret;
3174 	bool new_ext, ext;
3175 
3176 	rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3177 	ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
3178 	new_ext    = !!ecap_ecs(iommu->ecap);
3179 
3180 	/*
3181 	 * The RTT bit can only be changed when translation is disabled,
3182 	 * but disabling translation means to open a window for data
3183 	 * corruption. So bail out and don't copy anything if we would
3184 	 * have to change the bit.
3185 	 */
3186 	if (new_ext != ext)
3187 		return -EINVAL;
3188 
3189 	old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3190 	if (!old_rt_phys)
3191 		return -EINVAL;
3192 
3193 	old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3194 	if (!old_rt)
3195 		return -ENOMEM;
3196 
3197 	/* This is too big for the stack - allocate it from slab */
3198 	ctxt_table_entries = ext ? 512 : 256;
3199 	ret = -ENOMEM;
3200 	ctxt_tbls = kzalloc(ctxt_table_entries * sizeof(void *), GFP_KERNEL);
3201 	if (!ctxt_tbls)
3202 		goto out_unmap;
3203 
3204 	for (bus = 0; bus < 256; bus++) {
3205 		ret = copy_context_table(iommu, &old_rt[bus],
3206 					 ctxt_tbls, bus, ext);
3207 		if (ret) {
3208 			pr_err("%s: Failed to copy context table for bus %d\n",
3209 				iommu->name, bus);
3210 			continue;
3211 		}
3212 	}
3213 
3214 	spin_lock_irqsave(&iommu->lock, flags);
3215 
3216 	/* Context tables are copied, now write them to the root_entry table */
3217 	for (bus = 0; bus < 256; bus++) {
3218 		int idx = ext ? bus * 2 : bus;
3219 		u64 val;
3220 
3221 		if (ctxt_tbls[idx]) {
3222 			val = virt_to_phys(ctxt_tbls[idx]) | 1;
3223 			iommu->root_entry[bus].lo = val;
3224 		}
3225 
3226 		if (!ext || !ctxt_tbls[idx + 1])
3227 			continue;
3228 
3229 		val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3230 		iommu->root_entry[bus].hi = val;
3231 	}
3232 
3233 	spin_unlock_irqrestore(&iommu->lock, flags);
3234 
3235 	kfree(ctxt_tbls);
3236 
3237 	__iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3238 
3239 	ret = 0;
3240 
3241 out_unmap:
3242 	memunmap(old_rt);
3243 
3244 	return ret;
3245 }
3246 
init_dmars(void)3247 static int __init init_dmars(void)
3248 {
3249 	struct dmar_drhd_unit *drhd;
3250 	struct dmar_rmrr_unit *rmrr;
3251 	bool copied_tables = false;
3252 	struct device *dev;
3253 	struct intel_iommu *iommu;
3254 	int i, ret;
3255 
3256 	/*
3257 	 * for each drhd
3258 	 *    allocate root
3259 	 *    initialize and program root entry to not present
3260 	 * endfor
3261 	 */
3262 	for_each_drhd_unit(drhd) {
3263 		/*
3264 		 * lock not needed as this is only incremented in the single
3265 		 * threaded kernel __init code path all other access are read
3266 		 * only
3267 		 */
3268 		if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3269 			g_num_of_iommus++;
3270 			continue;
3271 		}
3272 		pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3273 	}
3274 
3275 	/* Preallocate enough resources for IOMMU hot-addition */
3276 	if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3277 		g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3278 
3279 	g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3280 			GFP_KERNEL);
3281 	if (!g_iommus) {
3282 		pr_err("Allocating global iommu array failed\n");
3283 		ret = -ENOMEM;
3284 		goto error;
3285 	}
3286 
3287 	for_each_active_iommu(iommu, drhd) {
3288 		g_iommus[iommu->seq_id] = iommu;
3289 
3290 		intel_iommu_init_qi(iommu);
3291 
3292 		ret = iommu_init_domains(iommu);
3293 		if (ret)
3294 			goto free_iommu;
3295 
3296 		init_translation_status(iommu);
3297 
3298 		if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3299 			iommu_disable_translation(iommu);
3300 			clear_translation_pre_enabled(iommu);
3301 			pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3302 				iommu->name);
3303 		}
3304 
3305 		/*
3306 		 * TBD:
3307 		 * we could share the same root & context tables
3308 		 * among all IOMMU's. Need to Split it later.
3309 		 */
3310 		ret = iommu_alloc_root_entry(iommu);
3311 		if (ret)
3312 			goto free_iommu;
3313 
3314 		if (translation_pre_enabled(iommu)) {
3315 			pr_info("Translation already enabled - trying to copy translation structures\n");
3316 
3317 			ret = copy_translation_tables(iommu);
3318 			if (ret) {
3319 				/*
3320 				 * We found the IOMMU with translation
3321 				 * enabled - but failed to copy over the
3322 				 * old root-entry table. Try to proceed
3323 				 * by disabling translation now and
3324 				 * allocating a clean root-entry table.
3325 				 * This might cause DMAR faults, but
3326 				 * probably the dump will still succeed.
3327 				 */
3328 				pr_err("Failed to copy translation tables from previous kernel for %s\n",
3329 				       iommu->name);
3330 				iommu_disable_translation(iommu);
3331 				clear_translation_pre_enabled(iommu);
3332 			} else {
3333 				pr_info("Copied translation tables from previous kernel for %s\n",
3334 					iommu->name);
3335 				copied_tables = true;
3336 			}
3337 		}
3338 
3339 		if (!ecap_pass_through(iommu->ecap))
3340 			hw_pass_through = 0;
3341 #ifdef CONFIG_INTEL_IOMMU_SVM
3342 		if (pasid_enabled(iommu))
3343 			intel_svm_alloc_pasid_tables(iommu);
3344 #endif
3345 	}
3346 
3347 	/*
3348 	 * Now that qi is enabled on all iommus, set the root entry and flush
3349 	 * caches. This is required on some Intel X58 chipsets, otherwise the
3350 	 * flush_context function will loop forever and the boot hangs.
3351 	 */
3352 	for_each_active_iommu(iommu, drhd) {
3353 		iommu_flush_write_buffer(iommu);
3354 		iommu_set_root_entry(iommu);
3355 		iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3356 		iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3357 	}
3358 
3359 	if (iommu_pass_through)
3360 		iommu_identity_mapping |= IDENTMAP_ALL;
3361 
3362 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3363 	dmar_map_gfx = 0;
3364 #endif
3365 
3366 	if (!dmar_map_gfx)
3367 		iommu_identity_mapping |= IDENTMAP_GFX;
3368 
3369 	check_tylersburg_isoch();
3370 
3371 	if (iommu_identity_mapping) {
3372 		ret = si_domain_init(hw_pass_through);
3373 		if (ret)
3374 			goto free_iommu;
3375 	}
3376 
3377 
3378 	/*
3379 	 * If we copied translations from a previous kernel in the kdump
3380 	 * case, we can not assign the devices to domains now, as that
3381 	 * would eliminate the old mappings. So skip this part and defer
3382 	 * the assignment to device driver initialization time.
3383 	 */
3384 	if (copied_tables)
3385 		goto domains_done;
3386 
3387 	/*
3388 	 * If pass through is not set or not enabled, setup context entries for
3389 	 * identity mappings for rmrr, gfx, and isa and may fall back to static
3390 	 * identity mapping if iommu_identity_mapping is set.
3391 	 */
3392 	if (iommu_identity_mapping) {
3393 		ret = iommu_prepare_static_identity_mapping(hw_pass_through);
3394 		if (ret) {
3395 			pr_crit("Failed to setup IOMMU pass-through\n");
3396 			goto free_iommu;
3397 		}
3398 	}
3399 	/*
3400 	 * For each rmrr
3401 	 *   for each dev attached to rmrr
3402 	 *   do
3403 	 *     locate drhd for dev, alloc domain for dev
3404 	 *     allocate free domain
3405 	 *     allocate page table entries for rmrr
3406 	 *     if context not allocated for bus
3407 	 *           allocate and init context
3408 	 *           set present in root table for this bus
3409 	 *     init context with domain, translation etc
3410 	 *    endfor
3411 	 * endfor
3412 	 */
3413 	pr_info("Setting RMRR:\n");
3414 	for_each_rmrr_units(rmrr) {
3415 		/* some BIOS lists non-exist devices in DMAR table. */
3416 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3417 					  i, dev) {
3418 			ret = iommu_prepare_rmrr_dev(rmrr, dev);
3419 			if (ret)
3420 				pr_err("Mapping reserved region failed\n");
3421 		}
3422 	}
3423 
3424 	iommu_prepare_isa();
3425 
3426 domains_done:
3427 
3428 	/*
3429 	 * for each drhd
3430 	 *   enable fault log
3431 	 *   global invalidate context cache
3432 	 *   global invalidate iotlb
3433 	 *   enable translation
3434 	 */
3435 	for_each_iommu(iommu, drhd) {
3436 		if (drhd->ignored) {
3437 			/*
3438 			 * we always have to disable PMRs or DMA may fail on
3439 			 * this device
3440 			 */
3441 			if (force_on)
3442 				iommu_disable_protect_mem_regions(iommu);
3443 			continue;
3444 		}
3445 
3446 		iommu_flush_write_buffer(iommu);
3447 
3448 #ifdef CONFIG_INTEL_IOMMU_SVM
3449 		if (pasid_enabled(iommu) && ecap_prs(iommu->ecap)) {
3450 			ret = intel_svm_enable_prq(iommu);
3451 			if (ret)
3452 				goto free_iommu;
3453 		}
3454 #endif
3455 		ret = dmar_set_interrupt(iommu);
3456 		if (ret)
3457 			goto free_iommu;
3458 
3459 		if (!translation_pre_enabled(iommu))
3460 			iommu_enable_translation(iommu);
3461 
3462 		iommu_disable_protect_mem_regions(iommu);
3463 	}
3464 
3465 	return 0;
3466 
3467 free_iommu:
3468 	for_each_active_iommu(iommu, drhd) {
3469 		disable_dmar_iommu(iommu);
3470 		free_dmar_iommu(iommu);
3471 	}
3472 
3473 	kfree(g_iommus);
3474 
3475 error:
3476 	return ret;
3477 }
3478 
3479 /* This takes a number of _MM_ pages, not VTD pages */
intel_alloc_iova(struct device * dev,struct dmar_domain * domain,unsigned long nrpages,uint64_t dma_mask)3480 static unsigned long intel_alloc_iova(struct device *dev,
3481 				     struct dmar_domain *domain,
3482 				     unsigned long nrpages, uint64_t dma_mask)
3483 {
3484 	unsigned long iova_pfn = 0;
3485 
3486 	/* Restrict dma_mask to the width that the iommu can handle */
3487 	dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
3488 	/* Ensure we reserve the whole size-aligned region */
3489 	nrpages = __roundup_pow_of_two(nrpages);
3490 
3491 	if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3492 		/*
3493 		 * First try to allocate an io virtual address in
3494 		 * DMA_BIT_MASK(32) and if that fails then try allocating
3495 		 * from higher range
3496 		 */
3497 		iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3498 					   IOVA_PFN(DMA_BIT_MASK(32)));
3499 		if (iova_pfn)
3500 			return iova_pfn;
3501 	}
3502 	iova_pfn = alloc_iova_fast(&domain->iovad, nrpages, IOVA_PFN(dma_mask));
3503 	if (unlikely(!iova_pfn)) {
3504 		pr_err("Allocating %ld-page iova for %s failed",
3505 		       nrpages, dev_name(dev));
3506 		return 0;
3507 	}
3508 
3509 	return iova_pfn;
3510 }
3511 
get_valid_domain_for_dev(struct device * dev)3512 static struct dmar_domain *get_valid_domain_for_dev(struct device *dev)
3513 {
3514 	struct dmar_domain *domain, *tmp;
3515 	struct dmar_rmrr_unit *rmrr;
3516 	struct device *i_dev;
3517 	int i, ret;
3518 
3519 	domain = find_domain(dev);
3520 	if (domain)
3521 		goto out;
3522 
3523 	domain = find_or_alloc_domain(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
3524 	if (!domain)
3525 		goto out;
3526 
3527 	/* We have a new domain - setup possible RMRRs for the device */
3528 	rcu_read_lock();
3529 	for_each_rmrr_units(rmrr) {
3530 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3531 					  i, i_dev) {
3532 			if (i_dev != dev)
3533 				continue;
3534 
3535 			ret = domain_prepare_identity_map(dev, domain,
3536 							  rmrr->base_address,
3537 							  rmrr->end_address);
3538 			if (ret)
3539 				dev_err(dev, "Mapping reserved region failed\n");
3540 		}
3541 	}
3542 	rcu_read_unlock();
3543 
3544 	tmp = set_domain_for_dev(dev, domain);
3545 	if (!tmp || domain != tmp) {
3546 		domain_exit(domain);
3547 		domain = tmp;
3548 	}
3549 
3550 out:
3551 
3552 	if (!domain)
3553 		pr_err("Allocating domain for %s failed\n", dev_name(dev));
3554 
3555 
3556 	return domain;
3557 }
3558 
3559 /* Check if the dev needs to go through non-identity map and unmap process.*/
iommu_no_mapping(struct device * dev)3560 static int iommu_no_mapping(struct device *dev)
3561 {
3562 	int found;
3563 
3564 	if (iommu_dummy(dev))
3565 		return 1;
3566 
3567 	if (!iommu_identity_mapping)
3568 		return 0;
3569 
3570 	found = identity_mapping(dev);
3571 	if (found) {
3572 		if (iommu_should_identity_map(dev, 0))
3573 			return 1;
3574 		else {
3575 			/*
3576 			 * 32 bit DMA is removed from si_domain and fall back
3577 			 * to non-identity mapping.
3578 			 */
3579 			dmar_remove_one_dev_info(si_domain, dev);
3580 			pr_info("32bit %s uses non-identity mapping\n",
3581 				dev_name(dev));
3582 			return 0;
3583 		}
3584 	} else {
3585 		/*
3586 		 * In case of a detached 64 bit DMA device from vm, the device
3587 		 * is put into si_domain for identity mapping.
3588 		 */
3589 		if (iommu_should_identity_map(dev, 0)) {
3590 			int ret;
3591 			ret = domain_add_dev_info(si_domain, dev);
3592 			if (!ret) {
3593 				pr_info("64bit %s uses identity mapping\n",
3594 					dev_name(dev));
3595 				return 1;
3596 			}
3597 		}
3598 	}
3599 
3600 	return 0;
3601 }
3602 
__intel_map_single(struct device * dev,phys_addr_t paddr,size_t size,int dir,u64 dma_mask)3603 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3604 				     size_t size, int dir, u64 dma_mask)
3605 {
3606 	struct dmar_domain *domain;
3607 	phys_addr_t start_paddr;
3608 	unsigned long iova_pfn;
3609 	int prot = 0;
3610 	int ret;
3611 	struct intel_iommu *iommu;
3612 	unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3613 
3614 	BUG_ON(dir == DMA_NONE);
3615 
3616 	if (iommu_no_mapping(dev))
3617 		return paddr;
3618 
3619 	domain = get_valid_domain_for_dev(dev);
3620 	if (!domain)
3621 		return 0;
3622 
3623 	iommu = domain_get_iommu(domain);
3624 	size = aligned_nrpages(paddr, size);
3625 
3626 	iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3627 	if (!iova_pfn)
3628 		goto error;
3629 
3630 	/*
3631 	 * Check if DMAR supports zero-length reads on write only
3632 	 * mappings..
3633 	 */
3634 	if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3635 			!cap_zlr(iommu->cap))
3636 		prot |= DMA_PTE_READ;
3637 	if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3638 		prot |= DMA_PTE_WRITE;
3639 	/*
3640 	 * paddr - (paddr + size) might be partial page, we should map the whole
3641 	 * page.  Note: if two part of one page are separately mapped, we
3642 	 * might have two guest_addr mapping to the same host paddr, but this
3643 	 * is not a big problem
3644 	 */
3645 	ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3646 				 mm_to_dma_pfn(paddr_pfn), size, prot);
3647 	if (ret)
3648 		goto error;
3649 
3650 	/* it's a non-present to present mapping. Only flush if caching mode */
3651 	if (cap_caching_mode(iommu->cap))
3652 		iommu_flush_iotlb_psi(iommu, domain,
3653 				      mm_to_dma_pfn(iova_pfn),
3654 				      size, 0, 1);
3655 	else
3656 		iommu_flush_write_buffer(iommu);
3657 
3658 	start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
3659 	start_paddr += paddr & ~PAGE_MASK;
3660 	return start_paddr;
3661 
3662 error:
3663 	if (iova_pfn)
3664 		free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3665 	pr_err("Device %s request: %zx@%llx dir %d --- failed\n",
3666 		dev_name(dev), size, (unsigned long long)paddr, dir);
3667 	return 0;
3668 }
3669 
intel_map_page(struct device * dev,struct page * page,unsigned long offset,size_t size,enum dma_data_direction dir,unsigned long attrs)3670 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3671 				 unsigned long offset, size_t size,
3672 				 enum dma_data_direction dir,
3673 				 unsigned long attrs)
3674 {
3675 	return __intel_map_single(dev, page_to_phys(page) + offset, size,
3676 				  dir, *dev->dma_mask);
3677 }
3678 
intel_unmap(struct device * dev,dma_addr_t dev_addr,size_t size)3679 static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
3680 {
3681 	struct dmar_domain *domain;
3682 	unsigned long start_pfn, last_pfn;
3683 	unsigned long nrpages;
3684 	unsigned long iova_pfn;
3685 	struct intel_iommu *iommu;
3686 	struct page *freelist;
3687 
3688 	if (iommu_no_mapping(dev))
3689 		return;
3690 
3691 	domain = find_domain(dev);
3692 	BUG_ON(!domain);
3693 
3694 	iommu = domain_get_iommu(domain);
3695 
3696 	iova_pfn = IOVA_PFN(dev_addr);
3697 
3698 	nrpages = aligned_nrpages(dev_addr, size);
3699 	start_pfn = mm_to_dma_pfn(iova_pfn);
3700 	last_pfn = start_pfn + nrpages - 1;
3701 
3702 	pr_debug("Device %s unmapping: pfn %lx-%lx\n",
3703 		 dev_name(dev), start_pfn, last_pfn);
3704 
3705 	freelist = domain_unmap(domain, start_pfn, last_pfn);
3706 
3707 	if (intel_iommu_strict || !has_iova_flush_queue(&domain->iovad)) {
3708 		iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3709 				      nrpages, !freelist, 0);
3710 		/* free iova */
3711 		free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3712 		dma_free_pagelist(freelist);
3713 	} else {
3714 		queue_iova(&domain->iovad, iova_pfn, nrpages,
3715 			   (unsigned long)freelist);
3716 		/*
3717 		 * queue up the release of the unmap to save the 1/6th of the
3718 		 * cpu used up by the iotlb flush operation...
3719 		 */
3720 	}
3721 }
3722 
intel_unmap_page(struct device * dev,dma_addr_t dev_addr,size_t size,enum dma_data_direction dir,unsigned long attrs)3723 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3724 			     size_t size, enum dma_data_direction dir,
3725 			     unsigned long attrs)
3726 {
3727 	intel_unmap(dev, dev_addr, size);
3728 }
3729 
intel_alloc_coherent(struct device * dev,size_t size,dma_addr_t * dma_handle,gfp_t flags,unsigned long attrs)3730 static void *intel_alloc_coherent(struct device *dev, size_t size,
3731 				  dma_addr_t *dma_handle, gfp_t flags,
3732 				  unsigned long attrs)
3733 {
3734 	struct page *page = NULL;
3735 	int order;
3736 
3737 	size = PAGE_ALIGN(size);
3738 	order = get_order(size);
3739 
3740 	if (!iommu_no_mapping(dev))
3741 		flags &= ~(GFP_DMA | GFP_DMA32);
3742 	else if (dev->coherent_dma_mask < dma_get_required_mask(dev)) {
3743 		if (dev->coherent_dma_mask < DMA_BIT_MASK(32))
3744 			flags |= GFP_DMA;
3745 		else
3746 			flags |= GFP_DMA32;
3747 	}
3748 
3749 	if (gfpflags_allow_blocking(flags)) {
3750 		unsigned int count = size >> PAGE_SHIFT;
3751 
3752 		page = dma_alloc_from_contiguous(dev, count, order, flags);
3753 		if (page && iommu_no_mapping(dev) &&
3754 		    page_to_phys(page) + size > dev->coherent_dma_mask) {
3755 			dma_release_from_contiguous(dev, page, count);
3756 			page = NULL;
3757 		}
3758 	}
3759 
3760 	if (!page)
3761 		page = alloc_pages(flags, order);
3762 	if (!page)
3763 		return NULL;
3764 	memset(page_address(page), 0, size);
3765 
3766 	*dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3767 					 DMA_BIDIRECTIONAL,
3768 					 dev->coherent_dma_mask);
3769 	if (*dma_handle)
3770 		return page_address(page);
3771 	if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3772 		__free_pages(page, order);
3773 
3774 	return NULL;
3775 }
3776 
intel_free_coherent(struct device * dev,size_t size,void * vaddr,dma_addr_t dma_handle,unsigned long attrs)3777 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3778 				dma_addr_t dma_handle, unsigned long attrs)
3779 {
3780 	int order;
3781 	struct page *page = virt_to_page(vaddr);
3782 
3783 	size = PAGE_ALIGN(size);
3784 	order = get_order(size);
3785 
3786 	intel_unmap(dev, dma_handle, size);
3787 	if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3788 		__free_pages(page, order);
3789 }
3790 
intel_unmap_sg(struct device * dev,struct scatterlist * sglist,int nelems,enum dma_data_direction dir,unsigned long attrs)3791 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3792 			   int nelems, enum dma_data_direction dir,
3793 			   unsigned long attrs)
3794 {
3795 	dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
3796 	unsigned long nrpages = 0;
3797 	struct scatterlist *sg;
3798 	int i;
3799 
3800 	for_each_sg(sglist, sg, nelems, i) {
3801 		nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
3802 	}
3803 
3804 	intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3805 }
3806 
intel_nontranslate_map_sg(struct device * hddev,struct scatterlist * sglist,int nelems,int dir)3807 static int intel_nontranslate_map_sg(struct device *hddev,
3808 	struct scatterlist *sglist, int nelems, int dir)
3809 {
3810 	int i;
3811 	struct scatterlist *sg;
3812 
3813 	for_each_sg(sglist, sg, nelems, i) {
3814 		BUG_ON(!sg_page(sg));
3815 		sg->dma_address = sg_phys(sg);
3816 		sg->dma_length = sg->length;
3817 	}
3818 	return nelems;
3819 }
3820 
intel_map_sg(struct device * dev,struct scatterlist * sglist,int nelems,enum dma_data_direction dir,unsigned long attrs)3821 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3822 			enum dma_data_direction dir, unsigned long attrs)
3823 {
3824 	int i;
3825 	struct dmar_domain *domain;
3826 	size_t size = 0;
3827 	int prot = 0;
3828 	unsigned long iova_pfn;
3829 	int ret;
3830 	struct scatterlist *sg;
3831 	unsigned long start_vpfn;
3832 	struct intel_iommu *iommu;
3833 
3834 	BUG_ON(dir == DMA_NONE);
3835 	if (iommu_no_mapping(dev))
3836 		return intel_nontranslate_map_sg(dev, sglist, nelems, dir);
3837 
3838 	domain = get_valid_domain_for_dev(dev);
3839 	if (!domain)
3840 		return 0;
3841 
3842 	iommu = domain_get_iommu(domain);
3843 
3844 	for_each_sg(sglist, sg, nelems, i)
3845 		size += aligned_nrpages(sg->offset, sg->length);
3846 
3847 	iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3848 				*dev->dma_mask);
3849 	if (!iova_pfn) {
3850 		sglist->dma_length = 0;
3851 		return 0;
3852 	}
3853 
3854 	/*
3855 	 * Check if DMAR supports zero-length reads on write only
3856 	 * mappings..
3857 	 */
3858 	if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3859 			!cap_zlr(iommu->cap))
3860 		prot |= DMA_PTE_READ;
3861 	if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3862 		prot |= DMA_PTE_WRITE;
3863 
3864 	start_vpfn = mm_to_dma_pfn(iova_pfn);
3865 
3866 	ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3867 	if (unlikely(ret)) {
3868 		dma_pte_free_pagetable(domain, start_vpfn,
3869 				       start_vpfn + size - 1,
3870 				       agaw_to_level(domain->agaw) + 1);
3871 		free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3872 		return 0;
3873 	}
3874 
3875 	/* it's a non-present to present mapping. Only flush if caching mode */
3876 	if (cap_caching_mode(iommu->cap))
3877 		iommu_flush_iotlb_psi(iommu, domain, start_vpfn, size, 0, 1);
3878 	else
3879 		iommu_flush_write_buffer(iommu);
3880 
3881 	return nelems;
3882 }
3883 
intel_mapping_error(struct device * dev,dma_addr_t dma_addr)3884 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
3885 {
3886 	return !dma_addr;
3887 }
3888 
3889 const struct dma_map_ops intel_dma_ops = {
3890 	.alloc = intel_alloc_coherent,
3891 	.free = intel_free_coherent,
3892 	.map_sg = intel_map_sg,
3893 	.unmap_sg = intel_unmap_sg,
3894 	.map_page = intel_map_page,
3895 	.unmap_page = intel_unmap_page,
3896 	.mapping_error = intel_mapping_error,
3897 #ifdef CONFIG_X86
3898 	.dma_supported = x86_dma_supported,
3899 #endif
3900 };
3901 
iommu_domain_cache_init(void)3902 static inline int iommu_domain_cache_init(void)
3903 {
3904 	int ret = 0;
3905 
3906 	iommu_domain_cache = kmem_cache_create("iommu_domain",
3907 					 sizeof(struct dmar_domain),
3908 					 0,
3909 					 SLAB_HWCACHE_ALIGN,
3910 
3911 					 NULL);
3912 	if (!iommu_domain_cache) {
3913 		pr_err("Couldn't create iommu_domain cache\n");
3914 		ret = -ENOMEM;
3915 	}
3916 
3917 	return ret;
3918 }
3919 
iommu_devinfo_cache_init(void)3920 static inline int iommu_devinfo_cache_init(void)
3921 {
3922 	int ret = 0;
3923 
3924 	iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3925 					 sizeof(struct device_domain_info),
3926 					 0,
3927 					 SLAB_HWCACHE_ALIGN,
3928 					 NULL);
3929 	if (!iommu_devinfo_cache) {
3930 		pr_err("Couldn't create devinfo cache\n");
3931 		ret = -ENOMEM;
3932 	}
3933 
3934 	return ret;
3935 }
3936 
iommu_init_mempool(void)3937 static int __init iommu_init_mempool(void)
3938 {
3939 	int ret;
3940 	ret = iova_cache_get();
3941 	if (ret)
3942 		return ret;
3943 
3944 	ret = iommu_domain_cache_init();
3945 	if (ret)
3946 		goto domain_error;
3947 
3948 	ret = iommu_devinfo_cache_init();
3949 	if (!ret)
3950 		return ret;
3951 
3952 	kmem_cache_destroy(iommu_domain_cache);
3953 domain_error:
3954 	iova_cache_put();
3955 
3956 	return -ENOMEM;
3957 }
3958 
iommu_exit_mempool(void)3959 static void __init iommu_exit_mempool(void)
3960 {
3961 	kmem_cache_destroy(iommu_devinfo_cache);
3962 	kmem_cache_destroy(iommu_domain_cache);
3963 	iova_cache_put();
3964 }
3965 
quirk_ioat_snb_local_iommu(struct pci_dev * pdev)3966 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3967 {
3968 	struct dmar_drhd_unit *drhd;
3969 	u32 vtbar;
3970 	int rc;
3971 
3972 	/* We know that this device on this chipset has its own IOMMU.
3973 	 * If we find it under a different IOMMU, then the BIOS is lying
3974 	 * to us. Hope that the IOMMU for this device is actually
3975 	 * disabled, and it needs no translation...
3976 	 */
3977 	rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3978 	if (rc) {
3979 		/* "can't" happen */
3980 		dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3981 		return;
3982 	}
3983 	vtbar &= 0xffff0000;
3984 
3985 	/* we know that the this iommu should be at offset 0xa000 from vtbar */
3986 	drhd = dmar_find_matched_drhd_unit(pdev);
3987 	if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
3988 		pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
3989 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
3990 		pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3991 	}
3992 }
3993 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3994 
init_no_remapping_devices(void)3995 static void __init init_no_remapping_devices(void)
3996 {
3997 	struct dmar_drhd_unit *drhd;
3998 	struct device *dev;
3999 	int i;
4000 
4001 	for_each_drhd_unit(drhd) {
4002 		if (!drhd->include_all) {
4003 			for_each_active_dev_scope(drhd->devices,
4004 						  drhd->devices_cnt, i, dev)
4005 				break;
4006 			/* ignore DMAR unit if no devices exist */
4007 			if (i == drhd->devices_cnt)
4008 				drhd->ignored = 1;
4009 		}
4010 	}
4011 
4012 	for_each_active_drhd_unit(drhd) {
4013 		if (drhd->include_all)
4014 			continue;
4015 
4016 		for_each_active_dev_scope(drhd->devices,
4017 					  drhd->devices_cnt, i, dev)
4018 			if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
4019 				break;
4020 		if (i < drhd->devices_cnt)
4021 			continue;
4022 
4023 		/* This IOMMU has *only* gfx devices. Either bypass it or
4024 		   set the gfx_mapped flag, as appropriate */
4025 		if (!dmar_map_gfx) {
4026 			drhd->ignored = 1;
4027 			for_each_active_dev_scope(drhd->devices,
4028 						  drhd->devices_cnt, i, dev)
4029 				dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
4030 		}
4031 	}
4032 }
4033 
4034 #ifdef CONFIG_SUSPEND
init_iommu_hw(void)4035 static int init_iommu_hw(void)
4036 {
4037 	struct dmar_drhd_unit *drhd;
4038 	struct intel_iommu *iommu = NULL;
4039 
4040 	for_each_active_iommu(iommu, drhd)
4041 		if (iommu->qi)
4042 			dmar_reenable_qi(iommu);
4043 
4044 	for_each_iommu(iommu, drhd) {
4045 		if (drhd->ignored) {
4046 			/*
4047 			 * we always have to disable PMRs or DMA may fail on
4048 			 * this device
4049 			 */
4050 			if (force_on)
4051 				iommu_disable_protect_mem_regions(iommu);
4052 			continue;
4053 		}
4054 
4055 		iommu_flush_write_buffer(iommu);
4056 
4057 		iommu_set_root_entry(iommu);
4058 
4059 		iommu->flush.flush_context(iommu, 0, 0, 0,
4060 					   DMA_CCMD_GLOBAL_INVL);
4061 		iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4062 		iommu_enable_translation(iommu);
4063 		iommu_disable_protect_mem_regions(iommu);
4064 	}
4065 
4066 	return 0;
4067 }
4068 
iommu_flush_all(void)4069 static void iommu_flush_all(void)
4070 {
4071 	struct dmar_drhd_unit *drhd;
4072 	struct intel_iommu *iommu;
4073 
4074 	for_each_active_iommu(iommu, drhd) {
4075 		iommu->flush.flush_context(iommu, 0, 0, 0,
4076 					   DMA_CCMD_GLOBAL_INVL);
4077 		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
4078 					 DMA_TLB_GLOBAL_FLUSH);
4079 	}
4080 }
4081 
iommu_suspend(void)4082 static int iommu_suspend(void)
4083 {
4084 	struct dmar_drhd_unit *drhd;
4085 	struct intel_iommu *iommu = NULL;
4086 	unsigned long flag;
4087 
4088 	for_each_active_iommu(iommu, drhd) {
4089 		iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
4090 						 GFP_ATOMIC);
4091 		if (!iommu->iommu_state)
4092 			goto nomem;
4093 	}
4094 
4095 	iommu_flush_all();
4096 
4097 	for_each_active_iommu(iommu, drhd) {
4098 		iommu_disable_translation(iommu);
4099 
4100 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
4101 
4102 		iommu->iommu_state[SR_DMAR_FECTL_REG] =
4103 			readl(iommu->reg + DMAR_FECTL_REG);
4104 		iommu->iommu_state[SR_DMAR_FEDATA_REG] =
4105 			readl(iommu->reg + DMAR_FEDATA_REG);
4106 		iommu->iommu_state[SR_DMAR_FEADDR_REG] =
4107 			readl(iommu->reg + DMAR_FEADDR_REG);
4108 		iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
4109 			readl(iommu->reg + DMAR_FEUADDR_REG);
4110 
4111 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4112 	}
4113 	return 0;
4114 
4115 nomem:
4116 	for_each_active_iommu(iommu, drhd)
4117 		kfree(iommu->iommu_state);
4118 
4119 	return -ENOMEM;
4120 }
4121 
iommu_resume(void)4122 static void iommu_resume(void)
4123 {
4124 	struct dmar_drhd_unit *drhd;
4125 	struct intel_iommu *iommu = NULL;
4126 	unsigned long flag;
4127 
4128 	if (init_iommu_hw()) {
4129 		if (force_on)
4130 			panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
4131 		else
4132 			WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
4133 		return;
4134 	}
4135 
4136 	for_each_active_iommu(iommu, drhd) {
4137 
4138 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
4139 
4140 		writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
4141 			iommu->reg + DMAR_FECTL_REG);
4142 		writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
4143 			iommu->reg + DMAR_FEDATA_REG);
4144 		writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
4145 			iommu->reg + DMAR_FEADDR_REG);
4146 		writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
4147 			iommu->reg + DMAR_FEUADDR_REG);
4148 
4149 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4150 	}
4151 
4152 	for_each_active_iommu(iommu, drhd)
4153 		kfree(iommu->iommu_state);
4154 }
4155 
4156 static struct syscore_ops iommu_syscore_ops = {
4157 	.resume		= iommu_resume,
4158 	.suspend	= iommu_suspend,
4159 };
4160 
init_iommu_pm_ops(void)4161 static void __init init_iommu_pm_ops(void)
4162 {
4163 	register_syscore_ops(&iommu_syscore_ops);
4164 }
4165 
4166 #else
init_iommu_pm_ops(void)4167 static inline void init_iommu_pm_ops(void) {}
4168 #endif	/* CONFIG_PM */
4169 
4170 
dmar_parse_one_rmrr(struct acpi_dmar_header * header,void * arg)4171 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4172 {
4173 	struct acpi_dmar_reserved_memory *rmrr;
4174 	struct dmar_rmrr_unit *rmrru;
4175 	size_t length;
4176 
4177 	rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4178 	if (!rmrru)
4179 		goto out;
4180 
4181 	rmrru->hdr = header;
4182 	rmrr = (struct acpi_dmar_reserved_memory *)header;
4183 	rmrru->base_address = rmrr->base_address;
4184 	rmrru->end_address = rmrr->end_address;
4185 
4186 	length = rmrr->end_address - rmrr->base_address + 1;
4187 
4188 	rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4189 				((void *)rmrr) + rmrr->header.length,
4190 				&rmrru->devices_cnt);
4191 	if (rmrru->devices_cnt && rmrru->devices == NULL)
4192 		goto free_rmrru;
4193 
4194 	list_add(&rmrru->list, &dmar_rmrr_units);
4195 
4196 	return 0;
4197 free_rmrru:
4198 	kfree(rmrru);
4199 out:
4200 	return -ENOMEM;
4201 }
4202 
dmar_find_atsr(struct acpi_dmar_atsr * atsr)4203 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4204 {
4205 	struct dmar_atsr_unit *atsru;
4206 	struct acpi_dmar_atsr *tmp;
4207 
4208 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4209 		tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4210 		if (atsr->segment != tmp->segment)
4211 			continue;
4212 		if (atsr->header.length != tmp->header.length)
4213 			continue;
4214 		if (memcmp(atsr, tmp, atsr->header.length) == 0)
4215 			return atsru;
4216 	}
4217 
4218 	return NULL;
4219 }
4220 
dmar_parse_one_atsr(struct acpi_dmar_header * hdr,void * arg)4221 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4222 {
4223 	struct acpi_dmar_atsr *atsr;
4224 	struct dmar_atsr_unit *atsru;
4225 
4226 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
4227 		return 0;
4228 
4229 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4230 	atsru = dmar_find_atsr(atsr);
4231 	if (atsru)
4232 		return 0;
4233 
4234 	atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4235 	if (!atsru)
4236 		return -ENOMEM;
4237 
4238 	/*
4239 	 * If memory is allocated from slab by ACPI _DSM method, we need to
4240 	 * copy the memory content because the memory buffer will be freed
4241 	 * on return.
4242 	 */
4243 	atsru->hdr = (void *)(atsru + 1);
4244 	memcpy(atsru->hdr, hdr, hdr->length);
4245 	atsru->include_all = atsr->flags & 0x1;
4246 	if (!atsru->include_all) {
4247 		atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4248 				(void *)atsr + atsr->header.length,
4249 				&atsru->devices_cnt);
4250 		if (atsru->devices_cnt && atsru->devices == NULL) {
4251 			kfree(atsru);
4252 			return -ENOMEM;
4253 		}
4254 	}
4255 
4256 	list_add_rcu(&atsru->list, &dmar_atsr_units);
4257 
4258 	return 0;
4259 }
4260 
intel_iommu_free_atsr(struct dmar_atsr_unit * atsru)4261 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4262 {
4263 	dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4264 	kfree(atsru);
4265 }
4266 
dmar_release_one_atsr(struct acpi_dmar_header * hdr,void * arg)4267 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4268 {
4269 	struct acpi_dmar_atsr *atsr;
4270 	struct dmar_atsr_unit *atsru;
4271 
4272 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4273 	atsru = dmar_find_atsr(atsr);
4274 	if (atsru) {
4275 		list_del_rcu(&atsru->list);
4276 		synchronize_rcu();
4277 		intel_iommu_free_atsr(atsru);
4278 	}
4279 
4280 	return 0;
4281 }
4282 
dmar_check_one_atsr(struct acpi_dmar_header * hdr,void * arg)4283 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4284 {
4285 	int i;
4286 	struct device *dev;
4287 	struct acpi_dmar_atsr *atsr;
4288 	struct dmar_atsr_unit *atsru;
4289 
4290 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4291 	atsru = dmar_find_atsr(atsr);
4292 	if (!atsru)
4293 		return 0;
4294 
4295 	if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4296 		for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4297 					  i, dev)
4298 			return -EBUSY;
4299 	}
4300 
4301 	return 0;
4302 }
4303 
intel_iommu_add(struct dmar_drhd_unit * dmaru)4304 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4305 {
4306 	int sp, ret = 0;
4307 	struct intel_iommu *iommu = dmaru->iommu;
4308 
4309 	if (g_iommus[iommu->seq_id])
4310 		return 0;
4311 
4312 	if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4313 		pr_warn("%s: Doesn't support hardware pass through.\n",
4314 			iommu->name);
4315 		return -ENXIO;
4316 	}
4317 	if (!ecap_sc_support(iommu->ecap) &&
4318 	    domain_update_iommu_snooping(iommu)) {
4319 		pr_warn("%s: Doesn't support snooping.\n",
4320 			iommu->name);
4321 		return -ENXIO;
4322 	}
4323 	sp = domain_update_iommu_superpage(iommu) - 1;
4324 	if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4325 		pr_warn("%s: Doesn't support large page.\n",
4326 			iommu->name);
4327 		return -ENXIO;
4328 	}
4329 
4330 	/*
4331 	 * Disable translation if already enabled prior to OS handover.
4332 	 */
4333 	if (iommu->gcmd & DMA_GCMD_TE)
4334 		iommu_disable_translation(iommu);
4335 
4336 	g_iommus[iommu->seq_id] = iommu;
4337 	ret = iommu_init_domains(iommu);
4338 	if (ret == 0)
4339 		ret = iommu_alloc_root_entry(iommu);
4340 	if (ret)
4341 		goto out;
4342 
4343 #ifdef CONFIG_INTEL_IOMMU_SVM
4344 	if (pasid_enabled(iommu))
4345 		intel_svm_alloc_pasid_tables(iommu);
4346 #endif
4347 
4348 	if (dmaru->ignored) {
4349 		/*
4350 		 * we always have to disable PMRs or DMA may fail on this device
4351 		 */
4352 		if (force_on)
4353 			iommu_disable_protect_mem_regions(iommu);
4354 		return 0;
4355 	}
4356 
4357 	intel_iommu_init_qi(iommu);
4358 	iommu_flush_write_buffer(iommu);
4359 
4360 #ifdef CONFIG_INTEL_IOMMU_SVM
4361 	if (pasid_enabled(iommu) && ecap_prs(iommu->ecap)) {
4362 		ret = intel_svm_enable_prq(iommu);
4363 		if (ret)
4364 			goto disable_iommu;
4365 	}
4366 #endif
4367 	ret = dmar_set_interrupt(iommu);
4368 	if (ret)
4369 		goto disable_iommu;
4370 
4371 	iommu_set_root_entry(iommu);
4372 	iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4373 	iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4374 	iommu_enable_translation(iommu);
4375 
4376 	iommu_disable_protect_mem_regions(iommu);
4377 	return 0;
4378 
4379 disable_iommu:
4380 	disable_dmar_iommu(iommu);
4381 out:
4382 	free_dmar_iommu(iommu);
4383 	return ret;
4384 }
4385 
dmar_iommu_hotplug(struct dmar_drhd_unit * dmaru,bool insert)4386 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4387 {
4388 	int ret = 0;
4389 	struct intel_iommu *iommu = dmaru->iommu;
4390 
4391 	if (!intel_iommu_enabled)
4392 		return 0;
4393 	if (iommu == NULL)
4394 		return -EINVAL;
4395 
4396 	if (insert) {
4397 		ret = intel_iommu_add(dmaru);
4398 	} else {
4399 		disable_dmar_iommu(iommu);
4400 		free_dmar_iommu(iommu);
4401 	}
4402 
4403 	return ret;
4404 }
4405 
intel_iommu_free_dmars(void)4406 static void intel_iommu_free_dmars(void)
4407 {
4408 	struct dmar_rmrr_unit *rmrru, *rmrr_n;
4409 	struct dmar_atsr_unit *atsru, *atsr_n;
4410 
4411 	list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4412 		list_del(&rmrru->list);
4413 		dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4414 		kfree(rmrru);
4415 	}
4416 
4417 	list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4418 		list_del(&atsru->list);
4419 		intel_iommu_free_atsr(atsru);
4420 	}
4421 }
4422 
dmar_find_matched_atsr_unit(struct pci_dev * dev)4423 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4424 {
4425 	int i, ret = 1;
4426 	struct pci_bus *bus;
4427 	struct pci_dev *bridge = NULL;
4428 	struct device *tmp;
4429 	struct acpi_dmar_atsr *atsr;
4430 	struct dmar_atsr_unit *atsru;
4431 
4432 	dev = pci_physfn(dev);
4433 	for (bus = dev->bus; bus; bus = bus->parent) {
4434 		bridge = bus->self;
4435 		/* If it's an integrated device, allow ATS */
4436 		if (!bridge)
4437 			return 1;
4438 		/* Connected via non-PCIe: no ATS */
4439 		if (!pci_is_pcie(bridge) ||
4440 		    pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4441 			return 0;
4442 		/* If we found the root port, look it up in the ATSR */
4443 		if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4444 			break;
4445 	}
4446 
4447 	rcu_read_lock();
4448 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4449 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4450 		if (atsr->segment != pci_domain_nr(dev->bus))
4451 			continue;
4452 
4453 		for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4454 			if (tmp == &bridge->dev)
4455 				goto out;
4456 
4457 		if (atsru->include_all)
4458 			goto out;
4459 	}
4460 	ret = 0;
4461 out:
4462 	rcu_read_unlock();
4463 
4464 	return ret;
4465 }
4466 
dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info * info)4467 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4468 {
4469 	int ret = 0;
4470 	struct dmar_rmrr_unit *rmrru;
4471 	struct dmar_atsr_unit *atsru;
4472 	struct acpi_dmar_atsr *atsr;
4473 	struct acpi_dmar_reserved_memory *rmrr;
4474 
4475 	if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4476 		return 0;
4477 
4478 	list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4479 		rmrr = container_of(rmrru->hdr,
4480 				    struct acpi_dmar_reserved_memory, header);
4481 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4482 			ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4483 				((void *)rmrr) + rmrr->header.length,
4484 				rmrr->segment, rmrru->devices,
4485 				rmrru->devices_cnt);
4486 			if(ret < 0)
4487 				return ret;
4488 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4489 			dmar_remove_dev_scope(info, rmrr->segment,
4490 				rmrru->devices, rmrru->devices_cnt);
4491 		}
4492 	}
4493 
4494 	list_for_each_entry(atsru, &dmar_atsr_units, list) {
4495 		if (atsru->include_all)
4496 			continue;
4497 
4498 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4499 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4500 			ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4501 					(void *)atsr + atsr->header.length,
4502 					atsr->segment, atsru->devices,
4503 					atsru->devices_cnt);
4504 			if (ret > 0)
4505 				break;
4506 			else if(ret < 0)
4507 				return ret;
4508 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4509 			if (dmar_remove_dev_scope(info, atsr->segment,
4510 					atsru->devices, atsru->devices_cnt))
4511 				break;
4512 		}
4513 	}
4514 
4515 	return 0;
4516 }
4517 
4518 /*
4519  * Here we only respond to action of unbound device from driver.
4520  *
4521  * Added device is not attached to its DMAR domain here yet. That will happen
4522  * when mapping the device to iova.
4523  */
device_notifier(struct notifier_block * nb,unsigned long action,void * data)4524 static int device_notifier(struct notifier_block *nb,
4525 				  unsigned long action, void *data)
4526 {
4527 	struct device *dev = data;
4528 	struct dmar_domain *domain;
4529 
4530 	if (iommu_dummy(dev))
4531 		return 0;
4532 
4533 	if (action != BUS_NOTIFY_REMOVED_DEVICE)
4534 		return 0;
4535 
4536 	domain = find_domain(dev);
4537 	if (!domain)
4538 		return 0;
4539 
4540 	dmar_remove_one_dev_info(domain, dev);
4541 	if (!domain_type_is_vm_or_si(domain) && list_empty(&domain->devices))
4542 		domain_exit(domain);
4543 
4544 	return 0;
4545 }
4546 
4547 static struct notifier_block device_nb = {
4548 	.notifier_call = device_notifier,
4549 };
4550 
intel_iommu_memory_notifier(struct notifier_block * nb,unsigned long val,void * v)4551 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4552 				       unsigned long val, void *v)
4553 {
4554 	struct memory_notify *mhp = v;
4555 	unsigned long long start, end;
4556 	unsigned long start_vpfn, last_vpfn;
4557 
4558 	switch (val) {
4559 	case MEM_GOING_ONLINE:
4560 		start = mhp->start_pfn << PAGE_SHIFT;
4561 		end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
4562 		if (iommu_domain_identity_map(si_domain, start, end)) {
4563 			pr_warn("Failed to build identity map for [%llx-%llx]\n",
4564 				start, end);
4565 			return NOTIFY_BAD;
4566 		}
4567 		break;
4568 
4569 	case MEM_OFFLINE:
4570 	case MEM_CANCEL_ONLINE:
4571 		start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4572 		last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
4573 		while (start_vpfn <= last_vpfn) {
4574 			struct iova *iova;
4575 			struct dmar_drhd_unit *drhd;
4576 			struct intel_iommu *iommu;
4577 			struct page *freelist;
4578 
4579 			iova = find_iova(&si_domain->iovad, start_vpfn);
4580 			if (iova == NULL) {
4581 				pr_debug("Failed get IOVA for PFN %lx\n",
4582 					 start_vpfn);
4583 				break;
4584 			}
4585 
4586 			iova = split_and_remove_iova(&si_domain->iovad, iova,
4587 						     start_vpfn, last_vpfn);
4588 			if (iova == NULL) {
4589 				pr_warn("Failed to split IOVA PFN [%lx-%lx]\n",
4590 					start_vpfn, last_vpfn);
4591 				return NOTIFY_BAD;
4592 			}
4593 
4594 			freelist = domain_unmap(si_domain, iova->pfn_lo,
4595 					       iova->pfn_hi);
4596 
4597 			rcu_read_lock();
4598 			for_each_active_iommu(iommu, drhd)
4599 				iommu_flush_iotlb_psi(iommu, si_domain,
4600 					iova->pfn_lo, iova_size(iova),
4601 					!freelist, 0);
4602 			rcu_read_unlock();
4603 			dma_free_pagelist(freelist);
4604 
4605 			start_vpfn = iova->pfn_hi + 1;
4606 			free_iova_mem(iova);
4607 		}
4608 		break;
4609 	}
4610 
4611 	return NOTIFY_OK;
4612 }
4613 
4614 static struct notifier_block intel_iommu_memory_nb = {
4615 	.notifier_call = intel_iommu_memory_notifier,
4616 	.priority = 0
4617 };
4618 
free_all_cpu_cached_iovas(unsigned int cpu)4619 static void free_all_cpu_cached_iovas(unsigned int cpu)
4620 {
4621 	int i;
4622 
4623 	for (i = 0; i < g_num_of_iommus; i++) {
4624 		struct intel_iommu *iommu = g_iommus[i];
4625 		struct dmar_domain *domain;
4626 		int did;
4627 
4628 		if (!iommu)
4629 			continue;
4630 
4631 		for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4632 			domain = get_iommu_domain(iommu, (u16)did);
4633 
4634 			if (!domain)
4635 				continue;
4636 			free_cpu_cached_iovas(cpu, &domain->iovad);
4637 		}
4638 	}
4639 }
4640 
intel_iommu_cpu_dead(unsigned int cpu)4641 static int intel_iommu_cpu_dead(unsigned int cpu)
4642 {
4643 	free_all_cpu_cached_iovas(cpu);
4644 	return 0;
4645 }
4646 
intel_disable_iommus(void)4647 static void intel_disable_iommus(void)
4648 {
4649 	struct intel_iommu *iommu = NULL;
4650 	struct dmar_drhd_unit *drhd;
4651 
4652 	for_each_iommu(iommu, drhd)
4653 		iommu_disable_translation(iommu);
4654 }
4655 
dev_to_intel_iommu(struct device * dev)4656 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4657 {
4658 	struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4659 
4660 	return container_of(iommu_dev, struct intel_iommu, iommu);
4661 }
4662 
intel_iommu_show_version(struct device * dev,struct device_attribute * attr,char * buf)4663 static ssize_t intel_iommu_show_version(struct device *dev,
4664 					struct device_attribute *attr,
4665 					char *buf)
4666 {
4667 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4668 	u32 ver = readl(iommu->reg + DMAR_VER_REG);
4669 	return sprintf(buf, "%d:%d\n",
4670 		       DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4671 }
4672 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4673 
intel_iommu_show_address(struct device * dev,struct device_attribute * attr,char * buf)4674 static ssize_t intel_iommu_show_address(struct device *dev,
4675 					struct device_attribute *attr,
4676 					char *buf)
4677 {
4678 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4679 	return sprintf(buf, "%llx\n", iommu->reg_phys);
4680 }
4681 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4682 
intel_iommu_show_cap(struct device * dev,struct device_attribute * attr,char * buf)4683 static ssize_t intel_iommu_show_cap(struct device *dev,
4684 				    struct device_attribute *attr,
4685 				    char *buf)
4686 {
4687 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4688 	return sprintf(buf, "%llx\n", iommu->cap);
4689 }
4690 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4691 
intel_iommu_show_ecap(struct device * dev,struct device_attribute * attr,char * buf)4692 static ssize_t intel_iommu_show_ecap(struct device *dev,
4693 				    struct device_attribute *attr,
4694 				    char *buf)
4695 {
4696 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4697 	return sprintf(buf, "%llx\n", iommu->ecap);
4698 }
4699 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4700 
intel_iommu_show_ndoms(struct device * dev,struct device_attribute * attr,char * buf)4701 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4702 				      struct device_attribute *attr,
4703 				      char *buf)
4704 {
4705 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4706 	return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4707 }
4708 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4709 
intel_iommu_show_ndoms_used(struct device * dev,struct device_attribute * attr,char * buf)4710 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4711 					   struct device_attribute *attr,
4712 					   char *buf)
4713 {
4714 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4715 	return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4716 						  cap_ndoms(iommu->cap)));
4717 }
4718 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4719 
4720 static struct attribute *intel_iommu_attrs[] = {
4721 	&dev_attr_version.attr,
4722 	&dev_attr_address.attr,
4723 	&dev_attr_cap.attr,
4724 	&dev_attr_ecap.attr,
4725 	&dev_attr_domains_supported.attr,
4726 	&dev_attr_domains_used.attr,
4727 	NULL,
4728 };
4729 
4730 static struct attribute_group intel_iommu_group = {
4731 	.name = "intel-iommu",
4732 	.attrs = intel_iommu_attrs,
4733 };
4734 
4735 const struct attribute_group *intel_iommu_groups[] = {
4736 	&intel_iommu_group,
4737 	NULL,
4738 };
4739 
intel_iommu_init(void)4740 int __init intel_iommu_init(void)
4741 {
4742 	int ret = -ENODEV;
4743 	struct dmar_drhd_unit *drhd;
4744 	struct intel_iommu *iommu;
4745 
4746 	/* VT-d is required for a TXT/tboot launch, so enforce that */
4747 	force_on = tboot_force_iommu();
4748 
4749 	if (iommu_init_mempool()) {
4750 		if (force_on)
4751 			panic("tboot: Failed to initialize iommu memory\n");
4752 		return -ENOMEM;
4753 	}
4754 
4755 	down_write(&dmar_global_lock);
4756 	if (dmar_table_init()) {
4757 		if (force_on)
4758 			panic("tboot: Failed to initialize DMAR table\n");
4759 		goto out_free_dmar;
4760 	}
4761 
4762 	if (dmar_dev_scope_init() < 0) {
4763 		if (force_on)
4764 			panic("tboot: Failed to initialize DMAR device scope\n");
4765 		goto out_free_dmar;
4766 	}
4767 
4768 	if (no_iommu || dmar_disabled) {
4769 		/*
4770 		 * We exit the function here to ensure IOMMU's remapping and
4771 		 * mempool aren't setup, which means that the IOMMU's PMRs
4772 		 * won't be disabled via the call to init_dmars(). So disable
4773 		 * it explicitly here. The PMRs were setup by tboot prior to
4774 		 * calling SENTER, but the kernel is expected to reset/tear
4775 		 * down the PMRs.
4776 		 */
4777 		if (intel_iommu_tboot_noforce) {
4778 			for_each_iommu(iommu, drhd)
4779 				iommu_disable_protect_mem_regions(iommu);
4780 		}
4781 
4782 		/*
4783 		 * Make sure the IOMMUs are switched off, even when we
4784 		 * boot into a kexec kernel and the previous kernel left
4785 		 * them enabled
4786 		 */
4787 		intel_disable_iommus();
4788 		goto out_free_dmar;
4789 	}
4790 
4791 	if (list_empty(&dmar_rmrr_units))
4792 		pr_info("No RMRR found\n");
4793 
4794 	if (list_empty(&dmar_atsr_units))
4795 		pr_info("No ATSR found\n");
4796 
4797 	if (dmar_init_reserved_ranges()) {
4798 		if (force_on)
4799 			panic("tboot: Failed to reserve iommu ranges\n");
4800 		goto out_free_reserved_range;
4801 	}
4802 
4803 	if (dmar_map_gfx)
4804 		intel_iommu_gfx_mapped = 1;
4805 
4806 	init_no_remapping_devices();
4807 
4808 	ret = init_dmars();
4809 	if (ret) {
4810 		if (force_on)
4811 			panic("tboot: Failed to initialize DMARs\n");
4812 		pr_err("Initialization failed\n");
4813 		goto out_free_reserved_range;
4814 	}
4815 	up_write(&dmar_global_lock);
4816 	pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4817 
4818 #ifdef CONFIG_SWIOTLB
4819 	swiotlb = 0;
4820 #endif
4821 	dma_ops = &intel_dma_ops;
4822 
4823 	init_iommu_pm_ops();
4824 
4825 	for_each_active_iommu(iommu, drhd) {
4826 		iommu_device_sysfs_add(&iommu->iommu, NULL,
4827 				       intel_iommu_groups,
4828 				       "%s", iommu->name);
4829 		iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
4830 		iommu_device_register(&iommu->iommu);
4831 	}
4832 
4833 	bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4834 	bus_register_notifier(&pci_bus_type, &device_nb);
4835 	if (si_domain && !hw_pass_through)
4836 		register_memory_notifier(&intel_iommu_memory_nb);
4837 	cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
4838 			  intel_iommu_cpu_dead);
4839 	intel_iommu_enabled = 1;
4840 
4841 	return 0;
4842 
4843 out_free_reserved_range:
4844 	put_iova_domain(&reserved_iova_list);
4845 out_free_dmar:
4846 	intel_iommu_free_dmars();
4847 	up_write(&dmar_global_lock);
4848 	iommu_exit_mempool();
4849 	return ret;
4850 }
4851 
domain_context_clear_one_cb(struct pci_dev * pdev,u16 alias,void * opaque)4852 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4853 {
4854 	struct intel_iommu *iommu = opaque;
4855 
4856 	domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
4857 	return 0;
4858 }
4859 
4860 /*
4861  * NB - intel-iommu lacks any sort of reference counting for the users of
4862  * dependent devices.  If multiple endpoints have intersecting dependent
4863  * devices, unbinding the driver from any one of them will possibly leave
4864  * the others unable to operate.
4865  */
domain_context_clear(struct intel_iommu * iommu,struct device * dev)4866 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
4867 {
4868 	if (!iommu || !dev || !dev_is_pci(dev))
4869 		return;
4870 
4871 	pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
4872 }
4873 
__dmar_remove_one_dev_info(struct device_domain_info * info)4874 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
4875 {
4876 	struct intel_iommu *iommu;
4877 	unsigned long flags;
4878 
4879 	assert_spin_locked(&device_domain_lock);
4880 
4881 	if (WARN_ON(!info))
4882 		return;
4883 
4884 	iommu = info->iommu;
4885 
4886 	if (info->dev) {
4887 		iommu_disable_dev_iotlb(info);
4888 		domain_context_clear(iommu, info->dev);
4889 	}
4890 
4891 	unlink_domain_info(info);
4892 
4893 	spin_lock_irqsave(&iommu->lock, flags);
4894 	domain_detach_iommu(info->domain, iommu);
4895 	spin_unlock_irqrestore(&iommu->lock, flags);
4896 
4897 	free_devinfo_mem(info);
4898 }
4899 
dmar_remove_one_dev_info(struct dmar_domain * domain,struct device * dev)4900 static void dmar_remove_one_dev_info(struct dmar_domain *domain,
4901 				     struct device *dev)
4902 {
4903 	struct device_domain_info *info;
4904 	unsigned long flags;
4905 
4906 	spin_lock_irqsave(&device_domain_lock, flags);
4907 	info = dev->archdata.iommu;
4908 	__dmar_remove_one_dev_info(info);
4909 	spin_unlock_irqrestore(&device_domain_lock, flags);
4910 }
4911 
md_domain_init(struct dmar_domain * domain,int guest_width)4912 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4913 {
4914 	int adjust_width;
4915 
4916 	init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN,
4917 			DMA_32BIT_PFN);
4918 	domain_reserve_special_ranges(domain);
4919 
4920 	/* calculate AGAW */
4921 	domain->gaw = guest_width;
4922 	adjust_width = guestwidth_to_adjustwidth(guest_width);
4923 	domain->agaw = width_to_agaw(adjust_width);
4924 
4925 	domain->iommu_coherency = 0;
4926 	domain->iommu_snooping = 0;
4927 	domain->iommu_superpage = 0;
4928 	domain->max_addr = 0;
4929 
4930 	/* always allocate the top pgd */
4931 	domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
4932 	if (!domain->pgd)
4933 		return -ENOMEM;
4934 	domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4935 	return 0;
4936 }
4937 
intel_iommu_domain_alloc(unsigned type)4938 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4939 {
4940 	struct dmar_domain *dmar_domain;
4941 	struct iommu_domain *domain;
4942 
4943 	if (type != IOMMU_DOMAIN_UNMANAGED)
4944 		return NULL;
4945 
4946 	dmar_domain = alloc_domain(DOMAIN_FLAG_VIRTUAL_MACHINE);
4947 	if (!dmar_domain) {
4948 		pr_err("Can't allocate dmar_domain\n");
4949 		return NULL;
4950 	}
4951 	if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4952 		pr_err("Domain initialization failed\n");
4953 		domain_exit(dmar_domain);
4954 		return NULL;
4955 	}
4956 	domain_update_iommu_cap(dmar_domain);
4957 
4958 	domain = &dmar_domain->domain;
4959 	domain->geometry.aperture_start = 0;
4960 	domain->geometry.aperture_end   = __DOMAIN_MAX_ADDR(dmar_domain->gaw);
4961 	domain->geometry.force_aperture = true;
4962 
4963 	return domain;
4964 }
4965 
intel_iommu_domain_free(struct iommu_domain * domain)4966 static void intel_iommu_domain_free(struct iommu_domain *domain)
4967 {
4968 	domain_exit(to_dmar_domain(domain));
4969 }
4970 
intel_iommu_attach_device(struct iommu_domain * domain,struct device * dev)4971 static int intel_iommu_attach_device(struct iommu_domain *domain,
4972 				     struct device *dev)
4973 {
4974 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4975 	struct intel_iommu *iommu;
4976 	int addr_width;
4977 	u8 bus, devfn;
4978 
4979 	if (device_is_rmrr_locked(dev)) {
4980 		dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
4981 		return -EPERM;
4982 	}
4983 
4984 	/* normally dev is not mapped */
4985 	if (unlikely(domain_context_mapped(dev))) {
4986 		struct dmar_domain *old_domain;
4987 
4988 		old_domain = find_domain(dev);
4989 		if (old_domain) {
4990 			rcu_read_lock();
4991 			dmar_remove_one_dev_info(old_domain, dev);
4992 			rcu_read_unlock();
4993 
4994 			if (!domain_type_is_vm_or_si(old_domain) &&
4995 			     list_empty(&old_domain->devices))
4996 				domain_exit(old_domain);
4997 		}
4998 	}
4999 
5000 	iommu = device_to_iommu(dev, &bus, &devfn);
5001 	if (!iommu)
5002 		return -ENODEV;
5003 
5004 	/* check if this iommu agaw is sufficient for max mapped address */
5005 	addr_width = agaw_to_width(iommu->agaw);
5006 	if (addr_width > cap_mgaw(iommu->cap))
5007 		addr_width = cap_mgaw(iommu->cap);
5008 
5009 	if (dmar_domain->max_addr > (1LL << addr_width)) {
5010 		pr_err("%s: iommu width (%d) is not "
5011 		       "sufficient for the mapped address (%llx)\n",
5012 		       __func__, addr_width, dmar_domain->max_addr);
5013 		return -EFAULT;
5014 	}
5015 	dmar_domain->gaw = addr_width;
5016 
5017 	/*
5018 	 * Knock out extra levels of page tables if necessary
5019 	 */
5020 	while (iommu->agaw < dmar_domain->agaw) {
5021 		struct dma_pte *pte;
5022 
5023 		pte = dmar_domain->pgd;
5024 		if (dma_pte_present(pte)) {
5025 			dmar_domain->pgd = (struct dma_pte *)
5026 				phys_to_virt(dma_pte_addr(pte));
5027 			free_pgtable_page(pte);
5028 		}
5029 		dmar_domain->agaw--;
5030 	}
5031 
5032 	return domain_add_dev_info(dmar_domain, dev);
5033 }
5034 
intel_iommu_detach_device(struct iommu_domain * domain,struct device * dev)5035 static void intel_iommu_detach_device(struct iommu_domain *domain,
5036 				      struct device *dev)
5037 {
5038 	dmar_remove_one_dev_info(to_dmar_domain(domain), dev);
5039 }
5040 
intel_iommu_map(struct iommu_domain * domain,unsigned long iova,phys_addr_t hpa,size_t size,int iommu_prot)5041 static int intel_iommu_map(struct iommu_domain *domain,
5042 			   unsigned long iova, phys_addr_t hpa,
5043 			   size_t size, int iommu_prot)
5044 {
5045 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5046 	u64 max_addr;
5047 	int prot = 0;
5048 	int ret;
5049 
5050 	if (iommu_prot & IOMMU_READ)
5051 		prot |= DMA_PTE_READ;
5052 	if (iommu_prot & IOMMU_WRITE)
5053 		prot |= DMA_PTE_WRITE;
5054 	if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5055 		prot |= DMA_PTE_SNP;
5056 
5057 	max_addr = iova + size;
5058 	if (dmar_domain->max_addr < max_addr) {
5059 		u64 end;
5060 
5061 		/* check if minimum agaw is sufficient for mapped address */
5062 		end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5063 		if (end < max_addr) {
5064 			pr_err("%s: iommu width (%d) is not "
5065 			       "sufficient for the mapped address (%llx)\n",
5066 			       __func__, dmar_domain->gaw, max_addr);
5067 			return -EFAULT;
5068 		}
5069 		dmar_domain->max_addr = max_addr;
5070 	}
5071 	/* Round up size to next multiple of PAGE_SIZE, if it and
5072 	   the low bits of hpa would take us onto the next page */
5073 	size = aligned_nrpages(hpa, size);
5074 	ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5075 				 hpa >> VTD_PAGE_SHIFT, size, prot);
5076 	return ret;
5077 }
5078 
intel_iommu_unmap(struct iommu_domain * domain,unsigned long iova,size_t size)5079 static size_t intel_iommu_unmap(struct iommu_domain *domain,
5080 				unsigned long iova, size_t size)
5081 {
5082 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5083 	struct page *freelist = NULL;
5084 	struct intel_iommu *iommu;
5085 	unsigned long start_pfn, last_pfn;
5086 	unsigned int npages;
5087 	int iommu_id, level = 0;
5088 
5089 	/* Cope with horrid API which requires us to unmap more than the
5090 	   size argument if it happens to be a large-page mapping. */
5091 	BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5092 
5093 	if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5094 		size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5095 
5096 	start_pfn = iova >> VTD_PAGE_SHIFT;
5097 	last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5098 
5099 	freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
5100 
5101 	npages = last_pfn - start_pfn + 1;
5102 
5103 	for_each_domain_iommu(iommu_id, dmar_domain) {
5104 		iommu = g_iommus[iommu_id];
5105 
5106 		iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5107 				      start_pfn, npages, !freelist, 0);
5108 	}
5109 
5110 	dma_free_pagelist(freelist);
5111 
5112 	if (dmar_domain->max_addr == iova + size)
5113 		dmar_domain->max_addr = iova;
5114 
5115 	return size;
5116 }
5117 
intel_iommu_iova_to_phys(struct iommu_domain * domain,dma_addr_t iova)5118 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5119 					    dma_addr_t iova)
5120 {
5121 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5122 	struct dma_pte *pte;
5123 	int level = 0;
5124 	u64 phys = 0;
5125 
5126 	pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5127 	if (pte && dma_pte_present(pte))
5128 		phys = dma_pte_addr(pte) +
5129 			(iova & (BIT_MASK(level_to_offset_bits(level) +
5130 						VTD_PAGE_SHIFT) - 1));
5131 
5132 	return phys;
5133 }
5134 
intel_iommu_capable(enum iommu_cap cap)5135 static bool intel_iommu_capable(enum iommu_cap cap)
5136 {
5137 	if (cap == IOMMU_CAP_CACHE_COHERENCY)
5138 		return domain_update_iommu_snooping(NULL) == 1;
5139 	if (cap == IOMMU_CAP_INTR_REMAP)
5140 		return irq_remapping_enabled == 1;
5141 
5142 	return false;
5143 }
5144 
intel_iommu_add_device(struct device * dev)5145 static int intel_iommu_add_device(struct device *dev)
5146 {
5147 	struct intel_iommu *iommu;
5148 	struct iommu_group *group;
5149 	u8 bus, devfn;
5150 
5151 	iommu = device_to_iommu(dev, &bus, &devfn);
5152 	if (!iommu)
5153 		return -ENODEV;
5154 
5155 	iommu_device_link(&iommu->iommu, dev);
5156 
5157 	group = iommu_group_get_for_dev(dev);
5158 
5159 	if (IS_ERR(group))
5160 		return PTR_ERR(group);
5161 
5162 	iommu_group_put(group);
5163 	return 0;
5164 }
5165 
intel_iommu_remove_device(struct device * dev)5166 static void intel_iommu_remove_device(struct device *dev)
5167 {
5168 	struct intel_iommu *iommu;
5169 	u8 bus, devfn;
5170 
5171 	iommu = device_to_iommu(dev, &bus, &devfn);
5172 	if (!iommu)
5173 		return;
5174 
5175 	iommu_group_remove_device(dev);
5176 
5177 	iommu_device_unlink(&iommu->iommu, dev);
5178 }
5179 
intel_iommu_get_resv_regions(struct device * device,struct list_head * head)5180 static void intel_iommu_get_resv_regions(struct device *device,
5181 					 struct list_head *head)
5182 {
5183 	int prot = DMA_PTE_READ | DMA_PTE_WRITE;
5184 	struct iommu_resv_region *reg;
5185 	struct dmar_rmrr_unit *rmrr;
5186 	struct device *i_dev;
5187 	int i;
5188 
5189 	down_read(&dmar_global_lock);
5190 	for_each_rmrr_units(rmrr) {
5191 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5192 					  i, i_dev) {
5193 			struct iommu_resv_region *resv;
5194 			size_t length;
5195 
5196 			if (i_dev != device)
5197 				continue;
5198 
5199 			length = rmrr->end_address - rmrr->base_address + 1;
5200 			resv = iommu_alloc_resv_region(rmrr->base_address,
5201 						       length, prot,
5202 						       IOMMU_RESV_DIRECT);
5203 			if (!resv)
5204 				break;
5205 
5206 			list_add_tail(&resv->list, head);
5207 		}
5208 	}
5209 	up_read(&dmar_global_lock);
5210 
5211 	reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5212 				      IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5213 				      0, IOMMU_RESV_MSI);
5214 	if (!reg)
5215 		return;
5216 	list_add_tail(&reg->list, head);
5217 }
5218 
intel_iommu_put_resv_regions(struct device * dev,struct list_head * head)5219 static void intel_iommu_put_resv_regions(struct device *dev,
5220 					 struct list_head *head)
5221 {
5222 	struct iommu_resv_region *entry, *next;
5223 
5224 	list_for_each_entry_safe(entry, next, head, list)
5225 		kfree(entry);
5226 }
5227 
5228 #ifdef CONFIG_INTEL_IOMMU_SVM
5229 #define MAX_NR_PASID_BITS (20)
intel_iommu_get_pts(struct intel_iommu * iommu)5230 static inline unsigned long intel_iommu_get_pts(struct intel_iommu *iommu)
5231 {
5232 	/*
5233 	 * Convert ecap_pss to extend context entry pts encoding, also
5234 	 * respect the soft pasid_max value set by the iommu.
5235 	 * - number of PASID bits = ecap_pss + 1
5236 	 * - number of PASID table entries = 2^(pts + 5)
5237 	 * Therefore, pts = ecap_pss - 4
5238 	 * e.g. KBL ecap_pss = 0x13, PASID has 20 bits, pts = 15
5239 	 */
5240 	if (ecap_pss(iommu->ecap) < 5)
5241 		return 0;
5242 
5243 	/* pasid_max is encoded as actual number of entries not the bits */
5244 	return find_first_bit((unsigned long *)&iommu->pasid_max,
5245 			MAX_NR_PASID_BITS) - 5;
5246 }
5247 
intel_iommu_enable_pasid(struct intel_iommu * iommu,struct intel_svm_dev * sdev)5248 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct intel_svm_dev *sdev)
5249 {
5250 	struct device_domain_info *info;
5251 	struct context_entry *context;
5252 	struct dmar_domain *domain;
5253 	unsigned long flags;
5254 	u64 ctx_lo;
5255 	int ret;
5256 
5257 	domain = get_valid_domain_for_dev(sdev->dev);
5258 	if (!domain)
5259 		return -EINVAL;
5260 
5261 	spin_lock_irqsave(&device_domain_lock, flags);
5262 	spin_lock(&iommu->lock);
5263 
5264 	ret = -EINVAL;
5265 	info = sdev->dev->archdata.iommu;
5266 	if (!info || !info->pasid_supported)
5267 		goto out;
5268 
5269 	context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5270 	if (WARN_ON(!context))
5271 		goto out;
5272 
5273 	ctx_lo = context[0].lo;
5274 
5275 	sdev->did = domain->iommu_did[iommu->seq_id];
5276 	sdev->sid = PCI_DEVID(info->bus, info->devfn);
5277 
5278 	if (!(ctx_lo & CONTEXT_PASIDE)) {
5279 		if (iommu->pasid_state_table)
5280 			context[1].hi = (u64)virt_to_phys(iommu->pasid_state_table);
5281 		context[1].lo = (u64)virt_to_phys(iommu->pasid_table) |
5282 			intel_iommu_get_pts(iommu);
5283 
5284 		wmb();
5285 		/* CONTEXT_TT_MULTI_LEVEL and CONTEXT_TT_DEV_IOTLB are both
5286 		 * extended to permit requests-with-PASID if the PASIDE bit
5287 		 * is set. which makes sense. For CONTEXT_TT_PASS_THROUGH,
5288 		 * however, the PASIDE bit is ignored and requests-with-PASID
5289 		 * are unconditionally blocked. Which makes less sense.
5290 		 * So convert from CONTEXT_TT_PASS_THROUGH to one of the new
5291 		 * "guest mode" translation types depending on whether ATS
5292 		 * is available or not. Annoyingly, we can't use the new
5293 		 * modes *unless* PASIDE is set. */
5294 		if ((ctx_lo & CONTEXT_TT_MASK) == (CONTEXT_TT_PASS_THROUGH << 2)) {
5295 			ctx_lo &= ~CONTEXT_TT_MASK;
5296 			if (info->ats_supported)
5297 				ctx_lo |= CONTEXT_TT_PT_PASID_DEV_IOTLB << 2;
5298 			else
5299 				ctx_lo |= CONTEXT_TT_PT_PASID << 2;
5300 		}
5301 		ctx_lo |= CONTEXT_PASIDE;
5302 		if (iommu->pasid_state_table)
5303 			ctx_lo |= CONTEXT_DINVE;
5304 		if (info->pri_supported)
5305 			ctx_lo |= CONTEXT_PRS;
5306 		context[0].lo = ctx_lo;
5307 		wmb();
5308 		iommu->flush.flush_context(iommu, sdev->did, sdev->sid,
5309 					   DMA_CCMD_MASK_NOBIT,
5310 					   DMA_CCMD_DEVICE_INVL);
5311 	}
5312 
5313 	/* Enable PASID support in the device, if it wasn't already */
5314 	if (!info->pasid_enabled)
5315 		iommu_enable_dev_iotlb(info);
5316 
5317 	if (info->ats_enabled) {
5318 		sdev->dev_iotlb = 1;
5319 		sdev->qdep = info->ats_qdep;
5320 		if (sdev->qdep >= QI_DEV_EIOTLB_MAX_INVS)
5321 			sdev->qdep = 0;
5322 	}
5323 	ret = 0;
5324 
5325  out:
5326 	spin_unlock(&iommu->lock);
5327 	spin_unlock_irqrestore(&device_domain_lock, flags);
5328 
5329 	return ret;
5330 }
5331 
intel_svm_device_to_iommu(struct device * dev)5332 struct intel_iommu *intel_svm_device_to_iommu(struct device *dev)
5333 {
5334 	struct intel_iommu *iommu;
5335 	u8 bus, devfn;
5336 
5337 	if (iommu_dummy(dev)) {
5338 		dev_warn(dev,
5339 			 "No IOMMU translation for device; cannot enable SVM\n");
5340 		return NULL;
5341 	}
5342 
5343 	iommu = device_to_iommu(dev, &bus, &devfn);
5344 	if ((!iommu)) {
5345 		dev_err(dev, "No IOMMU for device; cannot enable SVM\n");
5346 		return NULL;
5347 	}
5348 
5349 	if (!iommu->pasid_table) {
5350 		dev_err(dev, "PASID not enabled on IOMMU; cannot enable SVM\n");
5351 		return NULL;
5352 	}
5353 
5354 	return iommu;
5355 }
5356 #endif /* CONFIG_INTEL_IOMMU_SVM */
5357 
5358 const struct iommu_ops intel_iommu_ops = {
5359 	.capable		= intel_iommu_capable,
5360 	.domain_alloc		= intel_iommu_domain_alloc,
5361 	.domain_free		= intel_iommu_domain_free,
5362 	.attach_dev		= intel_iommu_attach_device,
5363 	.detach_dev		= intel_iommu_detach_device,
5364 	.map			= intel_iommu_map,
5365 	.unmap			= intel_iommu_unmap,
5366 	.map_sg			= default_iommu_map_sg,
5367 	.iova_to_phys		= intel_iommu_iova_to_phys,
5368 	.add_device		= intel_iommu_add_device,
5369 	.remove_device		= intel_iommu_remove_device,
5370 	.get_resv_regions	= intel_iommu_get_resv_regions,
5371 	.put_resv_regions	= intel_iommu_put_resv_regions,
5372 	.device_group		= pci_device_group,
5373 	.pgsize_bitmap		= INTEL_IOMMU_PGSIZES,
5374 };
5375 
quirk_iommu_g4x_gfx(struct pci_dev * dev)5376 static void quirk_iommu_g4x_gfx(struct pci_dev *dev)
5377 {
5378 	/* G4x/GM45 integrated gfx dmar support is totally busted. */
5379 	pr_info("Disabling IOMMU for graphics on this chipset\n");
5380 	dmar_map_gfx = 0;
5381 }
5382 
5383 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_g4x_gfx);
5384 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_g4x_gfx);
5385 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_g4x_gfx);
5386 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_g4x_gfx);
5387 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_g4x_gfx);
5388 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_g4x_gfx);
5389 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_g4x_gfx);
5390 
quirk_iommu_rwbf(struct pci_dev * dev)5391 static void quirk_iommu_rwbf(struct pci_dev *dev)
5392 {
5393 	/*
5394 	 * Mobile 4 Series Chipset neglects to set RWBF capability,
5395 	 * but needs it. Same seems to hold for the desktop versions.
5396 	 */
5397 	pr_info("Forcing write-buffer flush capability\n");
5398 	rwbf_quirk = 1;
5399 }
5400 
5401 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
5402 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
5403 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
5404 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
5405 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
5406 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
5407 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
5408 
5409 #define GGC 0x52
5410 #define GGC_MEMORY_SIZE_MASK	(0xf << 8)
5411 #define GGC_MEMORY_SIZE_NONE	(0x0 << 8)
5412 #define GGC_MEMORY_SIZE_1M	(0x1 << 8)
5413 #define GGC_MEMORY_SIZE_2M	(0x3 << 8)
5414 #define GGC_MEMORY_VT_ENABLED	(0x8 << 8)
5415 #define GGC_MEMORY_SIZE_2M_VT	(0x9 << 8)
5416 #define GGC_MEMORY_SIZE_3M_VT	(0xa << 8)
5417 #define GGC_MEMORY_SIZE_4M_VT	(0xb << 8)
5418 
quirk_calpella_no_shadow_gtt(struct pci_dev * dev)5419 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
5420 {
5421 	unsigned short ggc;
5422 
5423 	if (pci_read_config_word(dev, GGC, &ggc))
5424 		return;
5425 
5426 	if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
5427 		pr_info("BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
5428 		dmar_map_gfx = 0;
5429 	} else if (dmar_map_gfx) {
5430 		/* we have to ensure the gfx device is idle before we flush */
5431 		pr_info("Disabling batched IOTLB flush on Ironlake\n");
5432 		intel_iommu_strict = 1;
5433        }
5434 }
5435 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
5436 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
5437 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
5438 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
5439 
5440 /* On Tylersburg chipsets, some BIOSes have been known to enable the
5441    ISOCH DMAR unit for the Azalia sound device, but not give it any
5442    TLB entries, which causes it to deadlock. Check for that.  We do
5443    this in a function called from init_dmars(), instead of in a PCI
5444    quirk, because we don't want to print the obnoxious "BIOS broken"
5445    message if VT-d is actually disabled.
5446 */
check_tylersburg_isoch(void)5447 static void __init check_tylersburg_isoch(void)
5448 {
5449 	struct pci_dev *pdev;
5450 	uint32_t vtisochctrl;
5451 
5452 	/* If there's no Azalia in the system anyway, forget it. */
5453 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
5454 	if (!pdev)
5455 		return;
5456 	pci_dev_put(pdev);
5457 
5458 	/* System Management Registers. Might be hidden, in which case
5459 	   we can't do the sanity check. But that's OK, because the
5460 	   known-broken BIOSes _don't_ actually hide it, so far. */
5461 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
5462 	if (!pdev)
5463 		return;
5464 
5465 	if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
5466 		pci_dev_put(pdev);
5467 		return;
5468 	}
5469 
5470 	pci_dev_put(pdev);
5471 
5472 	/* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5473 	if (vtisochctrl & 1)
5474 		return;
5475 
5476 	/* Drop all bits other than the number of TLB entries */
5477 	vtisochctrl &= 0x1c;
5478 
5479 	/* If we have the recommended number of TLB entries (16), fine. */
5480 	if (vtisochctrl == 0x10)
5481 		return;
5482 
5483 	/* Zero TLB entries? You get to ride the short bus to school. */
5484 	if (!vtisochctrl) {
5485 		WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5486 		     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5487 		     dmi_get_system_info(DMI_BIOS_VENDOR),
5488 		     dmi_get_system_info(DMI_BIOS_VERSION),
5489 		     dmi_get_system_info(DMI_PRODUCT_VERSION));
5490 		iommu_identity_mapping |= IDENTMAP_AZALIA;
5491 		return;
5492 	}
5493 
5494 	pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
5495 	       vtisochctrl);
5496 }
5497