• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * Copyright (C) 2001 Mike Corrigan & Dave Engebretsen, IBM Corporation
4  *
5  * Rewrite, cleanup, new allocation schemes, virtual merging:
6  * Copyright (C) 2004 Olof Johansson, IBM Corporation
7  *               and  Ben. Herrenschmidt, IBM Corporation
8  *
9  * Dynamic DMA mapping support, bus-independent parts.
10  */
11 
12 
13 #include <linux/init.h>
14 #include <linux/types.h>
15 #include <linux/slab.h>
16 #include <linux/mm.h>
17 #include <linux/spinlock.h>
18 #include <linux/string.h>
19 #include <linux/dma-mapping.h>
20 #include <linux/bitmap.h>
21 #include <linux/iommu-helper.h>
22 #include <linux/crash_dump.h>
23 #include <linux/hash.h>
24 #include <linux/fault-inject.h>
25 #include <linux/pci.h>
26 #include <linux/iommu.h>
27 #include <linux/sched.h>
28 #include <asm/io.h>
29 #include <asm/prom.h>
30 #include <asm/iommu.h>
31 #include <asm/pci-bridge.h>
32 #include <asm/machdep.h>
33 #include <asm/kdump.h>
34 #include <asm/fadump.h>
35 #include <asm/vio.h>
36 #include <asm/tce.h>
37 #include <asm/mmu_context.h>
38 
39 #define DBG(...)
40 
41 static int novmerge;
42 
43 static void __iommu_free(struct iommu_table *, dma_addr_t, unsigned int);
44 
setup_iommu(char * str)45 static int __init setup_iommu(char *str)
46 {
47 	if (!strcmp(str, "novmerge"))
48 		novmerge = 1;
49 	else if (!strcmp(str, "vmerge"))
50 		novmerge = 0;
51 	return 1;
52 }
53 
54 __setup("iommu=", setup_iommu);
55 
56 static DEFINE_PER_CPU(unsigned int, iommu_pool_hash);
57 
58 /*
59  * We precalculate the hash to avoid doing it on every allocation.
60  *
61  * The hash is important to spread CPUs across all the pools. For example,
62  * on a POWER7 with 4 way SMT we want interrupts on the primary threads and
63  * with 4 pools all primary threads would map to the same pool.
64  */
setup_iommu_pool_hash(void)65 static int __init setup_iommu_pool_hash(void)
66 {
67 	unsigned int i;
68 
69 	for_each_possible_cpu(i)
70 		per_cpu(iommu_pool_hash, i) = hash_32(i, IOMMU_POOL_HASHBITS);
71 
72 	return 0;
73 }
74 subsys_initcall(setup_iommu_pool_hash);
75 
76 #ifdef CONFIG_FAIL_IOMMU
77 
78 static DECLARE_FAULT_ATTR(fail_iommu);
79 
setup_fail_iommu(char * str)80 static int __init setup_fail_iommu(char *str)
81 {
82 	return setup_fault_attr(&fail_iommu, str);
83 }
84 __setup("fail_iommu=", setup_fail_iommu);
85 
should_fail_iommu(struct device * dev)86 static bool should_fail_iommu(struct device *dev)
87 {
88 	return dev->archdata.fail_iommu && should_fail(&fail_iommu, 1);
89 }
90 
fail_iommu_debugfs(void)91 static int __init fail_iommu_debugfs(void)
92 {
93 	struct dentry *dir = fault_create_debugfs_attr("fail_iommu",
94 						       NULL, &fail_iommu);
95 
96 	return PTR_ERR_OR_ZERO(dir);
97 }
98 late_initcall(fail_iommu_debugfs);
99 
fail_iommu_show(struct device * dev,struct device_attribute * attr,char * buf)100 static ssize_t fail_iommu_show(struct device *dev,
101 			       struct device_attribute *attr, char *buf)
102 {
103 	return sprintf(buf, "%d\n", dev->archdata.fail_iommu);
104 }
105 
fail_iommu_store(struct device * dev,struct device_attribute * attr,const char * buf,size_t count)106 static ssize_t fail_iommu_store(struct device *dev,
107 				struct device_attribute *attr, const char *buf,
108 				size_t count)
109 {
110 	int i;
111 
112 	if (count > 0 && sscanf(buf, "%d", &i) > 0)
113 		dev->archdata.fail_iommu = (i == 0) ? 0 : 1;
114 
115 	return count;
116 }
117 
118 static DEVICE_ATTR_RW(fail_iommu);
119 
fail_iommu_bus_notify(struct notifier_block * nb,unsigned long action,void * data)120 static int fail_iommu_bus_notify(struct notifier_block *nb,
121 				 unsigned long action, void *data)
122 {
123 	struct device *dev = data;
124 
125 	if (action == BUS_NOTIFY_ADD_DEVICE) {
126 		if (device_create_file(dev, &dev_attr_fail_iommu))
127 			pr_warn("Unable to create IOMMU fault injection sysfs "
128 				"entries\n");
129 	} else if (action == BUS_NOTIFY_DEL_DEVICE) {
130 		device_remove_file(dev, &dev_attr_fail_iommu);
131 	}
132 
133 	return 0;
134 }
135 
136 /*
137  * PCI and VIO buses need separate notifier_block structs, since they're linked
138  * list nodes.  Sharing a notifier_block would mean that any notifiers later
139  * registered for PCI buses would also get called by VIO buses and vice versa.
140  */
141 static struct notifier_block fail_iommu_pci_bus_notifier = {
142 	.notifier_call = fail_iommu_bus_notify
143 };
144 
145 #ifdef CONFIG_IBMVIO
146 static struct notifier_block fail_iommu_vio_bus_notifier = {
147 	.notifier_call = fail_iommu_bus_notify
148 };
149 #endif
150 
fail_iommu_setup(void)151 static int __init fail_iommu_setup(void)
152 {
153 #ifdef CONFIG_PCI
154 	bus_register_notifier(&pci_bus_type, &fail_iommu_pci_bus_notifier);
155 #endif
156 #ifdef CONFIG_IBMVIO
157 	bus_register_notifier(&vio_bus_type, &fail_iommu_vio_bus_notifier);
158 #endif
159 
160 	return 0;
161 }
162 /*
163  * Must execute after PCI and VIO subsystem have initialised but before
164  * devices are probed.
165  */
166 arch_initcall(fail_iommu_setup);
167 #else
should_fail_iommu(struct device * dev)168 static inline bool should_fail_iommu(struct device *dev)
169 {
170 	return false;
171 }
172 #endif
173 
iommu_range_alloc(struct device * dev,struct iommu_table * tbl,unsigned long npages,unsigned long * handle,unsigned long mask,unsigned int align_order)174 static unsigned long iommu_range_alloc(struct device *dev,
175 				       struct iommu_table *tbl,
176                                        unsigned long npages,
177                                        unsigned long *handle,
178                                        unsigned long mask,
179                                        unsigned int align_order)
180 {
181 	unsigned long n, end, start;
182 	unsigned long limit;
183 	int largealloc = npages > 15;
184 	int pass = 0;
185 	unsigned long align_mask;
186 	unsigned long boundary_size;
187 	unsigned long flags;
188 	unsigned int pool_nr;
189 	struct iommu_pool *pool;
190 
191 	align_mask = (1ull << align_order) - 1;
192 
193 	/* This allocator was derived from x86_64's bit string search */
194 
195 	/* Sanity check */
196 	if (unlikely(npages == 0)) {
197 		if (printk_ratelimit())
198 			WARN_ON(1);
199 		return DMA_MAPPING_ERROR;
200 	}
201 
202 	if (should_fail_iommu(dev))
203 		return DMA_MAPPING_ERROR;
204 
205 	/*
206 	 * We don't need to disable preemption here because any CPU can
207 	 * safely use any IOMMU pool.
208 	 */
209 	pool_nr = raw_cpu_read(iommu_pool_hash) & (tbl->nr_pools - 1);
210 
211 	if (largealloc)
212 		pool = &(tbl->large_pool);
213 	else
214 		pool = &(tbl->pools[pool_nr]);
215 
216 	spin_lock_irqsave(&(pool->lock), flags);
217 
218 again:
219 	if ((pass == 0) && handle && *handle &&
220 	    (*handle >= pool->start) && (*handle < pool->end))
221 		start = *handle;
222 	else
223 		start = pool->hint;
224 
225 	limit = pool->end;
226 
227 	/* The case below can happen if we have a small segment appended
228 	 * to a large, or when the previous alloc was at the very end of
229 	 * the available space. If so, go back to the initial start.
230 	 */
231 	if (start >= limit)
232 		start = pool->start;
233 
234 	if (limit + tbl->it_offset > mask) {
235 		limit = mask - tbl->it_offset + 1;
236 		/* If we're constrained on address range, first try
237 		 * at the masked hint to avoid O(n) search complexity,
238 		 * but on second pass, start at 0 in pool 0.
239 		 */
240 		if ((start & mask) >= limit || pass > 0) {
241 			spin_unlock(&(pool->lock));
242 			pool = &(tbl->pools[0]);
243 			spin_lock(&(pool->lock));
244 			start = pool->start;
245 		} else {
246 			start &= mask;
247 		}
248 	}
249 
250 	if (dev)
251 		boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1,
252 				      1 << tbl->it_page_shift);
253 	else
254 		boundary_size = ALIGN(1UL << 32, 1 << tbl->it_page_shift);
255 	/* 4GB boundary for iseries_hv_alloc and iseries_hv_map */
256 
257 	n = iommu_area_alloc(tbl->it_map, limit, start, npages, tbl->it_offset,
258 			     boundary_size >> tbl->it_page_shift, align_mask);
259 	if (n == -1) {
260 		if (likely(pass == 0)) {
261 			/* First try the pool from the start */
262 			pool->hint = pool->start;
263 			pass++;
264 			goto again;
265 
266 		} else if (pass <= tbl->nr_pools) {
267 			/* Now try scanning all the other pools */
268 			spin_unlock(&(pool->lock));
269 			pool_nr = (pool_nr + 1) & (tbl->nr_pools - 1);
270 			pool = &tbl->pools[pool_nr];
271 			spin_lock(&(pool->lock));
272 			pool->hint = pool->start;
273 			pass++;
274 			goto again;
275 
276 		} else {
277 			/* Give up */
278 			spin_unlock_irqrestore(&(pool->lock), flags);
279 			return DMA_MAPPING_ERROR;
280 		}
281 	}
282 
283 	end = n + npages;
284 
285 	/* Bump the hint to a new block for small allocs. */
286 	if (largealloc) {
287 		/* Don't bump to new block to avoid fragmentation */
288 		pool->hint = end;
289 	} else {
290 		/* Overflow will be taken care of at the next allocation */
291 		pool->hint = (end + tbl->it_blocksize - 1) &
292 		                ~(tbl->it_blocksize - 1);
293 	}
294 
295 	/* Update handle for SG allocations */
296 	if (handle)
297 		*handle = end;
298 
299 	spin_unlock_irqrestore(&(pool->lock), flags);
300 
301 	return n;
302 }
303 
iommu_alloc(struct device * dev,struct iommu_table * tbl,void * page,unsigned int npages,enum dma_data_direction direction,unsigned long mask,unsigned int align_order,unsigned long attrs)304 static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl,
305 			      void *page, unsigned int npages,
306 			      enum dma_data_direction direction,
307 			      unsigned long mask, unsigned int align_order,
308 			      unsigned long attrs)
309 {
310 	unsigned long entry;
311 	dma_addr_t ret = DMA_MAPPING_ERROR;
312 	int build_fail;
313 
314 	entry = iommu_range_alloc(dev, tbl, npages, NULL, mask, align_order);
315 
316 	if (unlikely(entry == DMA_MAPPING_ERROR))
317 		return DMA_MAPPING_ERROR;
318 
319 	entry += tbl->it_offset;	/* Offset into real TCE table */
320 	ret = entry << tbl->it_page_shift;	/* Set the return dma address */
321 
322 	/* Put the TCEs in the HW table */
323 	build_fail = tbl->it_ops->set(tbl, entry, npages,
324 				      (unsigned long)page &
325 				      IOMMU_PAGE_MASK(tbl), direction, attrs);
326 
327 	/* tbl->it_ops->set() only returns non-zero for transient errors.
328 	 * Clean up the table bitmap in this case and return
329 	 * DMA_MAPPING_ERROR. For all other errors the functionality is
330 	 * not altered.
331 	 */
332 	if (unlikely(build_fail)) {
333 		__iommu_free(tbl, ret, npages);
334 		return DMA_MAPPING_ERROR;
335 	}
336 
337 	/* Flush/invalidate TLB caches if necessary */
338 	if (tbl->it_ops->flush)
339 		tbl->it_ops->flush(tbl);
340 
341 	/* Make sure updates are seen by hardware */
342 	mb();
343 
344 	return ret;
345 }
346 
iommu_free_check(struct iommu_table * tbl,dma_addr_t dma_addr,unsigned int npages)347 static bool iommu_free_check(struct iommu_table *tbl, dma_addr_t dma_addr,
348 			     unsigned int npages)
349 {
350 	unsigned long entry, free_entry;
351 
352 	entry = dma_addr >> tbl->it_page_shift;
353 	free_entry = entry - tbl->it_offset;
354 
355 	if (((free_entry + npages) > tbl->it_size) ||
356 	    (entry < tbl->it_offset)) {
357 		if (printk_ratelimit()) {
358 			printk(KERN_INFO "iommu_free: invalid entry\n");
359 			printk(KERN_INFO "\tentry     = 0x%lx\n", entry);
360 			printk(KERN_INFO "\tdma_addr  = 0x%llx\n", (u64)dma_addr);
361 			printk(KERN_INFO "\tTable     = 0x%llx\n", (u64)tbl);
362 			printk(KERN_INFO "\tbus#      = 0x%llx\n", (u64)tbl->it_busno);
363 			printk(KERN_INFO "\tsize      = 0x%llx\n", (u64)tbl->it_size);
364 			printk(KERN_INFO "\tstartOff  = 0x%llx\n", (u64)tbl->it_offset);
365 			printk(KERN_INFO "\tindex     = 0x%llx\n", (u64)tbl->it_index);
366 			WARN_ON(1);
367 		}
368 
369 		return false;
370 	}
371 
372 	return true;
373 }
374 
get_pool(struct iommu_table * tbl,unsigned long entry)375 static struct iommu_pool *get_pool(struct iommu_table *tbl,
376 				   unsigned long entry)
377 {
378 	struct iommu_pool *p;
379 	unsigned long largepool_start = tbl->large_pool.start;
380 
381 	/* The large pool is the last pool at the top of the table */
382 	if (entry >= largepool_start) {
383 		p = &tbl->large_pool;
384 	} else {
385 		unsigned int pool_nr = entry / tbl->poolsize;
386 
387 		BUG_ON(pool_nr > tbl->nr_pools);
388 		p = &tbl->pools[pool_nr];
389 	}
390 
391 	return p;
392 }
393 
__iommu_free(struct iommu_table * tbl,dma_addr_t dma_addr,unsigned int npages)394 static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
395 			 unsigned int npages)
396 {
397 	unsigned long entry, free_entry;
398 	unsigned long flags;
399 	struct iommu_pool *pool;
400 
401 	entry = dma_addr >> tbl->it_page_shift;
402 	free_entry = entry - tbl->it_offset;
403 
404 	pool = get_pool(tbl, free_entry);
405 
406 	if (!iommu_free_check(tbl, dma_addr, npages))
407 		return;
408 
409 	tbl->it_ops->clear(tbl, entry, npages);
410 
411 	spin_lock_irqsave(&(pool->lock), flags);
412 	bitmap_clear(tbl->it_map, free_entry, npages);
413 	spin_unlock_irqrestore(&(pool->lock), flags);
414 }
415 
iommu_free(struct iommu_table * tbl,dma_addr_t dma_addr,unsigned int npages)416 static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
417 		unsigned int npages)
418 {
419 	__iommu_free(tbl, dma_addr, npages);
420 
421 	/* Make sure TLB cache is flushed if the HW needs it. We do
422 	 * not do an mb() here on purpose, it is not needed on any of
423 	 * the current platforms.
424 	 */
425 	if (tbl->it_ops->flush)
426 		tbl->it_ops->flush(tbl);
427 }
428 
ppc_iommu_map_sg(struct device * dev,struct iommu_table * tbl,struct scatterlist * sglist,int nelems,unsigned long mask,enum dma_data_direction direction,unsigned long attrs)429 int ppc_iommu_map_sg(struct device *dev, struct iommu_table *tbl,
430 		     struct scatterlist *sglist, int nelems,
431 		     unsigned long mask, enum dma_data_direction direction,
432 		     unsigned long attrs)
433 {
434 	dma_addr_t dma_next = 0, dma_addr;
435 	struct scatterlist *s, *outs, *segstart;
436 	int outcount, incount, i, build_fail = 0;
437 	unsigned int align;
438 	unsigned long handle;
439 	unsigned int max_seg_size;
440 
441 	BUG_ON(direction == DMA_NONE);
442 
443 	if ((nelems == 0) || !tbl)
444 		return 0;
445 
446 	outs = s = segstart = &sglist[0];
447 	outcount = 1;
448 	incount = nelems;
449 	handle = 0;
450 
451 	/* Init first segment length for backout at failure */
452 	outs->dma_length = 0;
453 
454 	DBG("sg mapping %d elements:\n", nelems);
455 
456 	max_seg_size = dma_get_max_seg_size(dev);
457 	for_each_sg(sglist, s, nelems, i) {
458 		unsigned long vaddr, npages, entry, slen;
459 
460 		slen = s->length;
461 		/* Sanity check */
462 		if (slen == 0) {
463 			dma_next = 0;
464 			continue;
465 		}
466 		/* Allocate iommu entries for that segment */
467 		vaddr = (unsigned long) sg_virt(s);
468 		npages = iommu_num_pages(vaddr, slen, IOMMU_PAGE_SIZE(tbl));
469 		align = 0;
470 		if (tbl->it_page_shift < PAGE_SHIFT && slen >= PAGE_SIZE &&
471 		    (vaddr & ~PAGE_MASK) == 0)
472 			align = PAGE_SHIFT - tbl->it_page_shift;
473 		entry = iommu_range_alloc(dev, tbl, npages, &handle,
474 					  mask >> tbl->it_page_shift, align);
475 
476 		DBG("  - vaddr: %lx, size: %lx\n", vaddr, slen);
477 
478 		/* Handle failure */
479 		if (unlikely(entry == DMA_MAPPING_ERROR)) {
480 			if (!(attrs & DMA_ATTR_NO_WARN) &&
481 			    printk_ratelimit())
482 				dev_info(dev, "iommu_alloc failed, tbl %p "
483 					 "vaddr %lx npages %lu\n", tbl, vaddr,
484 					 npages);
485 			goto failure;
486 		}
487 
488 		/* Convert entry to a dma_addr_t */
489 		entry += tbl->it_offset;
490 		dma_addr = entry << tbl->it_page_shift;
491 		dma_addr |= (s->offset & ~IOMMU_PAGE_MASK(tbl));
492 
493 		DBG("  - %lu pages, entry: %lx, dma_addr: %lx\n",
494 			    npages, entry, dma_addr);
495 
496 		/* Insert into HW table */
497 		build_fail = tbl->it_ops->set(tbl, entry, npages,
498 					      vaddr & IOMMU_PAGE_MASK(tbl),
499 					      direction, attrs);
500 		if(unlikely(build_fail))
501 			goto failure;
502 
503 		/* If we are in an open segment, try merging */
504 		if (segstart != s) {
505 			DBG("  - trying merge...\n");
506 			/* We cannot merge if:
507 			 * - allocated dma_addr isn't contiguous to previous allocation
508 			 */
509 			if (novmerge || (dma_addr != dma_next) ||
510 			    (outs->dma_length + s->length > max_seg_size)) {
511 				/* Can't merge: create a new segment */
512 				segstart = s;
513 				outcount++;
514 				outs = sg_next(outs);
515 				DBG("    can't merge, new segment.\n");
516 			} else {
517 				outs->dma_length += s->length;
518 				DBG("    merged, new len: %ux\n", outs->dma_length);
519 			}
520 		}
521 
522 		if (segstart == s) {
523 			/* This is a new segment, fill entries */
524 			DBG("  - filling new segment.\n");
525 			outs->dma_address = dma_addr;
526 			outs->dma_length = slen;
527 		}
528 
529 		/* Calculate next page pointer for contiguous check */
530 		dma_next = dma_addr + slen;
531 
532 		DBG("  - dma next is: %lx\n", dma_next);
533 	}
534 
535 	/* Flush/invalidate TLB caches if necessary */
536 	if (tbl->it_ops->flush)
537 		tbl->it_ops->flush(tbl);
538 
539 	DBG("mapped %d elements:\n", outcount);
540 
541 	/* For the sake of ppc_iommu_unmap_sg, we clear out the length in the
542 	 * next entry of the sglist if we didn't fill the list completely
543 	 */
544 	if (outcount < incount) {
545 		outs = sg_next(outs);
546 		outs->dma_address = DMA_MAPPING_ERROR;
547 		outs->dma_length = 0;
548 	}
549 
550 	/* Make sure updates are seen by hardware */
551 	mb();
552 
553 	return outcount;
554 
555  failure:
556 	for_each_sg(sglist, s, nelems, i) {
557 		if (s->dma_length != 0) {
558 			unsigned long vaddr, npages;
559 
560 			vaddr = s->dma_address & IOMMU_PAGE_MASK(tbl);
561 			npages = iommu_num_pages(s->dma_address, s->dma_length,
562 						 IOMMU_PAGE_SIZE(tbl));
563 			__iommu_free(tbl, vaddr, npages);
564 			s->dma_address = DMA_MAPPING_ERROR;
565 			s->dma_length = 0;
566 		}
567 		if (s == outs)
568 			break;
569 	}
570 	return 0;
571 }
572 
573 
ppc_iommu_unmap_sg(struct iommu_table * tbl,struct scatterlist * sglist,int nelems,enum dma_data_direction direction,unsigned long attrs)574 void ppc_iommu_unmap_sg(struct iommu_table *tbl, struct scatterlist *sglist,
575 			int nelems, enum dma_data_direction direction,
576 			unsigned long attrs)
577 {
578 	struct scatterlist *sg;
579 
580 	BUG_ON(direction == DMA_NONE);
581 
582 	if (!tbl)
583 		return;
584 
585 	sg = sglist;
586 	while (nelems--) {
587 		unsigned int npages;
588 		dma_addr_t dma_handle = sg->dma_address;
589 
590 		if (sg->dma_length == 0)
591 			break;
592 		npages = iommu_num_pages(dma_handle, sg->dma_length,
593 					 IOMMU_PAGE_SIZE(tbl));
594 		__iommu_free(tbl, dma_handle, npages);
595 		sg = sg_next(sg);
596 	}
597 
598 	/* Flush/invalidate TLBs if necessary. As for iommu_free(), we
599 	 * do not do an mb() here, the affected platforms do not need it
600 	 * when freeing.
601 	 */
602 	if (tbl->it_ops->flush)
603 		tbl->it_ops->flush(tbl);
604 }
605 
iommu_table_clear(struct iommu_table * tbl)606 static void iommu_table_clear(struct iommu_table *tbl)
607 {
608 	/*
609 	 * In case of firmware assisted dump system goes through clean
610 	 * reboot process at the time of system crash. Hence it's safe to
611 	 * clear the TCE entries if firmware assisted dump is active.
612 	 */
613 	if (!is_kdump_kernel() || is_fadump_active()) {
614 		/* Clear the table in case firmware left allocations in it */
615 		tbl->it_ops->clear(tbl, tbl->it_offset, tbl->it_size);
616 		return;
617 	}
618 
619 #ifdef CONFIG_CRASH_DUMP
620 	if (tbl->it_ops->get) {
621 		unsigned long index, tceval, tcecount = 0;
622 
623 		/* Reserve the existing mappings left by the first kernel. */
624 		for (index = 0; index < tbl->it_size; index++) {
625 			tceval = tbl->it_ops->get(tbl, index + tbl->it_offset);
626 			/*
627 			 * Freed TCE entry contains 0x7fffffffffffffff on JS20
628 			 */
629 			if (tceval && (tceval != 0x7fffffffffffffffUL)) {
630 				__set_bit(index, tbl->it_map);
631 				tcecount++;
632 			}
633 		}
634 
635 		if ((tbl->it_size - tcecount) < KDUMP_MIN_TCE_ENTRIES) {
636 			printk(KERN_WARNING "TCE table is full; freeing ");
637 			printk(KERN_WARNING "%d entries for the kdump boot\n",
638 				KDUMP_MIN_TCE_ENTRIES);
639 			for (index = tbl->it_size - KDUMP_MIN_TCE_ENTRIES;
640 				index < tbl->it_size; index++)
641 				__clear_bit(index, tbl->it_map);
642 		}
643 	}
644 #endif
645 }
646 
iommu_table_reserve_pages(struct iommu_table * tbl,unsigned long res_start,unsigned long res_end)647 static void iommu_table_reserve_pages(struct iommu_table *tbl,
648 		unsigned long res_start, unsigned long res_end)
649 {
650 	int i;
651 
652 	WARN_ON_ONCE(res_end < res_start);
653 	/*
654 	 * Reserve page 0 so it will not be used for any mappings.
655 	 * This avoids buggy drivers that consider page 0 to be invalid
656 	 * to crash the machine or even lose data.
657 	 */
658 	if (tbl->it_offset == 0)
659 		set_bit(0, tbl->it_map);
660 
661 	tbl->it_reserved_start = res_start;
662 	tbl->it_reserved_end = res_end;
663 
664 	/* Check if res_start..res_end isn't empty and overlaps the table */
665 	if (res_start && res_end &&
666 			(tbl->it_offset + tbl->it_size < res_start ||
667 			 res_end < tbl->it_offset))
668 		return;
669 
670 	for (i = tbl->it_reserved_start; i < tbl->it_reserved_end; ++i)
671 		set_bit(i - tbl->it_offset, tbl->it_map);
672 }
673 
iommu_table_release_pages(struct iommu_table * tbl)674 static void iommu_table_release_pages(struct iommu_table *tbl)
675 {
676 	int i;
677 
678 	/*
679 	 * In case we have reserved the first bit, we should not emit
680 	 * the warning below.
681 	 */
682 	if (tbl->it_offset == 0)
683 		clear_bit(0, tbl->it_map);
684 
685 	for (i = tbl->it_reserved_start; i < tbl->it_reserved_end; ++i)
686 		clear_bit(i - tbl->it_offset, tbl->it_map);
687 }
688 
689 /*
690  * Build a iommu_table structure.  This contains a bit map which
691  * is used to manage allocation of the tce space.
692  */
iommu_init_table(struct iommu_table * tbl,int nid,unsigned long res_start,unsigned long res_end)693 struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid,
694 		unsigned long res_start, unsigned long res_end)
695 {
696 	unsigned long sz;
697 	static int welcomed = 0;
698 	struct page *page;
699 	unsigned int i;
700 	struct iommu_pool *p;
701 
702 	BUG_ON(!tbl->it_ops);
703 
704 	/* number of bytes needed for the bitmap */
705 	sz = BITS_TO_LONGS(tbl->it_size) * sizeof(unsigned long);
706 
707 	page = alloc_pages_node(nid, GFP_KERNEL, get_order(sz));
708 	if (!page)
709 		panic("iommu_init_table: Can't allocate %ld bytes\n", sz);
710 	tbl->it_map = page_address(page);
711 	memset(tbl->it_map, 0, sz);
712 
713 	iommu_table_reserve_pages(tbl, res_start, res_end);
714 
715 	/* We only split the IOMMU table if we have 1GB or more of space */
716 	if ((tbl->it_size << tbl->it_page_shift) >= (1UL * 1024 * 1024 * 1024))
717 		tbl->nr_pools = IOMMU_NR_POOLS;
718 	else
719 		tbl->nr_pools = 1;
720 
721 	/* We reserve the top 1/4 of the table for large allocations */
722 	tbl->poolsize = (tbl->it_size * 3 / 4) / tbl->nr_pools;
723 
724 	for (i = 0; i < tbl->nr_pools; i++) {
725 		p = &tbl->pools[i];
726 		spin_lock_init(&(p->lock));
727 		p->start = tbl->poolsize * i;
728 		p->hint = p->start;
729 		p->end = p->start + tbl->poolsize;
730 	}
731 
732 	p = &tbl->large_pool;
733 	spin_lock_init(&(p->lock));
734 	p->start = tbl->poolsize * i;
735 	p->hint = p->start;
736 	p->end = tbl->it_size;
737 
738 	iommu_table_clear(tbl);
739 
740 	if (!welcomed) {
741 		printk(KERN_INFO "IOMMU table initialized, virtual merging %s\n",
742 		       novmerge ? "disabled" : "enabled");
743 		welcomed = 1;
744 	}
745 
746 	return tbl;
747 }
748 
iommu_table_free(struct kref * kref)749 static void iommu_table_free(struct kref *kref)
750 {
751 	unsigned long bitmap_sz;
752 	unsigned int order;
753 	struct iommu_table *tbl;
754 
755 	tbl = container_of(kref, struct iommu_table, it_kref);
756 
757 	if (tbl->it_ops->free)
758 		tbl->it_ops->free(tbl);
759 
760 	if (!tbl->it_map) {
761 		kfree(tbl);
762 		return;
763 	}
764 
765 	iommu_table_release_pages(tbl);
766 
767 	/* verify that table contains no entries */
768 	if (!bitmap_empty(tbl->it_map, tbl->it_size))
769 		pr_warn("%s: Unexpected TCEs\n", __func__);
770 
771 	/* calculate bitmap size in bytes */
772 	bitmap_sz = BITS_TO_LONGS(tbl->it_size) * sizeof(unsigned long);
773 
774 	/* free bitmap */
775 	order = get_order(bitmap_sz);
776 	free_pages((unsigned long) tbl->it_map, order);
777 
778 	/* free table */
779 	kfree(tbl);
780 }
781 
iommu_tce_table_get(struct iommu_table * tbl)782 struct iommu_table *iommu_tce_table_get(struct iommu_table *tbl)
783 {
784 	if (kref_get_unless_zero(&tbl->it_kref))
785 		return tbl;
786 
787 	return NULL;
788 }
789 EXPORT_SYMBOL_GPL(iommu_tce_table_get);
790 
iommu_tce_table_put(struct iommu_table * tbl)791 int iommu_tce_table_put(struct iommu_table *tbl)
792 {
793 	if (WARN_ON(!tbl))
794 		return 0;
795 
796 	return kref_put(&tbl->it_kref, iommu_table_free);
797 }
798 EXPORT_SYMBOL_GPL(iommu_tce_table_put);
799 
800 /* Creates TCEs for a user provided buffer.  The user buffer must be
801  * contiguous real kernel storage (not vmalloc).  The address passed here
802  * comprises a page address and offset into that page. The dma_addr_t
803  * returned will point to the same byte within the page as was passed in.
804  */
iommu_map_page(struct device * dev,struct iommu_table * tbl,struct page * page,unsigned long offset,size_t size,unsigned long mask,enum dma_data_direction direction,unsigned long attrs)805 dma_addr_t iommu_map_page(struct device *dev, struct iommu_table *tbl,
806 			  struct page *page, unsigned long offset, size_t size,
807 			  unsigned long mask, enum dma_data_direction direction,
808 			  unsigned long attrs)
809 {
810 	dma_addr_t dma_handle = DMA_MAPPING_ERROR;
811 	void *vaddr;
812 	unsigned long uaddr;
813 	unsigned int npages, align;
814 
815 	BUG_ON(direction == DMA_NONE);
816 
817 	vaddr = page_address(page) + offset;
818 	uaddr = (unsigned long)vaddr;
819 
820 	if (tbl) {
821 		npages = iommu_num_pages(uaddr, size, IOMMU_PAGE_SIZE(tbl));
822 		align = 0;
823 		if (tbl->it_page_shift < PAGE_SHIFT && size >= PAGE_SIZE &&
824 		    ((unsigned long)vaddr & ~PAGE_MASK) == 0)
825 			align = PAGE_SHIFT - tbl->it_page_shift;
826 
827 		dma_handle = iommu_alloc(dev, tbl, vaddr, npages, direction,
828 					 mask >> tbl->it_page_shift, align,
829 					 attrs);
830 		if (dma_handle == DMA_MAPPING_ERROR) {
831 			if (!(attrs & DMA_ATTR_NO_WARN) &&
832 			    printk_ratelimit())  {
833 				dev_info(dev, "iommu_alloc failed, tbl %p "
834 					 "vaddr %p npages %d\n", tbl, vaddr,
835 					 npages);
836 			}
837 		} else
838 			dma_handle |= (uaddr & ~IOMMU_PAGE_MASK(tbl));
839 	}
840 
841 	return dma_handle;
842 }
843 
iommu_unmap_page(struct iommu_table * tbl,dma_addr_t dma_handle,size_t size,enum dma_data_direction direction,unsigned long attrs)844 void iommu_unmap_page(struct iommu_table *tbl, dma_addr_t dma_handle,
845 		      size_t size, enum dma_data_direction direction,
846 		      unsigned long attrs)
847 {
848 	unsigned int npages;
849 
850 	BUG_ON(direction == DMA_NONE);
851 
852 	if (tbl) {
853 		npages = iommu_num_pages(dma_handle, size,
854 					 IOMMU_PAGE_SIZE(tbl));
855 		iommu_free(tbl, dma_handle, npages);
856 	}
857 }
858 
859 /* Allocates a contiguous real buffer and creates mappings over it.
860  * Returns the virtual address of the buffer and sets dma_handle
861  * to the dma address (mapping) of the first page.
862  */
iommu_alloc_coherent(struct device * dev,struct iommu_table * tbl,size_t size,dma_addr_t * dma_handle,unsigned long mask,gfp_t flag,int node)863 void *iommu_alloc_coherent(struct device *dev, struct iommu_table *tbl,
864 			   size_t size,	dma_addr_t *dma_handle,
865 			   unsigned long mask, gfp_t flag, int node)
866 {
867 	void *ret = NULL;
868 	dma_addr_t mapping;
869 	unsigned int order;
870 	unsigned int nio_pages, io_order;
871 	struct page *page;
872 
873 	size = PAGE_ALIGN(size);
874 	order = get_order(size);
875 
876  	/*
877 	 * Client asked for way too much space.  This is checked later
878 	 * anyway.  It is easier to debug here for the drivers than in
879 	 * the tce tables.
880 	 */
881 	if (order >= IOMAP_MAX_ORDER) {
882 		dev_info(dev, "iommu_alloc_consistent size too large: 0x%lx\n",
883 			 size);
884 		return NULL;
885 	}
886 
887 	if (!tbl)
888 		return NULL;
889 
890 	/* Alloc enough pages (and possibly more) */
891 	page = alloc_pages_node(node, flag, order);
892 	if (!page)
893 		return NULL;
894 	ret = page_address(page);
895 	memset(ret, 0, size);
896 
897 	/* Set up tces to cover the allocated range */
898 	nio_pages = size >> tbl->it_page_shift;
899 	io_order = get_iommu_order(size, tbl);
900 	mapping = iommu_alloc(dev, tbl, ret, nio_pages, DMA_BIDIRECTIONAL,
901 			      mask >> tbl->it_page_shift, io_order, 0);
902 	if (mapping == DMA_MAPPING_ERROR) {
903 		free_pages((unsigned long)ret, order);
904 		return NULL;
905 	}
906 	*dma_handle = mapping;
907 	return ret;
908 }
909 
iommu_free_coherent(struct iommu_table * tbl,size_t size,void * vaddr,dma_addr_t dma_handle)910 void iommu_free_coherent(struct iommu_table *tbl, size_t size,
911 			 void *vaddr, dma_addr_t dma_handle)
912 {
913 	if (tbl) {
914 		unsigned int nio_pages;
915 
916 		size = PAGE_ALIGN(size);
917 		nio_pages = size >> tbl->it_page_shift;
918 		iommu_free(tbl, dma_handle, nio_pages);
919 		size = PAGE_ALIGN(size);
920 		free_pages((unsigned long)vaddr, get_order(size));
921 	}
922 }
923 
iommu_direction_to_tce_perm(enum dma_data_direction dir)924 unsigned long iommu_direction_to_tce_perm(enum dma_data_direction dir)
925 {
926 	switch (dir) {
927 	case DMA_BIDIRECTIONAL:
928 		return TCE_PCI_READ | TCE_PCI_WRITE;
929 	case DMA_FROM_DEVICE:
930 		return TCE_PCI_WRITE;
931 	case DMA_TO_DEVICE:
932 		return TCE_PCI_READ;
933 	default:
934 		return 0;
935 	}
936 }
937 EXPORT_SYMBOL_GPL(iommu_direction_to_tce_perm);
938 
939 #ifdef CONFIG_IOMMU_API
940 /*
941  * SPAPR TCE API
942  */
group_release(void * iommu_data)943 static void group_release(void *iommu_data)
944 {
945 	struct iommu_table_group *table_group = iommu_data;
946 
947 	table_group->group = NULL;
948 }
949 
iommu_register_group(struct iommu_table_group * table_group,int pci_domain_number,unsigned long pe_num)950 void iommu_register_group(struct iommu_table_group *table_group,
951 		int pci_domain_number, unsigned long pe_num)
952 {
953 	struct iommu_group *grp;
954 	char *name;
955 
956 	grp = iommu_group_alloc();
957 	if (IS_ERR(grp)) {
958 		pr_warn("powerpc iommu api: cannot create new group, err=%ld\n",
959 				PTR_ERR(grp));
960 		return;
961 	}
962 	table_group->group = grp;
963 	iommu_group_set_iommudata(grp, table_group, group_release);
964 	name = kasprintf(GFP_KERNEL, "domain%d-pe%lx",
965 			pci_domain_number, pe_num);
966 	if (!name)
967 		return;
968 	iommu_group_set_name(grp, name);
969 	kfree(name);
970 }
971 
iommu_tce_direction(unsigned long tce)972 enum dma_data_direction iommu_tce_direction(unsigned long tce)
973 {
974 	if ((tce & TCE_PCI_READ) && (tce & TCE_PCI_WRITE))
975 		return DMA_BIDIRECTIONAL;
976 	else if (tce & TCE_PCI_READ)
977 		return DMA_TO_DEVICE;
978 	else if (tce & TCE_PCI_WRITE)
979 		return DMA_FROM_DEVICE;
980 	else
981 		return DMA_NONE;
982 }
983 EXPORT_SYMBOL_GPL(iommu_tce_direction);
984 
iommu_flush_tce(struct iommu_table * tbl)985 void iommu_flush_tce(struct iommu_table *tbl)
986 {
987 	/* Flush/invalidate TLB caches if necessary */
988 	if (tbl->it_ops->flush)
989 		tbl->it_ops->flush(tbl);
990 
991 	/* Make sure updates are seen by hardware */
992 	mb();
993 }
994 EXPORT_SYMBOL_GPL(iommu_flush_tce);
995 
iommu_tce_check_ioba(unsigned long page_shift,unsigned long offset,unsigned long size,unsigned long ioba,unsigned long npages)996 int iommu_tce_check_ioba(unsigned long page_shift,
997 		unsigned long offset, unsigned long size,
998 		unsigned long ioba, unsigned long npages)
999 {
1000 	unsigned long mask = (1UL << page_shift) - 1;
1001 
1002 	if (ioba & mask)
1003 		return -EINVAL;
1004 
1005 	ioba >>= page_shift;
1006 	if (ioba < offset)
1007 		return -EINVAL;
1008 
1009 	if ((ioba + 1) > (offset + size))
1010 		return -EINVAL;
1011 
1012 	return 0;
1013 }
1014 EXPORT_SYMBOL_GPL(iommu_tce_check_ioba);
1015 
iommu_tce_check_gpa(unsigned long page_shift,unsigned long gpa)1016 int iommu_tce_check_gpa(unsigned long page_shift, unsigned long gpa)
1017 {
1018 	unsigned long mask = (1UL << page_shift) - 1;
1019 
1020 	if (gpa & mask)
1021 		return -EINVAL;
1022 
1023 	return 0;
1024 }
1025 EXPORT_SYMBOL_GPL(iommu_tce_check_gpa);
1026 
iommu_tce_xchg_no_kill(struct mm_struct * mm,struct iommu_table * tbl,unsigned long entry,unsigned long * hpa,enum dma_data_direction * direction)1027 extern long iommu_tce_xchg_no_kill(struct mm_struct *mm,
1028 		struct iommu_table *tbl,
1029 		unsigned long entry, unsigned long *hpa,
1030 		enum dma_data_direction *direction)
1031 {
1032 	long ret;
1033 	unsigned long size = 0;
1034 
1035 	ret = tbl->it_ops->xchg_no_kill(tbl, entry, hpa, direction, false);
1036 	if (!ret && ((*direction == DMA_FROM_DEVICE) ||
1037 			(*direction == DMA_BIDIRECTIONAL)) &&
1038 			!mm_iommu_is_devmem(mm, *hpa, tbl->it_page_shift,
1039 					&size))
1040 		SetPageDirty(pfn_to_page(*hpa >> PAGE_SHIFT));
1041 
1042 	return ret;
1043 }
1044 EXPORT_SYMBOL_GPL(iommu_tce_xchg_no_kill);
1045 
iommu_tce_kill(struct iommu_table * tbl,unsigned long entry,unsigned long pages)1046 void iommu_tce_kill(struct iommu_table *tbl,
1047 		unsigned long entry, unsigned long pages)
1048 {
1049 	if (tbl->it_ops->tce_kill)
1050 		tbl->it_ops->tce_kill(tbl, entry, pages, false);
1051 }
1052 EXPORT_SYMBOL_GPL(iommu_tce_kill);
1053 
iommu_take_ownership(struct iommu_table * tbl)1054 int iommu_take_ownership(struct iommu_table *tbl)
1055 {
1056 	unsigned long flags, i, sz = (tbl->it_size + 7) >> 3;
1057 	int ret = 0;
1058 
1059 	/*
1060 	 * VFIO does not control TCE entries allocation and the guest
1061 	 * can write new TCEs on top of existing ones so iommu_tce_build()
1062 	 * must be able to release old pages. This functionality
1063 	 * requires exchange() callback defined so if it is not
1064 	 * implemented, we disallow taking ownership over the table.
1065 	 */
1066 	if (!tbl->it_ops->xchg_no_kill)
1067 		return -EINVAL;
1068 
1069 	spin_lock_irqsave(&tbl->large_pool.lock, flags);
1070 	for (i = 0; i < tbl->nr_pools; i++)
1071 		spin_lock_nest_lock(&tbl->pools[i].lock, &tbl->large_pool.lock);
1072 
1073 	iommu_table_release_pages(tbl);
1074 
1075 	if (!bitmap_empty(tbl->it_map, tbl->it_size)) {
1076 		pr_err("iommu_tce: it_map is not empty");
1077 		ret = -EBUSY;
1078 		/* Undo iommu_table_release_pages, i.e. restore bit#0, etc */
1079 		iommu_table_reserve_pages(tbl, tbl->it_reserved_start,
1080 				tbl->it_reserved_end);
1081 	} else {
1082 		memset(tbl->it_map, 0xff, sz);
1083 	}
1084 
1085 	for (i = 0; i < tbl->nr_pools; i++)
1086 		spin_unlock(&tbl->pools[i].lock);
1087 	spin_unlock_irqrestore(&tbl->large_pool.lock, flags);
1088 
1089 	return ret;
1090 }
1091 EXPORT_SYMBOL_GPL(iommu_take_ownership);
1092 
iommu_release_ownership(struct iommu_table * tbl)1093 void iommu_release_ownership(struct iommu_table *tbl)
1094 {
1095 	unsigned long flags, i, sz = (tbl->it_size + 7) >> 3;
1096 
1097 	spin_lock_irqsave(&tbl->large_pool.lock, flags);
1098 	for (i = 0; i < tbl->nr_pools; i++)
1099 		spin_lock_nest_lock(&tbl->pools[i].lock, &tbl->large_pool.lock);
1100 
1101 	memset(tbl->it_map, 0, sz);
1102 
1103 	iommu_table_reserve_pages(tbl, tbl->it_reserved_start,
1104 			tbl->it_reserved_end);
1105 
1106 	for (i = 0; i < tbl->nr_pools; i++)
1107 		spin_unlock(&tbl->pools[i].lock);
1108 	spin_unlock_irqrestore(&tbl->large_pool.lock, flags);
1109 }
1110 EXPORT_SYMBOL_GPL(iommu_release_ownership);
1111 
iommu_add_device(struct iommu_table_group * table_group,struct device * dev)1112 int iommu_add_device(struct iommu_table_group *table_group, struct device *dev)
1113 {
1114 	/*
1115 	 * The sysfs entries should be populated before
1116 	 * binding IOMMU group. If sysfs entries isn't
1117 	 * ready, we simply bail.
1118 	 */
1119 	if (!device_is_registered(dev))
1120 		return -ENOENT;
1121 
1122 	if (device_iommu_mapped(dev)) {
1123 		pr_debug("%s: Skipping device %s with iommu group %d\n",
1124 			 __func__, dev_name(dev),
1125 			 iommu_group_id(dev->iommu_group));
1126 		return -EBUSY;
1127 	}
1128 
1129 	pr_debug("%s: Adding %s to iommu group %d\n",
1130 		 __func__, dev_name(dev),  iommu_group_id(table_group->group));
1131 
1132 	return iommu_group_add_device(table_group->group, dev);
1133 }
1134 EXPORT_SYMBOL_GPL(iommu_add_device);
1135 
iommu_del_device(struct device * dev)1136 void iommu_del_device(struct device *dev)
1137 {
1138 	/*
1139 	 * Some devices might not have IOMMU table and group
1140 	 * and we needn't detach them from the associated
1141 	 * IOMMU groups
1142 	 */
1143 	if (!device_iommu_mapped(dev)) {
1144 		pr_debug("iommu_tce: skipping device %s with no tbl\n",
1145 			 dev_name(dev));
1146 		return;
1147 	}
1148 
1149 	iommu_group_remove_device(dev);
1150 }
1151 EXPORT_SYMBOL_GPL(iommu_del_device);
1152 #endif /* CONFIG_IOMMU_API */
1153