• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2002 Andi Kleen, SuSE Labs.
3  * Thanks to Ben LaHaise for precious feedback.
4  */
5 #include <linux/highmem.h>
6 #include <linux/bootmem.h>
7 #include <linux/module.h>
8 #include <linux/sched.h>
9 #include <linux/mm.h>
10 #include <linux/interrupt.h>
11 #include <linux/seq_file.h>
12 #include <linux/debugfs.h>
13 #include <linux/pfn.h>
14 #include <linux/percpu.h>
15 #include <linux/gfp.h>
16 #include <linux/pci.h>
17 
18 #include <asm/e820.h>
19 #include <asm/processor.h>
20 #include <asm/tlbflush.h>
21 #include <asm/sections.h>
22 #include <asm/setup.h>
23 #include <asm/uaccess.h>
24 #include <asm/pgalloc.h>
25 #include <asm/proto.h>
26 #include <asm/pat.h>
27 
28 /*
29  * The current flushing context - we pass it instead of 5 arguments:
30  */
31 struct cpa_data {
32 	unsigned long	*vaddr;
33 	pgd_t		*pgd;
34 	pgprot_t	mask_set;
35 	pgprot_t	mask_clr;
36 	unsigned long	numpages;
37 	int		flags;
38 	unsigned long	pfn;
39 	unsigned	force_split : 1;
40 	int		curpage;
41 	struct page	**pages;
42 };
43 
44 /*
45  * Serialize cpa() (for !DEBUG_PAGEALLOC which uses large identity mappings)
46  * using cpa_lock. So that we don't allow any other cpu, with stale large tlb
47  * entries change the page attribute in parallel to some other cpu
48  * splitting a large page entry along with changing the attribute.
49  */
50 static DEFINE_SPINLOCK(cpa_lock);
51 
52 #define CPA_FLUSHTLB 1
53 #define CPA_ARRAY 2
54 #define CPA_PAGES_ARRAY 4
55 
56 #ifdef CONFIG_PROC_FS
57 static unsigned long direct_pages_count[PG_LEVEL_NUM];
58 
update_page_count(int level,unsigned long pages)59 void update_page_count(int level, unsigned long pages)
60 {
61 	/* Protect against CPA */
62 	spin_lock(&pgd_lock);
63 	direct_pages_count[level] += pages;
64 	spin_unlock(&pgd_lock);
65 }
66 
split_page_count(int level)67 static void split_page_count(int level)
68 {
69 	direct_pages_count[level]--;
70 	direct_pages_count[level - 1] += PTRS_PER_PTE;
71 }
72 
arch_report_meminfo(struct seq_file * m)73 void arch_report_meminfo(struct seq_file *m)
74 {
75 	seq_printf(m, "DirectMap4k:    %8lu kB\n",
76 			direct_pages_count[PG_LEVEL_4K] << 2);
77 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
78 	seq_printf(m, "DirectMap2M:    %8lu kB\n",
79 			direct_pages_count[PG_LEVEL_2M] << 11);
80 #else
81 	seq_printf(m, "DirectMap4M:    %8lu kB\n",
82 			direct_pages_count[PG_LEVEL_2M] << 12);
83 #endif
84 #ifdef CONFIG_X86_64
85 	if (direct_gbpages)
86 		seq_printf(m, "DirectMap1G:    %8lu kB\n",
87 			direct_pages_count[PG_LEVEL_1G] << 20);
88 #endif
89 }
90 #else
split_page_count(int level)91 static inline void split_page_count(int level) { }
92 #endif
93 
94 #ifdef CONFIG_X86_64
95 
highmap_start_pfn(void)96 static inline unsigned long highmap_start_pfn(void)
97 {
98 	return __pa_symbol(_text) >> PAGE_SHIFT;
99 }
100 
highmap_end_pfn(void)101 static inline unsigned long highmap_end_pfn(void)
102 {
103 	return __pa_symbol(roundup(_brk_end, PMD_SIZE)) >> PAGE_SHIFT;
104 }
105 
106 #endif
107 
108 #ifdef CONFIG_DEBUG_PAGEALLOC
109 # define debug_pagealloc 1
110 #else
111 # define debug_pagealloc 0
112 #endif
113 
114 static inline int
within(unsigned long addr,unsigned long start,unsigned long end)115 within(unsigned long addr, unsigned long start, unsigned long end)
116 {
117 	return addr >= start && addr < end;
118 }
119 
120 /*
121  * Flushing functions
122  */
123 
124 /**
125  * clflush_cache_range - flush a cache range with clflush
126  * @vaddr:	virtual start address
127  * @size:	number of bytes to flush
128  *
129  * clflushopt is an unordered instruction which needs fencing with mfence or
130  * sfence to avoid ordering issues.
131  */
clflush_cache_range(void * vaddr,unsigned int size)132 void clflush_cache_range(void *vaddr, unsigned int size)
133 {
134 	void *vend = vaddr + size - 1;
135 
136 	mb();
137 
138 	for (; vaddr < vend; vaddr += boot_cpu_data.x86_clflush_size)
139 		clflushopt(vaddr);
140 	/*
141 	 * Flush any possible final partial cacheline:
142 	 */
143 	clflushopt(vend);
144 
145 	mb();
146 }
147 EXPORT_SYMBOL_GPL(clflush_cache_range);
148 
__cpa_flush_all(void * arg)149 static void __cpa_flush_all(void *arg)
150 {
151 	unsigned long cache = (unsigned long)arg;
152 
153 	/*
154 	 * Flush all to work around Errata in early athlons regarding
155 	 * large page flushing.
156 	 */
157 	__flush_tlb_all();
158 
159 	if (cache && boot_cpu_data.x86 >= 4)
160 		wbinvd();
161 }
162 
cpa_flush_all(unsigned long cache)163 static void cpa_flush_all(unsigned long cache)
164 {
165 	BUG_ON(irqs_disabled());
166 
167 	on_each_cpu(__cpa_flush_all, (void *) cache, 1);
168 }
169 
__cpa_flush_range(void * arg)170 static void __cpa_flush_range(void *arg)
171 {
172 	/*
173 	 * We could optimize that further and do individual per page
174 	 * tlb invalidates for a low number of pages. Caveat: we must
175 	 * flush the high aliases on 64bit as well.
176 	 */
177 	__flush_tlb_all();
178 }
179 
cpa_flush_range(unsigned long start,int numpages,int cache)180 static void cpa_flush_range(unsigned long start, int numpages, int cache)
181 {
182 	unsigned int i, level;
183 	unsigned long addr;
184 
185 	BUG_ON(irqs_disabled());
186 	WARN_ON(PAGE_ALIGN(start) != start);
187 
188 	on_each_cpu(__cpa_flush_range, NULL, 1);
189 
190 	if (!cache)
191 		return;
192 
193 	/*
194 	 * We only need to flush on one CPU,
195 	 * clflush is a MESI-coherent instruction that
196 	 * will cause all other CPUs to flush the same
197 	 * cachelines:
198 	 */
199 	for (i = 0, addr = start; i < numpages; i++, addr += PAGE_SIZE) {
200 		pte_t *pte = lookup_address(addr, &level);
201 
202 		/*
203 		 * Only flush present addresses:
204 		 */
205 		if (pte && (pte_val(*pte) & _PAGE_PRESENT))
206 			clflush_cache_range((void *) addr, PAGE_SIZE);
207 	}
208 }
209 
cpa_flush_array(unsigned long * start,int numpages,int cache,int in_flags,struct page ** pages)210 static void cpa_flush_array(unsigned long *start, int numpages, int cache,
211 			    int in_flags, struct page **pages)
212 {
213 	unsigned int i, level;
214 	unsigned long do_wbinvd = cache && numpages >= 1024; /* 4M threshold */
215 
216 	BUG_ON(irqs_disabled());
217 
218 	on_each_cpu(__cpa_flush_all, (void *) do_wbinvd, 1);
219 
220 	if (!cache || do_wbinvd)
221 		return;
222 
223 	/*
224 	 * We only need to flush on one CPU,
225 	 * clflush is a MESI-coherent instruction that
226 	 * will cause all other CPUs to flush the same
227 	 * cachelines:
228 	 */
229 	for (i = 0; i < numpages; i++) {
230 		unsigned long addr;
231 		pte_t *pte;
232 
233 		if (in_flags & CPA_PAGES_ARRAY)
234 			addr = (unsigned long)page_address(pages[i]);
235 		else
236 			addr = start[i];
237 
238 		pte = lookup_address(addr, &level);
239 
240 		/*
241 		 * Only flush present addresses:
242 		 */
243 		if (pte && (pte_val(*pte) & _PAGE_PRESENT))
244 			clflush_cache_range((void *)addr, PAGE_SIZE);
245 	}
246 }
247 
248 /*
249  * Certain areas of memory on x86 require very specific protection flags,
250  * for example the BIOS area or kernel text. Callers don't always get this
251  * right (again, ioremap() on BIOS memory is not uncommon) so this function
252  * checks and fixes these known static required protection bits.
253  */
static_protections(pgprot_t prot,unsigned long address,unsigned long pfn)254 static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
255 				   unsigned long pfn)
256 {
257 	pgprot_t forbidden = __pgprot(0);
258 
259 	/*
260 	 * The BIOS area between 640k and 1Mb needs to be executable for
261 	 * PCI BIOS based config access (CONFIG_PCI_GOBIOS) support.
262 	 */
263 #ifdef CONFIG_PCI_BIOS
264 	if (pcibios_enabled && within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT))
265 		pgprot_val(forbidden) |= _PAGE_NX;
266 #endif
267 
268 	/*
269 	 * The kernel text needs to be executable for obvious reasons
270 	 * Does not cover __inittext since that is gone later on. On
271 	 * 64bit we do not enforce !NX on the low mapping
272 	 */
273 	if (within(address, (unsigned long)_text, (unsigned long)_etext))
274 		pgprot_val(forbidden) |= _PAGE_NX;
275 
276 	/*
277 	 * The .rodata section needs to be read-only. Using the pfn
278 	 * catches all aliases.
279 	 */
280 	if (within(pfn, __pa_symbol(__start_rodata) >> PAGE_SHIFT,
281 		   __pa_symbol(__end_rodata) >> PAGE_SHIFT))
282 		pgprot_val(forbidden) |= _PAGE_RW;
283 
284 #if defined(CONFIG_X86_64)
285 	/*
286 	 * Once the kernel maps the text as RO (kernel_set_to_readonly is set),
287 	 * kernel text mappings for the large page aligned text, rodata sections
288 	 * will be always read-only. For the kernel identity mappings covering
289 	 * the holes caused by this alignment can be anything that user asks.
290 	 *
291 	 * This will preserve the large page mappings for kernel text/data
292 	 * at no extra cost.
293 	 */
294 	if (kernel_set_to_readonly &&
295 	    within(address, (unsigned long)_text,
296 		   (unsigned long)__end_rodata_hpage_align)) {
297 		unsigned int level;
298 
299 		/*
300 		 * Don't enforce the !RW mapping for the kernel text mapping,
301 		 * if the current mapping is already using small page mapping.
302 		 * No need to work hard to preserve large page mappings in this
303 		 * case.
304 		 *
305 		 * This also fixes the Linux Xen paravirt guest boot failure
306 		 * (because of unexpected read-only mappings for kernel identity
307 		 * mappings). In this paravirt guest case, the kernel text
308 		 * mapping and the kernel identity mapping share the same
309 		 * page-table pages. Thus we can't really use different
310 		 * protections for the kernel text and identity mappings. Also,
311 		 * these shared mappings are made of small page mappings.
312 		 * Thus this don't enforce !RW mapping for small page kernel
313 		 * text mapping logic will help Linux Xen parvirt guest boot
314 		 * as well.
315 		 */
316 		if (lookup_address(address, &level) && (level != PG_LEVEL_4K))
317 			pgprot_val(forbidden) |= _PAGE_RW;
318 	}
319 #endif
320 
321 	prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden));
322 
323 	return prot;
324 }
325 
326 /*
327  * Lookup the page table entry for a virtual address in a specific pgd.
328  * Return a pointer to the entry and the level of the mapping.
329  */
lookup_address_in_pgd(pgd_t * pgd,unsigned long address,unsigned int * level)330 pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address,
331 			     unsigned int *level)
332 {
333 	pud_t *pud;
334 	pmd_t *pmd;
335 
336 	*level = PG_LEVEL_NONE;
337 
338 	if (pgd_none(*pgd))
339 		return NULL;
340 
341 	pud = pud_offset(pgd, address);
342 	if (pud_none(*pud))
343 		return NULL;
344 
345 	*level = PG_LEVEL_1G;
346 	if (pud_large(*pud) || !pud_present(*pud))
347 		return (pte_t *)pud;
348 
349 	pmd = pmd_offset(pud, address);
350 	if (pmd_none(*pmd))
351 		return NULL;
352 
353 	*level = PG_LEVEL_2M;
354 	if (pmd_large(*pmd) || !pmd_present(*pmd))
355 		return (pte_t *)pmd;
356 
357 	*level = PG_LEVEL_4K;
358 
359 	return pte_offset_kernel(pmd, address);
360 }
361 
362 /*
363  * Lookup the page table entry for a virtual address. Return a pointer
364  * to the entry and the level of the mapping.
365  *
366  * Note: We return pud and pmd either when the entry is marked large
367  * or when the present bit is not set. Otherwise we would return a
368  * pointer to a nonexisting mapping.
369  */
lookup_address(unsigned long address,unsigned int * level)370 pte_t *lookup_address(unsigned long address, unsigned int *level)
371 {
372         return lookup_address_in_pgd(pgd_offset_k(address), address, level);
373 }
374 EXPORT_SYMBOL_GPL(lookup_address);
375 
_lookup_address_cpa(struct cpa_data * cpa,unsigned long address,unsigned int * level)376 static pte_t *_lookup_address_cpa(struct cpa_data *cpa, unsigned long address,
377 				  unsigned int *level)
378 {
379         if (cpa->pgd)
380 		return lookup_address_in_pgd(cpa->pgd + pgd_index(address),
381 					       address, level);
382 
383         return lookup_address(address, level);
384 }
385 
386 /*
387  * This is necessary because __pa() does not work on some
388  * kinds of memory, like vmalloc() or the alloc_remap()
389  * areas on 32-bit NUMA systems.  The percpu areas can
390  * end up in this kind of memory, for instance.
391  *
392  * This could be optimized, but it is only intended to be
393  * used at inititalization time, and keeping it
394  * unoptimized should increase the testing coverage for
395  * the more obscure platforms.
396  */
slow_virt_to_phys(void * __virt_addr)397 phys_addr_t slow_virt_to_phys(void *__virt_addr)
398 {
399 	unsigned long virt_addr = (unsigned long)__virt_addr;
400 	phys_addr_t phys_addr;
401 	unsigned long offset;
402 	enum pg_level level;
403 	unsigned long psize;
404 	unsigned long pmask;
405 	pte_t *pte;
406 
407 	pte = lookup_address(virt_addr, &level);
408 	BUG_ON(!pte);
409 	psize = page_level_size(level);
410 	pmask = page_level_mask(level);
411 	offset = virt_addr & ~pmask;
412 	phys_addr = (phys_addr_t)pte_pfn(*pte) << PAGE_SHIFT;
413 	return (phys_addr | offset);
414 }
415 EXPORT_SYMBOL_GPL(slow_virt_to_phys);
416 
417 /*
418  * Set the new pmd in all the pgds we know about:
419  */
__set_pmd_pte(pte_t * kpte,unsigned long address,pte_t pte)420 static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte)
421 {
422 	/* change init_mm */
423 	set_pte_atomic(kpte, pte);
424 #ifdef CONFIG_X86_32
425 	if (!SHARED_KERNEL_PMD) {
426 		struct page *page;
427 
428 		list_for_each_entry(page, &pgd_list, lru) {
429 			pgd_t *pgd;
430 			pud_t *pud;
431 			pmd_t *pmd;
432 
433 			pgd = (pgd_t *)page_address(page) + pgd_index(address);
434 			pud = pud_offset(pgd, address);
435 			pmd = pmd_offset(pud, address);
436 			set_pte_atomic((pte_t *)pmd, pte);
437 		}
438 	}
439 #endif
440 }
441 
442 static int
try_preserve_large_page(pte_t * kpte,unsigned long address,struct cpa_data * cpa)443 try_preserve_large_page(pte_t *kpte, unsigned long address,
444 			struct cpa_data *cpa)
445 {
446 	unsigned long nextpage_addr, numpages, pmask, psize, addr, pfn;
447 	pte_t new_pte, old_pte, *tmp;
448 	pgprot_t old_prot, new_prot, req_prot;
449 	int i, do_split = 1;
450 	enum pg_level level;
451 
452 	if (cpa->force_split)
453 		return 1;
454 
455 	spin_lock(&pgd_lock);
456 	/*
457 	 * Check for races, another CPU might have split this page
458 	 * up already:
459 	 */
460 	tmp = _lookup_address_cpa(cpa, address, &level);
461 	if (tmp != kpte)
462 		goto out_unlock;
463 
464 	switch (level) {
465 	case PG_LEVEL_2M:
466 #ifdef CONFIG_X86_64
467 	case PG_LEVEL_1G:
468 #endif
469 		psize = page_level_size(level);
470 		pmask = page_level_mask(level);
471 		break;
472 	default:
473 		do_split = -EINVAL;
474 		goto out_unlock;
475 	}
476 
477 	/*
478 	 * Calculate the number of pages, which fit into this large
479 	 * page starting at address:
480 	 */
481 	nextpage_addr = (address + psize) & pmask;
482 	numpages = (nextpage_addr - address) >> PAGE_SHIFT;
483 	if (numpages < cpa->numpages)
484 		cpa->numpages = numpages;
485 
486 	/*
487 	 * We are safe now. Check whether the new pgprot is the same:
488 	 */
489 	old_pte = *kpte;
490 	old_prot = req_prot = pte_pgprot(old_pte);
491 
492 	pgprot_val(req_prot) &= ~pgprot_val(cpa->mask_clr);
493 	pgprot_val(req_prot) |= pgprot_val(cpa->mask_set);
494 
495 	/*
496 	 * Set the PSE and GLOBAL flags only if the PRESENT flag is
497 	 * set otherwise pmd_present/pmd_huge will return true even on
498 	 * a non present pmd. The canon_pgprot will clear _PAGE_GLOBAL
499 	 * for the ancient hardware that doesn't support it.
500 	 */
501 	if (pgprot_val(req_prot) & _PAGE_PRESENT)
502 		pgprot_val(req_prot) |= _PAGE_PSE | _PAGE_GLOBAL;
503 	else
504 		pgprot_val(req_prot) &= ~(_PAGE_PSE | _PAGE_GLOBAL);
505 
506 	req_prot = canon_pgprot(req_prot);
507 
508 	/*
509 	 * old_pte points to the large page base address. So we need
510 	 * to add the offset of the virtual address:
511 	 */
512 	pfn = pte_pfn(old_pte) + ((address & (psize - 1)) >> PAGE_SHIFT);
513 	cpa->pfn = pfn;
514 
515 	new_prot = static_protections(req_prot, address, pfn);
516 
517 	/*
518 	 * We need to check the full range, whether
519 	 * static_protection() requires a different pgprot for one of
520 	 * the pages in the range we try to preserve:
521 	 */
522 	addr = address & pmask;
523 	pfn = pte_pfn(old_pte);
524 	for (i = 0; i < (psize >> PAGE_SHIFT); i++, addr += PAGE_SIZE, pfn++) {
525 		pgprot_t chk_prot = static_protections(req_prot, addr, pfn);
526 
527 		if (pgprot_val(chk_prot) != pgprot_val(new_prot))
528 			goto out_unlock;
529 	}
530 
531 	/*
532 	 * If there are no changes, return. maxpages has been updated
533 	 * above:
534 	 */
535 	if (pgprot_val(new_prot) == pgprot_val(old_prot)) {
536 		do_split = 0;
537 		goto out_unlock;
538 	}
539 
540 	/*
541 	 * We need to change the attributes. Check, whether we can
542 	 * change the large page in one go. We request a split, when
543 	 * the address is not aligned and the number of pages is
544 	 * smaller than the number of pages in the large page. Note
545 	 * that we limited the number of possible pages already to
546 	 * the number of pages in the large page.
547 	 */
548 	if (address == (address & pmask) && cpa->numpages == (psize >> PAGE_SHIFT)) {
549 		/*
550 		 * The address is aligned and the number of pages
551 		 * covers the full page.
552 		 */
553 		new_pte = pfn_pte(pte_pfn(old_pte), new_prot);
554 		__set_pmd_pte(kpte, address, new_pte);
555 		cpa->flags |= CPA_FLUSHTLB;
556 		do_split = 0;
557 	}
558 
559 out_unlock:
560 	spin_unlock(&pgd_lock);
561 
562 	return do_split;
563 }
564 
565 static int
__split_large_page(struct cpa_data * cpa,pte_t * kpte,unsigned long address,struct page * base)566 __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
567 		   struct page *base)
568 {
569 	pte_t *pbase = (pte_t *)page_address(base);
570 	unsigned long pfn, pfninc = 1;
571 	unsigned int i, level;
572 	pte_t *tmp;
573 	pgprot_t ref_prot;
574 
575 	spin_lock(&pgd_lock);
576 	/*
577 	 * Check for races, another CPU might have split this page
578 	 * up for us already:
579 	 */
580 	tmp = _lookup_address_cpa(cpa, address, &level);
581 	if (tmp != kpte) {
582 		spin_unlock(&pgd_lock);
583 		return 1;
584 	}
585 
586 	paravirt_alloc_pte(&init_mm, page_to_pfn(base));
587 	ref_prot = pte_pgprot(pte_clrhuge(*kpte));
588 	/*
589 	 * If we ever want to utilize the PAT bit, we need to
590 	 * update this function to make sure it's converted from
591 	 * bit 12 to bit 7 when we cross from the 2MB level to
592 	 * the 4K level:
593 	 */
594 	WARN_ON_ONCE(pgprot_val(ref_prot) & _PAGE_PAT_LARGE);
595 
596 #ifdef CONFIG_X86_64
597 	if (level == PG_LEVEL_1G) {
598 		pfninc = PMD_PAGE_SIZE >> PAGE_SHIFT;
599 		/*
600 		 * Set the PSE flags only if the PRESENT flag is set
601 		 * otherwise pmd_present/pmd_huge will return true
602 		 * even on a non present pmd.
603 		 */
604 		if (pgprot_val(ref_prot) & _PAGE_PRESENT)
605 			pgprot_val(ref_prot) |= _PAGE_PSE;
606 		else
607 			pgprot_val(ref_prot) &= ~_PAGE_PSE;
608 	}
609 #endif
610 
611 	/*
612 	 * Set the GLOBAL flags only if the PRESENT flag is set
613 	 * otherwise pmd/pte_present will return true even on a non
614 	 * present pmd/pte. The canon_pgprot will clear _PAGE_GLOBAL
615 	 * for the ancient hardware that doesn't support it.
616 	 */
617 	if (pgprot_val(ref_prot) & _PAGE_PRESENT)
618 		pgprot_val(ref_prot) |= _PAGE_GLOBAL;
619 	else
620 		pgprot_val(ref_prot) &= ~_PAGE_GLOBAL;
621 
622 	/*
623 	 * Get the target pfn from the original entry:
624 	 */
625 	pfn = pte_pfn(*kpte);
626 	for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc)
627 		set_pte(&pbase[i], pfn_pte(pfn, canon_pgprot(ref_prot)));
628 
629 	if (pfn_range_is_mapped(PFN_DOWN(__pa(address)),
630 				PFN_DOWN(__pa(address)) + 1))
631 		split_page_count(level);
632 
633 	/*
634 	 * Install the new, split up pagetable.
635 	 *
636 	 * We use the standard kernel pagetable protections for the new
637 	 * pagetable protections, the actual ptes set above control the
638 	 * primary protection behavior:
639 	 */
640 	__set_pmd_pte(kpte, address, mk_pte(base, __pgprot(_KERNPG_TABLE)));
641 
642 	/*
643 	 * Intel Atom errata AAH41 workaround.
644 	 *
645 	 * The real fix should be in hw or in a microcode update, but
646 	 * we also probabilistically try to reduce the window of having
647 	 * a large TLB mixed with 4K TLBs while instruction fetches are
648 	 * going on.
649 	 */
650 	__flush_tlb_all();
651 	spin_unlock(&pgd_lock);
652 
653 	return 0;
654 }
655 
split_large_page(struct cpa_data * cpa,pte_t * kpte,unsigned long address)656 static int split_large_page(struct cpa_data *cpa, pte_t *kpte,
657 			    unsigned long address)
658 {
659 	struct page *base;
660 
661 	if (!debug_pagealloc)
662 		spin_unlock(&cpa_lock);
663 	base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0);
664 	if (!debug_pagealloc)
665 		spin_lock(&cpa_lock);
666 	if (!base)
667 		return -ENOMEM;
668 
669 	if (__split_large_page(cpa, kpte, address, base))
670 		__free_page(base);
671 
672 	return 0;
673 }
674 
try_to_free_pte_page(pte_t * pte)675 static bool try_to_free_pte_page(pte_t *pte)
676 {
677 	int i;
678 
679 	for (i = 0; i < PTRS_PER_PTE; i++)
680 		if (!pte_none(pte[i]))
681 			return false;
682 
683 	free_page((unsigned long)pte);
684 	return true;
685 }
686 
try_to_free_pmd_page(pmd_t * pmd)687 static bool try_to_free_pmd_page(pmd_t *pmd)
688 {
689 	int i;
690 
691 	for (i = 0; i < PTRS_PER_PMD; i++)
692 		if (!pmd_none(pmd[i]))
693 			return false;
694 
695 	free_page((unsigned long)pmd);
696 	return true;
697 }
698 
try_to_free_pud_page(pud_t * pud)699 static bool try_to_free_pud_page(pud_t *pud)
700 {
701 	int i;
702 
703 	for (i = 0; i < PTRS_PER_PUD; i++)
704 		if (!pud_none(pud[i]))
705 			return false;
706 
707 	free_page((unsigned long)pud);
708 	return true;
709 }
710 
unmap_pte_range(pmd_t * pmd,unsigned long start,unsigned long end)711 static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end)
712 {
713 	pte_t *pte = pte_offset_kernel(pmd, start);
714 
715 	while (start < end) {
716 		set_pte(pte, __pte(0));
717 
718 		start += PAGE_SIZE;
719 		pte++;
720 	}
721 
722 	if (try_to_free_pte_page((pte_t *)pmd_page_vaddr(*pmd))) {
723 		pmd_clear(pmd);
724 		return true;
725 	}
726 	return false;
727 }
728 
__unmap_pmd_range(pud_t * pud,pmd_t * pmd,unsigned long start,unsigned long end)729 static void __unmap_pmd_range(pud_t *pud, pmd_t *pmd,
730 			      unsigned long start, unsigned long end)
731 {
732 	if (unmap_pte_range(pmd, start, end))
733 		if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud)))
734 			pud_clear(pud);
735 }
736 
unmap_pmd_range(pud_t * pud,unsigned long start,unsigned long end)737 static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
738 {
739 	pmd_t *pmd = pmd_offset(pud, start);
740 
741 	/*
742 	 * Not on a 2MB page boundary?
743 	 */
744 	if (start & (PMD_SIZE - 1)) {
745 		unsigned long next_page = (start + PMD_SIZE) & PMD_MASK;
746 		unsigned long pre_end = min_t(unsigned long, end, next_page);
747 
748 		__unmap_pmd_range(pud, pmd, start, pre_end);
749 
750 		start = pre_end;
751 		pmd++;
752 	}
753 
754 	/*
755 	 * Try to unmap in 2M chunks.
756 	 */
757 	while (end - start >= PMD_SIZE) {
758 		if (pmd_large(*pmd))
759 			pmd_clear(pmd);
760 		else
761 			__unmap_pmd_range(pud, pmd, start, start + PMD_SIZE);
762 
763 		start += PMD_SIZE;
764 		pmd++;
765 	}
766 
767 	/*
768 	 * 4K leftovers?
769 	 */
770 	if (start < end)
771 		return __unmap_pmd_range(pud, pmd, start, end);
772 
773 	/*
774 	 * Try again to free the PMD page if haven't succeeded above.
775 	 */
776 	if (!pud_none(*pud))
777 		if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud)))
778 			pud_clear(pud);
779 }
780 
unmap_pud_range(pgd_t * pgd,unsigned long start,unsigned long end)781 static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
782 {
783 	pud_t *pud = pud_offset(pgd, start);
784 
785 	/*
786 	 * Not on a GB page boundary?
787 	 */
788 	if (start & (PUD_SIZE - 1)) {
789 		unsigned long next_page = (start + PUD_SIZE) & PUD_MASK;
790 		unsigned long pre_end	= min_t(unsigned long, end, next_page);
791 
792 		unmap_pmd_range(pud, start, pre_end);
793 
794 		start = pre_end;
795 		pud++;
796 	}
797 
798 	/*
799 	 * Try to unmap in 1G chunks?
800 	 */
801 	while (end - start >= PUD_SIZE) {
802 
803 		if (pud_large(*pud))
804 			pud_clear(pud);
805 		else
806 			unmap_pmd_range(pud, start, start + PUD_SIZE);
807 
808 		start += PUD_SIZE;
809 		pud++;
810 	}
811 
812 	/*
813 	 * 2M leftovers?
814 	 */
815 	if (start < end)
816 		unmap_pmd_range(pud, start, end);
817 
818 	/*
819 	 * No need to try to free the PUD page because we'll free it in
820 	 * populate_pgd's error path
821 	 */
822 }
823 
unmap_pgd_range(pgd_t * root,unsigned long addr,unsigned long end)824 static void unmap_pgd_range(pgd_t *root, unsigned long addr, unsigned long end)
825 {
826 	pgd_t *pgd_entry = root + pgd_index(addr);
827 
828 	unmap_pud_range(pgd_entry, addr, end);
829 
830 	if (try_to_free_pud_page((pud_t *)pgd_page_vaddr(*pgd_entry)))
831 		pgd_clear(pgd_entry);
832 }
833 
alloc_pte_page(pmd_t * pmd)834 static int alloc_pte_page(pmd_t *pmd)
835 {
836 	pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
837 	if (!pte)
838 		return -1;
839 
840 	set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE));
841 	return 0;
842 }
843 
alloc_pmd_page(pud_t * pud)844 static int alloc_pmd_page(pud_t *pud)
845 {
846 	pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
847 	if (!pmd)
848 		return -1;
849 
850 	set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
851 	return 0;
852 }
853 
populate_pte(struct cpa_data * cpa,unsigned long start,unsigned long end,unsigned num_pages,pmd_t * pmd,pgprot_t pgprot)854 static void populate_pte(struct cpa_data *cpa,
855 			 unsigned long start, unsigned long end,
856 			 unsigned num_pages, pmd_t *pmd, pgprot_t pgprot)
857 {
858 	pte_t *pte;
859 
860 	pte = pte_offset_kernel(pmd, start);
861 
862 	while (num_pages-- && start < end) {
863 
864 		/* deal with the NX bit */
865 		if (!(pgprot_val(pgprot) & _PAGE_NX))
866 			cpa->pfn &= ~_PAGE_NX;
867 
868 		set_pte(pte, pfn_pte(cpa->pfn >> PAGE_SHIFT, pgprot));
869 
870 		start	 += PAGE_SIZE;
871 		cpa->pfn += PAGE_SIZE;
872 		pte++;
873 	}
874 }
875 
populate_pmd(struct cpa_data * cpa,unsigned long start,unsigned long end,unsigned num_pages,pud_t * pud,pgprot_t pgprot)876 static int populate_pmd(struct cpa_data *cpa,
877 			unsigned long start, unsigned long end,
878 			unsigned num_pages, pud_t *pud, pgprot_t pgprot)
879 {
880 	unsigned int cur_pages = 0;
881 	pmd_t *pmd;
882 
883 	/*
884 	 * Not on a 2M boundary?
885 	 */
886 	if (start & (PMD_SIZE - 1)) {
887 		unsigned long pre_end = start + (num_pages << PAGE_SHIFT);
888 		unsigned long next_page = (start + PMD_SIZE) & PMD_MASK;
889 
890 		pre_end   = min_t(unsigned long, pre_end, next_page);
891 		cur_pages = (pre_end - start) >> PAGE_SHIFT;
892 		cur_pages = min_t(unsigned int, num_pages, cur_pages);
893 
894 		/*
895 		 * Need a PTE page?
896 		 */
897 		pmd = pmd_offset(pud, start);
898 		if (pmd_none(*pmd))
899 			if (alloc_pte_page(pmd))
900 				return -1;
901 
902 		populate_pte(cpa, start, pre_end, cur_pages, pmd, pgprot);
903 
904 		start = pre_end;
905 	}
906 
907 	/*
908 	 * We mapped them all?
909 	 */
910 	if (num_pages == cur_pages)
911 		return cur_pages;
912 
913 	while (end - start >= PMD_SIZE) {
914 
915 		/*
916 		 * We cannot use a 1G page so allocate a PMD page if needed.
917 		 */
918 		if (pud_none(*pud))
919 			if (alloc_pmd_page(pud))
920 				return -1;
921 
922 		pmd = pmd_offset(pud, start);
923 
924 		set_pmd(pmd, __pmd(cpa->pfn | _PAGE_PSE | massage_pgprot(pgprot)));
925 
926 		start	  += PMD_SIZE;
927 		cpa->pfn  += PMD_SIZE;
928 		cur_pages += PMD_SIZE >> PAGE_SHIFT;
929 	}
930 
931 	/*
932 	 * Map trailing 4K pages.
933 	 */
934 	if (start < end) {
935 		pmd = pmd_offset(pud, start);
936 		if (pmd_none(*pmd))
937 			if (alloc_pte_page(pmd))
938 				return -1;
939 
940 		populate_pte(cpa, start, end, num_pages - cur_pages,
941 			     pmd, pgprot);
942 	}
943 	return num_pages;
944 }
945 
populate_pud(struct cpa_data * cpa,unsigned long start,pgd_t * pgd,pgprot_t pgprot)946 static int populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd,
947 			pgprot_t pgprot)
948 {
949 	pud_t *pud;
950 	unsigned long end;
951 	int cur_pages = 0;
952 
953 	end = start + (cpa->numpages << PAGE_SHIFT);
954 
955 	/*
956 	 * Not on a Gb page boundary? => map everything up to it with
957 	 * smaller pages.
958 	 */
959 	if (start & (PUD_SIZE - 1)) {
960 		unsigned long pre_end;
961 		unsigned long next_page = (start + PUD_SIZE) & PUD_MASK;
962 
963 		pre_end   = min_t(unsigned long, end, next_page);
964 		cur_pages = (pre_end - start) >> PAGE_SHIFT;
965 		cur_pages = min_t(int, (int)cpa->numpages, cur_pages);
966 
967 		pud = pud_offset(pgd, start);
968 
969 		/*
970 		 * Need a PMD page?
971 		 */
972 		if (pud_none(*pud))
973 			if (alloc_pmd_page(pud))
974 				return -1;
975 
976 		cur_pages = populate_pmd(cpa, start, pre_end, cur_pages,
977 					 pud, pgprot);
978 		if (cur_pages < 0)
979 			return cur_pages;
980 
981 		start = pre_end;
982 	}
983 
984 	/* We mapped them all? */
985 	if (cpa->numpages == cur_pages)
986 		return cur_pages;
987 
988 	pud = pud_offset(pgd, start);
989 
990 	/*
991 	 * Map everything starting from the Gb boundary, possibly with 1G pages
992 	 */
993 	while (end - start >= PUD_SIZE) {
994 		set_pud(pud, __pud(cpa->pfn | _PAGE_PSE | massage_pgprot(pgprot)));
995 
996 		start	  += PUD_SIZE;
997 		cpa->pfn  += PUD_SIZE;
998 		cur_pages += PUD_SIZE >> PAGE_SHIFT;
999 		pud++;
1000 	}
1001 
1002 	/* Map trailing leftover */
1003 	if (start < end) {
1004 		int tmp;
1005 
1006 		pud = pud_offset(pgd, start);
1007 		if (pud_none(*pud))
1008 			if (alloc_pmd_page(pud))
1009 				return -1;
1010 
1011 		tmp = populate_pmd(cpa, start, end, cpa->numpages - cur_pages,
1012 				   pud, pgprot);
1013 		if (tmp < 0)
1014 			return cur_pages;
1015 
1016 		cur_pages += tmp;
1017 	}
1018 	return cur_pages;
1019 }
1020 
1021 /*
1022  * Restrictions for kernel page table do not necessarily apply when mapping in
1023  * an alternate PGD.
1024  */
populate_pgd(struct cpa_data * cpa,unsigned long addr)1025 static int populate_pgd(struct cpa_data *cpa, unsigned long addr)
1026 {
1027 	pgprot_t pgprot = __pgprot(_KERNPG_TABLE);
1028 	pud_t *pud = NULL;	/* shut up gcc */
1029 	pgd_t *pgd_entry;
1030 	int ret;
1031 
1032 	pgd_entry = cpa->pgd + pgd_index(addr);
1033 
1034 	/*
1035 	 * Allocate a PUD page and hand it down for mapping.
1036 	 */
1037 	if (pgd_none(*pgd_entry)) {
1038 		pud = (pud_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
1039 		if (!pud)
1040 			return -1;
1041 
1042 		set_pgd(pgd_entry, __pgd(__pa(pud) | _KERNPG_TABLE));
1043 	}
1044 
1045 	pgprot_val(pgprot) &= ~pgprot_val(cpa->mask_clr);
1046 	pgprot_val(pgprot) |=  pgprot_val(cpa->mask_set);
1047 
1048 	ret = populate_pud(cpa, addr, pgd_entry, pgprot);
1049 	if (ret < 0) {
1050 		unmap_pgd_range(cpa->pgd, addr,
1051 				addr + (cpa->numpages << PAGE_SHIFT));
1052 		return ret;
1053 	}
1054 
1055 	cpa->numpages = ret;
1056 	return 0;
1057 }
1058 
__cpa_process_fault(struct cpa_data * cpa,unsigned long vaddr,int primary)1059 static int __cpa_process_fault(struct cpa_data *cpa, unsigned long vaddr,
1060 			       int primary)
1061 {
1062 	if (cpa->pgd)
1063 		return populate_pgd(cpa, vaddr);
1064 
1065 	/*
1066 	 * Ignore all non primary paths.
1067 	 */
1068 	if (!primary)
1069 		return 0;
1070 
1071 	/*
1072 	 * Ignore the NULL PTE for kernel identity mapping, as it is expected
1073 	 * to have holes.
1074 	 * Also set numpages to '1' indicating that we processed cpa req for
1075 	 * one virtual address page and its pfn. TBD: numpages can be set based
1076 	 * on the initial value and the level returned by lookup_address().
1077 	 */
1078 	if (within(vaddr, PAGE_OFFSET,
1079 		   PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))) {
1080 		cpa->numpages = 1;
1081 		cpa->pfn = __pa(vaddr) >> PAGE_SHIFT;
1082 		return 0;
1083 	} else {
1084 		WARN(1, KERN_WARNING "CPA: called for zero pte. "
1085 			"vaddr = %lx cpa->vaddr = %lx\n", vaddr,
1086 			*cpa->vaddr);
1087 
1088 		return -EFAULT;
1089 	}
1090 }
1091 
__change_page_attr(struct cpa_data * cpa,int primary)1092 static int __change_page_attr(struct cpa_data *cpa, int primary)
1093 {
1094 	unsigned long address;
1095 	int do_split, err;
1096 	unsigned int level;
1097 	pte_t *kpte, old_pte;
1098 
1099 	if (cpa->flags & CPA_PAGES_ARRAY) {
1100 		struct page *page = cpa->pages[cpa->curpage];
1101 		if (unlikely(PageHighMem(page)))
1102 			return 0;
1103 		address = (unsigned long)page_address(page);
1104 	} else if (cpa->flags & CPA_ARRAY)
1105 		address = cpa->vaddr[cpa->curpage];
1106 	else
1107 		address = *cpa->vaddr;
1108 repeat:
1109 	kpte = _lookup_address_cpa(cpa, address, &level);
1110 	if (!kpte)
1111 		return __cpa_process_fault(cpa, address, primary);
1112 
1113 	old_pte = *kpte;
1114 	if (!pte_val(old_pte))
1115 		return __cpa_process_fault(cpa, address, primary);
1116 
1117 	if (level == PG_LEVEL_4K) {
1118 		pte_t new_pte;
1119 		pgprot_t new_prot = pte_pgprot(old_pte);
1120 		unsigned long pfn = pte_pfn(old_pte);
1121 
1122 		pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr);
1123 		pgprot_val(new_prot) |= pgprot_val(cpa->mask_set);
1124 
1125 		new_prot = static_protections(new_prot, address, pfn);
1126 
1127 		/*
1128 		 * Set the GLOBAL flags only if the PRESENT flag is
1129 		 * set otherwise pte_present will return true even on
1130 		 * a non present pte. The canon_pgprot will clear
1131 		 * _PAGE_GLOBAL for the ancient hardware that doesn't
1132 		 * support it.
1133 		 */
1134 		if (pgprot_val(new_prot) & _PAGE_PRESENT)
1135 			pgprot_val(new_prot) |= _PAGE_GLOBAL;
1136 		else
1137 			pgprot_val(new_prot) &= ~_PAGE_GLOBAL;
1138 
1139 		/*
1140 		 * We need to keep the pfn from the existing PTE,
1141 		 * after all we're only going to change it's attributes
1142 		 * not the memory it points to
1143 		 */
1144 		new_pte = pfn_pte(pfn, canon_pgprot(new_prot));
1145 		cpa->pfn = pfn;
1146 		/*
1147 		 * Do we really change anything ?
1148 		 */
1149 		if (pte_val(old_pte) != pte_val(new_pte)) {
1150 			set_pte_atomic(kpte, new_pte);
1151 			cpa->flags |= CPA_FLUSHTLB;
1152 		}
1153 		cpa->numpages = 1;
1154 		return 0;
1155 	}
1156 
1157 	/*
1158 	 * Check, whether we can keep the large page intact
1159 	 * and just change the pte:
1160 	 */
1161 	do_split = try_preserve_large_page(kpte, address, cpa);
1162 	/*
1163 	 * When the range fits into the existing large page,
1164 	 * return. cp->numpages and cpa->tlbflush have been updated in
1165 	 * try_large_page:
1166 	 */
1167 	if (do_split <= 0)
1168 		return do_split;
1169 
1170 	/*
1171 	 * We have to split the large page:
1172 	 */
1173 	err = split_large_page(cpa, kpte, address);
1174 	if (!err) {
1175 		/*
1176 	 	 * Do a global flush tlb after splitting the large page
1177 	 	 * and before we do the actual change page attribute in the PTE.
1178 	 	 *
1179 	 	 * With out this, we violate the TLB application note, that says
1180 	 	 * "The TLBs may contain both ordinary and large-page
1181 		 *  translations for a 4-KByte range of linear addresses. This
1182 		 *  may occur if software modifies the paging structures so that
1183 		 *  the page size used for the address range changes. If the two
1184 		 *  translations differ with respect to page frame or attributes
1185 		 *  (e.g., permissions), processor behavior is undefined and may
1186 		 *  be implementation-specific."
1187 	 	 *
1188 	 	 * We do this global tlb flush inside the cpa_lock, so that we
1189 		 * don't allow any other cpu, with stale tlb entries change the
1190 		 * page attribute in parallel, that also falls into the
1191 		 * just split large page entry.
1192 	 	 */
1193 		flush_tlb_all();
1194 		goto repeat;
1195 	}
1196 
1197 	return err;
1198 }
1199 
1200 static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias);
1201 
cpa_process_alias(struct cpa_data * cpa)1202 static int cpa_process_alias(struct cpa_data *cpa)
1203 {
1204 	struct cpa_data alias_cpa;
1205 	unsigned long laddr = (unsigned long)__va(cpa->pfn << PAGE_SHIFT);
1206 	unsigned long vaddr;
1207 	int ret;
1208 
1209 	if (!pfn_range_is_mapped(cpa->pfn, cpa->pfn + 1))
1210 		return 0;
1211 
1212 	/*
1213 	 * No need to redo, when the primary call touched the direct
1214 	 * mapping already:
1215 	 */
1216 	if (cpa->flags & CPA_PAGES_ARRAY) {
1217 		struct page *page = cpa->pages[cpa->curpage];
1218 		if (unlikely(PageHighMem(page)))
1219 			return 0;
1220 		vaddr = (unsigned long)page_address(page);
1221 	} else if (cpa->flags & CPA_ARRAY)
1222 		vaddr = cpa->vaddr[cpa->curpage];
1223 	else
1224 		vaddr = *cpa->vaddr;
1225 
1226 	if (!(within(vaddr, PAGE_OFFSET,
1227 		    PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT)))) {
1228 
1229 		alias_cpa = *cpa;
1230 		alias_cpa.vaddr = &laddr;
1231 		alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
1232 
1233 		ret = __change_page_attr_set_clr(&alias_cpa, 0);
1234 		if (ret)
1235 			return ret;
1236 	}
1237 
1238 #ifdef CONFIG_X86_64
1239 	/*
1240 	 * If the primary call didn't touch the high mapping already
1241 	 * and the physical address is inside the kernel map, we need
1242 	 * to touch the high mapped kernel as well:
1243 	 */
1244 	if (!within(vaddr, (unsigned long)_text, _brk_end) &&
1245 	    within(cpa->pfn, highmap_start_pfn(), highmap_end_pfn())) {
1246 		unsigned long temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) +
1247 					       __START_KERNEL_map - phys_base;
1248 		alias_cpa = *cpa;
1249 		alias_cpa.vaddr = &temp_cpa_vaddr;
1250 		alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
1251 
1252 		/*
1253 		 * The high mapping range is imprecise, so ignore the
1254 		 * return value.
1255 		 */
1256 		__change_page_attr_set_clr(&alias_cpa, 0);
1257 	}
1258 #endif
1259 
1260 	return 0;
1261 }
1262 
__change_page_attr_set_clr(struct cpa_data * cpa,int checkalias)1263 static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
1264 {
1265 	int ret, numpages = cpa->numpages;
1266 
1267 	while (numpages) {
1268 		/*
1269 		 * Store the remaining nr of pages for the large page
1270 		 * preservation check.
1271 		 */
1272 		cpa->numpages = numpages;
1273 		/* for array changes, we can't use large page */
1274 		if (cpa->flags & (CPA_ARRAY | CPA_PAGES_ARRAY))
1275 			cpa->numpages = 1;
1276 
1277 		if (!debug_pagealloc)
1278 			spin_lock(&cpa_lock);
1279 		ret = __change_page_attr(cpa, checkalias);
1280 		if (!debug_pagealloc)
1281 			spin_unlock(&cpa_lock);
1282 		if (ret)
1283 			return ret;
1284 
1285 		if (checkalias) {
1286 			ret = cpa_process_alias(cpa);
1287 			if (ret)
1288 				return ret;
1289 		}
1290 
1291 		/*
1292 		 * Adjust the number of pages with the result of the
1293 		 * CPA operation. Either a large page has been
1294 		 * preserved or a single page update happened.
1295 		 */
1296 		BUG_ON(cpa->numpages > numpages || !cpa->numpages);
1297 		numpages -= cpa->numpages;
1298 		if (cpa->flags & (CPA_PAGES_ARRAY | CPA_ARRAY))
1299 			cpa->curpage++;
1300 		else
1301 			*cpa->vaddr += cpa->numpages * PAGE_SIZE;
1302 
1303 	}
1304 	return 0;
1305 }
1306 
cache_attr(pgprot_t attr)1307 static inline int cache_attr(pgprot_t attr)
1308 {
1309 	return pgprot_val(attr) &
1310 		(_PAGE_PAT | _PAGE_PAT_LARGE | _PAGE_PWT | _PAGE_PCD);
1311 }
1312 
change_page_attr_set_clr(unsigned long * addr,int numpages,pgprot_t mask_set,pgprot_t mask_clr,int force_split,int in_flag,struct page ** pages)1313 static int change_page_attr_set_clr(unsigned long *addr, int numpages,
1314 				    pgprot_t mask_set, pgprot_t mask_clr,
1315 				    int force_split, int in_flag,
1316 				    struct page **pages)
1317 {
1318 	struct cpa_data cpa;
1319 	int ret, cache, checkalias;
1320 	unsigned long baddr = 0;
1321 
1322 	memset(&cpa, 0, sizeof(cpa));
1323 
1324 	/*
1325 	 * Check, if we are requested to change a not supported
1326 	 * feature:
1327 	 */
1328 	mask_set = canon_pgprot(mask_set);
1329 	mask_clr = canon_pgprot(mask_clr);
1330 	if (!pgprot_val(mask_set) && !pgprot_val(mask_clr) && !force_split)
1331 		return 0;
1332 
1333 	/* Ensure we are PAGE_SIZE aligned */
1334 	if (in_flag & CPA_ARRAY) {
1335 		int i;
1336 		for (i = 0; i < numpages; i++) {
1337 			if (addr[i] & ~PAGE_MASK) {
1338 				addr[i] &= PAGE_MASK;
1339 				WARN_ON_ONCE(1);
1340 			}
1341 		}
1342 	} else if (!(in_flag & CPA_PAGES_ARRAY)) {
1343 		/*
1344 		 * in_flag of CPA_PAGES_ARRAY implies it is aligned.
1345 		 * No need to cehck in that case
1346 		 */
1347 		if (*addr & ~PAGE_MASK) {
1348 			*addr &= PAGE_MASK;
1349 			/*
1350 			 * People should not be passing in unaligned addresses:
1351 			 */
1352 			WARN_ON_ONCE(1);
1353 		}
1354 		/*
1355 		 * Save address for cache flush. *addr is modified in the call
1356 		 * to __change_page_attr_set_clr() below.
1357 		 */
1358 		baddr = *addr;
1359 	}
1360 
1361 	/* Must avoid aliasing mappings in the highmem code */
1362 	kmap_flush_unused();
1363 
1364 	vm_unmap_aliases();
1365 
1366 	cpa.vaddr = addr;
1367 	cpa.pages = pages;
1368 	cpa.numpages = numpages;
1369 	cpa.mask_set = mask_set;
1370 	cpa.mask_clr = mask_clr;
1371 	cpa.flags = 0;
1372 	cpa.curpage = 0;
1373 	cpa.force_split = force_split;
1374 
1375 	if (in_flag & (CPA_ARRAY | CPA_PAGES_ARRAY))
1376 		cpa.flags |= in_flag;
1377 
1378 	/* No alias checking for _NX bit modifications */
1379 	checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX;
1380 
1381 	ret = __change_page_attr_set_clr(&cpa, checkalias);
1382 
1383 	/*
1384 	 * Check whether we really changed something:
1385 	 */
1386 	if (!(cpa.flags & CPA_FLUSHTLB))
1387 		goto out;
1388 
1389 	/*
1390 	 * No need to flush, when we did not set any of the caching
1391 	 * attributes:
1392 	 */
1393 	cache = cache_attr(mask_set);
1394 
1395 	/*
1396 	 * On success we use CLFLUSH, when the CPU supports it to
1397 	 * avoid the WBINVD. If the CPU does not support it and in the
1398 	 * error case we fall back to cpa_flush_all (which uses
1399 	 * WBINVD):
1400 	 */
1401 	if (!ret && cpu_has_clflush) {
1402 		if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) {
1403 			cpa_flush_array(addr, numpages, cache,
1404 					cpa.flags, pages);
1405 		} else
1406 			cpa_flush_range(baddr, numpages, cache);
1407 	} else
1408 		cpa_flush_all(cache);
1409 
1410 out:
1411 	return ret;
1412 }
1413 
change_page_attr_set(unsigned long * addr,int numpages,pgprot_t mask,int array)1414 static inline int change_page_attr_set(unsigned long *addr, int numpages,
1415 				       pgprot_t mask, int array)
1416 {
1417 	return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0,
1418 		(array ? CPA_ARRAY : 0), NULL);
1419 }
1420 
change_page_attr_clear(unsigned long * addr,int numpages,pgprot_t mask,int array)1421 static inline int change_page_attr_clear(unsigned long *addr, int numpages,
1422 					 pgprot_t mask, int array)
1423 {
1424 	return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0,
1425 		(array ? CPA_ARRAY : 0), NULL);
1426 }
1427 
cpa_set_pages_array(struct page ** pages,int numpages,pgprot_t mask)1428 static inline int cpa_set_pages_array(struct page **pages, int numpages,
1429 				       pgprot_t mask)
1430 {
1431 	return change_page_attr_set_clr(NULL, numpages, mask, __pgprot(0), 0,
1432 		CPA_PAGES_ARRAY, pages);
1433 }
1434 
cpa_clear_pages_array(struct page ** pages,int numpages,pgprot_t mask)1435 static inline int cpa_clear_pages_array(struct page **pages, int numpages,
1436 					 pgprot_t mask)
1437 {
1438 	return change_page_attr_set_clr(NULL, numpages, __pgprot(0), mask, 0,
1439 		CPA_PAGES_ARRAY, pages);
1440 }
1441 
_set_memory_uc(unsigned long addr,int numpages)1442 int _set_memory_uc(unsigned long addr, int numpages)
1443 {
1444 	/*
1445 	 * for now UC MINUS. see comments in ioremap_nocache()
1446 	 */
1447 	return change_page_attr_set(&addr, numpages,
1448 				    __pgprot(_PAGE_CACHE_UC_MINUS), 0);
1449 }
1450 
set_memory_uc(unsigned long addr,int numpages)1451 int set_memory_uc(unsigned long addr, int numpages)
1452 {
1453 	int ret;
1454 
1455 	/*
1456 	 * for now UC MINUS. see comments in ioremap_nocache()
1457 	 */
1458 	ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
1459 			    _PAGE_CACHE_UC_MINUS, NULL);
1460 	if (ret)
1461 		goto out_err;
1462 
1463 	ret = _set_memory_uc(addr, numpages);
1464 	if (ret)
1465 		goto out_free;
1466 
1467 	return 0;
1468 
1469 out_free:
1470 	free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
1471 out_err:
1472 	return ret;
1473 }
1474 EXPORT_SYMBOL(set_memory_uc);
1475 
_set_memory_array(unsigned long * addr,int addrinarray,unsigned long new_type)1476 static int _set_memory_array(unsigned long *addr, int addrinarray,
1477 		unsigned long new_type)
1478 {
1479 	int i, j;
1480 	int ret;
1481 
1482 	/*
1483 	 * for now UC MINUS. see comments in ioremap_nocache()
1484 	 */
1485 	for (i = 0; i < addrinarray; i++) {
1486 		ret = reserve_memtype(__pa(addr[i]), __pa(addr[i]) + PAGE_SIZE,
1487 					new_type, NULL);
1488 		if (ret)
1489 			goto out_free;
1490 	}
1491 
1492 	ret = change_page_attr_set(addr, addrinarray,
1493 				    __pgprot(_PAGE_CACHE_UC_MINUS), 1);
1494 
1495 	if (!ret && new_type == _PAGE_CACHE_WC)
1496 		ret = change_page_attr_set_clr(addr, addrinarray,
1497 					       __pgprot(_PAGE_CACHE_WC),
1498 					       __pgprot(_PAGE_CACHE_MASK),
1499 					       0, CPA_ARRAY, NULL);
1500 	if (ret)
1501 		goto out_free;
1502 
1503 	return 0;
1504 
1505 out_free:
1506 	for (j = 0; j < i; j++)
1507 		free_memtype(__pa(addr[j]), __pa(addr[j]) + PAGE_SIZE);
1508 
1509 	return ret;
1510 }
1511 
set_memory_array_uc(unsigned long * addr,int addrinarray)1512 int set_memory_array_uc(unsigned long *addr, int addrinarray)
1513 {
1514 	return _set_memory_array(addr, addrinarray, _PAGE_CACHE_UC_MINUS);
1515 }
1516 EXPORT_SYMBOL(set_memory_array_uc);
1517 
set_memory_array_wc(unsigned long * addr,int addrinarray)1518 int set_memory_array_wc(unsigned long *addr, int addrinarray)
1519 {
1520 	return _set_memory_array(addr, addrinarray, _PAGE_CACHE_WC);
1521 }
1522 EXPORT_SYMBOL(set_memory_array_wc);
1523 
_set_memory_wc(unsigned long addr,int numpages)1524 int _set_memory_wc(unsigned long addr, int numpages)
1525 {
1526 	int ret;
1527 	unsigned long addr_copy = addr;
1528 
1529 	ret = change_page_attr_set(&addr, numpages,
1530 				    __pgprot(_PAGE_CACHE_UC_MINUS), 0);
1531 	if (!ret) {
1532 		ret = change_page_attr_set_clr(&addr_copy, numpages,
1533 					       __pgprot(_PAGE_CACHE_WC),
1534 					       __pgprot(_PAGE_CACHE_MASK),
1535 					       0, 0, NULL);
1536 	}
1537 	return ret;
1538 }
1539 
set_memory_wc(unsigned long addr,int numpages)1540 int set_memory_wc(unsigned long addr, int numpages)
1541 {
1542 	int ret;
1543 
1544 	if (!pat_enabled)
1545 		return set_memory_uc(addr, numpages);
1546 
1547 	ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
1548 		_PAGE_CACHE_WC, NULL);
1549 	if (ret)
1550 		goto out_err;
1551 
1552 	ret = _set_memory_wc(addr, numpages);
1553 	if (ret)
1554 		goto out_free;
1555 
1556 	return 0;
1557 
1558 out_free:
1559 	free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
1560 out_err:
1561 	return ret;
1562 }
1563 EXPORT_SYMBOL(set_memory_wc);
1564 
_set_memory_wb(unsigned long addr,int numpages)1565 int _set_memory_wb(unsigned long addr, int numpages)
1566 {
1567 	return change_page_attr_clear(&addr, numpages,
1568 				      __pgprot(_PAGE_CACHE_MASK), 0);
1569 }
1570 
set_memory_wb(unsigned long addr,int numpages)1571 int set_memory_wb(unsigned long addr, int numpages)
1572 {
1573 	int ret;
1574 
1575 	ret = _set_memory_wb(addr, numpages);
1576 	if (ret)
1577 		return ret;
1578 
1579 	free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
1580 	return 0;
1581 }
1582 EXPORT_SYMBOL(set_memory_wb);
1583 
set_memory_array_wb(unsigned long * addr,int addrinarray)1584 int set_memory_array_wb(unsigned long *addr, int addrinarray)
1585 {
1586 	int i;
1587 	int ret;
1588 
1589 	ret = change_page_attr_clear(addr, addrinarray,
1590 				      __pgprot(_PAGE_CACHE_MASK), 1);
1591 	if (ret)
1592 		return ret;
1593 
1594 	for (i = 0; i < addrinarray; i++)
1595 		free_memtype(__pa(addr[i]), __pa(addr[i]) + PAGE_SIZE);
1596 
1597 	return 0;
1598 }
1599 EXPORT_SYMBOL(set_memory_array_wb);
1600 
set_memory_x(unsigned long addr,int numpages)1601 int set_memory_x(unsigned long addr, int numpages)
1602 {
1603 	if (!(__supported_pte_mask & _PAGE_NX))
1604 		return 0;
1605 
1606 	return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_NX), 0);
1607 }
1608 EXPORT_SYMBOL(set_memory_x);
1609 
set_memory_nx(unsigned long addr,int numpages)1610 int set_memory_nx(unsigned long addr, int numpages)
1611 {
1612 	if (!(__supported_pte_mask & _PAGE_NX))
1613 		return 0;
1614 
1615 	return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_NX), 0);
1616 }
1617 EXPORT_SYMBOL(set_memory_nx);
1618 
set_memory_ro(unsigned long addr,int numpages)1619 int set_memory_ro(unsigned long addr, int numpages)
1620 {
1621 	return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_RW), 0);
1622 }
1623 EXPORT_SYMBOL_GPL(set_memory_ro);
1624 
set_memory_rw(unsigned long addr,int numpages)1625 int set_memory_rw(unsigned long addr, int numpages)
1626 {
1627 	return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_RW), 0);
1628 }
1629 EXPORT_SYMBOL_GPL(set_memory_rw);
1630 
set_memory_np(unsigned long addr,int numpages)1631 int set_memory_np(unsigned long addr, int numpages)
1632 {
1633 	return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_PRESENT), 0);
1634 }
1635 
set_memory_4k(unsigned long addr,int numpages)1636 int set_memory_4k(unsigned long addr, int numpages)
1637 {
1638 	return change_page_attr_set_clr(&addr, numpages, __pgprot(0),
1639 					__pgprot(0), 1, 0, NULL);
1640 }
1641 
set_pages_uc(struct page * page,int numpages)1642 int set_pages_uc(struct page *page, int numpages)
1643 {
1644 	unsigned long addr = (unsigned long)page_address(page);
1645 
1646 	return set_memory_uc(addr, numpages);
1647 }
1648 EXPORT_SYMBOL(set_pages_uc);
1649 
_set_pages_array(struct page ** pages,int addrinarray,unsigned long new_type)1650 static int _set_pages_array(struct page **pages, int addrinarray,
1651 		unsigned long new_type)
1652 {
1653 	unsigned long start;
1654 	unsigned long end;
1655 	int i;
1656 	int free_idx;
1657 	int ret;
1658 
1659 	for (i = 0; i < addrinarray; i++) {
1660 		if (PageHighMem(pages[i]))
1661 			continue;
1662 		start = page_to_pfn(pages[i]) << PAGE_SHIFT;
1663 		end = start + PAGE_SIZE;
1664 		if (reserve_memtype(start, end, new_type, NULL))
1665 			goto err_out;
1666 	}
1667 
1668 	ret = cpa_set_pages_array(pages, addrinarray,
1669 			__pgprot(_PAGE_CACHE_UC_MINUS));
1670 	if (!ret && new_type == _PAGE_CACHE_WC)
1671 		ret = change_page_attr_set_clr(NULL, addrinarray,
1672 					       __pgprot(_PAGE_CACHE_WC),
1673 					       __pgprot(_PAGE_CACHE_MASK),
1674 					       0, CPA_PAGES_ARRAY, pages);
1675 	if (ret)
1676 		goto err_out;
1677 	return 0; /* Success */
1678 err_out:
1679 	free_idx = i;
1680 	for (i = 0; i < free_idx; i++) {
1681 		if (PageHighMem(pages[i]))
1682 			continue;
1683 		start = page_to_pfn(pages[i]) << PAGE_SHIFT;
1684 		end = start + PAGE_SIZE;
1685 		free_memtype(start, end);
1686 	}
1687 	return -EINVAL;
1688 }
1689 
set_pages_array_uc(struct page ** pages,int addrinarray)1690 int set_pages_array_uc(struct page **pages, int addrinarray)
1691 {
1692 	return _set_pages_array(pages, addrinarray, _PAGE_CACHE_UC_MINUS);
1693 }
1694 EXPORT_SYMBOL(set_pages_array_uc);
1695 
set_pages_array_wc(struct page ** pages,int addrinarray)1696 int set_pages_array_wc(struct page **pages, int addrinarray)
1697 {
1698 	return _set_pages_array(pages, addrinarray, _PAGE_CACHE_WC);
1699 }
1700 EXPORT_SYMBOL(set_pages_array_wc);
1701 
set_pages_wb(struct page * page,int numpages)1702 int set_pages_wb(struct page *page, int numpages)
1703 {
1704 	unsigned long addr = (unsigned long)page_address(page);
1705 
1706 	return set_memory_wb(addr, numpages);
1707 }
1708 EXPORT_SYMBOL(set_pages_wb);
1709 
set_pages_array_wb(struct page ** pages,int addrinarray)1710 int set_pages_array_wb(struct page **pages, int addrinarray)
1711 {
1712 	int retval;
1713 	unsigned long start;
1714 	unsigned long end;
1715 	int i;
1716 
1717 	retval = cpa_clear_pages_array(pages, addrinarray,
1718 			__pgprot(_PAGE_CACHE_MASK));
1719 	if (retval)
1720 		return retval;
1721 
1722 	for (i = 0; i < addrinarray; i++) {
1723 		if (PageHighMem(pages[i]))
1724 			continue;
1725 		start = page_to_pfn(pages[i]) << PAGE_SHIFT;
1726 		end = start + PAGE_SIZE;
1727 		free_memtype(start, end);
1728 	}
1729 
1730 	return 0;
1731 }
1732 EXPORT_SYMBOL(set_pages_array_wb);
1733 
set_pages_x(struct page * page,int numpages)1734 int set_pages_x(struct page *page, int numpages)
1735 {
1736 	unsigned long addr = (unsigned long)page_address(page);
1737 
1738 	return set_memory_x(addr, numpages);
1739 }
1740 EXPORT_SYMBOL(set_pages_x);
1741 
set_pages_nx(struct page * page,int numpages)1742 int set_pages_nx(struct page *page, int numpages)
1743 {
1744 	unsigned long addr = (unsigned long)page_address(page);
1745 
1746 	return set_memory_nx(addr, numpages);
1747 }
1748 EXPORT_SYMBOL(set_pages_nx);
1749 
set_pages_ro(struct page * page,int numpages)1750 int set_pages_ro(struct page *page, int numpages)
1751 {
1752 	unsigned long addr = (unsigned long)page_address(page);
1753 
1754 	return set_memory_ro(addr, numpages);
1755 }
1756 
set_pages_rw(struct page * page,int numpages)1757 int set_pages_rw(struct page *page, int numpages)
1758 {
1759 	unsigned long addr = (unsigned long)page_address(page);
1760 
1761 	return set_memory_rw(addr, numpages);
1762 }
1763 
1764 #ifdef CONFIG_DEBUG_PAGEALLOC
1765 
__set_pages_p(struct page * page,int numpages)1766 static int __set_pages_p(struct page *page, int numpages)
1767 {
1768 	unsigned long tempaddr = (unsigned long) page_address(page);
1769 	struct cpa_data cpa = { .vaddr = &tempaddr,
1770 				.pgd = NULL,
1771 				.numpages = numpages,
1772 				.mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW),
1773 				.mask_clr = __pgprot(0),
1774 				.flags = 0};
1775 
1776 	/*
1777 	 * No alias checking needed for setting present flag. otherwise,
1778 	 * we may need to break large pages for 64-bit kernel text
1779 	 * mappings (this adds to complexity if we want to do this from
1780 	 * atomic context especially). Let's keep it simple!
1781 	 */
1782 	return __change_page_attr_set_clr(&cpa, 0);
1783 }
1784 
__set_pages_np(struct page * page,int numpages)1785 static int __set_pages_np(struct page *page, int numpages)
1786 {
1787 	unsigned long tempaddr = (unsigned long) page_address(page);
1788 	struct cpa_data cpa = { .vaddr = &tempaddr,
1789 				.pgd = NULL,
1790 				.numpages = numpages,
1791 				.mask_set = __pgprot(0),
1792 				.mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW),
1793 				.flags = 0};
1794 
1795 	/*
1796 	 * No alias checking needed for setting not present flag. otherwise,
1797 	 * we may need to break large pages for 64-bit kernel text
1798 	 * mappings (this adds to complexity if we want to do this from
1799 	 * atomic context especially). Let's keep it simple!
1800 	 */
1801 	return __change_page_attr_set_clr(&cpa, 0);
1802 }
1803 
kernel_map_pages(struct page * page,int numpages,int enable)1804 void kernel_map_pages(struct page *page, int numpages, int enable)
1805 {
1806 	if (PageHighMem(page))
1807 		return;
1808 	if (!enable) {
1809 		debug_check_no_locks_freed(page_address(page),
1810 					   numpages * PAGE_SIZE);
1811 	}
1812 
1813 	/*
1814 	 * The return value is ignored as the calls cannot fail.
1815 	 * Large pages for identity mappings are not used at boot time
1816 	 * and hence no memory allocations during large page split.
1817 	 */
1818 	if (enable)
1819 		__set_pages_p(page, numpages);
1820 	else
1821 		__set_pages_np(page, numpages);
1822 
1823 	/*
1824 	 * We should perform an IPI and flush all tlbs,
1825 	 * but that can deadlock->flush only current cpu:
1826 	 */
1827 	__flush_tlb_all();
1828 
1829 	arch_flush_lazy_mmu_mode();
1830 }
1831 
1832 #ifdef CONFIG_HIBERNATION
1833 
kernel_page_present(struct page * page)1834 bool kernel_page_present(struct page *page)
1835 {
1836 	unsigned int level;
1837 	pte_t *pte;
1838 
1839 	if (PageHighMem(page))
1840 		return false;
1841 
1842 	pte = lookup_address((unsigned long)page_address(page), &level);
1843 	return (pte_val(*pte) & _PAGE_PRESENT);
1844 }
1845 
1846 #endif /* CONFIG_HIBERNATION */
1847 
1848 #endif /* CONFIG_DEBUG_PAGEALLOC */
1849 
kernel_map_pages_in_pgd(pgd_t * pgd,u64 pfn,unsigned long address,unsigned numpages,unsigned long page_flags)1850 int kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address,
1851 			    unsigned numpages, unsigned long page_flags)
1852 {
1853 	int retval = -EINVAL;
1854 
1855 	struct cpa_data cpa = {
1856 		.vaddr = &address,
1857 		.pfn = pfn,
1858 		.pgd = pgd,
1859 		.numpages = numpages,
1860 		.mask_set = __pgprot(0),
1861 		.mask_clr = __pgprot(0),
1862 		.flags = 0,
1863 	};
1864 
1865 	if (!(__supported_pte_mask & _PAGE_NX))
1866 		goto out;
1867 
1868 	if (!(page_flags & _PAGE_NX))
1869 		cpa.mask_clr = __pgprot(_PAGE_NX);
1870 
1871 	cpa.mask_set = __pgprot(_PAGE_PRESENT | page_flags);
1872 
1873 	retval = __change_page_attr_set_clr(&cpa, 0);
1874 	__flush_tlb_all();
1875 
1876 out:
1877 	return retval;
1878 }
1879 
kernel_unmap_pages_in_pgd(pgd_t * root,unsigned long address,unsigned numpages)1880 void kernel_unmap_pages_in_pgd(pgd_t *root, unsigned long address,
1881 			       unsigned numpages)
1882 {
1883 	unmap_pgd_range(root, address, address + (numpages << PAGE_SHIFT));
1884 }
1885 
1886 /*
1887  * The testcases use internal knowledge of the implementation that shouldn't
1888  * be exposed to the rest of the kernel. Include these directly here.
1889  */
1890 #ifdef CONFIG_CPA_DEBUG
1891 #include "pageattr-test.c"
1892 #endif
1893