• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2002 Andi Kleen, SuSE Labs.
3  * Thanks to Ben LaHaise for precious feedback.
4  */
5 #include <linux/highmem.h>
6 #include <linux/bootmem.h>
7 #include <linux/module.h>
8 #include <linux/sched.h>
9 #include <linux/slab.h>
10 #include <linux/mm.h>
11 #include <linux/interrupt.h>
12 #include <linux/seq_file.h>
13 #include <linux/debugfs.h>
14 
15 #include <asm/e820.h>
16 #include <asm/processor.h>
17 #include <asm/tlbflush.h>
18 #include <asm/sections.h>
19 #include <asm/uaccess.h>
20 #include <asm/pgalloc.h>
21 #include <asm/proto.h>
22 #include <asm/pat.h>
23 
24 /*
25  * The current flushing context - we pass it instead of 5 arguments:
26  */
27 struct cpa_data {
28 	unsigned long	*vaddr;
29 	pgprot_t	mask_set;
30 	pgprot_t	mask_clr;
31 	int		numpages;
32 	int		flags;
33 	unsigned long	pfn;
34 	unsigned	force_split : 1;
35 	int		curpage;
36 };
37 
38 /*
39  * Serialize cpa() (for !DEBUG_PAGEALLOC which uses large identity mappings)
40  * using cpa_lock. So that we don't allow any other cpu, with stale large tlb
41  * entries change the page attribute in parallel to some other cpu
42  * splitting a large page entry along with changing the attribute.
43  */
44 static DEFINE_SPINLOCK(cpa_lock);
45 
46 #define CPA_FLUSHTLB 1
47 #define CPA_ARRAY 2
48 
49 #ifdef CONFIG_PROC_FS
50 static unsigned long direct_pages_count[PG_LEVEL_NUM];
51 
update_page_count(int level,unsigned long pages)52 void update_page_count(int level, unsigned long pages)
53 {
54 	unsigned long flags;
55 
56 	/* Protect against CPA */
57 	spin_lock_irqsave(&pgd_lock, flags);
58 	direct_pages_count[level] += pages;
59 	spin_unlock_irqrestore(&pgd_lock, flags);
60 }
61 
split_page_count(int level)62 static void split_page_count(int level)
63 {
64 	direct_pages_count[level]--;
65 	direct_pages_count[level - 1] += PTRS_PER_PTE;
66 }
67 
arch_report_meminfo(struct seq_file * m)68 void arch_report_meminfo(struct seq_file *m)
69 {
70 	seq_printf(m, "DirectMap4k:    %8lu kB\n",
71 			direct_pages_count[PG_LEVEL_4K] << 2);
72 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
73 	seq_printf(m, "DirectMap2M:    %8lu kB\n",
74 			direct_pages_count[PG_LEVEL_2M] << 11);
75 #else
76 	seq_printf(m, "DirectMap4M:    %8lu kB\n",
77 			direct_pages_count[PG_LEVEL_2M] << 12);
78 #endif
79 #ifdef CONFIG_X86_64
80 	if (direct_gbpages)
81 		seq_printf(m, "DirectMap1G:    %8lu kB\n",
82 			direct_pages_count[PG_LEVEL_1G] << 20);
83 #endif
84 }
85 #else
split_page_count(int level)86 static inline void split_page_count(int level) { }
87 #endif
88 
89 #ifdef CONFIG_X86_64
90 
highmap_start_pfn(void)91 static inline unsigned long highmap_start_pfn(void)
92 {
93 	return __pa(_text) >> PAGE_SHIFT;
94 }
95 
highmap_end_pfn(void)96 static inline unsigned long highmap_end_pfn(void)
97 {
98 	return __pa(roundup((unsigned long)_end, PMD_SIZE)) >> PAGE_SHIFT;
99 }
100 
101 #endif
102 
103 #ifdef CONFIG_DEBUG_PAGEALLOC
104 # define debug_pagealloc 1
105 #else
106 # define debug_pagealloc 0
107 #endif
108 
109 static inline int
within(unsigned long addr,unsigned long start,unsigned long end)110 within(unsigned long addr, unsigned long start, unsigned long end)
111 {
112 	return addr >= start && addr < end;
113 }
114 
115 /*
116  * Flushing functions
117  */
118 
119 /**
120  * clflush_cache_range - flush a cache range with clflush
121  * @addr:	virtual start address
122  * @size:	number of bytes to flush
123  *
124  * clflush is an unordered instruction which needs fencing with mfence
125  * to avoid ordering issues.
126  */
clflush_cache_range(void * vaddr,unsigned int size)127 void clflush_cache_range(void *vaddr, unsigned int size)
128 {
129 	void *vend = vaddr + size - 1;
130 
131 	mb();
132 
133 	for (; vaddr < vend; vaddr += boot_cpu_data.x86_clflush_size)
134 		clflush(vaddr);
135 	/*
136 	 * Flush any possible final partial cacheline:
137 	 */
138 	clflush(vend);
139 
140 	mb();
141 }
142 
__cpa_flush_all(void * arg)143 static void __cpa_flush_all(void *arg)
144 {
145 	unsigned long cache = (unsigned long)arg;
146 
147 	/*
148 	 * Flush all to work around Errata in early athlons regarding
149 	 * large page flushing.
150 	 */
151 	__flush_tlb_all();
152 
153 	if (cache && boot_cpu_data.x86_model >= 4)
154 		wbinvd();
155 }
156 
cpa_flush_all(unsigned long cache)157 static void cpa_flush_all(unsigned long cache)
158 {
159 	BUG_ON(irqs_disabled());
160 
161 	on_each_cpu(__cpa_flush_all, (void *) cache, 1);
162 }
163 
__cpa_flush_range(void * arg)164 static void __cpa_flush_range(void *arg)
165 {
166 	/*
167 	 * We could optimize that further and do individual per page
168 	 * tlb invalidates for a low number of pages. Caveat: we must
169 	 * flush the high aliases on 64bit as well.
170 	 */
171 	__flush_tlb_all();
172 }
173 
cpa_flush_range(unsigned long start,int numpages,int cache)174 static void cpa_flush_range(unsigned long start, int numpages, int cache)
175 {
176 	unsigned int i, level;
177 	unsigned long addr;
178 
179 	BUG_ON(irqs_disabled());
180 	WARN_ON(PAGE_ALIGN(start) != start);
181 
182 	on_each_cpu(__cpa_flush_range, NULL, 1);
183 
184 	if (!cache)
185 		return;
186 
187 	/*
188 	 * We only need to flush on one CPU,
189 	 * clflush is a MESI-coherent instruction that
190 	 * will cause all other CPUs to flush the same
191 	 * cachelines:
192 	 */
193 	for (i = 0, addr = start; i < numpages; i++, addr += PAGE_SIZE) {
194 		pte_t *pte = lookup_address(addr, &level);
195 
196 		/*
197 		 * Only flush present addresses:
198 		 */
199 		if (pte && (pte_val(*pte) & _PAGE_PRESENT))
200 			clflush_cache_range((void *) addr, PAGE_SIZE);
201 	}
202 }
203 
cpa_flush_array(unsigned long * start,int numpages,int cache)204 static void cpa_flush_array(unsigned long *start, int numpages, int cache)
205 {
206 	unsigned int i, level;
207 	unsigned long *addr;
208 
209 	BUG_ON(irqs_disabled());
210 
211 	on_each_cpu(__cpa_flush_range, NULL, 1);
212 
213 	if (!cache)
214 		return;
215 
216 	/* 4M threshold */
217 	if (numpages >= 1024) {
218 		if (boot_cpu_data.x86_model >= 4)
219 			wbinvd();
220 		return;
221 	}
222 	/*
223 	 * We only need to flush on one CPU,
224 	 * clflush is a MESI-coherent instruction that
225 	 * will cause all other CPUs to flush the same
226 	 * cachelines:
227 	 */
228 	for (i = 0, addr = start; i < numpages; i++, addr++) {
229 		pte_t *pte = lookup_address(*addr, &level);
230 
231 		/*
232 		 * Only flush present addresses:
233 		 */
234 		if (pte && (pte_val(*pte) & _PAGE_PRESENT))
235 			clflush_cache_range((void *) *addr, PAGE_SIZE);
236 	}
237 }
238 
239 /*
240  * Certain areas of memory on x86 require very specific protection flags,
241  * for example the BIOS area or kernel text. Callers don't always get this
242  * right (again, ioremap() on BIOS memory is not uncommon) so this function
243  * checks and fixes these known static required protection bits.
244  */
static_protections(pgprot_t prot,unsigned long address,unsigned long pfn)245 static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
246 				   unsigned long pfn)
247 {
248 	pgprot_t forbidden = __pgprot(0);
249 
250 	/*
251 	 * The BIOS area between 640k and 1Mb needs to be executable for
252 	 * PCI BIOS based config access (CONFIG_PCI_GOBIOS) support.
253 	 */
254 	if (within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT))
255 		pgprot_val(forbidden) |= _PAGE_NX;
256 
257 	/*
258 	 * The kernel text needs to be executable for obvious reasons
259 	 * Does not cover __inittext since that is gone later on. On
260 	 * 64bit we do not enforce !NX on the low mapping
261 	 */
262 	if (within(address, (unsigned long)_text, (unsigned long)_etext))
263 		pgprot_val(forbidden) |= _PAGE_NX;
264 
265 	/*
266 	 * The .rodata section needs to be read-only. Using the pfn
267 	 * catches all aliases.
268 	 */
269 	if (within(pfn, __pa((unsigned long)__start_rodata) >> PAGE_SHIFT,
270 		   __pa((unsigned long)__end_rodata) >> PAGE_SHIFT))
271 		pgprot_val(forbidden) |= _PAGE_RW;
272 
273 	prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden));
274 
275 	return prot;
276 }
277 
278 /*
279  * Lookup the page table entry for a virtual address. Return a pointer
280  * to the entry and the level of the mapping.
281  *
282  * Note: We return pud and pmd either when the entry is marked large
283  * or when the present bit is not set. Otherwise we would return a
284  * pointer to a nonexisting mapping.
285  */
lookup_address(unsigned long address,unsigned int * level)286 pte_t *lookup_address(unsigned long address, unsigned int *level)
287 {
288 	pgd_t *pgd = pgd_offset_k(address);
289 	pud_t *pud;
290 	pmd_t *pmd;
291 
292 	*level = PG_LEVEL_NONE;
293 
294 	if (pgd_none(*pgd))
295 		return NULL;
296 
297 	pud = pud_offset(pgd, address);
298 	if (pud_none(*pud))
299 		return NULL;
300 
301 	*level = PG_LEVEL_1G;
302 	if (pud_large(*pud) || !pud_present(*pud))
303 		return (pte_t *)pud;
304 
305 	pmd = pmd_offset(pud, address);
306 	if (pmd_none(*pmd))
307 		return NULL;
308 
309 	*level = PG_LEVEL_2M;
310 	if (pmd_large(*pmd) || !pmd_present(*pmd))
311 		return (pte_t *)pmd;
312 
313 	*level = PG_LEVEL_4K;
314 
315 	return pte_offset_kernel(pmd, address);
316 }
317 EXPORT_SYMBOL_GPL(lookup_address);
318 
319 /*
320  * Set the new pmd in all the pgds we know about:
321  */
__set_pmd_pte(pte_t * kpte,unsigned long address,pte_t pte)322 static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte)
323 {
324 	/* change init_mm */
325 	set_pte_atomic(kpte, pte);
326 #ifdef CONFIG_X86_32
327 	if (!SHARED_KERNEL_PMD) {
328 		struct page *page;
329 
330 		list_for_each_entry(page, &pgd_list, lru) {
331 			pgd_t *pgd;
332 			pud_t *pud;
333 			pmd_t *pmd;
334 
335 			pgd = (pgd_t *)page_address(page) + pgd_index(address);
336 			pud = pud_offset(pgd, address);
337 			pmd = pmd_offset(pud, address);
338 			set_pte_atomic((pte_t *)pmd, pte);
339 		}
340 	}
341 #endif
342 }
343 
344 static int
try_preserve_large_page(pte_t * kpte,unsigned long address,struct cpa_data * cpa)345 try_preserve_large_page(pte_t *kpte, unsigned long address,
346 			struct cpa_data *cpa)
347 {
348 	unsigned long nextpage_addr, numpages, pmask, psize, flags, addr, pfn;
349 	pte_t new_pte, old_pte, *tmp;
350 	pgprot_t old_prot, new_prot;
351 	int i, do_split = 1;
352 	unsigned int level;
353 
354 	if (cpa->force_split)
355 		return 1;
356 
357 	spin_lock_irqsave(&pgd_lock, flags);
358 	/*
359 	 * Check for races, another CPU might have split this page
360 	 * up already:
361 	 */
362 	tmp = lookup_address(address, &level);
363 	if (tmp != kpte)
364 		goto out_unlock;
365 
366 	switch (level) {
367 	case PG_LEVEL_2M:
368 		psize = PMD_PAGE_SIZE;
369 		pmask = PMD_PAGE_MASK;
370 		break;
371 #ifdef CONFIG_X86_64
372 	case PG_LEVEL_1G:
373 		psize = PUD_PAGE_SIZE;
374 		pmask = PUD_PAGE_MASK;
375 		break;
376 #endif
377 	default:
378 		do_split = -EINVAL;
379 		goto out_unlock;
380 	}
381 
382 	/*
383 	 * Calculate the number of pages, which fit into this large
384 	 * page starting at address:
385 	 */
386 	nextpage_addr = (address + psize) & pmask;
387 	numpages = (nextpage_addr - address) >> PAGE_SHIFT;
388 	if (numpages < cpa->numpages)
389 		cpa->numpages = numpages;
390 
391 	/*
392 	 * We are safe now. Check whether the new pgprot is the same:
393 	 */
394 	old_pte = *kpte;
395 	old_prot = new_prot = pte_pgprot(old_pte);
396 
397 	pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr);
398 	pgprot_val(new_prot) |= pgprot_val(cpa->mask_set);
399 
400 	/*
401 	 * old_pte points to the large page base address. So we need
402 	 * to add the offset of the virtual address:
403 	 */
404 	pfn = pte_pfn(old_pte) + ((address & (psize - 1)) >> PAGE_SHIFT);
405 	cpa->pfn = pfn;
406 
407 	new_prot = static_protections(new_prot, address, pfn);
408 
409 	/*
410 	 * We need to check the full range, whether
411 	 * static_protection() requires a different pgprot for one of
412 	 * the pages in the range we try to preserve:
413 	 */
414 	addr = address + PAGE_SIZE;
415 	pfn++;
416 	for (i = 1; i < cpa->numpages; i++, addr += PAGE_SIZE, pfn++) {
417 		pgprot_t chk_prot = static_protections(new_prot, addr, pfn);
418 
419 		if (pgprot_val(chk_prot) != pgprot_val(new_prot))
420 			goto out_unlock;
421 	}
422 
423 	/*
424 	 * If there are no changes, return. maxpages has been updated
425 	 * above:
426 	 */
427 	if (pgprot_val(new_prot) == pgprot_val(old_prot)) {
428 		do_split = 0;
429 		goto out_unlock;
430 	}
431 
432 	/*
433 	 * We need to change the attributes. Check, whether we can
434 	 * change the large page in one go. We request a split, when
435 	 * the address is not aligned and the number of pages is
436 	 * smaller than the number of pages in the large page. Note
437 	 * that we limited the number of possible pages already to
438 	 * the number of pages in the large page.
439 	 */
440 	if (address == (nextpage_addr - psize) && cpa->numpages == numpages) {
441 		/*
442 		 * The address is aligned and the number of pages
443 		 * covers the full page.
444 		 */
445 		new_pte = pfn_pte(pte_pfn(old_pte), canon_pgprot(new_prot));
446 		__set_pmd_pte(kpte, address, new_pte);
447 		cpa->flags |= CPA_FLUSHTLB;
448 		do_split = 0;
449 	}
450 
451 out_unlock:
452 	spin_unlock_irqrestore(&pgd_lock, flags);
453 
454 	return do_split;
455 }
456 
split_large_page(pte_t * kpte,unsigned long address)457 static int split_large_page(pte_t *kpte, unsigned long address)
458 {
459 	unsigned long flags, pfn, pfninc = 1;
460 	unsigned int i, level;
461 	pte_t *pbase, *tmp;
462 	pgprot_t ref_prot;
463 	struct page *base;
464 
465 	if (!debug_pagealloc)
466 		spin_unlock(&cpa_lock);
467 	base = alloc_pages(GFP_KERNEL, 0);
468 	if (!debug_pagealloc)
469 		spin_lock(&cpa_lock);
470 	if (!base)
471 		return -ENOMEM;
472 
473 	spin_lock_irqsave(&pgd_lock, flags);
474 	/*
475 	 * Check for races, another CPU might have split this page
476 	 * up for us already:
477 	 */
478 	tmp = lookup_address(address, &level);
479 	if (tmp != kpte)
480 		goto out_unlock;
481 
482 	pbase = (pte_t *)page_address(base);
483 	paravirt_alloc_pte(&init_mm, page_to_pfn(base));
484 	ref_prot = pte_pgprot(pte_clrhuge(*kpte));
485 
486 #ifdef CONFIG_X86_64
487 	if (level == PG_LEVEL_1G) {
488 		pfninc = PMD_PAGE_SIZE >> PAGE_SHIFT;
489 		pgprot_val(ref_prot) |= _PAGE_PSE;
490 	}
491 #endif
492 
493 	/*
494 	 * Get the target pfn from the original entry:
495 	 */
496 	pfn = pte_pfn(*kpte);
497 	for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc)
498 		set_pte(&pbase[i], pfn_pte(pfn, ref_prot));
499 
500 	if (address >= (unsigned long)__va(0) &&
501 		address < (unsigned long)__va(max_low_pfn_mapped << PAGE_SHIFT))
502 		split_page_count(level);
503 
504 #ifdef CONFIG_X86_64
505 	if (address >= (unsigned long)__va(1UL<<32) &&
506 		address < (unsigned long)__va(max_pfn_mapped << PAGE_SHIFT))
507 		split_page_count(level);
508 #endif
509 
510 	/*
511 	 * Install the new, split up pagetable.
512 	 *
513 	 * We use the standard kernel pagetable protections for the new
514 	 * pagetable protections, the actual ptes set above control the
515 	 * primary protection behavior:
516 	 */
517 	__set_pmd_pte(kpte, address, mk_pte(base, __pgprot(_KERNPG_TABLE)));
518 
519 	/*
520 	 * Intel Atom errata AAH41 workaround.
521 	 *
522 	 * The real fix should be in hw or in a microcode update, but
523 	 * we also probabilistically try to reduce the window of having
524 	 * a large TLB mixed with 4K TLBs while instruction fetches are
525 	 * going on.
526 	 */
527 	__flush_tlb_all();
528 
529 	base = NULL;
530 
531 out_unlock:
532 	/*
533 	 * If we dropped out via the lookup_address check under
534 	 * pgd_lock then stick the page back into the pool:
535 	 */
536 	if (base)
537 		__free_page(base);
538 	spin_unlock_irqrestore(&pgd_lock, flags);
539 
540 	return 0;
541 }
542 
__cpa_process_fault(struct cpa_data * cpa,unsigned long vaddr,int primary)543 static int __cpa_process_fault(struct cpa_data *cpa, unsigned long vaddr,
544 			       int primary)
545 {
546 	/*
547 	 * Ignore all non primary paths.
548 	 */
549 	if (!primary)
550 		return 0;
551 
552 	/*
553 	 * Ignore the NULL PTE for kernel identity mapping, as it is expected
554 	 * to have holes.
555 	 * Also set numpages to '1' indicating that we processed cpa req for
556 	 * one virtual address page and its pfn. TBD: numpages can be set based
557 	 * on the initial value and the level returned by lookup_address().
558 	 */
559 	if (within(vaddr, PAGE_OFFSET,
560 		   PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))) {
561 		cpa->numpages = 1;
562 		cpa->pfn = __pa(vaddr) >> PAGE_SHIFT;
563 		return 0;
564 	} else {
565 		WARN(1, KERN_WARNING "CPA: called for zero pte. "
566 			"vaddr = %lx cpa->vaddr = %lx\n", vaddr,
567 			*cpa->vaddr);
568 
569 		return -EFAULT;
570 	}
571 }
572 
__change_page_attr(struct cpa_data * cpa,int primary)573 static int __change_page_attr(struct cpa_data *cpa, int primary)
574 {
575 	unsigned long address;
576 	int do_split, err;
577 	unsigned int level;
578 	pte_t *kpte, old_pte;
579 
580 	if (cpa->flags & CPA_ARRAY)
581 		address = cpa->vaddr[cpa->curpage];
582 	else
583 		address = *cpa->vaddr;
584 repeat:
585 	kpte = lookup_address(address, &level);
586 	if (!kpte)
587 		return __cpa_process_fault(cpa, address, primary);
588 
589 	old_pte = *kpte;
590 	if (!pte_val(old_pte))
591 		return __cpa_process_fault(cpa, address, primary);
592 
593 	if (level == PG_LEVEL_4K) {
594 		pte_t new_pte;
595 		pgprot_t new_prot = pte_pgprot(old_pte);
596 		unsigned long pfn = pte_pfn(old_pte);
597 
598 		pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr);
599 		pgprot_val(new_prot) |= pgprot_val(cpa->mask_set);
600 
601 		new_prot = static_protections(new_prot, address, pfn);
602 
603 		/*
604 		 * We need to keep the pfn from the existing PTE,
605 		 * after all we're only going to change it's attributes
606 		 * not the memory it points to
607 		 */
608 		new_pte = pfn_pte(pfn, canon_pgprot(new_prot));
609 		cpa->pfn = pfn;
610 		/*
611 		 * Do we really change anything ?
612 		 */
613 		if (pte_val(old_pte) != pte_val(new_pte)) {
614 			set_pte_atomic(kpte, new_pte);
615 			cpa->flags |= CPA_FLUSHTLB;
616 		}
617 		cpa->numpages = 1;
618 		return 0;
619 	}
620 
621 	/*
622 	 * Check, whether we can keep the large page intact
623 	 * and just change the pte:
624 	 */
625 	do_split = try_preserve_large_page(kpte, address, cpa);
626 	/*
627 	 * When the range fits into the existing large page,
628 	 * return. cp->numpages and cpa->tlbflush have been updated in
629 	 * try_large_page:
630 	 */
631 	if (do_split <= 0)
632 		return do_split;
633 
634 	/*
635 	 * We have to split the large page:
636 	 */
637 	err = split_large_page(kpte, address);
638 	if (!err) {
639 		/*
640 	 	 * Do a global flush tlb after splitting the large page
641 	 	 * and before we do the actual change page attribute in the PTE.
642 	 	 *
643 	 	 * With out this, we violate the TLB application note, that says
644 	 	 * "The TLBs may contain both ordinary and large-page
645 		 *  translations for a 4-KByte range of linear addresses. This
646 		 *  may occur if software modifies the paging structures so that
647 		 *  the page size used for the address range changes. If the two
648 		 *  translations differ with respect to page frame or attributes
649 		 *  (e.g., permissions), processor behavior is undefined and may
650 		 *  be implementation-specific."
651 	 	 *
652 	 	 * We do this global tlb flush inside the cpa_lock, so that we
653 		 * don't allow any other cpu, with stale tlb entries change the
654 		 * page attribute in parallel, that also falls into the
655 		 * just split large page entry.
656 	 	 */
657 		flush_tlb_all();
658 		goto repeat;
659 	}
660 
661 	return err;
662 }
663 
664 static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias);
665 
cpa_process_alias(struct cpa_data * cpa)666 static int cpa_process_alias(struct cpa_data *cpa)
667 {
668 	struct cpa_data alias_cpa;
669 	int ret = 0;
670 	unsigned long temp_cpa_vaddr, vaddr;
671 
672 	if (cpa->pfn >= max_pfn_mapped)
673 		return 0;
674 
675 #ifdef CONFIG_X86_64
676 	if (cpa->pfn >= max_low_pfn_mapped && cpa->pfn < (1UL<<(32-PAGE_SHIFT)))
677 		return 0;
678 #endif
679 	/*
680 	 * No need to redo, when the primary call touched the direct
681 	 * mapping already:
682 	 */
683 	if (cpa->flags & CPA_ARRAY)
684 		vaddr = cpa->vaddr[cpa->curpage];
685 	else
686 		vaddr = *cpa->vaddr;
687 
688 	if (!(within(vaddr, PAGE_OFFSET,
689 		    PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT)))) {
690 
691 		alias_cpa = *cpa;
692 		temp_cpa_vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT);
693 		alias_cpa.vaddr = &temp_cpa_vaddr;
694 		alias_cpa.flags &= ~CPA_ARRAY;
695 
696 
697 		ret = __change_page_attr_set_clr(&alias_cpa, 0);
698 	}
699 
700 #ifdef CONFIG_X86_64
701 	if (ret)
702 		return ret;
703 	/*
704 	 * No need to redo, when the primary call touched the high
705 	 * mapping already:
706 	 */
707 	if (within(vaddr, (unsigned long) _text, (unsigned long) _end))
708 		return 0;
709 
710 	/*
711 	 * If the physical address is inside the kernel map, we need
712 	 * to touch the high mapped kernel as well:
713 	 */
714 	if (!within(cpa->pfn, highmap_start_pfn(), highmap_end_pfn()))
715 		return 0;
716 
717 	alias_cpa = *cpa;
718 	temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) + __START_KERNEL_map - phys_base;
719 	alias_cpa.vaddr = &temp_cpa_vaddr;
720 	alias_cpa.flags &= ~CPA_ARRAY;
721 
722 	/*
723 	 * The high mapping range is imprecise, so ignore the return value.
724 	 */
725 	__change_page_attr_set_clr(&alias_cpa, 0);
726 #endif
727 	return ret;
728 }
729 
__change_page_attr_set_clr(struct cpa_data * cpa,int checkalias)730 static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
731 {
732 	int ret, numpages = cpa->numpages;
733 
734 	while (numpages) {
735 		/*
736 		 * Store the remaining nr of pages for the large page
737 		 * preservation check.
738 		 */
739 		cpa->numpages = numpages;
740 		/* for array changes, we can't use large page */
741 		if (cpa->flags & CPA_ARRAY)
742 			cpa->numpages = 1;
743 
744 		if (!debug_pagealloc)
745 			spin_lock(&cpa_lock);
746 		ret = __change_page_attr(cpa, checkalias);
747 		if (!debug_pagealloc)
748 			spin_unlock(&cpa_lock);
749 		if (ret)
750 			return ret;
751 
752 		if (checkalias) {
753 			ret = cpa_process_alias(cpa);
754 			if (ret)
755 				return ret;
756 		}
757 
758 		/*
759 		 * Adjust the number of pages with the result of the
760 		 * CPA operation. Either a large page has been
761 		 * preserved or a single page update happened.
762 		 */
763 		BUG_ON(cpa->numpages > numpages);
764 		numpages -= cpa->numpages;
765 		if (cpa->flags & CPA_ARRAY)
766 			cpa->curpage++;
767 		else
768 			*cpa->vaddr += cpa->numpages * PAGE_SIZE;
769 
770 	}
771 	return 0;
772 }
773 
cache_attr(pgprot_t attr)774 static inline int cache_attr(pgprot_t attr)
775 {
776 	return pgprot_val(attr) &
777 		(_PAGE_PAT | _PAGE_PAT_LARGE | _PAGE_PWT | _PAGE_PCD);
778 }
779 
change_page_attr_set_clr(unsigned long * addr,int numpages,pgprot_t mask_set,pgprot_t mask_clr,int force_split,int array)780 static int change_page_attr_set_clr(unsigned long *addr, int numpages,
781 				    pgprot_t mask_set, pgprot_t mask_clr,
782 				    int force_split, int array)
783 {
784 	struct cpa_data cpa;
785 	int ret, cache, checkalias;
786 
787 	/*
788 	 * Check, if we are requested to change a not supported
789 	 * feature:
790 	 */
791 	mask_set = canon_pgprot(mask_set);
792 	mask_clr = canon_pgprot(mask_clr);
793 	if (!pgprot_val(mask_set) && !pgprot_val(mask_clr) && !force_split)
794 		return 0;
795 
796 	/* Ensure we are PAGE_SIZE aligned */
797 	if (!array) {
798 		if (*addr & ~PAGE_MASK) {
799 			*addr &= PAGE_MASK;
800 			/*
801 			 * People should not be passing in unaligned addresses:
802 			 */
803 			WARN_ON_ONCE(1);
804 		}
805 	} else {
806 		int i;
807 		for (i = 0; i < numpages; i++) {
808 			if (addr[i] & ~PAGE_MASK) {
809 				addr[i] &= PAGE_MASK;
810 				WARN_ON_ONCE(1);
811 			}
812 		}
813 	}
814 
815 	/* Must avoid aliasing mappings in the highmem code */
816 	kmap_flush_unused();
817 
818 	vm_unmap_aliases();
819 
820 	/*
821 	 * If we're called with lazy mmu updates enabled, the
822 	 * in-memory pte state may be stale.  Flush pending updates to
823 	 * bring them up to date.
824 	 */
825 	arch_flush_lazy_mmu_mode();
826 
827 	cpa.vaddr = addr;
828 	cpa.numpages = numpages;
829 	cpa.mask_set = mask_set;
830 	cpa.mask_clr = mask_clr;
831 	cpa.flags = 0;
832 	cpa.curpage = 0;
833 	cpa.force_split = force_split;
834 
835 	if (array)
836 		cpa.flags |= CPA_ARRAY;
837 
838 	/* No alias checking for _NX bit modifications */
839 	checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX;
840 
841 	ret = __change_page_attr_set_clr(&cpa, checkalias);
842 
843 	/*
844 	 * Check whether we really changed something:
845 	 */
846 	if (!(cpa.flags & CPA_FLUSHTLB))
847 		goto out;
848 
849 	/*
850 	 * No need to flush, when we did not set any of the caching
851 	 * attributes:
852 	 */
853 	cache = cache_attr(mask_set);
854 
855 	/*
856 	 * On success we use clflush, when the CPU supports it to
857 	 * avoid the wbindv. If the CPU does not support it and in the
858 	 * error case we fall back to cpa_flush_all (which uses
859 	 * wbindv):
860 	 */
861 	if (!ret && cpu_has_clflush) {
862 		if (cpa.flags & CPA_ARRAY)
863 			cpa_flush_array(addr, numpages, cache);
864 		else
865 			cpa_flush_range(*addr, numpages, cache);
866 	} else
867 		cpa_flush_all(cache);
868 
869 	/*
870 	 * If we've been called with lazy mmu updates enabled, then
871 	 * make sure that everything gets flushed out before we
872 	 * return.
873 	 */
874 	arch_flush_lazy_mmu_mode();
875 
876 out:
877 	return ret;
878 }
879 
change_page_attr_set(unsigned long * addr,int numpages,pgprot_t mask,int array)880 static inline int change_page_attr_set(unsigned long *addr, int numpages,
881 				       pgprot_t mask, int array)
882 {
883 	return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0,
884 		array);
885 }
886 
change_page_attr_clear(unsigned long * addr,int numpages,pgprot_t mask,int array)887 static inline int change_page_attr_clear(unsigned long *addr, int numpages,
888 					 pgprot_t mask, int array)
889 {
890 	return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0,
891 		array);
892 }
893 
_set_memory_uc(unsigned long addr,int numpages)894 int _set_memory_uc(unsigned long addr, int numpages)
895 {
896 	/*
897 	 * for now UC MINUS. see comments in ioremap_nocache()
898 	 */
899 	return change_page_attr_set(&addr, numpages,
900 				    __pgprot(_PAGE_CACHE_UC_MINUS), 0);
901 }
902 
set_memory_uc(unsigned long addr,int numpages)903 int set_memory_uc(unsigned long addr, int numpages)
904 {
905 	/*
906 	 * for now UC MINUS. see comments in ioremap_nocache()
907 	 */
908 	if (reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
909 			    _PAGE_CACHE_UC_MINUS, NULL))
910 		return -EINVAL;
911 
912 	return _set_memory_uc(addr, numpages);
913 }
914 EXPORT_SYMBOL(set_memory_uc);
915 
set_memory_array_uc(unsigned long * addr,int addrinarray)916 int set_memory_array_uc(unsigned long *addr, int addrinarray)
917 {
918 	unsigned long start;
919 	unsigned long end;
920 	int i;
921 	/*
922 	 * for now UC MINUS. see comments in ioremap_nocache()
923 	 */
924 	for (i = 0; i < addrinarray; i++) {
925 		start = __pa(addr[i]);
926 		for (end = start + PAGE_SIZE; i < addrinarray - 1; end += PAGE_SIZE) {
927 			if (end != __pa(addr[i + 1]))
928 				break;
929 			i++;
930 		}
931 		if (reserve_memtype(start, end, _PAGE_CACHE_UC_MINUS, NULL))
932 			goto out;
933 	}
934 
935 	return change_page_attr_set(addr, addrinarray,
936 				    __pgprot(_PAGE_CACHE_UC_MINUS), 1);
937 out:
938 	for (i = 0; i < addrinarray; i++) {
939 		unsigned long tmp = __pa(addr[i]);
940 
941 		if (tmp == start)
942 			break;
943 		for (end = tmp + PAGE_SIZE; i < addrinarray - 1; end += PAGE_SIZE) {
944 			if (end != __pa(addr[i + 1]))
945 				break;
946 			i++;
947 		}
948 		free_memtype(tmp, end);
949 	}
950 	return -EINVAL;
951 }
952 EXPORT_SYMBOL(set_memory_array_uc);
953 
_set_memory_wc(unsigned long addr,int numpages)954 int _set_memory_wc(unsigned long addr, int numpages)
955 {
956 	return change_page_attr_set(&addr, numpages,
957 				    __pgprot(_PAGE_CACHE_WC), 0);
958 }
959 
set_memory_wc(unsigned long addr,int numpages)960 int set_memory_wc(unsigned long addr, int numpages)
961 {
962 	if (!pat_enabled)
963 		return set_memory_uc(addr, numpages);
964 
965 	if (reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
966 		_PAGE_CACHE_WC, NULL))
967 		return -EINVAL;
968 
969 	return _set_memory_wc(addr, numpages);
970 }
971 EXPORT_SYMBOL(set_memory_wc);
972 
_set_memory_wb(unsigned long addr,int numpages)973 int _set_memory_wb(unsigned long addr, int numpages)
974 {
975 	return change_page_attr_clear(&addr, numpages,
976 				      __pgprot(_PAGE_CACHE_MASK), 0);
977 }
978 
set_memory_wb(unsigned long addr,int numpages)979 int set_memory_wb(unsigned long addr, int numpages)
980 {
981 	free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
982 
983 	return _set_memory_wb(addr, numpages);
984 }
985 EXPORT_SYMBOL(set_memory_wb);
986 
set_memory_array_wb(unsigned long * addr,int addrinarray)987 int set_memory_array_wb(unsigned long *addr, int addrinarray)
988 {
989 	int i;
990 
991 	for (i = 0; i < addrinarray; i++) {
992 		unsigned long start = __pa(addr[i]);
993 		unsigned long end;
994 
995 		for (end = start + PAGE_SIZE; i < addrinarray - 1; end += PAGE_SIZE) {
996 			if (end != __pa(addr[i + 1]))
997 				break;
998 			i++;
999 		}
1000 		free_memtype(start, end);
1001 	}
1002 	return change_page_attr_clear(addr, addrinarray,
1003 				      __pgprot(_PAGE_CACHE_MASK), 1);
1004 }
1005 EXPORT_SYMBOL(set_memory_array_wb);
1006 
set_memory_x(unsigned long addr,int numpages)1007 int set_memory_x(unsigned long addr, int numpages)
1008 {
1009 	return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_NX), 0);
1010 }
1011 EXPORT_SYMBOL(set_memory_x);
1012 
set_memory_nx(unsigned long addr,int numpages)1013 int set_memory_nx(unsigned long addr, int numpages)
1014 {
1015 	return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_NX), 0);
1016 }
1017 EXPORT_SYMBOL(set_memory_nx);
1018 
set_memory_ro(unsigned long addr,int numpages)1019 int set_memory_ro(unsigned long addr, int numpages)
1020 {
1021 	return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_RW), 0);
1022 }
1023 EXPORT_SYMBOL_GPL(set_memory_ro);
1024 
set_memory_rw(unsigned long addr,int numpages)1025 int set_memory_rw(unsigned long addr, int numpages)
1026 {
1027 	return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_RW), 0);
1028 }
1029 EXPORT_SYMBOL_GPL(set_memory_rw);
1030 
set_memory_np(unsigned long addr,int numpages)1031 int set_memory_np(unsigned long addr, int numpages)
1032 {
1033 	return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_PRESENT), 0);
1034 }
1035 
set_memory_4k(unsigned long addr,int numpages)1036 int set_memory_4k(unsigned long addr, int numpages)
1037 {
1038 	return change_page_attr_set_clr(&addr, numpages, __pgprot(0),
1039 					__pgprot(0), 1, 0);
1040 }
1041 
set_pages_uc(struct page * page,int numpages)1042 int set_pages_uc(struct page *page, int numpages)
1043 {
1044 	unsigned long addr = (unsigned long)page_address(page);
1045 
1046 	return set_memory_uc(addr, numpages);
1047 }
1048 EXPORT_SYMBOL(set_pages_uc);
1049 
set_pages_wb(struct page * page,int numpages)1050 int set_pages_wb(struct page *page, int numpages)
1051 {
1052 	unsigned long addr = (unsigned long)page_address(page);
1053 
1054 	return set_memory_wb(addr, numpages);
1055 }
1056 EXPORT_SYMBOL(set_pages_wb);
1057 
set_pages_x(struct page * page,int numpages)1058 int set_pages_x(struct page *page, int numpages)
1059 {
1060 	unsigned long addr = (unsigned long)page_address(page);
1061 
1062 	return set_memory_x(addr, numpages);
1063 }
1064 EXPORT_SYMBOL(set_pages_x);
1065 
set_pages_nx(struct page * page,int numpages)1066 int set_pages_nx(struct page *page, int numpages)
1067 {
1068 	unsigned long addr = (unsigned long)page_address(page);
1069 
1070 	return set_memory_nx(addr, numpages);
1071 }
1072 EXPORT_SYMBOL(set_pages_nx);
1073 
set_pages_ro(struct page * page,int numpages)1074 int set_pages_ro(struct page *page, int numpages)
1075 {
1076 	unsigned long addr = (unsigned long)page_address(page);
1077 
1078 	return set_memory_ro(addr, numpages);
1079 }
1080 
set_pages_rw(struct page * page,int numpages)1081 int set_pages_rw(struct page *page, int numpages)
1082 {
1083 	unsigned long addr = (unsigned long)page_address(page);
1084 
1085 	return set_memory_rw(addr, numpages);
1086 }
1087 
1088 #ifdef CONFIG_DEBUG_PAGEALLOC
1089 
__set_pages_p(struct page * page,int numpages)1090 static int __set_pages_p(struct page *page, int numpages)
1091 {
1092 	unsigned long tempaddr = (unsigned long) page_address(page);
1093 	struct cpa_data cpa = { .vaddr = &tempaddr,
1094 				.numpages = numpages,
1095 				.mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW),
1096 				.mask_clr = __pgprot(0),
1097 				.flags = 0};
1098 
1099 	/*
1100 	 * No alias checking needed for setting present flag. otherwise,
1101 	 * we may need to break large pages for 64-bit kernel text
1102 	 * mappings (this adds to complexity if we want to do this from
1103 	 * atomic context especially). Let's keep it simple!
1104 	 */
1105 	return __change_page_attr_set_clr(&cpa, 0);
1106 }
1107 
__set_pages_np(struct page * page,int numpages)1108 static int __set_pages_np(struct page *page, int numpages)
1109 {
1110 	unsigned long tempaddr = (unsigned long) page_address(page);
1111 	struct cpa_data cpa = { .vaddr = &tempaddr,
1112 				.numpages = numpages,
1113 				.mask_set = __pgprot(0),
1114 				.mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW),
1115 				.flags = 0};
1116 
1117 	/*
1118 	 * No alias checking needed for setting not present flag. otherwise,
1119 	 * we may need to break large pages for 64-bit kernel text
1120 	 * mappings (this adds to complexity if we want to do this from
1121 	 * atomic context especially). Let's keep it simple!
1122 	 */
1123 	return __change_page_attr_set_clr(&cpa, 0);
1124 }
1125 
kernel_map_pages(struct page * page,int numpages,int enable)1126 void kernel_map_pages(struct page *page, int numpages, int enable)
1127 {
1128 	if (PageHighMem(page))
1129 		return;
1130 	if (!enable) {
1131 		debug_check_no_locks_freed(page_address(page),
1132 					   numpages * PAGE_SIZE);
1133 	}
1134 
1135 	/*
1136 	 * If page allocator is not up yet then do not call c_p_a():
1137 	 */
1138 	if (!debug_pagealloc_enabled)
1139 		return;
1140 
1141 	/*
1142 	 * The return value is ignored as the calls cannot fail.
1143 	 * Large pages for identity mappings are not used at boot time
1144 	 * and hence no memory allocations during large page split.
1145 	 */
1146 	if (enable)
1147 		__set_pages_p(page, numpages);
1148 	else
1149 		__set_pages_np(page, numpages);
1150 
1151 	/*
1152 	 * We should perform an IPI and flush all tlbs,
1153 	 * but that can deadlock->flush only current cpu:
1154 	 */
1155 	__flush_tlb_all();
1156 }
1157 
1158 #ifdef CONFIG_HIBERNATION
1159 
kernel_page_present(struct page * page)1160 bool kernel_page_present(struct page *page)
1161 {
1162 	unsigned int level;
1163 	pte_t *pte;
1164 
1165 	if (PageHighMem(page))
1166 		return false;
1167 
1168 	pte = lookup_address((unsigned long)page_address(page), &level);
1169 	return (pte_val(*pte) & _PAGE_PRESENT);
1170 }
1171 
1172 #endif /* CONFIG_HIBERNATION */
1173 
1174 #endif /* CONFIG_DEBUG_PAGEALLOC */
1175 
1176 /*
1177  * The testcases use internal knowledge of the implementation that shouldn't
1178  * be exposed to the rest of the kernel. Include these directly here.
1179  */
1180 #ifdef CONFIG_CPA_DEBUG
1181 #include "pageattr-test.c"
1182 #endif
1183