• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2005, Paul Mackerras, IBM Corporation.
3  * Copyright 2009, Benjamin Herrenschmidt, IBM Corporation.
4  * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation.
5  *
6  * This program is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU General Public License
8  * as published by the Free Software Foundation; either version
9  * 2 of the License, or (at your option) any later version.
10  */
11 
12 #include <linux/sched.h>
13 #include <asm/pgalloc.h>
14 #include <asm/tlb.h>
15 
16 #include "mmu_decl.h"
17 
18 #define CREATE_TRACE_POINTS
19 #include <trace/events/thp.h>
20 
21 #ifdef CONFIG_SPARSEMEM_VMEMMAP
22 /*
23  * On hash-based CPUs, the vmemmap is bolted in the hash table.
24  *
25  */
hash__vmemmap_create_mapping(unsigned long start,unsigned long page_size,unsigned long phys)26 int __meminit hash__vmemmap_create_mapping(unsigned long start,
27 				       unsigned long page_size,
28 				       unsigned long phys)
29 {
30 	int rc = htab_bolt_mapping(start, start + page_size, phys,
31 				   pgprot_val(PAGE_KERNEL),
32 				   mmu_vmemmap_psize, mmu_kernel_ssize);
33 	if (rc < 0) {
34 		int rc2 = htab_remove_mapping(start, start + page_size,
35 					      mmu_vmemmap_psize,
36 					      mmu_kernel_ssize);
37 		BUG_ON(rc2 && (rc2 != -ENOENT));
38 	}
39 	return rc;
40 }
41 
42 #ifdef CONFIG_MEMORY_HOTPLUG
hash__vmemmap_remove_mapping(unsigned long start,unsigned long page_size)43 void hash__vmemmap_remove_mapping(unsigned long start,
44 			      unsigned long page_size)
45 {
46 	int rc = htab_remove_mapping(start, start + page_size,
47 				     mmu_vmemmap_psize,
48 				     mmu_kernel_ssize);
49 	BUG_ON((rc < 0) && (rc != -ENOENT));
50 	WARN_ON(rc == -ENOENT);
51 }
52 #endif
53 #endif /* CONFIG_SPARSEMEM_VMEMMAP */
54 
55 /*
56  * map_kernel_page currently only called by __ioremap
57  * map_kernel_page adds an entry to the ioremap page table
58  * and adds an entry to the HPT, possibly bolting it
59  */
hash__map_kernel_page(unsigned long ea,unsigned long pa,unsigned long flags)60 int hash__map_kernel_page(unsigned long ea, unsigned long pa, unsigned long flags)
61 {
62 	pgd_t *pgdp;
63 	pud_t *pudp;
64 	pmd_t *pmdp;
65 	pte_t *ptep;
66 
67 	BUILD_BUG_ON(TASK_SIZE_USER64 > H_PGTABLE_RANGE);
68 	if (slab_is_available()) {
69 		pgdp = pgd_offset_k(ea);
70 		pudp = pud_alloc(&init_mm, pgdp, ea);
71 		if (!pudp)
72 			return -ENOMEM;
73 		pmdp = pmd_alloc(&init_mm, pudp, ea);
74 		if (!pmdp)
75 			return -ENOMEM;
76 		ptep = pte_alloc_kernel(pmdp, ea);
77 		if (!ptep)
78 			return -ENOMEM;
79 		set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT,
80 							  __pgprot(flags)));
81 	} else {
82 		/*
83 		 * If the mm subsystem is not fully up, we cannot create a
84 		 * linux page table entry for this mapping.  Simply bolt an
85 		 * entry in the hardware page table.
86 		 *
87 		 */
88 		if (htab_bolt_mapping(ea, ea + PAGE_SIZE, pa, flags,
89 				      mmu_io_psize, mmu_kernel_ssize)) {
90 			printk(KERN_ERR "Failed to do bolted mapping IO "
91 			       "memory at %016lx !\n", pa);
92 			return -ENOMEM;
93 		}
94 	}
95 
96 	smp_wmb();
97 	return 0;
98 }
99 
100 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
101 
hash__pmd_hugepage_update(struct mm_struct * mm,unsigned long addr,pmd_t * pmdp,unsigned long clr,unsigned long set)102 unsigned long hash__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr,
103 				    pmd_t *pmdp, unsigned long clr,
104 				    unsigned long set)
105 {
106 	__be64 old_be, tmp;
107 	unsigned long old;
108 
109 #ifdef CONFIG_DEBUG_VM
110 	WARN_ON(!pmd_trans_huge(*pmdp));
111 	assert_spin_locked(&mm->page_table_lock);
112 #endif
113 
114 	__asm__ __volatile__(
115 	"1:	ldarx	%0,0,%3\n\
116 		and.	%1,%0,%6\n\
117 		bne-	1b \n\
118 		andc	%1,%0,%4 \n\
119 		or	%1,%1,%7\n\
120 		stdcx.	%1,0,%3 \n\
121 		bne-	1b"
122 	: "=&r" (old_be), "=&r" (tmp), "=m" (*pmdp)
123 	: "r" (pmdp), "r" (cpu_to_be64(clr)), "m" (*pmdp),
124 	  "r" (cpu_to_be64(H_PAGE_BUSY)), "r" (cpu_to_be64(set))
125 	: "cc" );
126 
127 	old = be64_to_cpu(old_be);
128 
129 	trace_hugepage_update(addr, old, clr, set);
130 	if (old & H_PAGE_HASHPTE)
131 		hpte_do_hugepage_flush(mm, addr, pmdp, old);
132 	return old;
133 }
134 
hash__pmdp_collapse_flush(struct vm_area_struct * vma,unsigned long address,pmd_t * pmdp)135 pmd_t hash__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
136 			    pmd_t *pmdp)
137 {
138 	pmd_t pmd;
139 
140 	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
141 	VM_BUG_ON(pmd_trans_huge(*pmdp));
142 
143 	pmd = *pmdp;
144 	pmd_clear(pmdp);
145 	/*
146 	 * Wait for all pending hash_page to finish. This is needed
147 	 * in case of subpage collapse. When we collapse normal pages
148 	 * to hugepage, we first clear the pmd, then invalidate all
149 	 * the PTE entries. The assumption here is that any low level
150 	 * page fault will see a none pmd and take the slow path that
151 	 * will wait on mmap_sem. But we could very well be in a
152 	 * hash_page with local ptep pointer value. Such a hash page
153 	 * can result in adding new HPTE entries for normal subpages.
154 	 * That means we could be modifying the page content as we
155 	 * copy them to a huge page. So wait for parallel hash_page
156 	 * to finish before invalidating HPTE entries. We can do this
157 	 * by sending an IPI to all the cpus and executing a dummy
158 	 * function there.
159 	 */
160 	kick_all_cpus_sync();
161 	/*
162 	 * Now invalidate the hpte entries in the range
163 	 * covered by pmd. This make sure we take a
164 	 * fault and will find the pmd as none, which will
165 	 * result in a major fault which takes mmap_sem and
166 	 * hence wait for collapse to complete. Without this
167 	 * the __collapse_huge_page_copy can result in copying
168 	 * the old content.
169 	 */
170 	flush_tlb_pmd_range(vma->vm_mm, &pmd, address);
171 	return pmd;
172 }
173 
174 /*
175  * We want to put the pgtable in pmd and use pgtable for tracking
176  * the base page size hptes
177  */
hash__pgtable_trans_huge_deposit(struct mm_struct * mm,pmd_t * pmdp,pgtable_t pgtable)178 void hash__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
179 				  pgtable_t pgtable)
180 {
181 	pgtable_t *pgtable_slot;
182 	assert_spin_locked(&mm->page_table_lock);
183 	/*
184 	 * we store the pgtable in the second half of PMD
185 	 */
186 	pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
187 	*pgtable_slot = pgtable;
188 	/*
189 	 * expose the deposited pgtable to other cpus.
190 	 * before we set the hugepage PTE at pmd level
191 	 * hash fault code looks at the deposted pgtable
192 	 * to store hash index values.
193 	 */
194 	smp_wmb();
195 }
196 
hash__pgtable_trans_huge_withdraw(struct mm_struct * mm,pmd_t * pmdp)197 pgtable_t hash__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
198 {
199 	pgtable_t pgtable;
200 	pgtable_t *pgtable_slot;
201 
202 	assert_spin_locked(&mm->page_table_lock);
203 	pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
204 	pgtable = *pgtable_slot;
205 	/*
206 	 * Once we withdraw, mark the entry NULL.
207 	 */
208 	*pgtable_slot = NULL;
209 	/*
210 	 * We store HPTE information in the deposited PTE fragment.
211 	 * zero out the content on withdraw.
212 	 */
213 	memset(pgtable, 0, PTE_FRAG_SIZE);
214 	return pgtable;
215 }
216 
hash__pmdp_huge_split_prepare(struct vm_area_struct * vma,unsigned long address,pmd_t * pmdp)217 void hash__pmdp_huge_split_prepare(struct vm_area_struct *vma,
218 			       unsigned long address, pmd_t *pmdp)
219 {
220 	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
221 	VM_BUG_ON(REGION_ID(address) != USER_REGION_ID);
222 
223 	/*
224 	 * We can't mark the pmd none here, because that will cause a race
225 	 * against exit_mmap. We need to continue mark pmd TRANS HUGE, while
226 	 * we spilt, but at the same time we wan't rest of the ppc64 code
227 	 * not to insert hash pte on this, because we will be modifying
228 	 * the deposited pgtable in the caller of this function. Hence
229 	 * clear the _PAGE_USER so that we move the fault handling to
230 	 * higher level function and that will serialize against ptl.
231 	 * We need to flush existing hash pte entries here even though,
232 	 * the translation is still valid, because we will withdraw
233 	 * pgtable_t after this.
234 	 */
235 	pmd_hugepage_update(vma->vm_mm, address, pmdp, 0, _PAGE_PRIVILEGED);
236 }
237 
238 /*
239  * A linux hugepage PMD was changed and the corresponding hash table entries
240  * neesd to be flushed.
241  */
hpte_do_hugepage_flush(struct mm_struct * mm,unsigned long addr,pmd_t * pmdp,unsigned long old_pmd)242 void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr,
243 			    pmd_t *pmdp, unsigned long old_pmd)
244 {
245 	int ssize;
246 	unsigned int psize;
247 	unsigned long vsid;
248 	unsigned long flags = 0;
249 	const struct cpumask *tmp;
250 
251 	/* get the base page size,vsid and segment size */
252 #ifdef CONFIG_DEBUG_VM
253 	psize = get_slice_psize(mm, addr);
254 	BUG_ON(psize == MMU_PAGE_16M);
255 #endif
256 	if (old_pmd & H_PAGE_COMBO)
257 		psize = MMU_PAGE_4K;
258 	else
259 		psize = MMU_PAGE_64K;
260 
261 	if (!is_kernel_addr(addr)) {
262 		ssize = user_segment_size(addr);
263 		vsid = get_vsid(mm->context.id, addr, ssize);
264 		WARN_ON(vsid == 0);
265 	} else {
266 		vsid = get_kernel_vsid(addr, mmu_kernel_ssize);
267 		ssize = mmu_kernel_ssize;
268 	}
269 
270 	tmp = cpumask_of(smp_processor_id());
271 	if (cpumask_equal(mm_cpumask(mm), tmp))
272 		flags |= HPTE_LOCAL_UPDATE;
273 
274 	return flush_hash_hugepage(vsid, addr, pmdp, psize, ssize, flags);
275 }
276 
hash__pmdp_huge_get_and_clear(struct mm_struct * mm,unsigned long addr,pmd_t * pmdp)277 pmd_t hash__pmdp_huge_get_and_clear(struct mm_struct *mm,
278 				unsigned long addr, pmd_t *pmdp)
279 {
280 	pmd_t old_pmd;
281 	pgtable_t pgtable;
282 	unsigned long old;
283 	pgtable_t *pgtable_slot;
284 
285 	old = pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0);
286 	old_pmd = __pmd(old);
287 	/*
288 	 * We have pmd == none and we are holding page_table_lock.
289 	 * So we can safely go and clear the pgtable hash
290 	 * index info.
291 	 */
292 	pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
293 	pgtable = *pgtable_slot;
294 	/*
295 	 * Let's zero out old valid and hash index details
296 	 * hash fault look at them.
297 	 */
298 	memset(pgtable, 0, PTE_FRAG_SIZE);
299 	/*
300 	 * Serialize against find_linux_pte_or_hugepte which does lock-less
301 	 * lookup in page tables with local interrupts disabled. For huge pages
302 	 * it casts pmd_t to pte_t. Since format of pte_t is different from
303 	 * pmd_t we want to prevent transit from pmd pointing to page table
304 	 * to pmd pointing to huge page (and back) while interrupts are disabled.
305 	 * We clear pmd to possibly replace it with page table pointer in
306 	 * different code paths. So make sure we wait for the parallel
307 	 * find_linux_pte_or_hugepage to finish.
308 	 */
309 	kick_all_cpus_sync();
310 	return old_pmd;
311 }
312 
hash__has_transparent_hugepage(void)313 int hash__has_transparent_hugepage(void)
314 {
315 
316 	if (!mmu_has_feature(MMU_FTR_16M_PAGE))
317 		return 0;
318 	/*
319 	 * We support THP only if PMD_SIZE is 16MB.
320 	 */
321 	if (mmu_psize_defs[MMU_PAGE_16M].shift != PMD_SHIFT)
322 		return 0;
323 	/*
324 	 * We need to make sure that we support 16MB hugepage in a segement
325 	 * with base page size 64K or 4K. We only enable THP with a PAGE_SIZE
326 	 * of 64K.
327 	 */
328 	/*
329 	 * If we have 64K HPTE, we will be using that by default
330 	 */
331 	if (mmu_psize_defs[MMU_PAGE_64K].shift &&
332 	    (mmu_psize_defs[MMU_PAGE_64K].penc[MMU_PAGE_16M] == -1))
333 		return 0;
334 	/*
335 	 * Ok we only have 4K HPTE
336 	 */
337 	if (mmu_psize_defs[MMU_PAGE_4K].penc[MMU_PAGE_16M] == -1)
338 		return 0;
339 
340 	return 1;
341 }
342 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
343