• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 #include <linux/gfp.h>
2 #include <linux/initrd.h>
3 #include <linux/ioport.h>
4 #include <linux/swap.h>
5 #include <linux/memblock.h>
6 #include <linux/bootmem.h>	/* for max_low_pfn */
7 #include <linux/swapfile.h>
8 #include <linux/swapops.h>
9 
10 #include <asm/set_memory.h>
11 #include <asm/e820/api.h>
12 #include <asm/init.h>
13 #include <asm/page.h>
14 #include <asm/page_types.h>
15 #include <asm/sections.h>
16 #include <asm/setup.h>
17 #include <asm/tlbflush.h>
18 #include <asm/tlb.h>
19 #include <asm/proto.h>
20 #include <asm/dma.h>		/* for MAX_DMA_PFN */
21 #include <asm/microcode.h>
22 #include <asm/kaslr.h>
23 #include <asm/hypervisor.h>
24 #include <asm/cpufeature.h>
25 #include <asm/pti.h>
26 
27 /*
28  * We need to define the tracepoints somewhere, and tlb.c
29  * is only compied when SMP=y.
30  */
31 #define CREATE_TRACE_POINTS
32 #include <trace/events/tlb.h>
33 
34 #include "mm_internal.h"
35 
36 /*
37  * Tables translating between page_cache_type_t and pte encoding.
38  *
39  * The default values are defined statically as minimal supported mode;
40  * WC and WT fall back to UC-.  pat_init() updates these values to support
41  * more cache modes, WC and WT, when it is safe to do so.  See pat_init()
42  * for the details.  Note, __early_ioremap() used during early boot-time
43  * takes pgprot_t (pte encoding) and does not use these tables.
44  *
45  *   Index into __cachemode2pte_tbl[] is the cachemode.
46  *
47  *   Index into __pte2cachemode_tbl[] are the caching attribute bits of the pte
48  *   (_PAGE_PWT, _PAGE_PCD, _PAGE_PAT) at index bit positions 0, 1, 2.
49  */
50 uint16_t __cachemode2pte_tbl[_PAGE_CACHE_MODE_NUM] = {
51 	[_PAGE_CACHE_MODE_WB      ]	= 0         | 0        ,
52 	[_PAGE_CACHE_MODE_WC      ]	= 0         | _PAGE_PCD,
53 	[_PAGE_CACHE_MODE_UC_MINUS]	= 0         | _PAGE_PCD,
54 	[_PAGE_CACHE_MODE_UC      ]	= _PAGE_PWT | _PAGE_PCD,
55 	[_PAGE_CACHE_MODE_WT      ]	= 0         | _PAGE_PCD,
56 	[_PAGE_CACHE_MODE_WP      ]	= 0         | _PAGE_PCD,
57 };
58 EXPORT_SYMBOL(__cachemode2pte_tbl);
59 
60 uint8_t __pte2cachemode_tbl[8] = {
61 	[__pte2cm_idx( 0        | 0         | 0        )] = _PAGE_CACHE_MODE_WB,
62 	[__pte2cm_idx(_PAGE_PWT | 0         | 0        )] = _PAGE_CACHE_MODE_UC_MINUS,
63 	[__pte2cm_idx( 0        | _PAGE_PCD | 0        )] = _PAGE_CACHE_MODE_UC_MINUS,
64 	[__pte2cm_idx(_PAGE_PWT | _PAGE_PCD | 0        )] = _PAGE_CACHE_MODE_UC,
65 	[__pte2cm_idx( 0        | 0         | _PAGE_PAT)] = _PAGE_CACHE_MODE_WB,
66 	[__pte2cm_idx(_PAGE_PWT | 0         | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC_MINUS,
67 	[__pte2cm_idx(0         | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC_MINUS,
68 	[__pte2cm_idx(_PAGE_PWT | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC,
69 };
70 EXPORT_SYMBOL(__pte2cachemode_tbl);
71 
72 static unsigned long __initdata pgt_buf_start;
73 static unsigned long __initdata pgt_buf_end;
74 static unsigned long __initdata pgt_buf_top;
75 
76 static unsigned long min_pfn_mapped;
77 
78 static bool __initdata can_use_brk_pgt = true;
79 
80 /*
81  * Pages returned are already directly mapped.
82  *
83  * Changing that is likely to break Xen, see commit:
84  *
85  *    279b706 x86,xen: introduce x86_init.mapping.pagetable_reserve
86  *
87  * for detailed information.
88  */
alloc_low_pages(unsigned int num)89 __ref void *alloc_low_pages(unsigned int num)
90 {
91 	unsigned long pfn;
92 	int i;
93 
94 	if (after_bootmem) {
95 		unsigned int order;
96 
97 		order = get_order((unsigned long)num << PAGE_SHIFT);
98 		return (void *)__get_free_pages(GFP_ATOMIC | __GFP_ZERO, order);
99 	}
100 
101 	if ((pgt_buf_end + num) > pgt_buf_top || !can_use_brk_pgt) {
102 		unsigned long ret;
103 		if (min_pfn_mapped >= max_pfn_mapped)
104 			panic("alloc_low_pages: ran out of memory");
105 		ret = memblock_find_in_range(min_pfn_mapped << PAGE_SHIFT,
106 					max_pfn_mapped << PAGE_SHIFT,
107 					PAGE_SIZE * num , PAGE_SIZE);
108 		if (!ret)
109 			panic("alloc_low_pages: can not alloc memory");
110 		memblock_reserve(ret, PAGE_SIZE * num);
111 		pfn = ret >> PAGE_SHIFT;
112 	} else {
113 		pfn = pgt_buf_end;
114 		pgt_buf_end += num;
115 		printk(KERN_DEBUG "BRK [%#010lx, %#010lx] PGTABLE\n",
116 			pfn << PAGE_SHIFT, (pgt_buf_end << PAGE_SHIFT) - 1);
117 	}
118 
119 	for (i = 0; i < num; i++) {
120 		void *adr;
121 
122 		adr = __va((pfn + i) << PAGE_SHIFT);
123 		clear_page(adr);
124 	}
125 
126 	return __va(pfn << PAGE_SHIFT);
127 }
128 
129 /*
130  * By default need 3 4k for initial PMD_SIZE,  3 4k for 0-ISA_END_ADDRESS.
131  * With KASLR memory randomization, depending on the machine e820 memory
132  * and the PUD alignment. We may need twice more pages when KASLR memory
133  * randomization is enabled.
134  */
135 #ifndef CONFIG_RANDOMIZE_MEMORY
136 #define INIT_PGD_PAGE_COUNT      6
137 #else
138 #define INIT_PGD_PAGE_COUNT      12
139 #endif
140 #define INIT_PGT_BUF_SIZE	(INIT_PGD_PAGE_COUNT * PAGE_SIZE)
141 RESERVE_BRK(early_pgt_alloc, INIT_PGT_BUF_SIZE);
early_alloc_pgt_buf(void)142 void  __init early_alloc_pgt_buf(void)
143 {
144 	unsigned long tables = INIT_PGT_BUF_SIZE;
145 	phys_addr_t base;
146 
147 	base = __pa(extend_brk(tables, PAGE_SIZE));
148 
149 	pgt_buf_start = base >> PAGE_SHIFT;
150 	pgt_buf_end = pgt_buf_start;
151 	pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT);
152 }
153 
154 int after_bootmem;
155 
156 early_param_on_off("gbpages", "nogbpages", direct_gbpages, CONFIG_X86_DIRECT_GBPAGES);
157 
158 struct map_range {
159 	unsigned long start;
160 	unsigned long end;
161 	unsigned page_size_mask;
162 };
163 
164 static int page_size_mask;
165 
enable_global_pages(void)166 static void enable_global_pages(void)
167 {
168 	if (!static_cpu_has(X86_FEATURE_PTI))
169 		__supported_pte_mask |= _PAGE_GLOBAL;
170 }
171 
probe_page_size_mask(void)172 static void __init probe_page_size_mask(void)
173 {
174 	/*
175 	 * For pagealloc debugging, identity mapping will use small pages.
176 	 * This will simplify cpa(), which otherwise needs to support splitting
177 	 * large pages into small in interrupt context, etc.
178 	 */
179 	if (boot_cpu_has(X86_FEATURE_PSE) && !debug_pagealloc_enabled())
180 		page_size_mask |= 1 << PG_LEVEL_2M;
181 	else
182 		direct_gbpages = 0;
183 
184 	/* Enable PSE if available */
185 	if (boot_cpu_has(X86_FEATURE_PSE))
186 		cr4_set_bits_and_update_boot(X86_CR4_PSE);
187 
188 	/* Enable PGE if available */
189 	__supported_pte_mask &= ~_PAGE_GLOBAL;
190 	if (boot_cpu_has(X86_FEATURE_PGE)) {
191 		cr4_set_bits_and_update_boot(X86_CR4_PGE);
192 		enable_global_pages();
193 	}
194 
195 	/* Enable 1 GB linear kernel mappings if available: */
196 	if (direct_gbpages && boot_cpu_has(X86_FEATURE_GBPAGES)) {
197 		printk(KERN_INFO "Using GB pages for direct mapping\n");
198 		page_size_mask |= 1 << PG_LEVEL_1G;
199 	} else {
200 		direct_gbpages = 0;
201 	}
202 }
203 
setup_pcid(void)204 static void setup_pcid(void)
205 {
206 	if (!IS_ENABLED(CONFIG_X86_64))
207 		return;
208 
209 	if (!boot_cpu_has(X86_FEATURE_PCID))
210 		return;
211 
212 	if (boot_cpu_has(X86_FEATURE_PGE)) {
213 		/*
214 		 * This can't be cr4_set_bits_and_update_boot() -- the
215 		 * trampoline code can't handle CR4.PCIDE and it wouldn't
216 		 * do any good anyway.  Despite the name,
217 		 * cr4_set_bits_and_update_boot() doesn't actually cause
218 		 * the bits in question to remain set all the way through
219 		 * the secondary boot asm.
220 		 *
221 		 * Instead, we brute-force it and set CR4.PCIDE manually in
222 		 * start_secondary().
223 		 */
224 		cr4_set_bits(X86_CR4_PCIDE);
225 
226 		/*
227 		 * INVPCID's single-context modes (2/3) only work if we set
228 		 * X86_CR4_PCIDE, *and* we INVPCID support.  It's unusable
229 		 * on systems that have X86_CR4_PCIDE clear, or that have
230 		 * no INVPCID support at all.
231 		 */
232 		if (boot_cpu_has(X86_FEATURE_INVPCID))
233 			setup_force_cpu_cap(X86_FEATURE_INVPCID_SINGLE);
234 	} else {
235 		/*
236 		 * flush_tlb_all(), as currently implemented, won't work if
237 		 * PCID is on but PGE is not.  Since that combination
238 		 * doesn't exist on real hardware, there's no reason to try
239 		 * to fully support it, but it's polite to avoid corrupting
240 		 * data if we're on an improperly configured VM.
241 		 */
242 		setup_clear_cpu_cap(X86_FEATURE_PCID);
243 	}
244 }
245 
246 #ifdef CONFIG_X86_32
247 #define NR_RANGE_MR 3
248 #else /* CONFIG_X86_64 */
249 #define NR_RANGE_MR 5
250 #endif
251 
save_mr(struct map_range * mr,int nr_range,unsigned long start_pfn,unsigned long end_pfn,unsigned long page_size_mask)252 static int __meminit save_mr(struct map_range *mr, int nr_range,
253 			     unsigned long start_pfn, unsigned long end_pfn,
254 			     unsigned long page_size_mask)
255 {
256 	if (start_pfn < end_pfn) {
257 		if (nr_range >= NR_RANGE_MR)
258 			panic("run out of range for init_memory_mapping\n");
259 		mr[nr_range].start = start_pfn<<PAGE_SHIFT;
260 		mr[nr_range].end   = end_pfn<<PAGE_SHIFT;
261 		mr[nr_range].page_size_mask = page_size_mask;
262 		nr_range++;
263 	}
264 
265 	return nr_range;
266 }
267 
268 /*
269  * adjust the page_size_mask for small range to go with
270  *	big page size instead small one if nearby are ram too.
271  */
adjust_range_page_size_mask(struct map_range * mr,int nr_range)272 static void __ref adjust_range_page_size_mask(struct map_range *mr,
273 							 int nr_range)
274 {
275 	int i;
276 
277 	for (i = 0; i < nr_range; i++) {
278 		if ((page_size_mask & (1<<PG_LEVEL_2M)) &&
279 		    !(mr[i].page_size_mask & (1<<PG_LEVEL_2M))) {
280 			unsigned long start = round_down(mr[i].start, PMD_SIZE);
281 			unsigned long end = round_up(mr[i].end, PMD_SIZE);
282 
283 #ifdef CONFIG_X86_32
284 			if ((end >> PAGE_SHIFT) > max_low_pfn)
285 				continue;
286 #endif
287 
288 			if (memblock_is_region_memory(start, end - start))
289 				mr[i].page_size_mask |= 1<<PG_LEVEL_2M;
290 		}
291 		if ((page_size_mask & (1<<PG_LEVEL_1G)) &&
292 		    !(mr[i].page_size_mask & (1<<PG_LEVEL_1G))) {
293 			unsigned long start = round_down(mr[i].start, PUD_SIZE);
294 			unsigned long end = round_up(mr[i].end, PUD_SIZE);
295 
296 			if (memblock_is_region_memory(start, end - start))
297 				mr[i].page_size_mask |= 1<<PG_LEVEL_1G;
298 		}
299 	}
300 }
301 
page_size_string(struct map_range * mr)302 static const char *page_size_string(struct map_range *mr)
303 {
304 	static const char str_1g[] = "1G";
305 	static const char str_2m[] = "2M";
306 	static const char str_4m[] = "4M";
307 	static const char str_4k[] = "4k";
308 
309 	if (mr->page_size_mask & (1<<PG_LEVEL_1G))
310 		return str_1g;
311 	/*
312 	 * 32-bit without PAE has a 4M large page size.
313 	 * PG_LEVEL_2M is misnamed, but we can at least
314 	 * print out the right size in the string.
315 	 */
316 	if (IS_ENABLED(CONFIG_X86_32) &&
317 	    !IS_ENABLED(CONFIG_X86_PAE) &&
318 	    mr->page_size_mask & (1<<PG_LEVEL_2M))
319 		return str_4m;
320 
321 	if (mr->page_size_mask & (1<<PG_LEVEL_2M))
322 		return str_2m;
323 
324 	return str_4k;
325 }
326 
split_mem_range(struct map_range * mr,int nr_range,unsigned long start,unsigned long end)327 static int __meminit split_mem_range(struct map_range *mr, int nr_range,
328 				     unsigned long start,
329 				     unsigned long end)
330 {
331 	unsigned long start_pfn, end_pfn, limit_pfn;
332 	unsigned long pfn;
333 	int i;
334 
335 	limit_pfn = PFN_DOWN(end);
336 
337 	/* head if not big page alignment ? */
338 	pfn = start_pfn = PFN_DOWN(start);
339 #ifdef CONFIG_X86_32
340 	/*
341 	 * Don't use a large page for the first 2/4MB of memory
342 	 * because there are often fixed size MTRRs in there
343 	 * and overlapping MTRRs into large pages can cause
344 	 * slowdowns.
345 	 */
346 	if (pfn == 0)
347 		end_pfn = PFN_DOWN(PMD_SIZE);
348 	else
349 		end_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE));
350 #else /* CONFIG_X86_64 */
351 	end_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE));
352 #endif
353 	if (end_pfn > limit_pfn)
354 		end_pfn = limit_pfn;
355 	if (start_pfn < end_pfn) {
356 		nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
357 		pfn = end_pfn;
358 	}
359 
360 	/* big page (2M) range */
361 	start_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE));
362 #ifdef CONFIG_X86_32
363 	end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE));
364 #else /* CONFIG_X86_64 */
365 	end_pfn = round_up(pfn, PFN_DOWN(PUD_SIZE));
366 	if (end_pfn > round_down(limit_pfn, PFN_DOWN(PMD_SIZE)))
367 		end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE));
368 #endif
369 
370 	if (start_pfn < end_pfn) {
371 		nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
372 				page_size_mask & (1<<PG_LEVEL_2M));
373 		pfn = end_pfn;
374 	}
375 
376 #ifdef CONFIG_X86_64
377 	/* big page (1G) range */
378 	start_pfn = round_up(pfn, PFN_DOWN(PUD_SIZE));
379 	end_pfn = round_down(limit_pfn, PFN_DOWN(PUD_SIZE));
380 	if (start_pfn < end_pfn) {
381 		nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
382 				page_size_mask &
383 				 ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G)));
384 		pfn = end_pfn;
385 	}
386 
387 	/* tail is not big page (1G) alignment */
388 	start_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE));
389 	end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE));
390 	if (start_pfn < end_pfn) {
391 		nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
392 				page_size_mask & (1<<PG_LEVEL_2M));
393 		pfn = end_pfn;
394 	}
395 #endif
396 
397 	/* tail is not big page (2M) alignment */
398 	start_pfn = pfn;
399 	end_pfn = limit_pfn;
400 	nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
401 
402 	if (!after_bootmem)
403 		adjust_range_page_size_mask(mr, nr_range);
404 
405 	/* try to merge same page size and continuous */
406 	for (i = 0; nr_range > 1 && i < nr_range - 1; i++) {
407 		unsigned long old_start;
408 		if (mr[i].end != mr[i+1].start ||
409 		    mr[i].page_size_mask != mr[i+1].page_size_mask)
410 			continue;
411 		/* move it */
412 		old_start = mr[i].start;
413 		memmove(&mr[i], &mr[i+1],
414 			(nr_range - 1 - i) * sizeof(struct map_range));
415 		mr[i--].start = old_start;
416 		nr_range--;
417 	}
418 
419 	for (i = 0; i < nr_range; i++)
420 		pr_debug(" [mem %#010lx-%#010lx] page %s\n",
421 				mr[i].start, mr[i].end - 1,
422 				page_size_string(&mr[i]));
423 
424 	return nr_range;
425 }
426 
427 struct range pfn_mapped[E820_MAX_ENTRIES];
428 int nr_pfn_mapped;
429 
add_pfn_range_mapped(unsigned long start_pfn,unsigned long end_pfn)430 static void add_pfn_range_mapped(unsigned long start_pfn, unsigned long end_pfn)
431 {
432 	nr_pfn_mapped = add_range_with_merge(pfn_mapped, E820_MAX_ENTRIES,
433 					     nr_pfn_mapped, start_pfn, end_pfn);
434 	nr_pfn_mapped = clean_sort_range(pfn_mapped, E820_MAX_ENTRIES);
435 
436 	max_pfn_mapped = max(max_pfn_mapped, end_pfn);
437 
438 	if (start_pfn < (1UL<<(32-PAGE_SHIFT)))
439 		max_low_pfn_mapped = max(max_low_pfn_mapped,
440 					 min(end_pfn, 1UL<<(32-PAGE_SHIFT)));
441 }
442 
pfn_range_is_mapped(unsigned long start_pfn,unsigned long end_pfn)443 bool pfn_range_is_mapped(unsigned long start_pfn, unsigned long end_pfn)
444 {
445 	int i;
446 
447 	for (i = 0; i < nr_pfn_mapped; i++)
448 		if ((start_pfn >= pfn_mapped[i].start) &&
449 		    (end_pfn <= pfn_mapped[i].end))
450 			return true;
451 
452 	return false;
453 }
454 
455 /*
456  * Setup the direct mapping of the physical memory at PAGE_OFFSET.
457  * This runs before bootmem is initialized and gets pages directly from
458  * the physical memory. To access them they are temporarily mapped.
459  */
init_memory_mapping(unsigned long start,unsigned long end)460 unsigned long __ref init_memory_mapping(unsigned long start,
461 					       unsigned long end)
462 {
463 	struct map_range mr[NR_RANGE_MR];
464 	unsigned long ret = 0;
465 	int nr_range, i;
466 
467 	pr_debug("init_memory_mapping: [mem %#010lx-%#010lx]\n",
468 	       start, end - 1);
469 
470 	memset(mr, 0, sizeof(mr));
471 	nr_range = split_mem_range(mr, 0, start, end);
472 
473 	for (i = 0; i < nr_range; i++)
474 		ret = kernel_physical_mapping_init(mr[i].start, mr[i].end,
475 						   mr[i].page_size_mask);
476 
477 	add_pfn_range_mapped(start >> PAGE_SHIFT, ret >> PAGE_SHIFT);
478 
479 	return ret >> PAGE_SHIFT;
480 }
481 
482 /*
483  * We need to iterate through the E820 memory map and create direct mappings
484  * for only E820_TYPE_RAM and E820_KERN_RESERVED regions. We cannot simply
485  * create direct mappings for all pfns from [0 to max_low_pfn) and
486  * [4GB to max_pfn) because of possible memory holes in high addresses
487  * that cannot be marked as UC by fixed/variable range MTRRs.
488  * Depending on the alignment of E820 ranges, this may possibly result
489  * in using smaller size (i.e. 4K instead of 2M or 1G) page tables.
490  *
491  * init_mem_mapping() calls init_range_memory_mapping() with big range.
492  * That range would have hole in the middle or ends, and only ram parts
493  * will be mapped in init_range_memory_mapping().
494  */
init_range_memory_mapping(unsigned long r_start,unsigned long r_end)495 static unsigned long __init init_range_memory_mapping(
496 					   unsigned long r_start,
497 					   unsigned long r_end)
498 {
499 	unsigned long start_pfn, end_pfn;
500 	unsigned long mapped_ram_size = 0;
501 	int i;
502 
503 	for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) {
504 		u64 start = clamp_val(PFN_PHYS(start_pfn), r_start, r_end);
505 		u64 end = clamp_val(PFN_PHYS(end_pfn), r_start, r_end);
506 		if (start >= end)
507 			continue;
508 
509 		/*
510 		 * if it is overlapping with brk pgt, we need to
511 		 * alloc pgt buf from memblock instead.
512 		 */
513 		can_use_brk_pgt = max(start, (u64)pgt_buf_end<<PAGE_SHIFT) >=
514 				    min(end, (u64)pgt_buf_top<<PAGE_SHIFT);
515 		init_memory_mapping(start, end);
516 		mapped_ram_size += end - start;
517 		can_use_brk_pgt = true;
518 	}
519 
520 	return mapped_ram_size;
521 }
522 
get_new_step_size(unsigned long step_size)523 static unsigned long __init get_new_step_size(unsigned long step_size)
524 {
525 	/*
526 	 * Initial mapped size is PMD_SIZE (2M).
527 	 * We can not set step_size to be PUD_SIZE (1G) yet.
528 	 * In worse case, when we cross the 1G boundary, and
529 	 * PG_LEVEL_2M is not set, we will need 1+1+512 pages (2M + 8k)
530 	 * to map 1G range with PTE. Hence we use one less than the
531 	 * difference of page table level shifts.
532 	 *
533 	 * Don't need to worry about overflow in the top-down case, on 32bit,
534 	 * when step_size is 0, round_down() returns 0 for start, and that
535 	 * turns it into 0x100000000ULL.
536 	 * In the bottom-up case, round_up(x, 0) returns 0 though too, which
537 	 * needs to be taken into consideration by the code below.
538 	 */
539 	return step_size << (PMD_SHIFT - PAGE_SHIFT - 1);
540 }
541 
542 /**
543  * memory_map_top_down - Map [map_start, map_end) top down
544  * @map_start: start address of the target memory range
545  * @map_end: end address of the target memory range
546  *
547  * This function will setup direct mapping for memory range
548  * [map_start, map_end) in top-down. That said, the page tables
549  * will be allocated at the end of the memory, and we map the
550  * memory in top-down.
551  */
memory_map_top_down(unsigned long map_start,unsigned long map_end)552 static void __init memory_map_top_down(unsigned long map_start,
553 				       unsigned long map_end)
554 {
555 	unsigned long real_end, start, last_start;
556 	unsigned long step_size;
557 	unsigned long addr;
558 	unsigned long mapped_ram_size = 0;
559 
560 	/* xen has big range in reserved near end of ram, skip it at first.*/
561 	addr = memblock_find_in_range(map_start, map_end, PMD_SIZE, PMD_SIZE);
562 	real_end = addr + PMD_SIZE;
563 
564 	/* step_size need to be small so pgt_buf from BRK could cover it */
565 	step_size = PMD_SIZE;
566 	max_pfn_mapped = 0; /* will get exact value next */
567 	min_pfn_mapped = real_end >> PAGE_SHIFT;
568 	last_start = start = real_end;
569 
570 	/*
571 	 * We start from the top (end of memory) and go to the bottom.
572 	 * The memblock_find_in_range() gets us a block of RAM from the
573 	 * end of RAM in [min_pfn_mapped, max_pfn_mapped) used as new pages
574 	 * for page table.
575 	 */
576 	while (last_start > map_start) {
577 		if (last_start > step_size) {
578 			start = round_down(last_start - 1, step_size);
579 			if (start < map_start)
580 				start = map_start;
581 		} else
582 			start = map_start;
583 		mapped_ram_size += init_range_memory_mapping(start,
584 							last_start);
585 		last_start = start;
586 		min_pfn_mapped = last_start >> PAGE_SHIFT;
587 		if (mapped_ram_size >= step_size)
588 			step_size = get_new_step_size(step_size);
589 	}
590 
591 	if (real_end < map_end)
592 		init_range_memory_mapping(real_end, map_end);
593 }
594 
595 /**
596  * memory_map_bottom_up - Map [map_start, map_end) bottom up
597  * @map_start: start address of the target memory range
598  * @map_end: end address of the target memory range
599  *
600  * This function will setup direct mapping for memory range
601  * [map_start, map_end) in bottom-up. Since we have limited the
602  * bottom-up allocation above the kernel, the page tables will
603  * be allocated just above the kernel and we map the memory
604  * in [map_start, map_end) in bottom-up.
605  */
memory_map_bottom_up(unsigned long map_start,unsigned long map_end)606 static void __init memory_map_bottom_up(unsigned long map_start,
607 					unsigned long map_end)
608 {
609 	unsigned long next, start;
610 	unsigned long mapped_ram_size = 0;
611 	/* step_size need to be small so pgt_buf from BRK could cover it */
612 	unsigned long step_size = PMD_SIZE;
613 
614 	start = map_start;
615 	min_pfn_mapped = start >> PAGE_SHIFT;
616 
617 	/*
618 	 * We start from the bottom (@map_start) and go to the top (@map_end).
619 	 * The memblock_find_in_range() gets us a block of RAM from the
620 	 * end of RAM in [min_pfn_mapped, max_pfn_mapped) used as new pages
621 	 * for page table.
622 	 */
623 	while (start < map_end) {
624 		if (step_size && map_end - start > step_size) {
625 			next = round_up(start + 1, step_size);
626 			if (next > map_end)
627 				next = map_end;
628 		} else {
629 			next = map_end;
630 		}
631 
632 		mapped_ram_size += init_range_memory_mapping(start, next);
633 		start = next;
634 
635 		if (mapped_ram_size >= step_size)
636 			step_size = get_new_step_size(step_size);
637 	}
638 }
639 
init_mem_mapping(void)640 void __init init_mem_mapping(void)
641 {
642 	unsigned long end;
643 
644 	pti_check_boottime_disable();
645 	probe_page_size_mask();
646 	setup_pcid();
647 
648 #ifdef CONFIG_X86_64
649 	end = max_pfn << PAGE_SHIFT;
650 #else
651 	end = max_low_pfn << PAGE_SHIFT;
652 #endif
653 
654 	/* the ISA range is always mapped regardless of memory holes */
655 	init_memory_mapping(0, ISA_END_ADDRESS);
656 
657 	/* Init the trampoline, possibly with KASLR memory offset */
658 	init_trampoline();
659 
660 	/*
661 	 * If the allocation is in bottom-up direction, we setup direct mapping
662 	 * in bottom-up, otherwise we setup direct mapping in top-down.
663 	 */
664 	if (memblock_bottom_up()) {
665 		unsigned long kernel_end = __pa_symbol(_end);
666 
667 		/*
668 		 * we need two separate calls here. This is because we want to
669 		 * allocate page tables above the kernel. So we first map
670 		 * [kernel_end, end) to make memory above the kernel be mapped
671 		 * as soon as possible. And then use page tables allocated above
672 		 * the kernel to map [ISA_END_ADDRESS, kernel_end).
673 		 */
674 		memory_map_bottom_up(kernel_end, end);
675 		memory_map_bottom_up(ISA_END_ADDRESS, kernel_end);
676 	} else {
677 		memory_map_top_down(ISA_END_ADDRESS, end);
678 	}
679 
680 #ifdef CONFIG_X86_64
681 	if (max_pfn > max_low_pfn) {
682 		/* can we preseve max_low_pfn ?*/
683 		max_low_pfn = max_pfn;
684 	}
685 #else
686 	early_ioremap_page_table_range_init();
687 #endif
688 
689 	load_cr3(swapper_pg_dir);
690 	__flush_tlb_all();
691 
692 	x86_init.hyper.init_mem_mapping();
693 
694 	early_memtest(0, max_pfn_mapped << PAGE_SHIFT);
695 }
696 
697 /*
698  * devmem_is_allowed() checks to see if /dev/mem access to a certain address
699  * is valid. The argument is a physical page number.
700  *
701  * On x86, access has to be given to the first megabyte of RAM because that
702  * area traditionally contains BIOS code and data regions used by X, dosemu,
703  * and similar apps. Since they map the entire memory range, the whole range
704  * must be allowed (for mapping), but any areas that would otherwise be
705  * disallowed are flagged as being "zero filled" instead of rejected.
706  * Access has to be given to non-kernel-ram areas as well, these contain the
707  * PCI mmio resources as well as potential bios/acpi data regions.
708  */
devmem_is_allowed(unsigned long pagenr)709 int devmem_is_allowed(unsigned long pagenr)
710 {
711 	if (region_intersects(PFN_PHYS(pagenr), PAGE_SIZE,
712 				IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE)
713 			!= REGION_DISJOINT) {
714 		/*
715 		 * For disallowed memory regions in the low 1MB range,
716 		 * request that the page be shown as all zeros.
717 		 */
718 		if (pagenr < 256)
719 			return 2;
720 
721 		return 0;
722 	}
723 
724 	/*
725 	 * This must follow RAM test, since System RAM is considered a
726 	 * restricted resource under CONFIG_STRICT_IOMEM.
727 	 */
728 	if (iomem_is_exclusive(pagenr << PAGE_SHIFT)) {
729 		/* Low 1MB bypasses iomem restrictions. */
730 		if (pagenr < 256)
731 			return 1;
732 
733 		return 0;
734 	}
735 
736 	return 1;
737 }
738 
free_init_pages(char * what,unsigned long begin,unsigned long end)739 void free_init_pages(char *what, unsigned long begin, unsigned long end)
740 {
741 	unsigned long begin_aligned, end_aligned;
742 
743 	/* Make sure boundaries are page aligned */
744 	begin_aligned = PAGE_ALIGN(begin);
745 	end_aligned   = end & PAGE_MASK;
746 
747 	if (WARN_ON(begin_aligned != begin || end_aligned != end)) {
748 		begin = begin_aligned;
749 		end   = end_aligned;
750 	}
751 
752 	if (begin >= end)
753 		return;
754 
755 	/*
756 	 * If debugging page accesses then do not free this memory but
757 	 * mark them not present - any buggy init-section access will
758 	 * create a kernel page fault:
759 	 */
760 	if (debug_pagealloc_enabled()) {
761 		pr_info("debug: unmapping init [mem %#010lx-%#010lx]\n",
762 			begin, end - 1);
763 		set_memory_np(begin, (end - begin) >> PAGE_SHIFT);
764 	} else {
765 		/*
766 		 * We just marked the kernel text read only above, now that
767 		 * we are going to free part of that, we need to make that
768 		 * writeable and non-executable first.
769 		 */
770 		set_memory_nx(begin, (end - begin) >> PAGE_SHIFT);
771 		set_memory_rw(begin, (end - begin) >> PAGE_SHIFT);
772 
773 		free_reserved_area((void *)begin, (void *)end,
774 				   POISON_FREE_INITMEM, what);
775 	}
776 }
777 
free_initmem(void)778 void __ref free_initmem(void)
779 {
780 	e820__reallocate_tables();
781 
782 	free_init_pages("unused kernel",
783 			(unsigned long)(&__init_begin),
784 			(unsigned long)(&__init_end));
785 }
786 
787 #ifdef CONFIG_BLK_DEV_INITRD
free_initrd_mem(unsigned long start,unsigned long end)788 void __init free_initrd_mem(unsigned long start, unsigned long end)
789 {
790 	/*
791 	 * end could be not aligned, and We can not align that,
792 	 * decompresser could be confused by aligned initrd_end
793 	 * We already reserve the end partial page before in
794 	 *   - i386_start_kernel()
795 	 *   - x86_64_start_kernel()
796 	 *   - relocate_initrd()
797 	 * So here We can do PAGE_ALIGN() safely to get partial page to be freed
798 	 */
799 	free_init_pages("initrd", start, PAGE_ALIGN(end));
800 }
801 #endif
802 
803 /*
804  * Calculate the precise size of the DMA zone (first 16 MB of RAM),
805  * and pass it to the MM layer - to help it set zone watermarks more
806  * accurately.
807  *
808  * Done on 64-bit systems only for the time being, although 32-bit systems
809  * might benefit from this as well.
810  */
memblock_find_dma_reserve(void)811 void __init memblock_find_dma_reserve(void)
812 {
813 #ifdef CONFIG_X86_64
814 	u64 nr_pages = 0, nr_free_pages = 0;
815 	unsigned long start_pfn, end_pfn;
816 	phys_addr_t start_addr, end_addr;
817 	int i;
818 	u64 u;
819 
820 	/*
821 	 * Iterate over all memory ranges (free and reserved ones alike),
822 	 * to calculate the total number of pages in the first 16 MB of RAM:
823 	 */
824 	nr_pages = 0;
825 	for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) {
826 		start_pfn = min(start_pfn, MAX_DMA_PFN);
827 		end_pfn   = min(end_pfn,   MAX_DMA_PFN);
828 
829 		nr_pages += end_pfn - start_pfn;
830 	}
831 
832 	/*
833 	 * Iterate over free memory ranges to calculate the number of free
834 	 * pages in the DMA zone, while not counting potential partial
835 	 * pages at the beginning or the end of the range:
836 	 */
837 	nr_free_pages = 0;
838 	for_each_free_mem_range(u, NUMA_NO_NODE, MEMBLOCK_NONE, &start_addr, &end_addr, NULL) {
839 		start_pfn = min_t(unsigned long, PFN_UP(start_addr), MAX_DMA_PFN);
840 		end_pfn   = min_t(unsigned long, PFN_DOWN(end_addr), MAX_DMA_PFN);
841 
842 		if (start_pfn < end_pfn)
843 			nr_free_pages += end_pfn - start_pfn;
844 	}
845 
846 	set_dma_reserve(nr_pages - nr_free_pages);
847 #endif
848 }
849 
zone_sizes_init(void)850 void __init zone_sizes_init(void)
851 {
852 	unsigned long max_zone_pfns[MAX_NR_ZONES];
853 
854 	memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
855 
856 #ifdef CONFIG_ZONE_DMA
857 	max_zone_pfns[ZONE_DMA]		= min(MAX_DMA_PFN, max_low_pfn);
858 #endif
859 #ifdef CONFIG_ZONE_DMA32
860 	max_zone_pfns[ZONE_DMA32]	= min(MAX_DMA32_PFN, max_low_pfn);
861 #endif
862 	max_zone_pfns[ZONE_NORMAL]	= max_low_pfn;
863 #ifdef CONFIG_HIGHMEM
864 	max_zone_pfns[ZONE_HIGHMEM]	= max_pfn;
865 #endif
866 
867 	free_area_init_nodes(max_zone_pfns);
868 }
869 
870 __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = {
871 	.loaded_mm = &init_mm,
872 	.next_asid = 1,
873 	.cr4 = ~0UL,	/* fail hard if we screw up cr4 shadow initialization */
874 };
875 EXPORT_PER_CPU_SYMBOL(cpu_tlbstate);
876 
update_cache_mode_entry(unsigned entry,enum page_cache_mode cache)877 void update_cache_mode_entry(unsigned entry, enum page_cache_mode cache)
878 {
879 	/* entry 0 MUST be WB (hardwired to speed up translations) */
880 	BUG_ON(!entry && cache != _PAGE_CACHE_MODE_WB);
881 
882 	__cachemode2pte_tbl[cache] = __cm_idx2pte(entry);
883 	__pte2cachemode_tbl[entry] = cache;
884 }
885 
886 #ifdef CONFIG_SWAP
max_swapfile_size(void)887 unsigned long max_swapfile_size(void)
888 {
889 	unsigned long pages;
890 
891 	pages = generic_max_swapfile_size();
892 
893 	if (boot_cpu_has_bug(X86_BUG_L1TF) && l1tf_mitigation != L1TF_MITIGATION_OFF) {
894 		/* Limit the swap file size to MAX_PA/2 for L1TF workaround */
895 		unsigned long long l1tf_limit = l1tf_pfn_limit();
896 		/*
897 		 * We encode swap offsets also with 3 bits below those for pfn
898 		 * which makes the usable limit higher.
899 		 */
900 #if CONFIG_PGTABLE_LEVELS > 2
901 		l1tf_limit <<= PAGE_SHIFT - SWP_OFFSET_FIRST_BIT;
902 #endif
903 		pages = min_t(unsigned long long, l1tf_limit, pages);
904 	}
905 	return pages;
906 }
907 #endif
908