• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Page Size Emulation
4  *
5  * Copyright (c) 2024, Google LLC.
6  * Author: Kalesh Singh <kaleshsingh@goole.com>
7  */
8 
9 #include <linux/errno.h>
10 #include <linux/init.h>
11 #include <linux/kstrtox.h>
12 #include <linux/mm.h>
13 #include <linux/moduleparam.h>
14 #include <linux/pagemap.h>
15 #include <linux/page_size_compat.h>
16 #include <linux/swap.h>
17 #include <linux/perf_event.h>
18 
19 #define MIN_PAGE_SHIFT_COMPAT (PAGE_SHIFT + 1)
20 #define MAX_PAGE_SHIFT_COMPAT 16 /* Max of 64KB */
21 #define __MMAP_RND_BITS(x)      (x - (__PAGE_SHIFT - PAGE_SHIFT))
22 
23 DEFINE_STATIC_KEY_FALSE(page_shift_compat_enabled);
24 EXPORT_SYMBOL_GPL(page_shift_compat_enabled);
25 
26 int page_shift_compat __ro_after_init = MIN_PAGE_SHIFT_COMPAT;
27 EXPORT_SYMBOL_GPL(page_shift_compat);
28 
page_shift_params(char * param,char * val,const char * unused,void * arg)29 static int __init page_shift_params(char *param, char *val,
30 				    const char *unused, void *arg)
31 {
32 	int ret;
33 
34 	if (strcmp(param, "page_shift") != 0)
35 		return 0;
36 
37 	ret = kstrtoint(val, 10, &page_shift_compat);
38 	if (ret)
39 		return ret;
40 
41 	/* Only supported on 4KB kernel */
42 	if (PAGE_SHIFT != 12)
43 		return -ENOTSUPP;
44 
45 	if (page_shift_compat < MIN_PAGE_SHIFT_COMPAT ||
46 		page_shift_compat > MAX_PAGE_SHIFT_COMPAT)
47 		return -EINVAL;
48 
49 	static_branch_enable(&page_shift_compat_enabled);
50 
51 	return 0;
52 }
53 
init_page_shift_compat(void)54 static int __init init_page_shift_compat(void)
55 {
56 	char *err;
57 	char *command_line;
58 
59 	command_line = kstrdup(saved_command_line, GFP_KERNEL);
60 	if (!command_line)
61 		return -ENOMEM;
62 
63 	err = parse_args("page_shift", command_line, NULL, 0, 0, 0, NULL,
64 			page_shift_params);
65 
66 	kfree(command_line);
67 
68 	if (IS_ERR(err))
69 		return -EINVAL;
70 
71 	return 0;
72 }
73 pure_initcall(init_page_shift_compat);
74 
init_mmap_rnd_bits(void)75 static int __init init_mmap_rnd_bits(void)
76 {
77 	if (!static_branch_unlikely(&page_shift_compat_enabled))
78 		return 0;
79 
80 #ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS
81 	mmap_rnd_bits_min = __MMAP_RND_BITS(CONFIG_ARCH_MMAP_RND_BITS_MIN);
82 	mmap_rnd_bits_max = __MMAP_RND_BITS(CONFIG_ARCH_MMAP_RND_BITS_MAX);
83 	mmap_rnd_bits = __MMAP_RND_BITS(CONFIG_ARCH_MMAP_RND_BITS);
84 #endif
85 
86 	return 0;
87 }
88 core_initcall(init_mmap_rnd_bits);
89 
90 /*
91  * Returns size of the portion of the VMA backed by the
92  * underlying file.
93  */
___filemap_len(struct inode * inode,unsigned long pgoff,unsigned long len,unsigned long flags)94 unsigned long ___filemap_len(struct inode *inode, unsigned long pgoff, unsigned long len,
95 			     unsigned long flags)
96 {
97 	unsigned long file_size;
98 	unsigned long filemap_len;
99 	pgoff_t max_pgcount;
100 	pgoff_t last_pgoff;
101 
102 	if (flags & __MAP_NO_COMPAT)
103 		return len;
104 
105 	file_size = (unsigned long) i_size_read(inode);
106 
107 	/*
108 	 * Round up, so that this is a count (not an index). This simplifies
109 	 * the following calculations.
110 	 */
111 	max_pgcount = DIV_ROUND_UP(file_size, PAGE_SIZE);
112 	last_pgoff = pgoff + (len >> PAGE_SHIFT);
113 
114 	if (unlikely(last_pgoff >= max_pgcount)) {
115 		filemap_len = (max_pgcount - pgoff)  << PAGE_SHIFT;
116 		/* Careful of underflows in special files */
117 		if (filemap_len > 0 && filemap_len < len)
118 			return filemap_len;
119 	}
120 
121 	return len;
122 }
123 
is_shmem_fault(const struct vm_operations_struct * vm_ops)124 static inline bool is_shmem_fault(const struct vm_operations_struct *vm_ops)
125 {
126 #ifdef CONFIG_SHMEM
127 	return vm_ops->fault == shmem_fault;
128 #else
129 	return false;
130 #endif
131 }
132 
is_f2fs_filemap_fault(const struct vm_operations_struct * vm_ops)133 static inline bool is_f2fs_filemap_fault(const struct vm_operations_struct *vm_ops)
134 {
135 #ifdef CONFIG_F2FS_FS
136 	return vm_ops->fault == f2fs_filemap_fault;
137 #else
138 	return false;
139 #endif
140 }
141 
is_filemap_fault(const struct vm_operations_struct * vm_ops)142 static inline bool is_filemap_fault(const struct vm_operations_struct *vm_ops)
143 {
144 	return vm_ops->fault == filemap_fault;
145 }
146 
147 /*
148  * Given a file mapping of 48KiB backed by a file of size 18KiB, the
149  * faulting behaviour of the different page-size configurations is
150  * explained below.
151 
152  * In a 4KiB base page size system, when a file backed mapping extends
153  * past the end of the file, accessed is allowed to the entire last
154  * page that at least partially corresponds to valid offsets on the
155  * file. However, access beyond that page will generate a SIGBUS, since
156  * the offset we are trying to fault doesn't correspond to anywhere on
157  * the backing file.
158  *
159  * This is illustrated below. The offsets are given in units of KiB.
160  *
161  *                    Access OK (4KiB page paritially backed by file)
162  *                                │
163  *    ┌──────────────────────────┬┼─┬─────────────────────────────────────────┐
164  *    │                          │▼ │                                         │
165  *    │       File backed        │  │     SIGBUS (Invalid filemap_fault)      │
166  *    │                          │  │                                         │
167  *    └──────────────────────────┴──┴─────────────────────────────────────────┘
168  *
169  *    └─────┴─────┴─────┴─────┴─────┴─────┴─────┴─────┴─────┴─────┴─────┴─────┘
170  *    0     4     8     12   16    20    24    28    32    36    40    44    48
171  *
172  * In a x86_64 emulated 16KiB page size system, userspace beleives the page
173  * size is 16KiB and therefore shoud be able to access the entire last 16KiB page
174  * that is at least partially backed by the file. However, the kernel is still a
175  * 4KiB kernel and will fault at each 4KiB page that makes up the "emulated"
176  * 16KiB page, which will generate a SIGBUS any of the 4KiB pages making up the
177  * 16KiB expect the first is being faulted.
178  *
179  *                    Access OK (4KiB page paritially backed by file)
180  *                                │
181  *    ┌──────────────────────────┬┼─┬─────────────────────────────────────────┐
182  *    │                          │▼ │                                         │
183  *    │       File backed        │  │     SIGBUS (Invalid filemap_fault)      │
184  *    │                          │  │                                         │
185  *    └──────────────────────────┴──┴─────────────────────────────────────────┘
186  *
187  *    └─────┴─────┴─────┴─────┴─────┴─────┴─────┴─────┴─────┴─────┴─────┴─────┘
188  *    0     4     8     12   16    20    24    28    32    36    40    44    48
189  *
190  * To fix this semantic in the emulated page size mode an anonymous mapping is
191  * inserted into to replace the full 4KiB pages that make up the last 16KiB page
192  * partially backed by the file.
193  *
194  *
195  *                    Access OK (4KiB page paritially backed by file)
196  *                                │
197  *                                │        ┌─── Access OK
198  *                                │        │   (16KiB page partially backed
199  *                                │        │       by file)
200  *                                │        │
201  *    ┌──────────────────────────┬┼─┬──────┼──────────┬───────────────────────┐
202  *    │                          │▼ │      ▼          │      SIGBUS           │
203  *    │       File backed        │  │   Access OK     │(Invalid filemap fault)│
204  *    │                          │  │  (Anon Mapping) │                       │
205  *    └──────────────────────────┴──┴─────────────────┴───────────────────────┘
206  *
207  *    └─────┴─────┴─────┴─────┴─────┴─────┴─────┴─────┴─────┴─────┴─────┴─────┘
208  *    0     4     8     12   16    20    24    28    32    36    40    44    48
209  */
___filemap_fixup(unsigned long addr,unsigned long prot,unsigned long file_backed_len,unsigned long len)210 void ___filemap_fixup(unsigned long addr, unsigned long prot, unsigned long file_backed_len,
211 		      unsigned long len)
212 {
213 	unsigned long anon_addr = addr + file_backed_len;
214 	unsigned long __offset = __offset_in_page(anon_addr);
215 	unsigned long anon_len = __offset ? __PAGE_SIZE - __offset : 0;
216 	struct mm_struct *mm = current->mm;
217 	unsigned long populate = 0;
218 	struct vm_area_struct *vma;
219 	const struct vm_operations_struct *vm_ops;
220 
221 	if (!anon_len)
222 		return;
223 
224 	BUG_ON(anon_len >= __PAGE_SIZE);
225 
226 	/* The original do_mmap() failed */
227 	if (IS_ERR_VALUE(addr))
228 		return;
229 
230 	vma = find_vma(mm, addr);
231 
232 	/*
233 	 * This should never happen, VMA was inserted and we still
234 	 * haven't released the mmap write lock.
235 	 */
236 	BUG_ON(!vma);
237 
238 	vm_ops = vma->vm_ops;
239 	if (!vm_ops)
240 		return;
241 
242 	/*
243 	 * Insert fixup vmas for file backed and shmem backed VMAs.
244 	 *
245 	 * Faulting off the end of a file will result in SIGBUS since there is no
246 	 * file page for the given file offset.
247 	 *
248 	 * shmem pages live in page cache or swap cache. Looking up a page cache
249 	 * page with an index (pgoff) beyond the file is invalid and will result
250 	 * in shmem_get_folio_gfp() returning -EINVAL.
251 	 */
252 	if (!is_filemap_fault(vm_ops) && !is_f2fs_filemap_fault(vm_ops) &&
253 	    !is_shmem_fault(vm_ops))
254 		return;
255 
256 	/*
257 	 * Override the partial emulated page of the file backed portion of the VMA
258 	 * with an anonymous mapping.
259 	 */
260 	anon_addr = do_mmap(NULL, anon_addr, anon_len, prot,
261 					MAP_PRIVATE|MAP_ANONYMOUS|MAP_FIXED|__MAP_NO_COMPAT,
262 					0, 0, &populate, NULL);
263 }
264 
265 /*
266  * Folds any anon fixup entries created by ___filemap_fixup()
267  * into the previous mapping so that /proc/<pid>/[s]maps don't
268  * show unaliged entries.
269  */
__fold_filemap_fixup_entry(struct vma_iterator * iter,unsigned long * end)270 void __fold_filemap_fixup_entry(struct vma_iterator *iter, unsigned long *end)
271 {
272 	struct vm_area_struct *next_vma;
273 
274 	/* Not emulating page size? */
275 	if (!static_branch_unlikely(&page_shift_compat_enabled))
276 		return;
277 
278 	/* Advance iterator */
279 	next_vma = vma_next(iter);
280 
281 	/* If fixup VMA, adjust the end to cover its extent */
282 	if (next_vma && (next_vma->vm_flags & __VM_NO_COMPAT)) {
283 		*end = next_vma->vm_end;
284 		return;
285 	}
286 
287 	/* Rewind iterator */
288 	vma_prev(iter);
289 }
290 
291 /*
292  * The swap header is usually in the first page, with the magic in the last 10 bytes.
293  * of the page. In the emulated mode, mkswap tools might place the magic on the last
294  * 10 bytes of __PAGE_SIZE-ed page. Check if this is the case and place the magic on
295  * the first page and clear the magic from the original page in which it was found.
296  *
297  */
__fixup_swap_header(struct file * swap_file,struct address_space * mapping)298 int __fixup_swap_header(struct file *swap_file, struct address_space *mapping)
299 {
300 	union swap_header *swap_header;
301 	struct page *header_page = NULL;
302 	struct page *magic_page = NULL;
303 	int index;
304 	int error = 0;
305 	const char* magic = "SWAPSPACE2";
306 
307 	if (__PAGE_SHIFT == PAGE_SHIFT)
308 		return 0;
309 
310 	index = (1 << (__PAGE_SHIFT  - PAGE_SHIFT)) - 1;
311 	magic_page = read_mapping_page(mapping, index, swap_file);
312 	if (IS_ERR(magic_page)) {
313 		pgcompat_err("Failed reading swap magic page");
314 		return PTR_ERR(magic_page);
315 	}
316 	swap_header = kmap(magic_page);
317 
318 	/* Nothing to do; mkswap tool may have hardcoded a 4096 page size */
319 	if (memcmp(magic, swap_header->magic.magic, 10))
320 		goto free_magic;
321 
322 	memset(swap_header->magic.magic, 0, 10);
323 
324 	index = 0;
325 	header_page = read_mapping_page(mapping, index, swap_file);
326 	if (IS_ERR(header_page)) {
327 		pgcompat_err("Failed reading swap header page");
328 		error = PTR_ERR(header_page);
329 		goto free_magic;
330 	}
331 	swap_header = kmap(header_page);
332 
333 	memcpy(swap_header->magic.magic, magic, 10);
334 
335 	kunmap(header_page);
336 	put_page(header_page);
337 
338 free_magic:
339 	kunmap(magic_page);
340 	put_page(magic_page);
341 
342 	return error;
343 }
344 
345 #if IS_ENABLED(CONFIG_PERF_EVENTS)
init_sysctl_perf_event_mlock(void)346 static int __init init_sysctl_perf_event_mlock(void)
347 {
348 	if (!static_branch_unlikely(&page_shift_compat_enabled))
349 		return 0;
350 
351 	/* Minimum for 512 kiB + 1 user control page */
352 	sysctl_perf_event_mlock = 512 + (__PAGE_SIZE / 1024); /* 'free' kiB per user */
353 
354 	return 0;
355 }
356 core_initcall(init_sysctl_perf_event_mlock);
357 #endif
358