1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Page Size Emulation
4 *
5 * Copyright (c) 2024, Google LLC.
6 * Author: Kalesh Singh <kaleshsingh@goole.com>
7 */
8
9 #include <linux/errno.h>
10 #include <linux/init.h>
11 #include <linux/kstrtox.h>
12 #include <linux/mm.h>
13 #include <linux/moduleparam.h>
14 #include <linux/pagemap.h>
15 #include <linux/page_size_compat.h>
16 #include <linux/swap.h>
17 #include <linux/perf_event.h>
18
19 #define MIN_PAGE_SHIFT_COMPAT (PAGE_SHIFT + 1)
20 #define MAX_PAGE_SHIFT_COMPAT 16 /* Max of 64KB */
21 #define __MMAP_RND_BITS(x) (x - (__PAGE_SHIFT - PAGE_SHIFT))
22
23 DEFINE_STATIC_KEY_FALSE(page_shift_compat_enabled);
24 EXPORT_SYMBOL_GPL(page_shift_compat_enabled);
25
26 int page_shift_compat __ro_after_init = MIN_PAGE_SHIFT_COMPAT;
27 EXPORT_SYMBOL_GPL(page_shift_compat);
28
page_shift_params(char * param,char * val,const char * unused,void * arg)29 static int __init page_shift_params(char *param, char *val,
30 const char *unused, void *arg)
31 {
32 int ret;
33
34 if (strcmp(param, "page_shift") != 0)
35 return 0;
36
37 ret = kstrtoint(val, 10, &page_shift_compat);
38 if (ret)
39 return ret;
40
41 /* Only supported on 4KB kernel */
42 if (PAGE_SHIFT != 12)
43 return -ENOTSUPP;
44
45 if (page_shift_compat < MIN_PAGE_SHIFT_COMPAT ||
46 page_shift_compat > MAX_PAGE_SHIFT_COMPAT)
47 return -EINVAL;
48
49 static_branch_enable(&page_shift_compat_enabled);
50
51 return 0;
52 }
53
init_page_shift_compat(void)54 static int __init init_page_shift_compat(void)
55 {
56 char *err;
57 char *command_line;
58
59 command_line = kstrdup(saved_command_line, GFP_KERNEL);
60 if (!command_line)
61 return -ENOMEM;
62
63 err = parse_args("page_shift", command_line, NULL, 0, 0, 0, NULL,
64 page_shift_params);
65
66 kfree(command_line);
67
68 if (IS_ERR(err))
69 return -EINVAL;
70
71 return 0;
72 }
73 pure_initcall(init_page_shift_compat);
74
init_mmap_rnd_bits(void)75 static int __init init_mmap_rnd_bits(void)
76 {
77 if (!static_branch_unlikely(&page_shift_compat_enabled))
78 return 0;
79
80 #ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS
81 mmap_rnd_bits_min = __MMAP_RND_BITS(CONFIG_ARCH_MMAP_RND_BITS_MIN);
82 mmap_rnd_bits_max = __MMAP_RND_BITS(CONFIG_ARCH_MMAP_RND_BITS_MAX);
83 mmap_rnd_bits = __MMAP_RND_BITS(CONFIG_ARCH_MMAP_RND_BITS);
84 #endif
85
86 return 0;
87 }
88 core_initcall(init_mmap_rnd_bits);
89
90 /*
91 * Returns size of the portion of the VMA backed by the
92 * underlying file.
93 */
___filemap_len(struct inode * inode,unsigned long pgoff,unsigned long len,unsigned long flags)94 unsigned long ___filemap_len(struct inode *inode, unsigned long pgoff, unsigned long len,
95 unsigned long flags)
96 {
97 unsigned long file_size;
98 unsigned long filemap_len;
99 pgoff_t max_pgcount;
100 pgoff_t last_pgoff;
101
102 if (flags & __MAP_NO_COMPAT)
103 return len;
104
105 file_size = (unsigned long) i_size_read(inode);
106
107 /*
108 * Round up, so that this is a count (not an index). This simplifies
109 * the following calculations.
110 */
111 max_pgcount = DIV_ROUND_UP(file_size, PAGE_SIZE);
112 last_pgoff = pgoff + (len >> PAGE_SHIFT);
113
114 if (unlikely(last_pgoff >= max_pgcount)) {
115 filemap_len = (max_pgcount - pgoff) << PAGE_SHIFT;
116 /* Careful of underflows in special files */
117 if (filemap_len > 0 && filemap_len < len)
118 return filemap_len;
119 }
120
121 return len;
122 }
123
is_shmem_fault(const struct vm_operations_struct * vm_ops)124 static inline bool is_shmem_fault(const struct vm_operations_struct *vm_ops)
125 {
126 #ifdef CONFIG_SHMEM
127 return vm_ops->fault == shmem_fault;
128 #else
129 return false;
130 #endif
131 }
132
is_f2fs_filemap_fault(const struct vm_operations_struct * vm_ops)133 static inline bool is_f2fs_filemap_fault(const struct vm_operations_struct *vm_ops)
134 {
135 #ifdef CONFIG_F2FS_FS
136 return vm_ops->fault == f2fs_filemap_fault;
137 #else
138 return false;
139 #endif
140 }
141
is_filemap_fault(const struct vm_operations_struct * vm_ops)142 static inline bool is_filemap_fault(const struct vm_operations_struct *vm_ops)
143 {
144 return vm_ops->fault == filemap_fault;
145 }
146
147 /*
148 * Given a file mapping of 48KiB backed by a file of size 18KiB, the
149 * faulting behaviour of the different page-size configurations is
150 * explained below.
151
152 * In a 4KiB base page size system, when a file backed mapping extends
153 * past the end of the file, accessed is allowed to the entire last
154 * page that at least partially corresponds to valid offsets on the
155 * file. However, access beyond that page will generate a SIGBUS, since
156 * the offset we are trying to fault doesn't correspond to anywhere on
157 * the backing file.
158 *
159 * This is illustrated below. The offsets are given in units of KiB.
160 *
161 * Access OK (4KiB page paritially backed by file)
162 * │
163 * ┌──────────────────────────┬┼─┬─────────────────────────────────────────┐
164 * │ │▼ │ │
165 * │ File backed │ │ SIGBUS (Invalid filemap_fault) │
166 * │ │ │ │
167 * └──────────────────────────┴──┴─────────────────────────────────────────┘
168 *
169 * └─────┴─────┴─────┴─────┴─────┴─────┴─────┴─────┴─────┴─────┴─────┴─────┘
170 * 0 4 8 12 16 20 24 28 32 36 40 44 48
171 *
172 * In a x86_64 emulated 16KiB page size system, userspace beleives the page
173 * size is 16KiB and therefore shoud be able to access the entire last 16KiB page
174 * that is at least partially backed by the file. However, the kernel is still a
175 * 4KiB kernel and will fault at each 4KiB page that makes up the "emulated"
176 * 16KiB page, which will generate a SIGBUS any of the 4KiB pages making up the
177 * 16KiB expect the first is being faulted.
178 *
179 * Access OK (4KiB page paritially backed by file)
180 * │
181 * ┌──────────────────────────┬┼─┬─────────────────────────────────────────┐
182 * │ │▼ │ │
183 * │ File backed │ │ SIGBUS (Invalid filemap_fault) │
184 * │ │ │ │
185 * └──────────────────────────┴──┴─────────────────────────────────────────┘
186 *
187 * └─────┴─────┴─────┴─────┴─────┴─────┴─────┴─────┴─────┴─────┴─────┴─────┘
188 * 0 4 8 12 16 20 24 28 32 36 40 44 48
189 *
190 * To fix this semantic in the emulated page size mode an anonymous mapping is
191 * inserted into to replace the full 4KiB pages that make up the last 16KiB page
192 * partially backed by the file.
193 *
194 *
195 * Access OK (4KiB page paritially backed by file)
196 * │
197 * │ ┌─── Access OK
198 * │ │ (16KiB page partially backed
199 * │ │ by file)
200 * │ │
201 * ┌──────────────────────────┬┼─┬──────┼──────────┬───────────────────────┐
202 * │ │▼ │ ▼ │ SIGBUS │
203 * │ File backed │ │ Access OK │(Invalid filemap fault)│
204 * │ │ │ (Anon Mapping) │ │
205 * └──────────────────────────┴──┴─────────────────┴───────────────────────┘
206 *
207 * └─────┴─────┴─────┴─────┴─────┴─────┴─────┴─────┴─────┴─────┴─────┴─────┘
208 * 0 4 8 12 16 20 24 28 32 36 40 44 48
209 */
___filemap_fixup(unsigned long addr,unsigned long prot,unsigned long file_backed_len,unsigned long len)210 void ___filemap_fixup(unsigned long addr, unsigned long prot, unsigned long file_backed_len,
211 unsigned long len)
212 {
213 unsigned long anon_addr = addr + file_backed_len;
214 unsigned long __offset = __offset_in_page(anon_addr);
215 unsigned long anon_len = __offset ? __PAGE_SIZE - __offset : 0;
216 struct mm_struct *mm = current->mm;
217 unsigned long populate = 0;
218 struct vm_area_struct *vma;
219 const struct vm_operations_struct *vm_ops;
220
221 if (!anon_len)
222 return;
223
224 BUG_ON(anon_len >= __PAGE_SIZE);
225
226 /* The original do_mmap() failed */
227 if (IS_ERR_VALUE(addr))
228 return;
229
230 vma = find_vma(mm, addr);
231
232 /*
233 * This should never happen, VMA was inserted and we still
234 * haven't released the mmap write lock.
235 */
236 BUG_ON(!vma);
237
238 vm_ops = vma->vm_ops;
239 if (!vm_ops)
240 return;
241
242 /*
243 * Insert fixup vmas for file backed and shmem backed VMAs.
244 *
245 * Faulting off the end of a file will result in SIGBUS since there is no
246 * file page for the given file offset.
247 *
248 * shmem pages live in page cache or swap cache. Looking up a page cache
249 * page with an index (pgoff) beyond the file is invalid and will result
250 * in shmem_get_folio_gfp() returning -EINVAL.
251 */
252 if (!is_filemap_fault(vm_ops) && !is_f2fs_filemap_fault(vm_ops) &&
253 !is_shmem_fault(vm_ops))
254 return;
255
256 /*
257 * Override the partial emulated page of the file backed portion of the VMA
258 * with an anonymous mapping.
259 */
260 anon_addr = do_mmap(NULL, anon_addr, anon_len, prot,
261 MAP_PRIVATE|MAP_ANONYMOUS|MAP_FIXED|__MAP_NO_COMPAT,
262 0, 0, &populate, NULL);
263 }
264
265 /*
266 * Folds any anon fixup entries created by ___filemap_fixup()
267 * into the previous mapping so that /proc/<pid>/[s]maps don't
268 * show unaliged entries.
269 */
__fold_filemap_fixup_entry(struct vma_iterator * iter,unsigned long * end)270 void __fold_filemap_fixup_entry(struct vma_iterator *iter, unsigned long *end)
271 {
272 struct vm_area_struct *next_vma;
273
274 /* Not emulating page size? */
275 if (!static_branch_unlikely(&page_shift_compat_enabled))
276 return;
277
278 /* Advance iterator */
279 next_vma = vma_next(iter);
280
281 /* If fixup VMA, adjust the end to cover its extent */
282 if (next_vma && (next_vma->vm_flags & __VM_NO_COMPAT)) {
283 *end = next_vma->vm_end;
284 return;
285 }
286
287 /* Rewind iterator */
288 vma_prev(iter);
289 }
290
291 /*
292 * The swap header is usually in the first page, with the magic in the last 10 bytes.
293 * of the page. In the emulated mode, mkswap tools might place the magic on the last
294 * 10 bytes of __PAGE_SIZE-ed page. Check if this is the case and place the magic on
295 * the first page and clear the magic from the original page in which it was found.
296 *
297 */
__fixup_swap_header(struct file * swap_file,struct address_space * mapping)298 int __fixup_swap_header(struct file *swap_file, struct address_space *mapping)
299 {
300 union swap_header *swap_header;
301 struct page *header_page = NULL;
302 struct page *magic_page = NULL;
303 int index;
304 int error = 0;
305 const char* magic = "SWAPSPACE2";
306
307 if (__PAGE_SHIFT == PAGE_SHIFT)
308 return 0;
309
310 index = (1 << (__PAGE_SHIFT - PAGE_SHIFT)) - 1;
311 magic_page = read_mapping_page(mapping, index, swap_file);
312 if (IS_ERR(magic_page)) {
313 pgcompat_err("Failed reading swap magic page");
314 return PTR_ERR(magic_page);
315 }
316 swap_header = kmap(magic_page);
317
318 /* Nothing to do; mkswap tool may have hardcoded a 4096 page size */
319 if (memcmp(magic, swap_header->magic.magic, 10))
320 goto free_magic;
321
322 memset(swap_header->magic.magic, 0, 10);
323
324 index = 0;
325 header_page = read_mapping_page(mapping, index, swap_file);
326 if (IS_ERR(header_page)) {
327 pgcompat_err("Failed reading swap header page");
328 error = PTR_ERR(header_page);
329 goto free_magic;
330 }
331 swap_header = kmap(header_page);
332
333 memcpy(swap_header->magic.magic, magic, 10);
334
335 kunmap(header_page);
336 put_page(header_page);
337
338 free_magic:
339 kunmap(magic_page);
340 put_page(magic_page);
341
342 return error;
343 }
344
345 #if IS_ENABLED(CONFIG_PERF_EVENTS)
init_sysctl_perf_event_mlock(void)346 static int __init init_sysctl_perf_event_mlock(void)
347 {
348 if (!static_branch_unlikely(&page_shift_compat_enabled))
349 return 0;
350
351 /* Minimum for 512 kiB + 1 user control page */
352 sysctl_perf_event_mlock = 512 + (__PAGE_SIZE / 1024); /* 'free' kiB per user */
353
354 return 0;
355 }
356 core_initcall(init_sysctl_perf_event_mlock);
357 #endif
358