1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Page Size Migration
4 *
5 * This file contains the core logic of mitigations to ensure
6 * app compatibility during the transition from 4kB to 16kB
7 * page size in Android.
8 *
9 * Copyright (c) 2024, Google LLC.
10 * Author: Kalesh Singh <kaleshsingh@goole.com>
11 */
12
13 #include <linux/pgsize_migration.h>
14
15 #include <linux/init.h>
16 #include <linux/jump_label.h>
17 #include <linux/kobject.h>
18 #include <linux/kstrtox.h>
19 #include <linux/sched/task_stack.h>
20 #include <linux/slab.h>
21 #include <linux/string.h>
22 #include <linux/sysfs.h>
23
24 typedef void (*show_pad_maps_fn) (struct seq_file *m, struct vm_area_struct *vma);
25 typedef int (*show_pad_smaps_fn) (struct seq_file *m, void *v);
26
27 #ifdef CONFIG_64BIT
28 #if PAGE_SIZE == SZ_4K
29 DEFINE_STATIC_KEY_TRUE(pgsize_migration_enabled);
30
31 #define is_pgsize_migration_enabled() (static_branch_likely(&pgsize_migration_enabled))
32 #else /* PAGE_SIZE != SZ_4K */
33 DEFINE_STATIC_KEY_FALSE(pgsize_migration_enabled);
34
35 #define is_pgsize_migration_enabled() (static_branch_unlikely(&pgsize_migration_enabled))
36 #endif /* PAGE_SIZE == SZ_4K */
37
show_pgsize_migration_enabled(struct kobject * kobj,struct kobj_attribute * attr,char * buf)38 static ssize_t show_pgsize_migration_enabled(struct kobject *kobj,
39 struct kobj_attribute *attr,
40 char *buf)
41 {
42 if (is_pgsize_migration_enabled())
43 return sprintf(buf, "%d\n", 1);
44 else
45 return sprintf(buf, "%d\n", 0);
46 }
47
store_pgsize_migration_enabled(struct kobject * kobj,struct kobj_attribute * attr,const char * buf,size_t n)48 static ssize_t store_pgsize_migration_enabled(struct kobject *kobj,
49 struct kobj_attribute *attr,
50 const char *buf, size_t n)
51 {
52 unsigned long val;
53
54 /* Migration is only applicable to 4kB kernels */
55 if (PAGE_SIZE != SZ_4K)
56 return n;
57
58 if (kstrtoul(buf, 10, &val))
59 return -EINVAL;
60
61 if (val > 1)
62 return -EINVAL;
63
64 if (val == 1)
65 static_branch_enable(&pgsize_migration_enabled);
66 else if (val == 0)
67 static_branch_disable(&pgsize_migration_enabled);
68
69 return n;
70 }
71
72 static struct kobj_attribute pgsize_migration_enabled_attr = __ATTR(
73 enabled,
74 0644,
75 show_pgsize_migration_enabled,
76 store_pgsize_migration_enabled
77 );
78
79 static struct attribute *pgsize_migration_attrs[] = {
80 &pgsize_migration_enabled_attr.attr,
81 NULL
82 };
83
84 static struct attribute_group pgsize_migration_attr_group = {
85 .name = "pgsize_migration",
86 .attrs = pgsize_migration_attrs,
87 };
88
89 /**
90 * What: /sys/kernel/mm/pgsize_migration/enabled
91 * Date: April 2024
92 * KernelVersion: v5.4+ (GKI kernels)
93 * Contact: Kalesh Singh <kaleshsingh@google.com>
94 * Description: /sys/kernel/mm/pgsize_migration/enabled
95 * allows for userspace to turn on or off page size
96 * migration mitigations necessary for app compatibility
97 * during Android's transition from 4kB to 16kB page size.
98 * Such mitigations include preserving /proc/<pid>/[s]maps
99 * output as if there was no segment extension by the
100 * dynamic loader; and preventing fault around in the padding
101 * sections of ELF LOAD segment mappings.
102 * Users: Bionic's dynamic linker
103 */
init_pgsize_migration(void)104 static int __init init_pgsize_migration(void)
105 {
106 if (sysfs_create_group(mm_kobj, &pgsize_migration_attr_group))
107 pr_err("pgsize_migration: failed to create sysfs group\n");
108
109 return 0;
110 };
111 late_initcall(init_pgsize_migration);
112
113 #if PAGE_SIZE == SZ_4K
vma_set_pad_pages(struct vm_area_struct * vma,unsigned long nr_pages)114 void vma_set_pad_pages(struct vm_area_struct *vma,
115 unsigned long nr_pages)
116 {
117 if (!is_pgsize_migration_enabled())
118 return;
119
120 /*
121 * Usually to modify vm_flags we need to take exclusive mmap_lock but here
122 * only have the lock in read mode, to avoid all DONTNEED/DONTNEED_LOCKED
123 * calls needing the write lock.
124 *
125 * A race to the flags update can only happen with another MADV_DONTNEED on
126 * the same process and same range (VMA).
127 *
128 * In practice, this specific scenario is not possible because the action that
129 * could cause it is usually performed at most once per VMA and only by the
130 * dynamic linker.
131 *
132 * Forego protection for this case, to avoid penalties in the common cases.
133 */
134 __vm_flags_mod(vma, 0, VM_PAD_MASK);
135 __vm_flags_mod(vma, nr_pages << VM_PAD_SHIFT, 0);
136 }
137
vma_pad_pages(struct vm_area_struct * vma)138 unsigned long vma_pad_pages(struct vm_area_struct *vma)
139 {
140 if (!is_pgsize_migration_enabled())
141 return 0;
142
143 return (vma->vm_flags & VM_PAD_MASK) >> VM_PAD_SHIFT;
144 }
145
str_has_suffix(const char * str,const char * suffix)146 static __always_inline bool str_has_suffix(const char *str, const char *suffix)
147 {
148 size_t str_len = strlen(str);
149 size_t suffix_len = strlen(suffix);
150
151 if (str_len < suffix_len)
152 return false;
153
154 return !strncmp(str + str_len - suffix_len, suffix, suffix_len);
155 }
156
157 #ifdef CONFIG_PER_VMA_LOCK
158 /*
159 * The dynamic linker, or interpreter, operates within the process context
160 * of the binary that necessitated dynamic linking.
161 *
162 * Consequently, process context identifiers; like PID, comm, ...; cannot
163 * be used to differentiate whether the execution context belongs to the
164 * dynamic linker or not.
165 *
166 * linker_ctx() deduces whether execution is currently in the dynamic linker's
167 * context by correlating the current userspace instruction pointer with the
168 * VMAs of the current task.
169 *
170 * Returns true if in linker context, otherwise false.
171 */
linker_ctx(void)172 static inline bool linker_ctx(void)
173 {
174 struct pt_regs *regs = task_pt_regs(current);
175 struct mm_struct *mm = current->mm;
176 struct vm_area_struct *vma;
177 struct file *file;
178
179 if (!regs)
180 return false;
181
182 vma = lock_vma_under_rcu(mm, instruction_pointer(regs));
183
184 /*
185 * lock_vma_under_rcu() is a try-lock that can fail if the
186 * VMA is already locked for modification.
187 *
188 * Fallback to finding the vma under mmap read lock.
189 */
190 if (!vma) {
191 mmap_read_lock(mm);
192
193 vma = find_vma(mm, instruction_pointer(regs));
194
195 /* Current execution context, the VMA must be present */
196 BUG_ON(!vma);
197
198 /*
199 * We cannot use vma_start_read() as it may fail due to
200 * false locked (see comment in vma_start_read()). We
201 * can avoid that by using vma_start_read_locked under
202 * mmap_lock, which guarantees that nobody can lock the
203 * vma for write (vma_start_write()) under us.
204 */
205 BUG_ON(!vma_start_read_locked(vma));
206
207 mmap_read_unlock(mm);
208 }
209
210 file = vma->vm_file;
211 if (!file)
212 goto out;
213
214 if ((vma->vm_flags & VM_EXEC)) {
215 char buf[64];
216 const int bufsize = sizeof(buf);
217 char *path;
218
219 memset(buf, 0, bufsize);
220 path = d_path(&file->f_path, buf, bufsize);
221
222 /*
223 * Depending on interpreter requested, valid paths could be any of:
224 * 1. /system/bin/bootstrap/linker64
225 * 2. /system/bin/linker64
226 * 3. /apex/com.android.runtime/bin/linker64
227 *
228 * Check the base name (linker64).
229 */
230 if (!strcmp(kbasename(path), "linker64")) {
231 vma_end_read(vma);
232 return true;
233 }
234 }
235 out:
236 vma_end_read(vma);
237 return false;
238 }
239
240 #else /* CONFIG_PER_VMA_LOCK */
241
linker_ctx(void)242 static inline bool linker_ctx(void) { return false; }
243
244 #endif /* CONFIG_PER_VMA_LOCK */
245 /*
246 * Saves the number of padding pages for an ELF segment mapping
247 * in vm_flags.
248 *
249 * The number of padding pages is deduced from the madvise DONTNEED range [start, end)
250 * if the following conditions are met:
251 * 1) The range is enclosed by a single VMA
252 * 2) The range ends at the end address of the VMA
253 * 3) The range starts at an address greater than the start address of the VMA
254 * 4) The number of the pages in the range does not exceed VM_TOTAL_PAD_PAGES.
255 * 5) The VMA is a file backed VMA.
256 * 6) The file backing the VMA is a shared library (*.so)
257 * 7) The madvise was requested by bionic's dynamic linker.
258 */
madvise_vma_pad_pages(struct vm_area_struct * vma,unsigned long start,unsigned long end)259 void madvise_vma_pad_pages(struct vm_area_struct *vma,
260 unsigned long start, unsigned long end)
261 {
262 unsigned long nr_pad_pages;
263
264 if (!is_pgsize_migration_enabled())
265 return;
266
267 /*
268 * If the madvise range is it at the end of the file save the number of
269 * pages in vm_flags (only need 4 bits are needed for up to 64kB aligned ELFs).
270 */
271 if (start <= vma->vm_start || end != vma->vm_end)
272 return;
273
274 nr_pad_pages = (end - start) >> PAGE_SHIFT;
275
276 if (!nr_pad_pages || nr_pad_pages > VM_TOTAL_PAD_PAGES)
277 return;
278
279 /* Only handle this for file backed VMAs */
280 if (!vma->vm_file)
281 return;
282
283 /* Limit this to only shared libraries (*.so) */
284 if (!str_has_suffix(vma->vm_file->f_path.dentry->d_name.name, ".so"))
285 return;
286
287 /* Only bionic's dynamic linker needs to hint padding pages. */
288 if (!linker_ctx())
289 return;
290
291 vma_set_pad_pages(vma, nr_pad_pages);
292 }
293
pad_vma_name(struct vm_area_struct * vma)294 static const char *pad_vma_name(struct vm_area_struct *vma)
295 {
296 return "[page size compat]";
297 }
298
299 static const struct vm_operations_struct pad_vma_ops = {
300 .name = pad_vma_name,
301 };
302
303 /*
304 * Initialize @pad VMA fields with information from the original @vma.
305 */
init_pad_vma(struct vm_area_struct * vma,struct vm_area_struct * pad)306 static void init_pad_vma(struct vm_area_struct *vma, struct vm_area_struct *pad)
307 {
308 memcpy(pad, vma, sizeof(struct vm_area_struct));
309
310 /* Remove file */
311 pad->vm_file = NULL;
312
313 /* Add vm_ops->name */
314 pad->vm_ops = &pad_vma_ops;
315
316 /* Adjust the start to begin at the start of the padding section */
317 pad->vm_start = VMA_PAD_START(pad);
318
319 /*
320 * The below modifications to vm_flags don't need mmap write lock,
321 * since, pad does not belong to the VMA tree.
322 */
323 /* Make the pad vma PROT_NONE */
324 __vm_flags_mod(pad, 0, VM_READ|VM_WRITE|VM_EXEC);
325 /* Remove padding bits */
326 __vm_flags_mod(pad, 0, VM_PAD_MASK);
327 }
328
329 /*
330 * Calls the show_pad_vma_fn on the @pad VMA.
331 */
show_map_pad_vma(struct vm_area_struct * vma,struct seq_file * m,void * func,bool smaps)332 void show_map_pad_vma(struct vm_area_struct *vma, struct seq_file *m,
333 void *func, bool smaps)
334 {
335 if (!is_pgsize_migration_enabled() || !(vma->vm_flags & VM_PAD_MASK))
336 return;
337
338 struct vm_area_struct pad;
339
340 init_pad_vma(vma, &pad);
341
342 /* The pad VMA should be anonymous. */
343 BUG_ON(pad.vm_file);
344
345 /* The pad VMA should be PROT_NONE. */
346 BUG_ON(pad.vm_flags & (VM_READ|VM_WRITE|VM_EXEC));
347
348 /* The pad VMA itself cannot have padding; infinite recursion */
349 BUG_ON(pad.vm_flags & VM_PAD_MASK);
350
351 if (smaps)
352 ((show_pad_smaps_fn)func)(m, &pad);
353 else
354 ((show_pad_maps_fn)func)(m, &pad);
355 }
356
357 /*
358 * When splitting a padding VMA there are a couple of cases to handle.
359 *
360 * Given:
361 *
362 * | DDDDPPPP |
363 *
364 * where:
365 * - D represents 1 page of data;
366 * - P represents 1 page of padding;
367 * - | represents the boundaries (start/end) of the VMA
368 *
369 *
370 * 1) Split exactly at the padding boundary
371 *
372 * | DDDDPPPP | --> | DDDD | PPPP |
373 *
374 * - Remove padding flags from the first VMA.
375 * - The second VMA is all padding
376 *
377 * 2) Split within the padding area
378 *
379 * | DDDDPPPP | --> | DDDDPP | PP |
380 *
381 * - Subtract the length of the second VMA from the first VMA's padding.
382 * - The second VMA is all padding, adjust its padding length (flags)
383 *
384 * 3) Split within the data area
385 *
386 * | DDDDPPPP | --> | DD | DDPPPP |
387 *
388 * - Remove padding flags from the first VMA.
389 * - The second VMA is has the same padding as from before the split.
390 */
split_pad_vma(struct vm_area_struct * vma,struct vm_area_struct * new,unsigned long addr,int new_below)391 void split_pad_vma(struct vm_area_struct *vma, struct vm_area_struct *new,
392 unsigned long addr, int new_below)
393 {
394 unsigned long nr_pad_pages = vma_pad_pages(vma);
395 unsigned long nr_vma2_pages;
396 struct vm_area_struct *first;
397 struct vm_area_struct *second;
398
399 if (!nr_pad_pages)
400 return;
401
402 if (new_below) {
403 first = new;
404 second = vma;
405 } else {
406 first = vma;
407 second = new;
408 }
409
410 nr_vma2_pages = vma_pages(second);
411
412 if (nr_vma2_pages >= nr_pad_pages) { /* Case 1 & 3 */
413 vma_set_pad_pages(first, 0);
414 vma_set_pad_pages(second, nr_pad_pages);
415 } else { /* Case 2 */
416 vma_set_pad_pages(first, nr_pad_pages - nr_vma2_pages);
417 vma_set_pad_pages(second, nr_vma2_pages);
418 }
419 }
420
421 /*
422 * Merging of padding VMAs is uncommon, as padding is only allowed
423 * from the linker context.
424 *
425 * To simplify the semantics, adjacent VMAs with padding are not
426 * allowed to merge.
427 */
is_mergable_pad_vma(struct vm_area_struct * vma,unsigned long vm_flags)428 bool is_mergable_pad_vma(struct vm_area_struct *vma,
429 unsigned long vm_flags)
430 {
431 /* Padding VMAs cannot be merged with other padding or real VMAs */
432 return !((vma->vm_flags | vm_flags) & VM_PAD_MASK);
433 }
434
vma_data_pages(struct vm_area_struct * vma)435 unsigned long vma_data_pages(struct vm_area_struct *vma)
436 {
437 return vma_pages(vma) - vma_pad_pages(vma);
438 }
439
440 #endif /* PAGE_SIZE == SZ_4K */
441 #endif /* CONFIG_64BIT */
442