1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Page Size Migration
4 *
5 * This file contains the core logic of mitigations to ensure
6 * app compatibility during the transition from 4kB to 16kB
7 * page size in Android.
8 *
9 * Copyright (c) 2024, Google LLC.
10 * Author: Kalesh Singh <kaleshsingh@goole.com>
11 */
12
13 #include <linux/pgsize_migration.h>
14
15 #include <linux/init.h>
16 #include <linux/jump_label.h>
17 #include <linux/kobject.h>
18 #include <linux/kstrtox.h>
19 #include <linux/sched/task_stack.h>
20 #include <linux/slab.h>
21 #include <linux/sysfs.h>
22
23 #ifdef CONFIG_64BIT
24 #if PAGE_SIZE == SZ_4K
25 DEFINE_STATIC_KEY_TRUE(pgsize_migration_enabled);
26
27 #define is_pgsize_migration_enabled() (static_branch_likely(&pgsize_migration_enabled))
28 #else /* PAGE_SIZE != SZ_4K */
29 DEFINE_STATIC_KEY_FALSE(pgsize_migration_enabled);
30
31 #define is_pgsize_migration_enabled() (static_branch_unlikely(&pgsize_migration_enabled))
32 #endif /* PAGE_SIZE == SZ_4K */
33
show_pgsize_migration_enabled(struct kobject * kobj,struct kobj_attribute * attr,char * buf)34 static ssize_t show_pgsize_migration_enabled(struct kobject *kobj,
35 struct kobj_attribute *attr,
36 char *buf)
37 {
38 if (is_pgsize_migration_enabled())
39 return sprintf(buf, "%d\n", 1);
40 else
41 return sprintf(buf, "%d\n", 0);
42 }
43
store_pgsize_migration_enabled(struct kobject * kobj,struct kobj_attribute * attr,const char * buf,size_t n)44 static ssize_t store_pgsize_migration_enabled(struct kobject *kobj,
45 struct kobj_attribute *attr,
46 const char *buf, size_t n)
47 {
48 unsigned long val;
49
50 /* Migration is only applicable to 4kB kernels */
51 if (PAGE_SIZE != SZ_4K)
52 return n;
53
54 if (kstrtoul(buf, 10, &val))
55 return -EINVAL;
56
57 if (val > 1)
58 return -EINVAL;
59
60 if (val == 1)
61 static_branch_enable(&pgsize_migration_enabled);
62 else if (val == 0)
63 static_branch_disable(&pgsize_migration_enabled);
64
65 return n;
66 }
67
68 static struct kobj_attribute pgsize_migration_enabled_attr = __ATTR(
69 enabled,
70 0644,
71 show_pgsize_migration_enabled,
72 store_pgsize_migration_enabled
73 );
74
75 static struct attribute *pgsize_migration_attrs[] = {
76 &pgsize_migration_enabled_attr.attr,
77 NULL
78 };
79
80 static struct attribute_group pgsize_migration_attr_group = {
81 .name = "pgsize_migration",
82 .attrs = pgsize_migration_attrs,
83 };
84
85 /**
86 * What: /sys/kernel/mm/pgsize_migration/enabled
87 * Date: April 2024
88 * KernelVersion: v5.4+ (GKI kernels)
89 * Contact: Kalesh Singh <kaleshsingh@google.com>
90 * Description: /sys/kernel/mm/pgsize_migration/enabled
91 * allows for userspace to turn on or off page size
92 * migration mitigations necessary for app compatibility
93 * during Android's transition from 4kB to 16kB page size.
94 * Such mitigations include preserving /proc/<pid>/[s]maps
95 * output as if there was no segment extension by the
96 * dynamic loader; and preventing fault around in the padding
97 * sections of ELF LOAD segment mappings.
98 * Users: Bionic's dynamic linker
99 */
init_pgsize_migration(void)100 static int __init init_pgsize_migration(void)
101 {
102 if (sysfs_create_group(mm_kobj, &pgsize_migration_attr_group))
103 pr_err("pgsize_migration: failed to create sysfs group\n");
104
105 return 0;
106 };
107 late_initcall(init_pgsize_migration);
108
109 #if PAGE_SIZE == SZ_4K
vma_set_pad_pages(struct vm_area_struct * vma,unsigned long nr_pages)110 void vma_set_pad_pages(struct vm_area_struct *vma,
111 unsigned long nr_pages)
112 {
113 if (!is_pgsize_migration_enabled())
114 return;
115
116 vma->vm_flags &= ~VM_PAD_MASK;
117 vma->vm_flags |= (nr_pages << VM_PAD_SHIFT);
118 }
119
vma_pad_pages(struct vm_area_struct * vma)120 unsigned long vma_pad_pages(struct vm_area_struct *vma)
121 {
122 if (!is_pgsize_migration_enabled())
123 return 0;
124
125 return vma->vm_flags >> VM_PAD_SHIFT;
126 }
127
str_has_suffix(const char * str,const char * suffix)128 static __always_inline bool str_has_suffix(const char *str, const char *suffix)
129 {
130 size_t str_len = strlen(str);
131 size_t suffix_len = strlen(suffix);
132
133 if (str_len < suffix_len)
134 return false;
135
136 return !strncmp(str + str_len - suffix_len, suffix, suffix_len);
137 }
138
139 /*
140 * The dynamic linker, or interpreter, operates within the process context
141 * of the binary that necessitated dynamic linking.
142 *
143 * Consequently, process context identifiers; like PID, comm, ...; cannot
144 * be used to differentiate whether the execution context belongs to the
145 * dynamic linker or not.
146 *
147 * linker_ctx() deduces whether execution is currently in the dynamic linker's
148 * context by correlating the current userspace instruction pointer with the
149 * VMAs of the current task.
150 *
151 * Returns true if in linker context, otherwise false.
152 *
153 * Caller must hold mmap lock in read mode.
154 */
linker_ctx(void)155 static inline bool linker_ctx(void)
156 {
157 struct pt_regs *regs = task_pt_regs(current);
158 struct mm_struct *mm = current->mm;
159 struct vm_area_struct *vma;
160 struct file *file;
161
162 if (!regs)
163 return false;
164
165 vma = find_vma(mm, instruction_pointer(regs));
166
167 /* Current execution context, the VMA must be present */
168 BUG_ON(!vma);
169
170 file = vma->vm_file;
171 if (!file)
172 return false;
173
174 if ((vma->vm_flags & VM_EXEC)) {
175 char buf[64];
176 const int bufsize = sizeof(buf);
177 char *path;
178
179 memset(buf, 0, bufsize);
180 path = d_path(&file->f_path, buf, bufsize);
181
182 if (!strcmp(path, "/system/bin/linker64"))
183 return true;
184 }
185
186 return false;
187 }
188
189 /*
190 * Saves the number of padding pages for an ELF segment mapping
191 * in vm_flags.
192 *
193 * The number of padding pages is deduced from the madvise DONTNEED range [start, end)
194 * if the following conditions are met:
195 * 1) The range is enclosed by a single VMA
196 * 2) The range ends at the end address of the VMA
197 * 3) The range starts at an address greater than the start address of the VMA
198 * 4) The number of the pages in the range does not exceed VM_TOTAL_PAD_PAGES.
199 * 5) The VMA is a file backed VMA.
200 * 6) The file backing the VMA is a shared library (*.so)
201 * 7) The madvise was requested by bionic's dynamic linker.
202 */
madvise_vma_pad_pages(struct vm_area_struct * vma,unsigned long start,unsigned long end)203 void madvise_vma_pad_pages(struct vm_area_struct *vma,
204 unsigned long start, unsigned long end)
205 {
206 unsigned long nr_pad_pages;
207
208 if (!is_pgsize_migration_enabled())
209 return;
210
211 /*
212 * If the madvise range is it at the end of the file save the number of
213 * pages in vm_flags (only need 4 bits are needed for up to 64kB aligned ELFs).
214 */
215 if (start <= vma->vm_start || end != vma->vm_end)
216 return;
217
218 nr_pad_pages = (end - start) >> PAGE_SHIFT;
219
220 if (!nr_pad_pages || nr_pad_pages > VM_TOTAL_PAD_PAGES)
221 return;
222
223 /* Only handle this for file backed VMAs */
224 if (!vma->vm_file)
225 return;
226
227 /* Limit this to only shared libraries (*.so) */
228 if (!str_has_suffix(vma->vm_file->f_path.dentry->d_name.name, ".so"))
229 return;
230
231 /* Only bionic's dynamic linker needs to hint padding pages. */
232 if (!linker_ctx())
233 return;
234
235 vma_set_pad_pages(vma, nr_pad_pages);
236 }
237
pad_vma_name(struct vm_area_struct * vma)238 static const char *pad_vma_name(struct vm_area_struct *vma)
239 {
240 return "[page size compat]";
241 }
242
243 static const struct vm_operations_struct pad_vma_ops = {
244 .name = pad_vma_name,
245 };
246
247 /*
248 * Returns a new VMA representing the padding in @vma, if no padding
249 * in @vma returns NULL.
250 */
get_pad_vma(struct vm_area_struct * vma)251 struct vm_area_struct *get_pad_vma(struct vm_area_struct *vma)
252 {
253 struct vm_area_struct *pad;
254
255 if (!is_pgsize_migration_enabled() || !(vma->vm_flags & VM_PAD_MASK))
256 return NULL;
257
258 pad = kzalloc(sizeof(struct vm_area_struct), GFP_KERNEL);
259
260 *pad = *vma;
261
262 /* Remove file */
263 pad->vm_file = NULL;
264
265 /* Add vm_ops->name */
266 pad->vm_ops = &pad_vma_ops;
267
268 /* Adjust the start to begin at the start of the padding section */
269 pad->vm_start = VMA_PAD_START(pad);
270
271 /* Make the pad vma PROT_NONE */
272 pad->vm_flags &= ~(VM_READ|VM_WRITE|VM_EXEC);
273
274 /* Remove padding bits */
275 pad->vm_flags &= ~VM_PAD_MASK;
276
277 return pad;
278 }
279
280 /*
281 * Returns a new VMA exclusing the padding from @vma; if no padding in
282 * @vma returns @vma.
283 */
get_data_vma(struct vm_area_struct * vma)284 struct vm_area_struct *get_data_vma(struct vm_area_struct *vma)
285 {
286 struct vm_area_struct *data;
287
288 if (!is_pgsize_migration_enabled() || !(vma->vm_flags & VM_PAD_MASK))
289 return vma;
290
291 data = kzalloc(sizeof(struct vm_area_struct), GFP_KERNEL);
292
293 *data = *vma;
294
295 /* Adjust the end to the start of the padding section */
296 data->vm_end = VMA_PAD_START(data);
297
298 return data;
299 }
300
301 /*
302 * Calls the show_pad_vma_fn on the @pad VMA, and frees the copies of @vma
303 * and @pad.
304 */
show_map_pad_vma(struct vm_area_struct * vma,struct vm_area_struct * pad,struct seq_file * m,show_pad_vma_fn func)305 void show_map_pad_vma(struct vm_area_struct *vma, struct vm_area_struct *pad,
306 struct seq_file *m, show_pad_vma_fn func)
307 {
308 if (!pad)
309 return;
310
311 /*
312 * This cannot happen. If @pad vma was allocated the corresponding
313 * @vma should have the VM_PAD_MASK bit(s) set.
314 */
315 BUG_ON(!(vma->vm_flags & VM_PAD_MASK));
316
317 /*
318 * This cannot happen. @pad is a section of the original VMA.
319 * Therefore @vma cannot be null if @pad is not null.
320 */
321 BUG_ON(!vma);
322
323 func(m, pad);
324
325 kfree(pad);
326 kfree(vma);
327 }
328
329 /*
330 * When splitting a padding VMA there are a couple of cases to handle.
331 *
332 * Given:
333 *
334 * | DDDDPPPP |
335 *
336 * where:
337 * - D represents 1 page of data;
338 * - P represents 1 page of padding;
339 * - | represents the boundaries (start/end) of the VMA
340 *
341 *
342 * 1) Split exactly at the padding boundary
343 *
344 * | DDDDPPPP | --> | DDDD | PPPP |
345 *
346 * - Remove padding flags from the first VMA.
347 * - The second VMA is all padding
348 *
349 * 2) Split within the padding area
350 *
351 * | DDDDPPPP | --> | DDDDPP | PP |
352 *
353 * - Subtract the length of the second VMA from the first VMA's padding.
354 * - The second VMA is all padding, adjust its padding length (flags)
355 *
356 * 3) Split within the data area
357 *
358 * | DDDDPPPP | --> | DD | DDPPPP |
359 *
360 * - Remove padding flags from the first VMA.
361 * - The second VMA is has the same padding as from before the split.
362 */
split_pad_vma(struct vm_area_struct * vma,struct vm_area_struct * new,unsigned long addr,int new_below)363 void split_pad_vma(struct vm_area_struct *vma, struct vm_area_struct *new,
364 unsigned long addr, int new_below)
365 {
366 unsigned long nr_pad_pages = vma_pad_pages(vma);
367 unsigned long nr_vma2_pages;
368 struct vm_area_struct *first;
369 struct vm_area_struct *second;
370
371 if (!nr_pad_pages)
372 return;
373
374 if (new_below) {
375 first = new;
376 second = vma;
377 } else {
378 first = vma;
379 second = new;
380 }
381
382 nr_vma2_pages = vma_pages(second);
383
384 if (nr_vma2_pages == nr_pad_pages) { /* Case 1 */
385 first->vm_flags &= ~VM_PAD_MASK;
386 vma_set_pad_pages(second, nr_pad_pages);
387 } else if (nr_vma2_pages < nr_pad_pages) { /* Case 2 */
388 vma_set_pad_pages(first, nr_pad_pages - nr_vma2_pages);
389 vma_set_pad_pages(second, nr_vma2_pages);
390 } else { /* Case 3 */
391 first->vm_flags &= ~VM_PAD_MASK;
392 vma_set_pad_pages(second, nr_pad_pages);
393 }
394 }
395 #endif /* PAGE_SIZE == SZ_4K */
396 #endif /* CONFIG_64BIT */
397