• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Page Size Migration
4  *
5  * This file contains the core logic of mitigations to ensure
6  * app compatibility during the transition from 4kB to 16kB
7  * page size in Android.
8  *
9  * Copyright (c) 2024, Google LLC.
10  * Author: Kalesh Singh <kaleshsingh@goole.com>
11  */
12 
13 #include <linux/pgsize_migration.h>
14 
15 #include <linux/init.h>
16 #include <linux/jump_label.h>
17 #include <linux/kobject.h>
18 #include <linux/kstrtox.h>
19 #include <linux/sched/task_stack.h>
20 #include <linux/slab.h>
21 #include <linux/string.h>
22 #include <linux/sysfs.h>
23 
24 typedef void (*show_pad_maps_fn)	(struct seq_file *m, struct vm_area_struct *vma);
25 typedef int  (*show_pad_smaps_fn)	(struct seq_file *m, void *v);
26 
27 #ifdef CONFIG_64BIT
28 #if PAGE_SIZE == SZ_4K
29 DEFINE_STATIC_KEY_TRUE(pgsize_migration_enabled);
30 
31 #define is_pgsize_migration_enabled() 	(static_branch_likely(&pgsize_migration_enabled))
32 #else /* PAGE_SIZE != SZ_4K */
33 DEFINE_STATIC_KEY_FALSE(pgsize_migration_enabled);
34 
35 #define is_pgsize_migration_enabled() 	(static_branch_unlikely(&pgsize_migration_enabled))
36 #endif /* PAGE_SIZE == SZ_4K */
37 
show_pgsize_migration_enabled(struct kobject * kobj,struct kobj_attribute * attr,char * buf)38 static ssize_t show_pgsize_migration_enabled(struct kobject *kobj,
39 					     struct kobj_attribute *attr,
40 					     char *buf)
41 {
42 	if (is_pgsize_migration_enabled())
43 		return sprintf(buf, "%d\n", 1);
44 	else
45 		return sprintf(buf, "%d\n", 0);
46 }
47 
store_pgsize_migration_enabled(struct kobject * kobj,struct kobj_attribute * attr,const char * buf,size_t n)48 static ssize_t store_pgsize_migration_enabled(struct kobject *kobj,
49 					      struct kobj_attribute *attr,
50 					      const char *buf, size_t n)
51 {
52 	unsigned long val;
53 
54 	/* Migration is only applicable to 4kB kernels */
55 	if (PAGE_SIZE != SZ_4K)
56 		return n;
57 
58 	if (kstrtoul(buf, 10, &val))
59 		return -EINVAL;
60 
61 	if (val > 1)
62 		return -EINVAL;
63 
64 	if (val == 1)
65 		static_branch_enable(&pgsize_migration_enabled);
66 	else if (val == 0)
67 		static_branch_disable(&pgsize_migration_enabled);
68 
69 	return n;
70 }
71 
72 static struct kobj_attribute pgsize_migration_enabled_attr = __ATTR(
73 	enabled,
74 	0644,
75 	show_pgsize_migration_enabled,
76 	store_pgsize_migration_enabled
77 );
78 
79 static struct attribute *pgsize_migration_attrs[] = {
80 	&pgsize_migration_enabled_attr.attr,
81 	NULL
82 };
83 
84 static struct attribute_group pgsize_migration_attr_group = {
85 	.name = "pgsize_migration",
86 	.attrs = pgsize_migration_attrs,
87 };
88 
89 /**
90  * What:          /sys/kernel/mm/pgsize_migration/enabled
91  * Date:          April 2024
92  * KernelVersion: v5.4+ (GKI kernels)
93  * Contact:       Kalesh Singh <kaleshsingh@google.com>
94  * Description:   /sys/kernel/mm/pgsize_migration/enabled
95  *                allows for userspace to turn on or off page size
96  *                migration mitigations necessary for app compatibility
97  *                during Android's transition from 4kB to 16kB page size.
98  *                Such mitigations include preserving /proc/<pid>/[s]maps
99  *                output as if there was no segment extension by the
100  *                dynamic loader; and preventing fault around in the padding
101  *                sections of ELF LOAD segment mappings.
102  * Users:         Bionic's dynamic linker
103  */
init_pgsize_migration(void)104 static int __init init_pgsize_migration(void)
105 {
106 	if (sysfs_create_group(mm_kobj, &pgsize_migration_attr_group))
107 		pr_err("pgsize_migration: failed to create sysfs group\n");
108 
109 	return 0;
110 };
111 late_initcall(init_pgsize_migration);
112 
113 #if PAGE_SIZE == SZ_4K
vma_set_pad_pages(struct vm_area_struct * vma,unsigned long nr_pages)114 void vma_set_pad_pages(struct vm_area_struct *vma,
115 		       unsigned long nr_pages)
116 {
117 	if (!is_pgsize_migration_enabled())
118 		return;
119 
120 	/*
121 	 * Usually to modify vm_flags we need to take exclusive mmap_lock but here
122 	 * only have the lock in read mode, to avoid all DONTNEED/DONTNEED_LOCKED
123 	 * calls needing the write lock.
124 	 *
125 	 * A race to the flags update can only happen with another MADV_DONTNEED on
126 	 * the same process and same range (VMA).
127 	 *
128 	 * In practice, this specific scenario is not possible because the action that
129 	 * could cause it is usually performed at most once per VMA and only by the
130 	 * dynamic linker.
131 	 *
132 	 * Forego protection for this case, to avoid penalties in the common cases.
133 	 */
134 	__vm_flags_mod(vma, 0, VM_PAD_MASK);
135 	__vm_flags_mod(vma, nr_pages << VM_PAD_SHIFT, 0);
136 }
137 
vma_pad_pages(struct vm_area_struct * vma)138 unsigned long vma_pad_pages(struct vm_area_struct *vma)
139 {
140 	if (!is_pgsize_migration_enabled())
141 		return 0;
142 
143 	return vma->vm_flags >> VM_PAD_SHIFT;
144 }
145 
str_has_suffix(const char * str,const char * suffix)146 static __always_inline bool str_has_suffix(const char *str, const char *suffix)
147 {
148 	size_t str_len = strlen(str);
149 	size_t suffix_len = strlen(suffix);
150 
151 	if (str_len < suffix_len)
152 		return false;
153 
154 	return !strncmp(str + str_len - suffix_len, suffix, suffix_len);
155 }
156 
157 /*
158  * The dynamic linker, or interpreter, operates within the process context
159  * of the binary that necessitated dynamic linking.
160  *
161  * Consequently, process context identifiers; like PID, comm, ...; cannot
162  * be used to differentiate whether the execution context belongs to the
163  * dynamic linker or not.
164  *
165  * linker_ctx() deduces whether execution is currently in the dynamic linker's
166  * context by correlating the current userspace instruction pointer with the
167  * VMAs of the current task.
168  *
169  * Returns true if in linker context, otherwise false.
170  *
171  * Caller must hold mmap lock in read mode.
172  */
linker_ctx(void)173 static inline bool linker_ctx(void)
174 {
175 	struct pt_regs *regs = task_pt_regs(current);
176 	struct mm_struct *mm = current->mm;
177 	struct vm_area_struct *vma;
178 	struct file *file;
179 
180 	if (!regs)
181 		return false;
182 
183 	vma = find_vma(mm, instruction_pointer(regs));
184 
185 	/* Current execution context, the VMA must be present */
186 	BUG_ON(!vma);
187 
188 	file = vma->vm_file;
189 	if (!file)
190 		return false;
191 
192 	if ((vma->vm_flags & VM_EXEC)) {
193 		char buf[64];
194 		const int bufsize = sizeof(buf);
195 		char *path;
196 
197 		memset(buf, 0, bufsize);
198 		path = d_path(&file->f_path, buf, bufsize);
199 
200 		/*
201 		 * Depending on interpreter requested, valid paths could be any of:
202 		 *   1. /system/bin/bootstrap/linker64
203 		 *   2. /system/bin/linker64
204 		 *   3. /apex/com.android.runtime/bin/linker64
205 		 *
206 		 * Check the base name (linker64).
207 		 */
208 		if (!strcmp(kbasename(path), "linker64"))
209 			return true;
210 	}
211 
212 	return false;
213 }
214 
215 /*
216  * Saves the number of padding pages for an ELF segment mapping
217  * in vm_flags.
218  *
219  * The number of padding pages is deduced from the madvise DONTNEED range [start, end)
220  * if the following conditions are met:
221  *    1) The range is enclosed by a single VMA
222  *    2) The range ends at the end address of the VMA
223  *    3) The range starts at an address greater than the start address of the VMA
224  *    4) The number of the pages in the range does not exceed VM_TOTAL_PAD_PAGES.
225  *    5) The VMA is a file backed VMA.
226  *    6) The file backing the VMA is a shared library (*.so)
227  *    7) The madvise was requested by bionic's dynamic linker.
228  */
madvise_vma_pad_pages(struct vm_area_struct * vma,unsigned long start,unsigned long end)229 void madvise_vma_pad_pages(struct vm_area_struct *vma,
230 			   unsigned long start, unsigned long end)
231 {
232 	unsigned long nr_pad_pages;
233 
234 	if (!is_pgsize_migration_enabled())
235 		return;
236 
237 	/*
238 	 * If the madvise range is it at the end of the file save the number of
239 	 * pages in vm_flags (only need 4 bits are needed for up to 64kB aligned ELFs).
240 	 */
241 	if (start <= vma->vm_start || end != vma->vm_end)
242 		return;
243 
244 	nr_pad_pages = (end - start) >> PAGE_SHIFT;
245 
246 	if (!nr_pad_pages || nr_pad_pages > VM_TOTAL_PAD_PAGES)
247 		return;
248 
249 	/* Only handle this for file backed VMAs */
250 	if (!vma->vm_file)
251 		return;
252 
253 	/* Limit this to only shared libraries (*.so) */
254 	if (!str_has_suffix(vma->vm_file->f_path.dentry->d_name.name, ".so"))
255 		return;
256 
257 	/* Only bionic's dynamic linker needs to hint padding pages. */
258 	if (!linker_ctx())
259 		return;
260 
261 	vma_set_pad_pages(vma, nr_pad_pages);
262 }
263 
pad_vma_name(struct vm_area_struct * vma)264 static const char *pad_vma_name(struct vm_area_struct *vma)
265 {
266 	return "[page size compat]";
267 }
268 
269 static const struct vm_operations_struct pad_vma_ops = {
270 	.name = pad_vma_name,
271 };
272 
273 /*
274  * Returns a new VMA representing the padding in @vma, if no padding
275  * in @vma returns NULL.
276  */
get_pad_vma(struct vm_area_struct * vma)277 struct vm_area_struct *get_pad_vma(struct vm_area_struct *vma)
278 {
279 	struct vm_area_struct *pad;
280 
281 	if (!is_pgsize_migration_enabled() || !(vma->vm_flags & VM_PAD_MASK))
282 		return NULL;
283 
284 	pad = kzalloc(sizeof(struct vm_area_struct), GFP_KERNEL);
285 
286 	memcpy(pad, vma, sizeof(struct vm_area_struct));
287 
288 	/* Remove file */
289 	pad->vm_file = NULL;
290 
291 	/* Add vm_ops->name */
292 	pad->vm_ops = &pad_vma_ops;
293 
294 	/* Adjust the start to begin at the start of the padding section */
295 	pad->vm_start = VMA_PAD_START(pad);
296 
297 	/*
298 	 * The below modifications to vm_flags don't need mmap write lock,
299 	 * since, pad does not belong to the VMA tree.
300 	 */
301 	/* Make the pad vma PROT_NONE */
302 	__vm_flags_mod(pad, 0, VM_READ|VM_WRITE|VM_EXEC);
303 	/* Remove padding bits */
304 	__vm_flags_mod(pad, 0, VM_PAD_MASK);
305 
306 	return pad;
307 }
308 
309 /*
310  * Returns a new VMA exclusing the padding from @vma; if no padding in
311  * @vma returns @vma.
312  */
get_data_vma(struct vm_area_struct * vma)313 struct vm_area_struct *get_data_vma(struct vm_area_struct *vma)
314 {
315 	struct vm_area_struct *data;
316 
317 	if (!is_pgsize_migration_enabled() || !(vma->vm_flags & VM_PAD_MASK))
318 		return vma;
319 
320 	data = kzalloc(sizeof(struct vm_area_struct), GFP_KERNEL);
321 
322 	memcpy(data, vma, sizeof(struct vm_area_struct));
323 
324 	/* Adjust the end to the start of the padding section */
325 	data->vm_end = VMA_PAD_START(data);
326 
327 	return data;
328 }
329 
330 /*
331  * Calls the show_pad_vma_fn on the @pad VMA, and frees the copies of @vma
332  * and @pad.
333  */
show_map_pad_vma(struct vm_area_struct * vma,struct vm_area_struct * pad,struct seq_file * m,void * func,bool smaps)334 void show_map_pad_vma(struct vm_area_struct *vma, struct vm_area_struct *pad,
335 		      struct seq_file *m, void *func, bool smaps)
336 {
337 	if (!pad)
338 		return;
339 
340 	/*
341 	 * This cannot happen. If @pad vma was allocated the corresponding
342 	 * @vma should have the VM_PAD_MASK bit(s) set.
343 	 */
344 	BUG_ON(!(vma->vm_flags & VM_PAD_MASK));
345 
346 	/*
347 	 * This cannot happen. @pad is a section of the original VMA.
348 	 * Therefore @vma cannot be null if @pad is not null.
349 	 */
350 	BUG_ON(!vma);
351 
352 	if (smaps)
353 		((show_pad_smaps_fn)func)(m, pad);
354 	else
355 		((show_pad_maps_fn)func)(m, pad);
356 
357 	kfree(pad);
358 	kfree(vma);
359 }
360 
361 /*
362  * When splitting a padding VMA there are a couple of cases to handle.
363  *
364  * Given:
365  *
366  *     | DDDDPPPP |
367  *
368  * where:
369  *     - D represents 1 page of data;
370  *     - P represents 1 page of padding;
371  *     - | represents the boundaries (start/end) of the VMA
372  *
373  *
374  * 1) Split exactly at the padding boundary
375  *
376  *     | DDDDPPPP | --> | DDDD | PPPP |
377  *
378  *     - Remove padding flags from the first VMA.
379  *     - The second VMA is all padding
380  *
381  * 2) Split within the padding area
382  *
383  *     | DDDDPPPP | --> | DDDDPP | PP |
384  *
385  *     - Subtract the length of the second VMA from the first VMA's padding.
386  *     - The second VMA is all padding, adjust its padding length (flags)
387  *
388  * 3) Split within the data area
389  *
390  *     | DDDDPPPP | --> | DD | DDPPPP |
391  *
392  *     - Remove padding flags from the first VMA.
393  *     - The second VMA is has the same padding as from before the split.
394  */
split_pad_vma(struct vm_area_struct * vma,struct vm_area_struct * new,unsigned long addr,int new_below)395 void split_pad_vma(struct vm_area_struct *vma, struct vm_area_struct *new,
396 		   unsigned long addr, int new_below)
397 {
398 	unsigned long nr_pad_pages = vma_pad_pages(vma);
399 	unsigned long nr_vma2_pages;
400 	struct vm_area_struct *first;
401 	struct vm_area_struct *second;
402 
403 	if (!nr_pad_pages)
404 		return;
405 
406 	if (new_below) {
407 		first = new;
408 		second = vma;
409 	} else {
410 		first = vma;
411 		second = new;
412 	}
413 
414 	nr_vma2_pages = vma_pages(second);
415 
416 	if (nr_vma2_pages >= nr_pad_pages) { 			/* Case 1 & 3*/
417 		vm_flags_clear(first, VM_PAD_MASK);
418 		vma_set_pad_pages(second, nr_pad_pages);
419 	} else {						/* Case 2 */
420 		vma_set_pad_pages(first, nr_pad_pages - nr_vma2_pages);
421 		vma_set_pad_pages(second, nr_vma2_pages);
422 	}
423 }
424 
425 /*
426  * Sets the correct padding bits / flags for a VMA split.
427  */
vma_pad_fixup_flags(struct vm_area_struct * vma,unsigned long newflags)428 unsigned long vma_pad_fixup_flags(struct vm_area_struct *vma,
429 				  unsigned long newflags)
430 {
431 	if (newflags & VM_PAD_MASK)
432 		return (newflags & ~VM_PAD_MASK) | (vma->vm_flags & VM_PAD_MASK);
433 	else
434 		return newflags;
435 }
436 
437 /*
438  * Merging of padding VMAs is uncommon, as padding is only allowed
439  * from the linker context.
440  *
441  * To simplify the semantics, adjacent VMAs with padding are not
442  * allowed to merge.
443  */
is_mergable_pad_vma(struct vm_area_struct * vma,unsigned long vm_flags)444 bool is_mergable_pad_vma(struct vm_area_struct *vma,
445 			 unsigned long vm_flags)
446 {
447 	/* Padding VMAs cannot be merged with other padding or real VMAs */
448 	return !((vma->vm_flags | vm_flags) & VM_PAD_MASK);
449 }
450 
vma_data_pages(struct vm_area_struct * vma)451 unsigned long vma_data_pages(struct vm_area_struct *vma)
452 {
453 	return vma_pages(vma) - vma_pad_pages(vma);
454 }
455 
456 #endif /* PAGE_SIZE == SZ_4K */
457 #endif /* CONFIG_64BIT */
458