1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * Copyright (C) 2009 Red Hat, Inc.
4 */
5
6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
7
8 #include <linux/mm.h>
9 #include <linux/sched.h>
10 #include <linux/sched/mm.h>
11 #include <linux/sched/coredump.h>
12 #include <linux/sched/numa_balancing.h>
13 #include <linux/highmem.h>
14 #include <linux/hugetlb.h>
15 #include <linux/mmu_notifier.h>
16 #include <linux/rmap.h>
17 #include <linux/swap.h>
18 #include <linux/shrinker.h>
19 #include <linux/mm_inline.h>
20 #include <linux/swapops.h>
21 #include <linux/backing-dev.h>
22 #include <linux/dax.h>
23 #include <linux/khugepaged.h>
24 #include <linux/freezer.h>
25 #include <linux/pfn_t.h>
26 #include <linux/mman.h>
27 #include <linux/memremap.h>
28 #include <linux/pagemap.h>
29 #include <linux/debugfs.h>
30 #include <linux/migrate.h>
31 #include <linux/hashtable.h>
32 #include <linux/userfaultfd_k.h>
33 #include <linux/page_idle.h>
34 #include <linux/shmem_fs.h>
35 #include <linux/oom.h>
36 #include <linux/numa.h>
37 #include <linux/page_owner.h>
38 #include <linux/sched/sysctl.h>
39 #include <linux/memory-tiers.h>
40 #include <linux/compat.h>
41
42 #include <asm/tlb.h>
43 #include <asm/pgalloc.h>
44 #include "internal.h"
45 #include "swap.h"
46
47 #define CREATE_TRACE_POINTS
48 #include <trace/events/thp.h>
49
50 /*
51 * By default, transparent hugepage support is disabled in order to avoid
52 * risking an increased memory footprint for applications that are not
53 * guaranteed to benefit from it. When transparent hugepage support is
54 * enabled, it is for all mappings, and khugepaged scans all mappings.
55 * Defrag is invoked by khugepaged hugepage allocations and by page faults
56 * for all hugepage allocations.
57 */
58 unsigned long transparent_hugepage_flags __read_mostly =
59 #ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS
60 (1<<TRANSPARENT_HUGEPAGE_FLAG)|
61 #endif
62 #ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE
63 (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)|
64 #endif
65 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG)|
66 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)|
67 (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
68
69 static struct shrinker deferred_split_shrinker;
70
71 static atomic_t huge_zero_refcount;
72 struct page *huge_zero_page __read_mostly;
73 unsigned long huge_zero_pfn __read_mostly = ~0UL;
74 unsigned long huge_anon_orders_always __read_mostly;
75 unsigned long huge_anon_orders_madvise __read_mostly;
76 unsigned long huge_anon_orders_inherit __read_mostly;
77
__thp_vma_allowable_orders(struct vm_area_struct * vma,unsigned long vm_flags,unsigned long tva_flags,unsigned long orders)78 unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
79 unsigned long vm_flags,
80 unsigned long tva_flags,
81 unsigned long orders)
82 {
83 bool smaps = tva_flags & TVA_SMAPS;
84 bool in_pf = tva_flags & TVA_IN_PF;
85 bool enforce_sysfs = tva_flags & TVA_ENFORCE_SYSFS;
86 /* Check the intersection of requested and supported orders. */
87 orders &= vma_is_anonymous(vma) ?
88 THP_ORDERS_ALL_ANON : THP_ORDERS_ALL_FILE;
89 if (!orders)
90 return 0;
91
92 if (!vma->vm_mm) /* vdso */
93 return 0;
94
95 /*
96 * Explicitly disabled through madvise or prctl, or some
97 * architectures may disable THP for some mappings, for
98 * example, s390 kvm.
99 * */
100 if ((vm_flags & VM_NOHUGEPAGE) ||
101 test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
102 return 0;
103 /*
104 * If the hardware/firmware marked hugepage support disabled.
105 */
106 if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_UNSUPPORTED))
107 return 0;
108
109 /* khugepaged doesn't collapse DAX vma, but page fault is fine. */
110 if (vma_is_dax(vma))
111 return in_pf ? orders : 0;
112
113 /*
114 * khugepaged special VMA and hugetlb VMA.
115 * Must be checked after dax since some dax mappings may have
116 * VM_MIXEDMAP set.
117 */
118 if (!in_pf && !smaps && (vm_flags & VM_NO_KHUGEPAGED))
119 return 0;
120
121 /*
122 * Check alignment for file vma and size for both file and anon vma by
123 * filtering out the unsuitable orders.
124 *
125 * Skip the check for page fault. Huge fault does the check in fault
126 * handlers.
127 */
128 if (!in_pf) {
129 int order = highest_order(orders);
130 unsigned long addr;
131
132 while (orders) {
133 addr = vma->vm_end - (PAGE_SIZE << order);
134 if (thp_vma_suitable_order(vma, addr, order))
135 break;
136 order = next_order(&orders, order);
137 }
138
139 if (!orders)
140 return 0;
141 }
142
143 /*
144 * Enabled via shmem mount options or sysfs settings.
145 * Must be done before hugepage flags check since shmem has its
146 * own flags.
147 */
148 if (!in_pf && shmem_file(vma->vm_file))
149 return shmem_is_huge(file_inode(vma->vm_file), vma->vm_pgoff,
150 !enforce_sysfs, vma->vm_mm, vm_flags)
151 ? orders : 0;
152
153 if (!vma_is_anonymous(vma)) {
154 /*
155 * Enforce sysfs THP requirements as necessary. Anonymous vmas
156 * were already handled in thp_vma_allowable_orders().
157 */
158 if (enforce_sysfs &&
159 (!hugepage_global_enabled() || (!(vm_flags & VM_HUGEPAGE) &&
160 !hugepage_global_always())))
161 return 0;
162
163 /*
164 * Trust that ->huge_fault() handlers know what they are doing
165 * in fault path.
166 */
167 if (((in_pf || smaps)) && vma->vm_ops->huge_fault)
168 return orders;
169 /* Only regular file is valid in collapse path */
170 if (((!in_pf || smaps)) && file_thp_enabled(vma))
171 return orders;
172 return 0;
173 }
174
175 if (vma_is_temporary_stack(vma))
176 return 0;
177
178 /*
179 * THPeligible bit of smaps should show 1 for proper VMAs even
180 * though anon_vma is not initialized yet.
181 *
182 * Allow page fault since anon_vma may be not initialized until
183 * the first page fault.
184 */
185 if (!vma->anon_vma)
186 return (smaps || in_pf) ? orders : 0;
187
188 return orders;
189 }
190
get_huge_zero_page(void)191 static bool get_huge_zero_page(void)
192 {
193 struct page *zero_page;
194 retry:
195 if (likely(atomic_inc_not_zero(&huge_zero_refcount)))
196 return true;
197
198 zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE,
199 HPAGE_PMD_ORDER);
200 if (!zero_page) {
201 count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED);
202 return false;
203 }
204 preempt_disable();
205 if (cmpxchg(&huge_zero_page, NULL, zero_page)) {
206 preempt_enable();
207 __free_pages(zero_page, compound_order(zero_page));
208 goto retry;
209 }
210 WRITE_ONCE(huge_zero_pfn, page_to_pfn(zero_page));
211
212 /* We take additional reference here. It will be put back by shrinker */
213 atomic_set(&huge_zero_refcount, 2);
214 preempt_enable();
215 count_vm_event(THP_ZERO_PAGE_ALLOC);
216 return true;
217 }
218
put_huge_zero_page(void)219 static void put_huge_zero_page(void)
220 {
221 /*
222 * Counter should never go to zero here. Only shrinker can put
223 * last reference.
224 */
225 BUG_ON(atomic_dec_and_test(&huge_zero_refcount));
226 }
227
mm_get_huge_zero_page(struct mm_struct * mm)228 struct page *mm_get_huge_zero_page(struct mm_struct *mm)
229 {
230 if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
231 return READ_ONCE(huge_zero_page);
232
233 if (!get_huge_zero_page())
234 return NULL;
235
236 if (test_and_set_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
237 put_huge_zero_page();
238
239 return READ_ONCE(huge_zero_page);
240 }
241
mm_put_huge_zero_page(struct mm_struct * mm)242 void mm_put_huge_zero_page(struct mm_struct *mm)
243 {
244 if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
245 put_huge_zero_page();
246 }
247
shrink_huge_zero_page_count(struct shrinker * shrink,struct shrink_control * sc)248 static unsigned long shrink_huge_zero_page_count(struct shrinker *shrink,
249 struct shrink_control *sc)
250 {
251 /* we can free zero page only if last reference remains */
252 return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0;
253 }
254
shrink_huge_zero_page_scan(struct shrinker * shrink,struct shrink_control * sc)255 static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink,
256 struct shrink_control *sc)
257 {
258 if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) {
259 struct page *zero_page = xchg(&huge_zero_page, NULL);
260 BUG_ON(zero_page == NULL);
261 WRITE_ONCE(huge_zero_pfn, ~0UL);
262 __free_pages(zero_page, compound_order(zero_page));
263 return HPAGE_PMD_NR;
264 }
265
266 return 0;
267 }
268
269 static struct shrinker huge_zero_page_shrinker = {
270 .count_objects = shrink_huge_zero_page_count,
271 .scan_objects = shrink_huge_zero_page_scan,
272 .seeks = DEFAULT_SEEKS,
273 };
274
275 #ifdef CONFIG_SYSFS
enabled_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)276 static ssize_t enabled_show(struct kobject *kobj,
277 struct kobj_attribute *attr, char *buf)
278 {
279 const char *output;
280
281 if (test_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags))
282 output = "[always] madvise never";
283 else if (test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
284 &transparent_hugepage_flags))
285 output = "always [madvise] never";
286 else
287 output = "always madvise [never]";
288
289 return sysfs_emit(buf, "%s\n", output);
290 }
291
enabled_store(struct kobject * kobj,struct kobj_attribute * attr,const char * buf,size_t count)292 static ssize_t enabled_store(struct kobject *kobj,
293 struct kobj_attribute *attr,
294 const char *buf, size_t count)
295 {
296 ssize_t ret = count;
297
298 if (sysfs_streq(buf, "always")) {
299 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
300 set_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
301 } else if (sysfs_streq(buf, "madvise")) {
302 clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
303 set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
304 } else if (sysfs_streq(buf, "never")) {
305 clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
306 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
307 } else
308 ret = -EINVAL;
309
310 if (ret > 0) {
311 int err = start_stop_khugepaged();
312 if (err)
313 ret = err;
314 }
315 return ret;
316 }
317
318 static struct kobj_attribute enabled_attr = __ATTR_RW(enabled);
319
single_hugepage_flag_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf,enum transparent_hugepage_flag flag)320 ssize_t single_hugepage_flag_show(struct kobject *kobj,
321 struct kobj_attribute *attr, char *buf,
322 enum transparent_hugepage_flag flag)
323 {
324 return sysfs_emit(buf, "%d\n",
325 !!test_bit(flag, &transparent_hugepage_flags));
326 }
327
single_hugepage_flag_store(struct kobject * kobj,struct kobj_attribute * attr,const char * buf,size_t count,enum transparent_hugepage_flag flag)328 ssize_t single_hugepage_flag_store(struct kobject *kobj,
329 struct kobj_attribute *attr,
330 const char *buf, size_t count,
331 enum transparent_hugepage_flag flag)
332 {
333 unsigned long value;
334 int ret;
335
336 ret = kstrtoul(buf, 10, &value);
337 if (ret < 0)
338 return ret;
339 if (value > 1)
340 return -EINVAL;
341
342 if (value)
343 set_bit(flag, &transparent_hugepage_flags);
344 else
345 clear_bit(flag, &transparent_hugepage_flags);
346
347 return count;
348 }
349
defrag_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)350 static ssize_t defrag_show(struct kobject *kobj,
351 struct kobj_attribute *attr, char *buf)
352 {
353 const char *output;
354
355 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG,
356 &transparent_hugepage_flags))
357 output = "[always] defer defer+madvise madvise never";
358 else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG,
359 &transparent_hugepage_flags))
360 output = "always [defer] defer+madvise madvise never";
361 else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG,
362 &transparent_hugepage_flags))
363 output = "always defer [defer+madvise] madvise never";
364 else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG,
365 &transparent_hugepage_flags))
366 output = "always defer defer+madvise [madvise] never";
367 else
368 output = "always defer defer+madvise madvise [never]";
369
370 return sysfs_emit(buf, "%s\n", output);
371 }
372
defrag_store(struct kobject * kobj,struct kobj_attribute * attr,const char * buf,size_t count)373 static ssize_t defrag_store(struct kobject *kobj,
374 struct kobj_attribute *attr,
375 const char *buf, size_t count)
376 {
377 if (sysfs_streq(buf, "always")) {
378 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
379 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
380 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
381 set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
382 } else if (sysfs_streq(buf, "defer+madvise")) {
383 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
384 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
385 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
386 set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
387 } else if (sysfs_streq(buf, "defer")) {
388 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
389 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
390 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
391 set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
392 } else if (sysfs_streq(buf, "madvise")) {
393 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
394 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
395 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
396 set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
397 } else if (sysfs_streq(buf, "never")) {
398 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
399 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
400 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
401 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
402 } else
403 return -EINVAL;
404
405 return count;
406 }
407 static struct kobj_attribute defrag_attr = __ATTR_RW(defrag);
408
use_zero_page_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)409 static ssize_t use_zero_page_show(struct kobject *kobj,
410 struct kobj_attribute *attr, char *buf)
411 {
412 return single_hugepage_flag_show(kobj, attr, buf,
413 TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
414 }
use_zero_page_store(struct kobject * kobj,struct kobj_attribute * attr,const char * buf,size_t count)415 static ssize_t use_zero_page_store(struct kobject *kobj,
416 struct kobj_attribute *attr, const char *buf, size_t count)
417 {
418 return single_hugepage_flag_store(kobj, attr, buf, count,
419 TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
420 }
421 static struct kobj_attribute use_zero_page_attr = __ATTR_RW(use_zero_page);
422
hpage_pmd_size_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)423 static ssize_t hpage_pmd_size_show(struct kobject *kobj,
424 struct kobj_attribute *attr, char *buf)
425 {
426 return sysfs_emit(buf, "%lu\n", HPAGE_PMD_SIZE);
427 }
428 static struct kobj_attribute hpage_pmd_size_attr =
429 __ATTR_RO(hpage_pmd_size);
430
431 static struct attribute *hugepage_attr[] = {
432 &enabled_attr.attr,
433 &defrag_attr.attr,
434 &use_zero_page_attr.attr,
435 &hpage_pmd_size_attr.attr,
436 #ifdef CONFIG_SHMEM
437 &shmem_enabled_attr.attr,
438 #endif
439 NULL,
440 };
441
442 static const struct attribute_group hugepage_attr_group = {
443 .attrs = hugepage_attr,
444 };
445
446 static void hugepage_exit_sysfs(struct kobject *hugepage_kobj);
447 static void thpsize_release(struct kobject *kobj);
448 static DEFINE_SPINLOCK(huge_anon_orders_lock);
449 static LIST_HEAD(thpsize_list);
450
451 struct thpsize {
452 struct kobject kobj;
453 struct list_head node;
454 int order;
455 };
456
457 #define to_thpsize(kobj) container_of(kobj, struct thpsize, kobj)
458
thpsize_enabled_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)459 static ssize_t thpsize_enabled_show(struct kobject *kobj,
460 struct kobj_attribute *attr, char *buf)
461 {
462 int order = to_thpsize(kobj)->order;
463 const char *output;
464
465 if (test_bit(order, &huge_anon_orders_always))
466 output = "[always] inherit madvise never";
467 else if (test_bit(order, &huge_anon_orders_inherit))
468 output = "always [inherit] madvise never";
469 else if (test_bit(order, &huge_anon_orders_madvise))
470 output = "always inherit [madvise] never";
471 else
472 output = "always inherit madvise [never]";
473
474 return sysfs_emit(buf, "%s\n", output);
475 }
476
thpsize_enabled_store(struct kobject * kobj,struct kobj_attribute * attr,const char * buf,size_t count)477 static ssize_t thpsize_enabled_store(struct kobject *kobj,
478 struct kobj_attribute *attr,
479 const char *buf, size_t count)
480 {
481 int order = to_thpsize(kobj)->order;
482 ssize_t ret = count;
483
484 if (sysfs_streq(buf, "always")) {
485 spin_lock(&huge_anon_orders_lock);
486 clear_bit(order, &huge_anon_orders_inherit);
487 clear_bit(order, &huge_anon_orders_madvise);
488 set_bit(order, &huge_anon_orders_always);
489 spin_unlock(&huge_anon_orders_lock);
490 } else if (sysfs_streq(buf, "inherit")) {
491 spin_lock(&huge_anon_orders_lock);
492 clear_bit(order, &huge_anon_orders_always);
493 clear_bit(order, &huge_anon_orders_madvise);
494 set_bit(order, &huge_anon_orders_inherit);
495 spin_unlock(&huge_anon_orders_lock);
496 } else if (sysfs_streq(buf, "madvise")) {
497 spin_lock(&huge_anon_orders_lock);
498 clear_bit(order, &huge_anon_orders_always);
499 clear_bit(order, &huge_anon_orders_inherit);
500 set_bit(order, &huge_anon_orders_madvise);
501 spin_unlock(&huge_anon_orders_lock);
502 } else if (sysfs_streq(buf, "never")) {
503 spin_lock(&huge_anon_orders_lock);
504 clear_bit(order, &huge_anon_orders_always);
505 clear_bit(order, &huge_anon_orders_inherit);
506 clear_bit(order, &huge_anon_orders_madvise);
507 spin_unlock(&huge_anon_orders_lock);
508 } else
509 ret = -EINVAL;
510
511 return ret;
512 }
513
514 static struct kobj_attribute thpsize_enabled_attr =
515 __ATTR(enabled, 0644, thpsize_enabled_show, thpsize_enabled_store);
516
517 static struct attribute *thpsize_attrs[] = {
518 &thpsize_enabled_attr.attr,
519 NULL,
520 };
521
522 static const struct attribute_group thpsize_attr_group = {
523 .attrs = thpsize_attrs,
524 };
525
526 static const struct kobj_type thpsize_ktype = {
527 .release = &thpsize_release,
528 .sysfs_ops = &kobj_sysfs_ops,
529 };
530
531 DEFINE_PER_CPU(struct mthp_stat, mthp_stats) = {{{0}}};
532
sum_mthp_stat(int order,enum mthp_stat_item item)533 static unsigned long sum_mthp_stat(int order, enum mthp_stat_item item)
534 {
535 unsigned long sum = 0;
536 int cpu;
537
538 for_each_possible_cpu(cpu) {
539 struct mthp_stat *this = &per_cpu(mthp_stats, cpu);
540
541 sum += this->stats[order][item];
542 }
543
544 return sum;
545 }
546
547 #define DEFINE_MTHP_STAT_ATTR(_name, _index) \
548 static ssize_t _name##_show(struct kobject *kobj, \
549 struct kobj_attribute *attr, char *buf) \
550 { \
551 int order = to_thpsize(kobj)->order; \
552 \
553 return sysfs_emit(buf, "%lu\n", sum_mthp_stat(order, _index)); \
554 } \
555 static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
556
557 DEFINE_MTHP_STAT_ATTR(anon_fault_alloc, MTHP_STAT_ANON_FAULT_ALLOC);
558 DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK);
559 DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
560 DEFINE_MTHP_STAT_ATTR(swpout, MTHP_STAT_SWPOUT);
561 DEFINE_MTHP_STAT_ATTR(swpout_fallback, MTHP_STAT_SWPOUT_FALLBACK);
562
563 static struct attribute *stats_attrs[] = {
564 &anon_fault_alloc_attr.attr,
565 &anon_fault_fallback_attr.attr,
566 &anon_fault_fallback_charge_attr.attr,
567 &swpout_attr.attr,
568 &swpout_fallback_attr.attr,
569 NULL,
570 };
571
572 static struct attribute_group stats_attr_group = {
573 .name = "stats",
574 .attrs = stats_attrs,
575 };
576
thpsize_create(int order,struct kobject * parent)577 static struct thpsize *thpsize_create(int order, struct kobject *parent)
578 {
579 unsigned long size = (PAGE_SIZE << order) / SZ_1K;
580 struct thpsize *thpsize;
581 int ret;
582
583 thpsize = kzalloc(sizeof(*thpsize), GFP_KERNEL);
584 if (!thpsize)
585 return ERR_PTR(-ENOMEM);
586
587 ret = kobject_init_and_add(&thpsize->kobj, &thpsize_ktype, parent,
588 "hugepages-%lukB", size);
589 if (ret) {
590 kfree(thpsize);
591 return ERR_PTR(ret);
592 }
593
594 ret = sysfs_create_group(&thpsize->kobj, &thpsize_attr_group);
595 if (ret) {
596 kobject_put(&thpsize->kobj);
597 return ERR_PTR(ret);
598 }
599
600 ret = sysfs_create_group(&thpsize->kobj, &stats_attr_group);
601 if (ret) {
602 kobject_put(&thpsize->kobj);
603 return ERR_PTR(ret);
604 }
605
606 thpsize->order = order;
607 return thpsize;
608 }
609
thpsize_release(struct kobject * kobj)610 static void thpsize_release(struct kobject *kobj)
611 {
612 kfree(to_thpsize(kobj));
613 }
614
hugepage_init_sysfs(struct kobject ** hugepage_kobj)615 static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj)
616 {
617 int err;
618 struct thpsize *thpsize;
619 unsigned long orders;
620 int order;
621
622 /*
623 * Default to setting PMD-sized THP to inherit the global setting and
624 * disable all other sizes. powerpc's PMD_ORDER isn't a compile-time
625 * constant so we have to do this here.
626 */
627 huge_anon_orders_inherit = BIT(PMD_ORDER);
628
629 *hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj);
630 if (unlikely(!*hugepage_kobj)) {
631 pr_err("failed to create transparent hugepage kobject\n");
632 return -ENOMEM;
633 }
634
635 err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group);
636 if (err) {
637 pr_err("failed to register transparent hugepage group\n");
638 goto delete_obj;
639 }
640
641 err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group);
642 if (err) {
643 pr_err("failed to register transparent hugepage group\n");
644 goto remove_hp_group;
645 }
646
647 orders = THP_ORDERS_ALL_ANON;
648 order = highest_order(orders);
649 while (orders) {
650 thpsize = thpsize_create(order, *hugepage_kobj);
651 if (IS_ERR(thpsize)) {
652 pr_err("failed to create thpsize for order %d\n", order);
653 err = PTR_ERR(thpsize);
654 goto remove_all;
655 }
656 list_add(&thpsize->node, &thpsize_list);
657 order = next_order(&orders, order);
658 }
659
660 return 0;
661
662 remove_all:
663 hugepage_exit_sysfs(*hugepage_kobj);
664 return err;
665 remove_hp_group:
666 sysfs_remove_group(*hugepage_kobj, &hugepage_attr_group);
667 delete_obj:
668 kobject_put(*hugepage_kobj);
669 return err;
670 }
671
hugepage_exit_sysfs(struct kobject * hugepage_kobj)672 static void __init hugepage_exit_sysfs(struct kobject *hugepage_kobj)
673 {
674 struct thpsize *thpsize, *tmp;
675
676 list_for_each_entry_safe(thpsize, tmp, &thpsize_list, node) {
677 list_del(&thpsize->node);
678 kobject_put(&thpsize->kobj);
679 }
680
681 sysfs_remove_group(hugepage_kobj, &khugepaged_attr_group);
682 sysfs_remove_group(hugepage_kobj, &hugepage_attr_group);
683 kobject_put(hugepage_kobj);
684 }
685 #else
hugepage_init_sysfs(struct kobject ** hugepage_kobj)686 static inline int hugepage_init_sysfs(struct kobject **hugepage_kobj)
687 {
688 return 0;
689 }
690
hugepage_exit_sysfs(struct kobject * hugepage_kobj)691 static inline void hugepage_exit_sysfs(struct kobject *hugepage_kobj)
692 {
693 }
694 #endif /* CONFIG_SYSFS */
695
hugepage_init(void)696 static int __init hugepage_init(void)
697 {
698 int err;
699 struct kobject *hugepage_kobj;
700
701 if (!has_transparent_hugepage()) {
702 transparent_hugepage_flags = 1 << TRANSPARENT_HUGEPAGE_UNSUPPORTED;
703 return -EINVAL;
704 }
705
706 /*
707 * hugepages can't be allocated by the buddy allocator
708 */
709 MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER > MAX_ORDER);
710 /*
711 * we use page->mapping and page->index in second tail page
712 * as list_head: assuming THP order >= 2
713 */
714 MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER < 2);
715
716 err = hugepage_init_sysfs(&hugepage_kobj);
717 if (err)
718 goto err_sysfs;
719
720 err = khugepaged_init();
721 if (err)
722 goto err_slab;
723
724 err = register_shrinker(&huge_zero_page_shrinker, "thp-zero");
725 if (err)
726 goto err_hzp_shrinker;
727 err = register_shrinker(&deferred_split_shrinker, "thp-deferred_split");
728 if (err)
729 goto err_split_shrinker;
730
731 /*
732 * By default disable transparent hugepages on smaller systems,
733 * where the extra memory used could hurt more than TLB overhead
734 * is likely to save. The admin can still enable it through /sys.
735 */
736 if (totalram_pages() < (512 << (20 - PAGE_SHIFT))) {
737 transparent_hugepage_flags = 0;
738 return 0;
739 }
740
741 err = start_stop_khugepaged();
742 if (err)
743 goto err_khugepaged;
744
745 return 0;
746 err_khugepaged:
747 unregister_shrinker(&deferred_split_shrinker);
748 err_split_shrinker:
749 unregister_shrinker(&huge_zero_page_shrinker);
750 err_hzp_shrinker:
751 khugepaged_destroy();
752 err_slab:
753 hugepage_exit_sysfs(hugepage_kobj);
754 err_sysfs:
755 return err;
756 }
757 subsys_initcall(hugepage_init);
758
setup_transparent_hugepage(char * str)759 static int __init setup_transparent_hugepage(char *str)
760 {
761 int ret = 0;
762 if (!str)
763 goto out;
764 if (!strcmp(str, "always")) {
765 set_bit(TRANSPARENT_HUGEPAGE_FLAG,
766 &transparent_hugepage_flags);
767 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
768 &transparent_hugepage_flags);
769 ret = 1;
770 } else if (!strcmp(str, "madvise")) {
771 clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
772 &transparent_hugepage_flags);
773 set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
774 &transparent_hugepage_flags);
775 ret = 1;
776 } else if (!strcmp(str, "never")) {
777 clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
778 &transparent_hugepage_flags);
779 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
780 &transparent_hugepage_flags);
781 ret = 1;
782 }
783 out:
784 if (!ret)
785 pr_warn("transparent_hugepage= cannot parse, ignored\n");
786 return ret;
787 }
788 __setup("transparent_hugepage=", setup_transparent_hugepage);
789
maybe_pmd_mkwrite(pmd_t pmd,struct vm_area_struct * vma)790 pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
791 {
792 if (likely(vma->vm_flags & VM_WRITE))
793 pmd = pmd_mkwrite(pmd, vma);
794 return pmd;
795 }
796
797 #ifdef CONFIG_MEMCG
798 static inline
get_deferred_split_queue(struct folio * folio)799 struct deferred_split *get_deferred_split_queue(struct folio *folio)
800 {
801 struct mem_cgroup *memcg = folio_memcg(folio);
802 struct pglist_data *pgdat = NODE_DATA(folio_nid(folio));
803
804 if (memcg)
805 return &memcg->deferred_split_queue;
806 else
807 return &pgdat->deferred_split_queue;
808 }
809 #else
810 static inline
get_deferred_split_queue(struct folio * folio)811 struct deferred_split *get_deferred_split_queue(struct folio *folio)
812 {
813 struct pglist_data *pgdat = NODE_DATA(folio_nid(folio));
814
815 return &pgdat->deferred_split_queue;
816 }
817 #endif
818
folio_prep_large_rmappable(struct folio * folio)819 void folio_prep_large_rmappable(struct folio *folio)
820 {
821 VM_BUG_ON_FOLIO(folio_order(folio) < 2, folio);
822 INIT_LIST_HEAD(&folio->_deferred_list);
823 folio_set_large_rmappable(folio);
824 }
825
is_transparent_hugepage(struct folio * folio)826 static inline bool is_transparent_hugepage(struct folio *folio)
827 {
828 if (!folio_test_large(folio))
829 return false;
830
831 return is_huge_zero_page(&folio->page) ||
832 folio_test_large_rmappable(folio);
833 }
834
__thp_get_unmapped_area(struct file * filp,unsigned long addr,unsigned long len,loff_t off,unsigned long flags,unsigned long size)835 static unsigned long __thp_get_unmapped_area(struct file *filp,
836 unsigned long addr, unsigned long len,
837 loff_t off, unsigned long flags, unsigned long size)
838 {
839 loff_t off_end = off + len;
840 loff_t off_align = round_up(off, size);
841 unsigned long len_pad, ret;
842
843 if (!IS_ENABLED(CONFIG_64BIT) || in_compat_syscall())
844 return 0;
845
846 if (off_end <= off_align || (off_end - off_align) < size)
847 return 0;
848
849 len_pad = len + size;
850 if (len_pad < len || (off + len_pad) < off)
851 return 0;
852
853 ret = current->mm->get_unmapped_area(filp, addr, len_pad,
854 off >> PAGE_SHIFT, flags);
855
856 /*
857 * The failure might be due to length padding. The caller will retry
858 * without the padding.
859 */
860 if (IS_ERR_VALUE(ret))
861 return 0;
862
863 /*
864 * Do not try to align to THP boundary if allocation at the address
865 * hint succeeds.
866 */
867 if (ret == addr)
868 return addr;
869
870 ret += (off - ret) & (size - 1);
871 return ret;
872 }
873
thp_get_unmapped_area(struct file * filp,unsigned long addr,unsigned long len,unsigned long pgoff,unsigned long flags)874 unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
875 unsigned long len, unsigned long pgoff, unsigned long flags)
876 {
877 unsigned long ret;
878 loff_t off = (loff_t)pgoff << PAGE_SHIFT;
879
880 ret = __thp_get_unmapped_area(filp, addr, len, off, flags, PMD_SIZE);
881 if (ret)
882 return ret;
883
884 return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags);
885 }
886 EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
887
__do_huge_pmd_anonymous_page(struct vm_fault * vmf,struct page * page,gfp_t gfp)888 static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
889 struct page *page, gfp_t gfp)
890 {
891 struct vm_area_struct *vma = vmf->vma;
892 struct folio *folio = page_folio(page);
893 pgtable_t pgtable;
894 unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
895 vm_fault_t ret = 0;
896
897 VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
898
899 if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) {
900 folio_put(folio);
901 count_vm_event(THP_FAULT_FALLBACK);
902 count_vm_event(THP_FAULT_FALLBACK_CHARGE);
903 count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK);
904 count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
905 return VM_FAULT_FALLBACK;
906 }
907 folio_throttle_swaprate(folio, gfp);
908
909 pgtable = pte_alloc_one(vma->vm_mm);
910 if (unlikely(!pgtable)) {
911 ret = VM_FAULT_OOM;
912 goto release;
913 }
914
915 clear_huge_page(page, vmf->address, HPAGE_PMD_NR);
916 /*
917 * The memory barrier inside __folio_mark_uptodate makes sure that
918 * clear_huge_page writes become visible before the set_pmd_at()
919 * write.
920 */
921 __folio_mark_uptodate(folio);
922
923 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
924 if (unlikely(!pmd_none(*vmf->pmd))) {
925 goto unlock_release;
926 } else {
927 pmd_t entry;
928
929 ret = check_stable_address_space(vma->vm_mm);
930 if (ret)
931 goto unlock_release;
932
933 /* Deliver the page fault to userland */
934 if (userfaultfd_missing(vma)) {
935 spin_unlock(vmf->ptl);
936 folio_put(folio);
937 pte_free(vma->vm_mm, pgtable);
938 ret = handle_userfault(vmf, VM_UFFD_MISSING);
939 VM_BUG_ON(ret & VM_FAULT_FALLBACK);
940 return ret;
941 }
942
943 entry = mk_huge_pmd(page, vma->vm_page_prot);
944 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
945 folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE);
946 folio_add_lru_vma(folio, vma);
947 pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
948 set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
949 update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
950 add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
951 mm_inc_nr_ptes(vma->vm_mm);
952 spin_unlock(vmf->ptl);
953 count_vm_event(THP_FAULT_ALLOC);
954 count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC);
955 count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
956 }
957
958 return 0;
959 unlock_release:
960 spin_unlock(vmf->ptl);
961 release:
962 if (pgtable)
963 pte_free(vma->vm_mm, pgtable);
964 folio_put(folio);
965 return ret;
966
967 }
968
969 /*
970 * always: directly stall for all thp allocations
971 * defer: wake kswapd and fail if not immediately available
972 * defer+madvise: wake kswapd and directly stall for MADV_HUGEPAGE, otherwise
973 * fail if not immediately available
974 * madvise: directly stall for MADV_HUGEPAGE, otherwise fail if not immediately
975 * available
976 * never: never stall for any thp allocation
977 */
vma_thp_gfp_mask(struct vm_area_struct * vma)978 gfp_t vma_thp_gfp_mask(struct vm_area_struct *vma)
979 {
980 const bool vma_madvised = vma && (vma->vm_flags & VM_HUGEPAGE);
981
982 /* Always do synchronous compaction */
983 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
984 return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY);
985
986 /* Kick kcompactd and fail quickly */
987 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags))
988 return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM;
989
990 /* Synchronous compaction if madvised, otherwise kick kcompactd */
991 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags))
992 return GFP_TRANSHUGE_LIGHT |
993 (vma_madvised ? __GFP_DIRECT_RECLAIM :
994 __GFP_KSWAPD_RECLAIM);
995
996 /* Only do synchronous compaction if madvised */
997 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags))
998 return GFP_TRANSHUGE_LIGHT |
999 (vma_madvised ? __GFP_DIRECT_RECLAIM : 0);
1000
1001 return GFP_TRANSHUGE_LIGHT;
1002 }
1003
1004 /* Caller must hold page table lock. */
set_huge_zero_page(pgtable_t pgtable,struct mm_struct * mm,struct vm_area_struct * vma,unsigned long haddr,pmd_t * pmd,struct page * zero_page)1005 static void set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
1006 struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
1007 struct page *zero_page)
1008 {
1009 pmd_t entry;
1010 if (!pmd_none(*pmd))
1011 return;
1012 entry = mk_pmd(zero_page, vma->vm_page_prot);
1013 entry = pmd_mkhuge(entry);
1014 pgtable_trans_huge_deposit(mm, pmd, pgtable);
1015 set_pmd_at(mm, haddr, pmd, entry);
1016 mm_inc_nr_ptes(mm);
1017 }
1018
do_huge_pmd_anonymous_page(struct vm_fault * vmf)1019 vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
1020 {
1021 struct vm_area_struct *vma = vmf->vma;
1022 gfp_t gfp;
1023 struct folio *folio;
1024 unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
1025 vm_fault_t ret;
1026
1027 if (!thp_vma_suitable_order(vma, haddr, PMD_ORDER))
1028 return VM_FAULT_FALLBACK;
1029 ret = vmf_anon_prepare(vmf);
1030 if (ret)
1031 return ret;
1032 khugepaged_enter_vma(vma, vma->vm_flags);
1033
1034 if (!(vmf->flags & FAULT_FLAG_WRITE) &&
1035 !mm_forbids_zeropage(vma->vm_mm) &&
1036 transparent_hugepage_use_zero_page()) {
1037 pgtable_t pgtable;
1038 struct page *zero_page;
1039 vm_fault_t ret;
1040 pgtable = pte_alloc_one(vma->vm_mm);
1041 if (unlikely(!pgtable))
1042 return VM_FAULT_OOM;
1043 zero_page = mm_get_huge_zero_page(vma->vm_mm);
1044 if (unlikely(!zero_page)) {
1045 pte_free(vma->vm_mm, pgtable);
1046 count_vm_event(THP_FAULT_FALLBACK);
1047 return VM_FAULT_FALLBACK;
1048 }
1049 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
1050 ret = 0;
1051 if (pmd_none(*vmf->pmd)) {
1052 ret = check_stable_address_space(vma->vm_mm);
1053 if (ret) {
1054 spin_unlock(vmf->ptl);
1055 pte_free(vma->vm_mm, pgtable);
1056 } else if (userfaultfd_missing(vma)) {
1057 spin_unlock(vmf->ptl);
1058 pte_free(vma->vm_mm, pgtable);
1059 ret = handle_userfault(vmf, VM_UFFD_MISSING);
1060 VM_BUG_ON(ret & VM_FAULT_FALLBACK);
1061 } else {
1062 set_huge_zero_page(pgtable, vma->vm_mm, vma,
1063 haddr, vmf->pmd, zero_page);
1064 update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
1065 spin_unlock(vmf->ptl);
1066 }
1067 } else {
1068 spin_unlock(vmf->ptl);
1069 pte_free(vma->vm_mm, pgtable);
1070 }
1071 return ret;
1072 }
1073 gfp = vma_thp_gfp_mask(vma);
1074 folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, vma, haddr, true);
1075 if (unlikely(!folio)) {
1076 count_vm_event(THP_FAULT_FALLBACK);
1077 count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK);
1078 return VM_FAULT_FALLBACK;
1079 }
1080 return __do_huge_pmd_anonymous_page(vmf, &folio->page, gfp);
1081 }
1082
insert_pfn_pmd(struct vm_area_struct * vma,unsigned long addr,pmd_t * pmd,pfn_t pfn,pgprot_t prot,bool write,pgtable_t pgtable)1083 static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
1084 pmd_t *pmd, pfn_t pfn, pgprot_t prot, bool write,
1085 pgtable_t pgtable)
1086 {
1087 struct mm_struct *mm = vma->vm_mm;
1088 pmd_t entry;
1089 spinlock_t *ptl;
1090
1091 ptl = pmd_lock(mm, pmd);
1092 if (!pmd_none(*pmd)) {
1093 if (write) {
1094 if (pmd_pfn(*pmd) != pfn_t_to_pfn(pfn)) {
1095 WARN_ON_ONCE(!is_huge_zero_pmd(*pmd));
1096 goto out_unlock;
1097 }
1098 entry = pmd_mkyoung(*pmd);
1099 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
1100 if (pmdp_set_access_flags(vma, addr, pmd, entry, 1))
1101 update_mmu_cache_pmd(vma, addr, pmd);
1102 }
1103
1104 goto out_unlock;
1105 }
1106
1107 entry = pmd_mkhuge(pfn_t_pmd(pfn, prot));
1108 if (pfn_t_devmap(pfn))
1109 entry = pmd_mkdevmap(entry);
1110 if (write) {
1111 entry = pmd_mkyoung(pmd_mkdirty(entry));
1112 entry = maybe_pmd_mkwrite(entry, vma);
1113 }
1114
1115 if (pgtable) {
1116 pgtable_trans_huge_deposit(mm, pmd, pgtable);
1117 mm_inc_nr_ptes(mm);
1118 pgtable = NULL;
1119 }
1120
1121 set_pmd_at(mm, addr, pmd, entry);
1122 update_mmu_cache_pmd(vma, addr, pmd);
1123
1124 out_unlock:
1125 spin_unlock(ptl);
1126 if (pgtable)
1127 pte_free(mm, pgtable);
1128 }
1129
1130 /**
1131 * vmf_insert_pfn_pmd - insert a pmd size pfn
1132 * @vmf: Structure describing the fault
1133 * @pfn: pfn to insert
1134 * @write: whether it's a write fault
1135 *
1136 * Insert a pmd size pfn. See vmf_insert_pfn() for additional info.
1137 *
1138 * Return: vm_fault_t value.
1139 */
vmf_insert_pfn_pmd(struct vm_fault * vmf,pfn_t pfn,bool write)1140 vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write)
1141 {
1142 unsigned long addr = vmf->address & PMD_MASK;
1143 struct vm_area_struct *vma = vmf->vma;
1144 pgprot_t pgprot = vma->vm_page_prot;
1145 pgtable_t pgtable = NULL;
1146
1147 /*
1148 * If we had pmd_special, we could avoid all these restrictions,
1149 * but we need to be consistent with PTEs and architectures that
1150 * can't support a 'special' bit.
1151 */
1152 BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) &&
1153 !pfn_t_devmap(pfn));
1154 BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
1155 (VM_PFNMAP|VM_MIXEDMAP));
1156 BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
1157
1158 if (addr < vma->vm_start || addr >= vma->vm_end)
1159 return VM_FAULT_SIGBUS;
1160
1161 if (arch_needs_pgtable_deposit()) {
1162 pgtable = pte_alloc_one(vma->vm_mm);
1163 if (!pgtable)
1164 return VM_FAULT_OOM;
1165 }
1166
1167 track_pfn_insert(vma, &pgprot, pfn);
1168
1169 insert_pfn_pmd(vma, addr, vmf->pmd, pfn, pgprot, write, pgtable);
1170 return VM_FAULT_NOPAGE;
1171 }
1172 EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd);
1173
1174 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
maybe_pud_mkwrite(pud_t pud,struct vm_area_struct * vma)1175 static pud_t maybe_pud_mkwrite(pud_t pud, struct vm_area_struct *vma)
1176 {
1177 if (likely(vma->vm_flags & VM_WRITE))
1178 pud = pud_mkwrite(pud);
1179 return pud;
1180 }
1181
insert_pfn_pud(struct vm_area_struct * vma,unsigned long addr,pud_t * pud,pfn_t pfn,bool write)1182 static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
1183 pud_t *pud, pfn_t pfn, bool write)
1184 {
1185 struct mm_struct *mm = vma->vm_mm;
1186 pgprot_t prot = vma->vm_page_prot;
1187 pud_t entry;
1188 spinlock_t *ptl;
1189
1190 ptl = pud_lock(mm, pud);
1191 if (!pud_none(*pud)) {
1192 if (write) {
1193 if (pud_pfn(*pud) != pfn_t_to_pfn(pfn)) {
1194 WARN_ON_ONCE(!is_huge_zero_pud(*pud));
1195 goto out_unlock;
1196 }
1197 entry = pud_mkyoung(*pud);
1198 entry = maybe_pud_mkwrite(pud_mkdirty(entry), vma);
1199 if (pudp_set_access_flags(vma, addr, pud, entry, 1))
1200 update_mmu_cache_pud(vma, addr, pud);
1201 }
1202 goto out_unlock;
1203 }
1204
1205 entry = pud_mkhuge(pfn_t_pud(pfn, prot));
1206 if (pfn_t_devmap(pfn))
1207 entry = pud_mkdevmap(entry);
1208 if (write) {
1209 entry = pud_mkyoung(pud_mkdirty(entry));
1210 entry = maybe_pud_mkwrite(entry, vma);
1211 }
1212 set_pud_at(mm, addr, pud, entry);
1213 update_mmu_cache_pud(vma, addr, pud);
1214
1215 out_unlock:
1216 spin_unlock(ptl);
1217 }
1218
1219 /**
1220 * vmf_insert_pfn_pud - insert a pud size pfn
1221 * @vmf: Structure describing the fault
1222 * @pfn: pfn to insert
1223 * @write: whether it's a write fault
1224 *
1225 * Insert a pud size pfn. See vmf_insert_pfn() for additional info.
1226 *
1227 * Return: vm_fault_t value.
1228 */
vmf_insert_pfn_pud(struct vm_fault * vmf,pfn_t pfn,bool write)1229 vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write)
1230 {
1231 unsigned long addr = vmf->address & PUD_MASK;
1232 struct vm_area_struct *vma = vmf->vma;
1233 pgprot_t pgprot = vma->vm_page_prot;
1234
1235 /*
1236 * If we had pud_special, we could avoid all these restrictions,
1237 * but we need to be consistent with PTEs and architectures that
1238 * can't support a 'special' bit.
1239 */
1240 BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) &&
1241 !pfn_t_devmap(pfn));
1242 BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
1243 (VM_PFNMAP|VM_MIXEDMAP));
1244 BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
1245
1246 if (addr < vma->vm_start || addr >= vma->vm_end)
1247 return VM_FAULT_SIGBUS;
1248
1249 track_pfn_insert(vma, &pgprot, pfn);
1250
1251 insert_pfn_pud(vma, addr, vmf->pud, pfn, write);
1252 return VM_FAULT_NOPAGE;
1253 }
1254 EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud);
1255 #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
1256
touch_pmd(struct vm_area_struct * vma,unsigned long addr,pmd_t * pmd,bool write)1257 static void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
1258 pmd_t *pmd, bool write)
1259 {
1260 pmd_t _pmd;
1261
1262 _pmd = pmd_mkyoung(*pmd);
1263 if (write)
1264 _pmd = pmd_mkdirty(_pmd);
1265 if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK,
1266 pmd, _pmd, write))
1267 update_mmu_cache_pmd(vma, addr, pmd);
1268 }
1269
follow_devmap_pmd(struct vm_area_struct * vma,unsigned long addr,pmd_t * pmd,int flags,struct dev_pagemap ** pgmap)1270 struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
1271 pmd_t *pmd, int flags, struct dev_pagemap **pgmap)
1272 {
1273 unsigned long pfn = pmd_pfn(*pmd);
1274 struct mm_struct *mm = vma->vm_mm;
1275 struct page *page;
1276 int ret;
1277
1278 assert_spin_locked(pmd_lockptr(mm, pmd));
1279
1280 if (flags & FOLL_WRITE && !pmd_write(*pmd))
1281 return NULL;
1282
1283 if (pmd_present(*pmd) && pmd_devmap(*pmd))
1284 /* pass */;
1285 else
1286 return NULL;
1287
1288 if (flags & FOLL_TOUCH)
1289 touch_pmd(vma, addr, pmd, flags & FOLL_WRITE);
1290
1291 /*
1292 * device mapped pages can only be returned if the
1293 * caller will manage the page reference count.
1294 */
1295 if (!(flags & (FOLL_GET | FOLL_PIN)))
1296 return ERR_PTR(-EEXIST);
1297
1298 pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT;
1299 *pgmap = get_dev_pagemap(pfn, *pgmap);
1300 if (!*pgmap)
1301 return ERR_PTR(-EFAULT);
1302 page = pfn_to_page(pfn);
1303 ret = try_grab_folio(page_folio(page), 1, flags);
1304 if (ret)
1305 page = ERR_PTR(ret);
1306
1307 return page;
1308 }
1309
copy_huge_pmd(struct mm_struct * dst_mm,struct mm_struct * src_mm,pmd_t * dst_pmd,pmd_t * src_pmd,unsigned long addr,struct vm_area_struct * dst_vma,struct vm_area_struct * src_vma)1310 int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1311 pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
1312 struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
1313 {
1314 spinlock_t *dst_ptl, *src_ptl;
1315 struct page *src_page;
1316 struct folio *src_folio;
1317 pmd_t pmd;
1318 pgtable_t pgtable = NULL;
1319 int ret = -ENOMEM;
1320
1321 /* Skip if can be re-fill on fault */
1322 if (!vma_is_anonymous(dst_vma))
1323 return 0;
1324
1325 pgtable = pte_alloc_one(dst_mm);
1326 if (unlikely(!pgtable))
1327 goto out;
1328
1329 dst_ptl = pmd_lock(dst_mm, dst_pmd);
1330 src_ptl = pmd_lockptr(src_mm, src_pmd);
1331 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
1332
1333 ret = -EAGAIN;
1334 pmd = *src_pmd;
1335
1336 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
1337 if (unlikely(is_swap_pmd(pmd))) {
1338 swp_entry_t entry = pmd_to_swp_entry(pmd);
1339
1340 VM_BUG_ON(!is_pmd_migration_entry(pmd));
1341 if (!is_readable_migration_entry(entry)) {
1342 entry = make_readable_migration_entry(
1343 swp_offset(entry));
1344 pmd = swp_entry_to_pmd(entry);
1345 if (pmd_swp_soft_dirty(*src_pmd))
1346 pmd = pmd_swp_mksoft_dirty(pmd);
1347 if (pmd_swp_uffd_wp(*src_pmd))
1348 pmd = pmd_swp_mkuffd_wp(pmd);
1349 set_pmd_at(src_mm, addr, src_pmd, pmd);
1350 }
1351 add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
1352 mm_inc_nr_ptes(dst_mm);
1353 pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
1354 if (!userfaultfd_wp(dst_vma))
1355 pmd = pmd_swp_clear_uffd_wp(pmd);
1356 set_pmd_at(dst_mm, addr, dst_pmd, pmd);
1357 ret = 0;
1358 goto out_unlock;
1359 }
1360 #endif
1361
1362 if (unlikely(!pmd_trans_huge(pmd))) {
1363 pte_free(dst_mm, pgtable);
1364 goto out_unlock;
1365 }
1366 /*
1367 * When page table lock is held, the huge zero pmd should not be
1368 * under splitting since we don't split the page itself, only pmd to
1369 * a page table.
1370 */
1371 if (is_huge_zero_pmd(pmd)) {
1372 /*
1373 * get_huge_zero_page() will never allocate a new page here,
1374 * since we already have a zero page to copy. It just takes a
1375 * reference.
1376 */
1377 mm_get_huge_zero_page(dst_mm);
1378 goto out_zero_page;
1379 }
1380
1381 src_page = pmd_page(pmd);
1382 VM_BUG_ON_PAGE(!PageHead(src_page), src_page);
1383 src_folio = page_folio(src_page);
1384
1385 folio_get(src_folio);
1386 if (unlikely(folio_try_dup_anon_rmap_pmd(src_folio, src_page, src_vma))) {
1387 /* Page maybe pinned: split and retry the fault on PTEs. */
1388 folio_put(src_folio);
1389 pte_free(dst_mm, pgtable);
1390 spin_unlock(src_ptl);
1391 spin_unlock(dst_ptl);
1392 __split_huge_pmd(src_vma, src_pmd, addr, false, NULL);
1393 return -EAGAIN;
1394 }
1395 add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
1396 out_zero_page:
1397 mm_inc_nr_ptes(dst_mm);
1398 pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
1399 pmdp_set_wrprotect(src_mm, addr, src_pmd);
1400 if (!userfaultfd_wp(dst_vma))
1401 pmd = pmd_clear_uffd_wp(pmd);
1402 pmd = pmd_mkold(pmd_wrprotect(pmd));
1403 set_pmd_at(dst_mm, addr, dst_pmd, pmd);
1404
1405 ret = 0;
1406 out_unlock:
1407 spin_unlock(src_ptl);
1408 spin_unlock(dst_ptl);
1409 out:
1410 return ret;
1411 }
1412
1413 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
touch_pud(struct vm_area_struct * vma,unsigned long addr,pud_t * pud,bool write)1414 static void touch_pud(struct vm_area_struct *vma, unsigned long addr,
1415 pud_t *pud, bool write)
1416 {
1417 pud_t _pud;
1418
1419 _pud = pud_mkyoung(*pud);
1420 if (write)
1421 _pud = pud_mkdirty(_pud);
1422 if (pudp_set_access_flags(vma, addr & HPAGE_PUD_MASK,
1423 pud, _pud, write))
1424 update_mmu_cache_pud(vma, addr, pud);
1425 }
1426
follow_devmap_pud(struct vm_area_struct * vma,unsigned long addr,pud_t * pud,int flags,struct dev_pagemap ** pgmap)1427 struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
1428 pud_t *pud, int flags, struct dev_pagemap **pgmap)
1429 {
1430 unsigned long pfn = pud_pfn(*pud);
1431 struct mm_struct *mm = vma->vm_mm;
1432 struct page *page;
1433 int ret;
1434
1435 assert_spin_locked(pud_lockptr(mm, pud));
1436
1437 if (flags & FOLL_WRITE && !pud_write(*pud))
1438 return NULL;
1439
1440 if (pud_present(*pud) && pud_devmap(*pud))
1441 /* pass */;
1442 else
1443 return NULL;
1444
1445 if (flags & FOLL_TOUCH)
1446 touch_pud(vma, addr, pud, flags & FOLL_WRITE);
1447
1448 /*
1449 * device mapped pages can only be returned if the
1450 * caller will manage the page reference count.
1451 *
1452 * At least one of FOLL_GET | FOLL_PIN must be set, so assert that here:
1453 */
1454 if (!(flags & (FOLL_GET | FOLL_PIN)))
1455 return ERR_PTR(-EEXIST);
1456
1457 pfn += (addr & ~PUD_MASK) >> PAGE_SHIFT;
1458 *pgmap = get_dev_pagemap(pfn, *pgmap);
1459 if (!*pgmap)
1460 return ERR_PTR(-EFAULT);
1461 page = pfn_to_page(pfn);
1462
1463 ret = try_grab_folio(page_folio(page), 1, flags);
1464 if (ret)
1465 page = ERR_PTR(ret);
1466
1467 return page;
1468 }
1469
copy_huge_pud(struct mm_struct * dst_mm,struct mm_struct * src_mm,pud_t * dst_pud,pud_t * src_pud,unsigned long addr,struct vm_area_struct * vma)1470 int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1471 pud_t *dst_pud, pud_t *src_pud, unsigned long addr,
1472 struct vm_area_struct *vma)
1473 {
1474 spinlock_t *dst_ptl, *src_ptl;
1475 pud_t pud;
1476 int ret;
1477
1478 dst_ptl = pud_lock(dst_mm, dst_pud);
1479 src_ptl = pud_lockptr(src_mm, src_pud);
1480 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
1481
1482 ret = -EAGAIN;
1483 pud = *src_pud;
1484 if (unlikely(!pud_trans_huge(pud) && !pud_devmap(pud)))
1485 goto out_unlock;
1486
1487 /*
1488 * When page table lock is held, the huge zero pud should not be
1489 * under splitting since we don't split the page itself, only pud to
1490 * a page table.
1491 */
1492 if (is_huge_zero_pud(pud)) {
1493 /* No huge zero pud yet */
1494 }
1495
1496 /*
1497 * TODO: once we support anonymous pages, use
1498 * folio_try_dup_anon_rmap_*() and split if duplicating fails.
1499 */
1500 pudp_set_wrprotect(src_mm, addr, src_pud);
1501 pud = pud_mkold(pud_wrprotect(pud));
1502 set_pud_at(dst_mm, addr, dst_pud, pud);
1503
1504 ret = 0;
1505 out_unlock:
1506 spin_unlock(src_ptl);
1507 spin_unlock(dst_ptl);
1508 return ret;
1509 }
1510
huge_pud_set_accessed(struct vm_fault * vmf,pud_t orig_pud)1511 void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud)
1512 {
1513 bool write = vmf->flags & FAULT_FLAG_WRITE;
1514
1515 vmf->ptl = pud_lock(vmf->vma->vm_mm, vmf->pud);
1516 if (unlikely(!pud_same(*vmf->pud, orig_pud)))
1517 goto unlock;
1518
1519 touch_pud(vmf->vma, vmf->address, vmf->pud, write);
1520 unlock:
1521 spin_unlock(vmf->ptl);
1522 }
1523 #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
1524
huge_pmd_set_accessed(struct vm_fault * vmf)1525 void huge_pmd_set_accessed(struct vm_fault *vmf)
1526 {
1527 bool write = vmf->flags & FAULT_FLAG_WRITE;
1528
1529 vmf->ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
1530 if (unlikely(!pmd_same(*vmf->pmd, vmf->orig_pmd)))
1531 goto unlock;
1532
1533 touch_pmd(vmf->vma, vmf->address, vmf->pmd, write);
1534
1535 unlock:
1536 spin_unlock(vmf->ptl);
1537 }
1538
do_huge_pmd_wp_page(struct vm_fault * vmf)1539 vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
1540 {
1541 const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
1542 struct vm_area_struct *vma = vmf->vma;
1543 struct folio *folio;
1544 struct page *page;
1545 unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
1546 pmd_t orig_pmd = vmf->orig_pmd;
1547
1548 vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd);
1549 VM_BUG_ON_VMA(!vma->anon_vma, vma);
1550
1551 if (is_huge_zero_pmd(orig_pmd))
1552 goto fallback;
1553
1554 spin_lock(vmf->ptl);
1555
1556 if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
1557 spin_unlock(vmf->ptl);
1558 return 0;
1559 }
1560
1561 page = pmd_page(orig_pmd);
1562 folio = page_folio(page);
1563 VM_BUG_ON_PAGE(!PageHead(page), page);
1564
1565 /* Early check when only holding the PT lock. */
1566 if (PageAnonExclusive(page))
1567 goto reuse;
1568
1569 if (!folio_trylock(folio)) {
1570 folio_get(folio);
1571 spin_unlock(vmf->ptl);
1572 folio_lock(folio);
1573 spin_lock(vmf->ptl);
1574 if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
1575 spin_unlock(vmf->ptl);
1576 folio_unlock(folio);
1577 folio_put(folio);
1578 return 0;
1579 }
1580 folio_put(folio);
1581 }
1582
1583 /* Recheck after temporarily dropping the PT lock. */
1584 if (PageAnonExclusive(page)) {
1585 folio_unlock(folio);
1586 goto reuse;
1587 }
1588
1589 /*
1590 * See do_wp_page(): we can only reuse the folio exclusively if
1591 * there are no additional references. Note that we always drain
1592 * the LRU cache immediately after adding a THP.
1593 */
1594 if (folio_ref_count(folio) >
1595 1 + folio_test_swapcache(folio) * folio_nr_pages(folio))
1596 goto unlock_fallback;
1597 if (folio_test_swapcache(folio))
1598 folio_free_swap(folio);
1599 if (folio_ref_count(folio) == 1) {
1600 pmd_t entry;
1601
1602 folio_move_anon_rmap(folio, vma);
1603 SetPageAnonExclusive(page);
1604 folio_unlock(folio);
1605 reuse:
1606 if (unlikely(unshare)) {
1607 spin_unlock(vmf->ptl);
1608 return 0;
1609 }
1610 entry = pmd_mkyoung(orig_pmd);
1611 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
1612 if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1))
1613 update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
1614 spin_unlock(vmf->ptl);
1615 return 0;
1616 }
1617
1618 unlock_fallback:
1619 folio_unlock(folio);
1620 spin_unlock(vmf->ptl);
1621 fallback:
1622 __split_huge_pmd(vma, vmf->pmd, vmf->address, false, NULL);
1623 return VM_FAULT_FALLBACK;
1624 }
1625
can_change_pmd_writable(struct vm_area_struct * vma,unsigned long addr,pmd_t pmd)1626 static inline bool can_change_pmd_writable(struct vm_area_struct *vma,
1627 unsigned long addr, pmd_t pmd)
1628 {
1629 struct page *page;
1630
1631 if (WARN_ON_ONCE(!(vma->vm_flags & VM_WRITE)))
1632 return false;
1633
1634 /* Don't touch entries that are not even readable (NUMA hinting). */
1635 if (pmd_protnone(pmd))
1636 return false;
1637
1638 /* Do we need write faults for softdirty tracking? */
1639 if (pmd_needs_soft_dirty_wp(vma, pmd))
1640 return false;
1641
1642 /* Do we need write faults for uffd-wp tracking? */
1643 if (userfaultfd_huge_pmd_wp(vma, pmd))
1644 return false;
1645
1646 if (!(vma->vm_flags & VM_SHARED)) {
1647 /* See can_change_pte_writable(). */
1648 page = vm_normal_page_pmd(vma, addr, pmd);
1649 return page && PageAnon(page) && PageAnonExclusive(page);
1650 }
1651
1652 /* See can_change_pte_writable(). */
1653 return pmd_dirty(pmd);
1654 }
1655
1656 /* FOLL_FORCE can write to even unwritable PMDs in COW mappings. */
can_follow_write_pmd(pmd_t pmd,struct page * page,struct vm_area_struct * vma,unsigned int flags)1657 static inline bool can_follow_write_pmd(pmd_t pmd, struct page *page,
1658 struct vm_area_struct *vma,
1659 unsigned int flags)
1660 {
1661 /* If the pmd is writable, we can write to the page. */
1662 if (pmd_write(pmd))
1663 return true;
1664
1665 /* Maybe FOLL_FORCE is set to override it? */
1666 if (!(flags & FOLL_FORCE))
1667 return false;
1668
1669 /* But FOLL_FORCE has no effect on shared mappings */
1670 if (vma->vm_flags & (VM_MAYSHARE | VM_SHARED))
1671 return false;
1672
1673 /* ... or read-only private ones */
1674 if (!(vma->vm_flags & VM_MAYWRITE))
1675 return false;
1676
1677 /* ... or already writable ones that just need to take a write fault */
1678 if (vma->vm_flags & VM_WRITE)
1679 return false;
1680
1681 /*
1682 * See can_change_pte_writable(): we broke COW and could map the page
1683 * writable if we have an exclusive anonymous page ...
1684 */
1685 if (!page || !PageAnon(page) || !PageAnonExclusive(page))
1686 return false;
1687
1688 /* ... and a write-fault isn't required for other reasons. */
1689 if (pmd_needs_soft_dirty_wp(vma, pmd))
1690 return false;
1691 return !userfaultfd_huge_pmd_wp(vma, pmd);
1692 }
1693
follow_trans_huge_pmd(struct vm_area_struct * vma,unsigned long addr,pmd_t * pmd,unsigned int flags)1694 struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
1695 unsigned long addr,
1696 pmd_t *pmd,
1697 unsigned int flags)
1698 {
1699 struct mm_struct *mm = vma->vm_mm;
1700 struct page *page;
1701 int ret;
1702
1703 assert_spin_locked(pmd_lockptr(mm, pmd));
1704
1705 page = pmd_page(*pmd);
1706 VM_BUG_ON_PAGE(!PageHead(page) && !is_zone_device_page(page), page);
1707
1708 if ((flags & FOLL_WRITE) &&
1709 !can_follow_write_pmd(*pmd, page, vma, flags))
1710 return NULL;
1711
1712 /* Avoid dumping huge zero page */
1713 if ((flags & FOLL_DUMP) && is_huge_zero_pmd(*pmd))
1714 return ERR_PTR(-EFAULT);
1715
1716 if (pmd_protnone(*pmd) && !gup_can_follow_protnone(vma, flags))
1717 return NULL;
1718
1719 if (!pmd_write(*pmd) && gup_must_unshare(vma, flags, page))
1720 return ERR_PTR(-EMLINK);
1721
1722 VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) &&
1723 !PageAnonExclusive(page), page);
1724
1725 ret = try_grab_folio(page_folio(page), 1, flags);
1726 if (ret)
1727 return ERR_PTR(ret);
1728
1729 if (flags & FOLL_TOUCH)
1730 touch_pmd(vma, addr, pmd, flags & FOLL_WRITE);
1731
1732 page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
1733 VM_BUG_ON_PAGE(!PageCompound(page) && !is_zone_device_page(page), page);
1734
1735 return page;
1736 }
1737
1738 /* NUMA hinting page fault entry point for trans huge pmds */
do_huge_pmd_numa_page(struct vm_fault * vmf)1739 vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
1740 {
1741 struct vm_area_struct *vma = vmf->vma;
1742 pmd_t oldpmd = vmf->orig_pmd;
1743 pmd_t pmd;
1744 struct folio *folio;
1745 unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
1746 int nid = NUMA_NO_NODE;
1747 int target_nid, last_cpupid = (-1 & LAST_CPUPID_MASK);
1748 bool migrated = false, writable = false;
1749 int flags = 0;
1750
1751 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
1752 if (unlikely(!pmd_same(oldpmd, *vmf->pmd))) {
1753 spin_unlock(vmf->ptl);
1754 return 0;
1755 }
1756
1757 pmd = pmd_modify(oldpmd, vma->vm_page_prot);
1758
1759 /*
1760 * Detect now whether the PMD could be writable; this information
1761 * is only valid while holding the PT lock.
1762 */
1763 writable = pmd_write(pmd);
1764 if (!writable && vma_wants_manual_pte_write_upgrade(vma) &&
1765 can_change_pmd_writable(vma, vmf->address, pmd))
1766 writable = true;
1767
1768 folio = vm_normal_folio_pmd(vma, haddr, pmd);
1769 if (!folio)
1770 goto out_map;
1771
1772 /* See similar comment in do_numa_page for explanation */
1773 if (!writable)
1774 flags |= TNF_NO_GROUP;
1775
1776 nid = folio_nid(folio);
1777 /*
1778 * For memory tiering mode, cpupid of slow memory page is used
1779 * to record page access time. So use default value.
1780 */
1781 if (node_is_toptier(nid))
1782 last_cpupid = page_cpupid_last(&folio->page);
1783 target_nid = numa_migrate_prep(folio, vma, haddr, nid, &flags);
1784 if (target_nid == NUMA_NO_NODE) {
1785 folio_put(folio);
1786 goto out_map;
1787 }
1788
1789 spin_unlock(vmf->ptl);
1790 writable = false;
1791
1792 migrated = migrate_misplaced_folio(folio, vma, target_nid);
1793 if (migrated) {
1794 flags |= TNF_MIGRATED;
1795 nid = target_nid;
1796 task_numa_fault(last_cpupid, nid, HPAGE_PMD_NR, flags);
1797 return 0;
1798 }
1799
1800 flags |= TNF_MIGRATE_FAIL;
1801 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
1802 if (unlikely(!pmd_same(oldpmd, *vmf->pmd))) {
1803 spin_unlock(vmf->ptl);
1804 return 0;
1805 }
1806 out_map:
1807 /* Restore the PMD */
1808 pmd = pmd_modify(oldpmd, vma->vm_page_prot);
1809 pmd = pmd_mkyoung(pmd);
1810 if (writable)
1811 pmd = pmd_mkwrite(pmd, vma);
1812 set_pmd_at(vma->vm_mm, haddr, vmf->pmd, pmd);
1813 update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
1814 spin_unlock(vmf->ptl);
1815
1816 if (nid != NUMA_NO_NODE)
1817 task_numa_fault(last_cpupid, nid, HPAGE_PMD_NR, flags);
1818 return 0;
1819 }
1820
1821 /*
1822 * Return true if we do MADV_FREE successfully on entire pmd page.
1823 * Otherwise, return false.
1824 */
madvise_free_huge_pmd(struct mmu_gather * tlb,struct vm_area_struct * vma,pmd_t * pmd,unsigned long addr,unsigned long next)1825 bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
1826 pmd_t *pmd, unsigned long addr, unsigned long next)
1827 {
1828 spinlock_t *ptl;
1829 pmd_t orig_pmd;
1830 struct folio *folio;
1831 struct mm_struct *mm = tlb->mm;
1832 bool ret = false;
1833
1834 tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
1835
1836 ptl = pmd_trans_huge_lock(pmd, vma);
1837 if (!ptl)
1838 goto out_unlocked;
1839
1840 orig_pmd = *pmd;
1841 if (is_huge_zero_pmd(orig_pmd))
1842 goto out;
1843
1844 if (unlikely(!pmd_present(orig_pmd))) {
1845 VM_BUG_ON(thp_migration_supported() &&
1846 !is_pmd_migration_entry(orig_pmd));
1847 goto out;
1848 }
1849
1850 folio = pfn_folio(pmd_pfn(orig_pmd));
1851 /*
1852 * If other processes are mapping this folio, we couldn't discard
1853 * the folio unless they all do MADV_FREE so let's skip the folio.
1854 */
1855 if (folio_likely_mapped_shared(folio))
1856 goto out;
1857
1858 if (!folio_trylock(folio))
1859 goto out;
1860
1861 /*
1862 * If user want to discard part-pages of THP, split it so MADV_FREE
1863 * will deactivate only them.
1864 */
1865 if (next - addr != HPAGE_PMD_SIZE) {
1866 folio_get(folio);
1867 spin_unlock(ptl);
1868 split_folio(folio);
1869 folio_unlock(folio);
1870 folio_put(folio);
1871 goto out_unlocked;
1872 }
1873
1874 if (folio_test_dirty(folio))
1875 folio_clear_dirty(folio);
1876 folio_unlock(folio);
1877
1878 if (pmd_young(orig_pmd) || pmd_dirty(orig_pmd)) {
1879 pmdp_invalidate(vma, addr, pmd);
1880 orig_pmd = pmd_mkold(orig_pmd);
1881 orig_pmd = pmd_mkclean(orig_pmd);
1882
1883 set_pmd_at(mm, addr, pmd, orig_pmd);
1884 tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
1885 }
1886
1887 folio_mark_lazyfree(folio);
1888 ret = true;
1889 out:
1890 spin_unlock(ptl);
1891 out_unlocked:
1892 return ret;
1893 }
1894
zap_deposited_table(struct mm_struct * mm,pmd_t * pmd)1895 static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd)
1896 {
1897 pgtable_t pgtable;
1898
1899 pgtable = pgtable_trans_huge_withdraw(mm, pmd);
1900 pte_free(mm, pgtable);
1901 mm_dec_nr_ptes(mm);
1902 }
1903
zap_huge_pmd(struct mmu_gather * tlb,struct vm_area_struct * vma,pmd_t * pmd,unsigned long addr)1904 int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
1905 pmd_t *pmd, unsigned long addr)
1906 {
1907 pmd_t orig_pmd;
1908 spinlock_t *ptl;
1909
1910 tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
1911
1912 ptl = __pmd_trans_huge_lock(pmd, vma);
1913 if (!ptl)
1914 return 0;
1915 /*
1916 * For architectures like ppc64 we look at deposited pgtable
1917 * when calling pmdp_huge_get_and_clear. So do the
1918 * pgtable_trans_huge_withdraw after finishing pmdp related
1919 * operations.
1920 */
1921 orig_pmd = pmdp_huge_get_and_clear_full(vma, addr, pmd,
1922 tlb->fullmm);
1923 arch_check_zapped_pmd(vma, orig_pmd);
1924 tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
1925 if (vma_is_special_huge(vma)) {
1926 if (arch_needs_pgtable_deposit())
1927 zap_deposited_table(tlb->mm, pmd);
1928 spin_unlock(ptl);
1929 } else if (is_huge_zero_pmd(orig_pmd)) {
1930 zap_deposited_table(tlb->mm, pmd);
1931 spin_unlock(ptl);
1932 } else {
1933 struct folio *folio = NULL;
1934 int flush_needed = 1;
1935
1936 if (pmd_present(orig_pmd)) {
1937 struct page *page = pmd_page(orig_pmd);
1938
1939 folio = page_folio(page);
1940 folio_remove_rmap_pmd(folio, page, vma);
1941 VM_BUG_ON_PAGE(page_mapcount(page) < 0, page);
1942 VM_BUG_ON_PAGE(!PageHead(page), page);
1943 } else if (thp_migration_supported()) {
1944 swp_entry_t entry;
1945
1946 VM_BUG_ON(!is_pmd_migration_entry(orig_pmd));
1947 entry = pmd_to_swp_entry(orig_pmd);
1948 folio = pfn_swap_entry_folio(entry);
1949 flush_needed = 0;
1950 } else
1951 WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!");
1952
1953 if (folio_test_anon(folio)) {
1954 zap_deposited_table(tlb->mm, pmd);
1955 add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
1956 } else {
1957 if (arch_needs_pgtable_deposit())
1958 zap_deposited_table(tlb->mm, pmd);
1959 add_mm_counter(tlb->mm, mm_counter_file(folio),
1960 -HPAGE_PMD_NR);
1961 }
1962
1963 spin_unlock(ptl);
1964 if (flush_needed)
1965 tlb_remove_page_size(tlb, &folio->page, HPAGE_PMD_SIZE);
1966 }
1967 return 1;
1968 }
1969
1970 #ifndef pmd_move_must_withdraw
pmd_move_must_withdraw(spinlock_t * new_pmd_ptl,spinlock_t * old_pmd_ptl,struct vm_area_struct * vma)1971 static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl,
1972 spinlock_t *old_pmd_ptl,
1973 struct vm_area_struct *vma)
1974 {
1975 /*
1976 * With split pmd lock we also need to move preallocated
1977 * PTE page table if new_pmd is on different PMD page table.
1978 *
1979 * We also don't deposit and withdraw tables for file pages.
1980 */
1981 return (new_pmd_ptl != old_pmd_ptl) && vma_is_anonymous(vma);
1982 }
1983 #endif
1984
move_soft_dirty_pmd(pmd_t pmd)1985 static pmd_t move_soft_dirty_pmd(pmd_t pmd)
1986 {
1987 #ifdef CONFIG_MEM_SOFT_DIRTY
1988 if (unlikely(is_pmd_migration_entry(pmd)))
1989 pmd = pmd_swp_mksoft_dirty(pmd);
1990 else if (pmd_present(pmd))
1991 pmd = pmd_mksoft_dirty(pmd);
1992 #endif
1993 return pmd;
1994 }
1995
move_huge_pmd(struct vm_area_struct * vma,unsigned long old_addr,unsigned long new_addr,pmd_t * old_pmd,pmd_t * new_pmd)1996 bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
1997 unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd)
1998 {
1999 spinlock_t *old_ptl, *new_ptl;
2000 pmd_t pmd;
2001 struct mm_struct *mm = vma->vm_mm;
2002 bool force_flush = false;
2003
2004 /*
2005 * The destination pmd shouldn't be established, free_pgtables()
2006 * should have released it; but move_page_tables() might have already
2007 * inserted a page table, if racing against shmem/file collapse.
2008 */
2009 if (!pmd_none(*new_pmd)) {
2010 VM_BUG_ON(pmd_trans_huge(*new_pmd));
2011 return false;
2012 }
2013
2014 /*
2015 * We don't have to worry about the ordering of src and dst
2016 * ptlocks because exclusive mmap_lock prevents deadlock.
2017 */
2018 old_ptl = __pmd_trans_huge_lock(old_pmd, vma);
2019 if (old_ptl) {
2020 new_ptl = pmd_lockptr(mm, new_pmd);
2021 if (new_ptl != old_ptl)
2022 spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
2023 pmd = pmdp_huge_get_and_clear(mm, old_addr, old_pmd);
2024 if (pmd_present(pmd))
2025 force_flush = true;
2026 VM_BUG_ON(!pmd_none(*new_pmd));
2027
2028 if (pmd_move_must_withdraw(new_ptl, old_ptl, vma)) {
2029 pgtable_t pgtable;
2030 pgtable = pgtable_trans_huge_withdraw(mm, old_pmd);
2031 pgtable_trans_huge_deposit(mm, new_pmd, pgtable);
2032 }
2033 pmd = move_soft_dirty_pmd(pmd);
2034 set_pmd_at(mm, new_addr, new_pmd, pmd);
2035 if (force_flush)
2036 flush_pmd_tlb_range(vma, old_addr, old_addr + PMD_SIZE);
2037 if (new_ptl != old_ptl)
2038 spin_unlock(new_ptl);
2039 spin_unlock(old_ptl);
2040 return true;
2041 }
2042 return false;
2043 }
2044
2045 /*
2046 * Returns
2047 * - 0 if PMD could not be locked
2048 * - 1 if PMD was locked but protections unchanged and TLB flush unnecessary
2049 * or if prot_numa but THP migration is not supported
2050 * - HPAGE_PMD_NR if protections changed and TLB flush necessary
2051 */
change_huge_pmd(struct mmu_gather * tlb,struct vm_area_struct * vma,pmd_t * pmd,unsigned long addr,pgprot_t newprot,unsigned long cp_flags)2052 int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
2053 pmd_t *pmd, unsigned long addr, pgprot_t newprot,
2054 unsigned long cp_flags)
2055 {
2056 struct mm_struct *mm = vma->vm_mm;
2057 spinlock_t *ptl;
2058 pmd_t oldpmd, entry;
2059 bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
2060 bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
2061 bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
2062 int ret = 1;
2063
2064 tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
2065
2066 if (prot_numa && !thp_migration_supported())
2067 return 1;
2068
2069 ptl = __pmd_trans_huge_lock(pmd, vma);
2070 if (!ptl)
2071 return 0;
2072
2073 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
2074 if (is_swap_pmd(*pmd)) {
2075 swp_entry_t entry = pmd_to_swp_entry(*pmd);
2076 struct page *page = pfn_swap_entry_to_page(entry);
2077 pmd_t newpmd;
2078
2079 VM_BUG_ON(!is_pmd_migration_entry(*pmd));
2080 if (is_writable_migration_entry(entry)) {
2081 /*
2082 * A protection check is difficult so
2083 * just be safe and disable write
2084 */
2085 if (PageAnon(page))
2086 entry = make_readable_exclusive_migration_entry(swp_offset(entry));
2087 else
2088 entry = make_readable_migration_entry(swp_offset(entry));
2089 newpmd = swp_entry_to_pmd(entry);
2090 if (pmd_swp_soft_dirty(*pmd))
2091 newpmd = pmd_swp_mksoft_dirty(newpmd);
2092 } else {
2093 newpmd = *pmd;
2094 }
2095
2096 if (uffd_wp)
2097 newpmd = pmd_swp_mkuffd_wp(newpmd);
2098 else if (uffd_wp_resolve)
2099 newpmd = pmd_swp_clear_uffd_wp(newpmd);
2100 if (!pmd_same(*pmd, newpmd))
2101 set_pmd_at(mm, addr, pmd, newpmd);
2102 goto unlock;
2103 }
2104 #endif
2105
2106 if (prot_numa) {
2107 struct page *page;
2108 bool toptier;
2109 /*
2110 * Avoid trapping faults against the zero page. The read-only
2111 * data is likely to be read-cached on the local CPU and
2112 * local/remote hits to the zero page are not interesting.
2113 */
2114 if (is_huge_zero_pmd(*pmd))
2115 goto unlock;
2116
2117 if (pmd_protnone(*pmd))
2118 goto unlock;
2119
2120 page = pmd_page(*pmd);
2121 toptier = node_is_toptier(page_to_nid(page));
2122 /*
2123 * Skip scanning top tier node if normal numa
2124 * balancing is disabled
2125 */
2126 if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) &&
2127 toptier)
2128 goto unlock;
2129
2130 if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING &&
2131 !toptier)
2132 xchg_page_access_time(page, jiffies_to_msecs(jiffies));
2133 }
2134 /*
2135 * In case prot_numa, we are under mmap_read_lock(mm). It's critical
2136 * to not clear pmd intermittently to avoid race with MADV_DONTNEED
2137 * which is also under mmap_read_lock(mm):
2138 *
2139 * CPU0: CPU1:
2140 * change_huge_pmd(prot_numa=1)
2141 * pmdp_huge_get_and_clear_notify()
2142 * madvise_dontneed()
2143 * zap_pmd_range()
2144 * pmd_trans_huge(*pmd) == 0 (without ptl)
2145 * // skip the pmd
2146 * set_pmd_at();
2147 * // pmd is re-established
2148 *
2149 * The race makes MADV_DONTNEED miss the huge pmd and don't clear it
2150 * which may break userspace.
2151 *
2152 * pmdp_invalidate_ad() is required to make sure we don't miss
2153 * dirty/young flags set by hardware.
2154 */
2155 oldpmd = pmdp_invalidate_ad(vma, addr, pmd);
2156
2157 entry = pmd_modify(oldpmd, newprot);
2158 if (uffd_wp)
2159 entry = pmd_mkuffd_wp(entry);
2160 else if (uffd_wp_resolve)
2161 /*
2162 * Leave the write bit to be handled by PF interrupt
2163 * handler, then things like COW could be properly
2164 * handled.
2165 */
2166 entry = pmd_clear_uffd_wp(entry);
2167
2168 /* See change_pte_range(). */
2169 if ((cp_flags & MM_CP_TRY_CHANGE_WRITABLE) && !pmd_write(entry) &&
2170 can_change_pmd_writable(vma, addr, entry))
2171 entry = pmd_mkwrite(entry, vma);
2172
2173 ret = HPAGE_PMD_NR;
2174 set_pmd_at(mm, addr, pmd, entry);
2175
2176 if (huge_pmd_needs_flush(oldpmd, entry))
2177 tlb_flush_pmd_range(tlb, addr, HPAGE_PMD_SIZE);
2178 unlock:
2179 spin_unlock(ptl);
2180 return ret;
2181 }
2182
2183 #ifdef CONFIG_USERFAULTFD
2184 /*
2185 * The PT lock for src_pmd and dst_vma/src_vma (for reading) are locked by
2186 * the caller, but it must return after releasing the page_table_lock.
2187 * Just move the page from src_pmd to dst_pmd if possible.
2188 * Return zero if succeeded in moving the page, -EAGAIN if it needs to be
2189 * repeated by the caller, or other errors in case of failure.
2190 */
move_pages_huge_pmd(struct mm_struct * mm,pmd_t * dst_pmd,pmd_t * src_pmd,pmd_t dst_pmdval,struct vm_area_struct * dst_vma,struct vm_area_struct * src_vma,unsigned long dst_addr,unsigned long src_addr)2191 int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pmd_t dst_pmdval,
2192 struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
2193 unsigned long dst_addr, unsigned long src_addr)
2194 {
2195 pmd_t _dst_pmd, src_pmdval;
2196 struct page *src_page;
2197 struct folio *src_folio;
2198 struct anon_vma *src_anon_vma;
2199 spinlock_t *src_ptl, *dst_ptl;
2200 pgtable_t src_pgtable;
2201 struct mmu_notifier_range range;
2202 int err = 0;
2203
2204 src_pmdval = *src_pmd;
2205 src_ptl = pmd_lockptr(mm, src_pmd);
2206
2207 lockdep_assert_held(src_ptl);
2208 vma_assert_locked(src_vma);
2209 vma_assert_locked(dst_vma);
2210
2211 /* Sanity checks before the operation */
2212 if (WARN_ON_ONCE(!pmd_none(dst_pmdval)) || WARN_ON_ONCE(src_addr & ~HPAGE_PMD_MASK) ||
2213 WARN_ON_ONCE(dst_addr & ~HPAGE_PMD_MASK)) {
2214 spin_unlock(src_ptl);
2215 return -EINVAL;
2216 }
2217
2218 if (!pmd_trans_huge(src_pmdval)) {
2219 spin_unlock(src_ptl);
2220 if (is_pmd_migration_entry(src_pmdval)) {
2221 pmd_migration_entry_wait(mm, &src_pmdval);
2222 return -EAGAIN;
2223 }
2224 return -ENOENT;
2225 }
2226
2227 src_page = pmd_page(src_pmdval);
2228
2229 if (!is_huge_zero_pmd(src_pmdval)) {
2230 if (unlikely(!PageAnonExclusive(src_page))) {
2231 spin_unlock(src_ptl);
2232 return -EBUSY;
2233 }
2234
2235 src_folio = page_folio(src_page);
2236 folio_get(src_folio);
2237 } else
2238 src_folio = NULL;
2239
2240 spin_unlock(src_ptl);
2241
2242 flush_cache_range(src_vma, src_addr, src_addr + HPAGE_PMD_SIZE);
2243 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, src_addr,
2244 src_addr + HPAGE_PMD_SIZE);
2245 mmu_notifier_invalidate_range_start(&range);
2246
2247 if (src_folio) {
2248 folio_lock(src_folio);
2249
2250 /*
2251 * split_huge_page walks the anon_vma chain without the page
2252 * lock. Serialize against it with the anon_vma lock, the page
2253 * lock is not enough.
2254 */
2255 src_anon_vma = folio_get_anon_vma(src_folio);
2256 if (!src_anon_vma) {
2257 err = -EAGAIN;
2258 goto unlock_folio;
2259 }
2260 anon_vma_lock_write(src_anon_vma);
2261 } else
2262 src_anon_vma = NULL;
2263
2264 dst_ptl = pmd_lockptr(mm, dst_pmd);
2265 double_pt_lock(src_ptl, dst_ptl);
2266 if (unlikely(!pmd_same(*src_pmd, src_pmdval) ||
2267 !pmd_same(*dst_pmd, dst_pmdval))) {
2268 err = -EAGAIN;
2269 goto unlock_ptls;
2270 }
2271 if (src_folio) {
2272 if (folio_maybe_dma_pinned(src_folio) ||
2273 !PageAnonExclusive(&src_folio->page)) {
2274 err = -EBUSY;
2275 goto unlock_ptls;
2276 }
2277
2278 if (WARN_ON_ONCE(!folio_test_head(src_folio)) ||
2279 WARN_ON_ONCE(!folio_test_anon(src_folio))) {
2280 err = -EBUSY;
2281 goto unlock_ptls;
2282 }
2283
2284 src_pmdval = pmdp_huge_clear_flush(src_vma, src_addr, src_pmd);
2285 /* Folio got pinned from under us. Put it back and fail the move. */
2286 if (folio_maybe_dma_pinned(src_folio)) {
2287 set_pmd_at(mm, src_addr, src_pmd, src_pmdval);
2288 err = -EBUSY;
2289 goto unlock_ptls;
2290 }
2291
2292 folio_move_anon_rmap(src_folio, dst_vma);
2293 WRITE_ONCE(src_folio->index, linear_page_index(dst_vma, dst_addr));
2294
2295 _dst_pmd = mk_huge_pmd(&src_folio->page, dst_vma->vm_page_prot);
2296 /* Follow mremap() behavior and treat the entry dirty after the move */
2297 _dst_pmd = pmd_mkwrite(pmd_mkdirty(_dst_pmd), dst_vma);
2298 } else {
2299 src_pmdval = pmdp_huge_clear_flush(src_vma, src_addr, src_pmd);
2300 _dst_pmd = mk_huge_pmd(src_page, dst_vma->vm_page_prot);
2301 }
2302 set_pmd_at(mm, dst_addr, dst_pmd, _dst_pmd);
2303
2304 src_pgtable = pgtable_trans_huge_withdraw(mm, src_pmd);
2305 pgtable_trans_huge_deposit(mm, dst_pmd, src_pgtable);
2306 unlock_ptls:
2307 double_pt_unlock(src_ptl, dst_ptl);
2308 if (src_anon_vma) {
2309 anon_vma_unlock_write(src_anon_vma);
2310 put_anon_vma(src_anon_vma);
2311 }
2312 unlock_folio:
2313 /* unblock rmap walks */
2314 if (src_folio)
2315 folio_unlock(src_folio);
2316 mmu_notifier_invalidate_range_end(&range);
2317 if (src_folio)
2318 folio_put(src_folio);
2319 return err;
2320 }
2321 #endif /* CONFIG_USERFAULTFD */
2322
2323 /*
2324 * Returns page table lock pointer if a given pmd maps a thp, NULL otherwise.
2325 *
2326 * Note that if it returns page table lock pointer, this routine returns without
2327 * unlocking page table lock. So callers must unlock it.
2328 */
__pmd_trans_huge_lock(pmd_t * pmd,struct vm_area_struct * vma)2329 spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
2330 {
2331 spinlock_t *ptl;
2332 ptl = pmd_lock(vma->vm_mm, pmd);
2333 if (likely(is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) ||
2334 pmd_devmap(*pmd)))
2335 return ptl;
2336 spin_unlock(ptl);
2337 return NULL;
2338 }
2339 EXPORT_SYMBOL_GPL(__pmd_trans_huge_lock);
2340
2341 /*
2342 * Returns page table lock pointer if a given pud maps a thp, NULL otherwise.
2343 *
2344 * Note that if it returns page table lock pointer, this routine returns without
2345 * unlocking page table lock. So callers must unlock it.
2346 */
__pud_trans_huge_lock(pud_t * pud,struct vm_area_struct * vma)2347 spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma)
2348 {
2349 spinlock_t *ptl;
2350
2351 ptl = pud_lock(vma->vm_mm, pud);
2352 if (likely(pud_trans_huge(*pud) || pud_devmap(*pud)))
2353 return ptl;
2354 spin_unlock(ptl);
2355 return NULL;
2356 }
2357
2358 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
zap_huge_pud(struct mmu_gather * tlb,struct vm_area_struct * vma,pud_t * pud,unsigned long addr)2359 int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma,
2360 pud_t *pud, unsigned long addr)
2361 {
2362 spinlock_t *ptl;
2363
2364 ptl = __pud_trans_huge_lock(pud, vma);
2365 if (!ptl)
2366 return 0;
2367
2368 pudp_huge_get_and_clear_full(vma, addr, pud, tlb->fullmm);
2369 tlb_remove_pud_tlb_entry(tlb, pud, addr);
2370 if (vma_is_special_huge(vma)) {
2371 spin_unlock(ptl);
2372 /* No zero page support yet */
2373 } else {
2374 /* No support for anonymous PUD pages yet */
2375 BUG();
2376 }
2377 return 1;
2378 }
2379
__split_huge_pud_locked(struct vm_area_struct * vma,pud_t * pud,unsigned long haddr)2380 static void __split_huge_pud_locked(struct vm_area_struct *vma, pud_t *pud,
2381 unsigned long haddr)
2382 {
2383 VM_BUG_ON(haddr & ~HPAGE_PUD_MASK);
2384 VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
2385 VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PUD_SIZE, vma);
2386 VM_BUG_ON(!pud_trans_huge(*pud) && !pud_devmap(*pud));
2387
2388 count_vm_event(THP_SPLIT_PUD);
2389
2390 pudp_huge_clear_flush(vma, haddr, pud);
2391 }
2392
__split_huge_pud(struct vm_area_struct * vma,pud_t * pud,unsigned long address)2393 void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
2394 unsigned long address)
2395 {
2396 spinlock_t *ptl;
2397 struct mmu_notifier_range range;
2398
2399 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
2400 address & HPAGE_PUD_MASK,
2401 (address & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE);
2402 mmu_notifier_invalidate_range_start(&range);
2403 ptl = pud_lock(vma->vm_mm, pud);
2404 if (unlikely(!pud_trans_huge(*pud) && !pud_devmap(*pud)))
2405 goto out;
2406 __split_huge_pud_locked(vma, pud, range.start);
2407
2408 out:
2409 spin_unlock(ptl);
2410 mmu_notifier_invalidate_range_end(&range);
2411 }
2412 #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
2413
__split_huge_zero_page_pmd(struct vm_area_struct * vma,unsigned long haddr,pmd_t * pmd)2414 static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
2415 unsigned long haddr, pmd_t *pmd)
2416 {
2417 struct mm_struct *mm = vma->vm_mm;
2418 pgtable_t pgtable;
2419 pmd_t _pmd, old_pmd;
2420 unsigned long addr;
2421 pte_t *pte;
2422 int i;
2423
2424 /*
2425 * Leave pmd empty until pte is filled note that it is fine to delay
2426 * notification until mmu_notifier_invalidate_range_end() as we are
2427 * replacing a zero pmd write protected page with a zero pte write
2428 * protected page.
2429 *
2430 * See Documentation/mm/mmu_notifier.rst
2431 */
2432 old_pmd = pmdp_huge_clear_flush(vma, haddr, pmd);
2433
2434 pgtable = pgtable_trans_huge_withdraw(mm, pmd);
2435 pmd_populate(mm, &_pmd, pgtable);
2436
2437 pte = pte_offset_map(&_pmd, haddr);
2438 VM_BUG_ON(!pte);
2439 for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
2440 pte_t entry;
2441
2442 entry = pfn_pte(my_zero_pfn(addr), vma->vm_page_prot);
2443 entry = pte_mkspecial(entry);
2444 if (pmd_uffd_wp(old_pmd))
2445 entry = pte_mkuffd_wp(entry);
2446 VM_BUG_ON(!pte_none(ptep_get(pte)));
2447 set_pte_at(mm, addr, pte, entry);
2448 pte++;
2449 }
2450 pte_unmap(pte - 1);
2451 smp_wmb(); /* make pte visible before pmd */
2452 pmd_populate(mm, pmd, pgtable);
2453 }
2454
__split_huge_pmd_locked(struct vm_area_struct * vma,pmd_t * pmd,unsigned long haddr,bool freeze)2455 static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
2456 unsigned long haddr, bool freeze)
2457 {
2458 struct mm_struct *mm = vma->vm_mm;
2459 struct folio *folio;
2460 struct page *page;
2461 pgtable_t pgtable;
2462 pmd_t old_pmd, _pmd;
2463 bool young, write, soft_dirty, pmd_migration = false, uffd_wp = false;
2464 bool anon_exclusive = false, dirty = false;
2465 unsigned long addr;
2466 pte_t *pte;
2467 int i;
2468
2469 VM_BUG_ON(haddr & ~HPAGE_PMD_MASK);
2470 VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
2471 VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma);
2472 VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd)
2473 && !pmd_devmap(*pmd));
2474
2475 count_vm_event(THP_SPLIT_PMD);
2476
2477 if (!vma_is_anonymous(vma)) {
2478 old_pmd = pmdp_huge_clear_flush(vma, haddr, pmd);
2479 /*
2480 * We are going to unmap this huge page. So
2481 * just go ahead and zap it
2482 */
2483 if (arch_needs_pgtable_deposit())
2484 zap_deposited_table(mm, pmd);
2485 if (vma_is_special_huge(vma))
2486 return;
2487 if (unlikely(is_pmd_migration_entry(old_pmd))) {
2488 swp_entry_t entry;
2489
2490 entry = pmd_to_swp_entry(old_pmd);
2491 folio = pfn_swap_entry_folio(entry);
2492 } else {
2493 page = pmd_page(old_pmd);
2494 folio = page_folio(page);
2495 if (!folio_test_dirty(folio) && pmd_dirty(old_pmd))
2496 folio_mark_dirty(folio);
2497 if (!folio_test_referenced(folio) && pmd_young(old_pmd))
2498 folio_set_referenced(folio);
2499 folio_remove_rmap_pmd(folio, page, vma);
2500 folio_put(folio);
2501 }
2502 add_mm_counter(mm, mm_counter_file(folio), -HPAGE_PMD_NR);
2503 return;
2504 }
2505
2506 if (is_huge_zero_pmd(*pmd)) {
2507 /*
2508 * FIXME: Do we want to invalidate secondary mmu by calling
2509 * mmu_notifier_arch_invalidate_secondary_tlbs() see comments below
2510 * inside __split_huge_pmd() ?
2511 *
2512 * We are going from a zero huge page write protected to zero
2513 * small page also write protected so it does not seems useful
2514 * to invalidate secondary mmu at this time.
2515 */
2516 return __split_huge_zero_page_pmd(vma, haddr, pmd);
2517 }
2518
2519 pmd_migration = is_pmd_migration_entry(*pmd);
2520 if (unlikely(pmd_migration)) {
2521 swp_entry_t entry;
2522
2523 old_pmd = *pmd;
2524 entry = pmd_to_swp_entry(old_pmd);
2525 page = pfn_swap_entry_to_page(entry);
2526 write = is_writable_migration_entry(entry);
2527 if (PageAnon(page))
2528 anon_exclusive = is_readable_exclusive_migration_entry(entry);
2529 young = is_migration_entry_young(entry);
2530 dirty = is_migration_entry_dirty(entry);
2531 soft_dirty = pmd_swp_soft_dirty(old_pmd);
2532 uffd_wp = pmd_swp_uffd_wp(old_pmd);
2533 } else {
2534 /*
2535 * Up to this point the pmd is present and huge and userland has
2536 * the whole access to the hugepage during the split (which
2537 * happens in place). If we overwrite the pmd with the not-huge
2538 * version pointing to the pte here (which of course we could if
2539 * all CPUs were bug free), userland could trigger a small page
2540 * size TLB miss on the small sized TLB while the hugepage TLB
2541 * entry is still established in the huge TLB. Some CPU doesn't
2542 * like that. See
2543 * http://support.amd.com/TechDocs/41322_10h_Rev_Gd.pdf, Erratum
2544 * 383 on page 105. Intel should be safe but is also warns that
2545 * it's only safe if the permission and cache attributes of the
2546 * two entries loaded in the two TLB is identical (which should
2547 * be the case here). But it is generally safer to never allow
2548 * small and huge TLB entries for the same virtual address to be
2549 * loaded simultaneously. So instead of doing "pmd_populate();
2550 * flush_pmd_tlb_range();" we first mark the current pmd
2551 * notpresent (atomically because here the pmd_trans_huge must
2552 * remain set at all times on the pmd until the split is
2553 * complete for this pmd), then we flush the SMP TLB and finally
2554 * we write the non-huge version of the pmd entry with
2555 * pmd_populate.
2556 */
2557 old_pmd = pmdp_invalidate(vma, haddr, pmd);
2558 page = pmd_page(old_pmd);
2559 folio = page_folio(page);
2560 if (pmd_dirty(old_pmd)) {
2561 dirty = true;
2562 folio_set_dirty(folio);
2563 }
2564 write = pmd_write(old_pmd);
2565 young = pmd_young(old_pmd);
2566 soft_dirty = pmd_soft_dirty(old_pmd);
2567 uffd_wp = pmd_uffd_wp(old_pmd);
2568
2569 VM_WARN_ON_FOLIO(!folio_ref_count(folio), folio);
2570 VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
2571
2572 /*
2573 * Without "freeze", we'll simply split the PMD, propagating the
2574 * PageAnonExclusive() flag for each PTE by setting it for
2575 * each subpage -- no need to (temporarily) clear.
2576 *
2577 * With "freeze" we want to replace mapped pages by
2578 * migration entries right away. This is only possible if we
2579 * managed to clear PageAnonExclusive() -- see
2580 * set_pmd_migration_entry().
2581 *
2582 * In case we cannot clear PageAnonExclusive(), split the PMD
2583 * only and let try_to_migrate_one() fail later.
2584 *
2585 * See folio_try_share_anon_rmap_pmd(): invalidate PMD first.
2586 */
2587 anon_exclusive = PageAnonExclusive(page);
2588 if (freeze && anon_exclusive &&
2589 folio_try_share_anon_rmap_pmd(folio, page))
2590 freeze = false;
2591 if (!freeze) {
2592 rmap_t rmap_flags = RMAP_NONE;
2593
2594 folio_ref_add(folio, HPAGE_PMD_NR - 1);
2595 if (anon_exclusive)
2596 rmap_flags |= RMAP_EXCLUSIVE;
2597 folio_add_anon_rmap_ptes(folio, page, HPAGE_PMD_NR,
2598 vma, haddr, rmap_flags);
2599 }
2600 }
2601
2602 /*
2603 * Withdraw the table only after we mark the pmd entry invalid.
2604 * This's critical for some architectures (Power).
2605 */
2606 pgtable = pgtable_trans_huge_withdraw(mm, pmd);
2607 pmd_populate(mm, &_pmd, pgtable);
2608
2609 pte = pte_offset_map(&_pmd, haddr);
2610 VM_BUG_ON(!pte);
2611
2612 /*
2613 * Note that NUMA hinting access restrictions are not transferred to
2614 * avoid any possibility of altering permissions across VMAs.
2615 */
2616 if (freeze || pmd_migration) {
2617 for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
2618 pte_t entry;
2619 swp_entry_t swp_entry;
2620
2621 if (write)
2622 swp_entry = make_writable_migration_entry(
2623 page_to_pfn(page + i));
2624 else if (anon_exclusive)
2625 swp_entry = make_readable_exclusive_migration_entry(
2626 page_to_pfn(page + i));
2627 else
2628 swp_entry = make_readable_migration_entry(
2629 page_to_pfn(page + i));
2630 if (young)
2631 swp_entry = make_migration_entry_young(swp_entry);
2632 if (dirty)
2633 swp_entry = make_migration_entry_dirty(swp_entry);
2634 entry = swp_entry_to_pte(swp_entry);
2635 if (soft_dirty)
2636 entry = pte_swp_mksoft_dirty(entry);
2637 if (uffd_wp)
2638 entry = pte_swp_mkuffd_wp(entry);
2639 if (vma->vm_flags & VM_LOCKED)
2640 set_src_usage(page + i, SRC_PAGE_MLOCKED);
2641 else
2642 set_src_usage(page + i, SRC_PAGE_MAPPED);
2643
2644 VM_WARN_ON(!pte_none(ptep_get(pte + i)));
2645 set_pte_at(mm, addr, pte + i, entry);
2646 }
2647 } else {
2648 pte_t entry;
2649
2650 entry = mk_pte(page, READ_ONCE(vma->vm_page_prot));
2651 if (write)
2652 entry = pte_mkwrite(entry, vma);
2653 if (!young)
2654 entry = pte_mkold(entry);
2655 /* NOTE: this may set soft-dirty too on some archs */
2656 if (dirty)
2657 entry = pte_mkdirty(entry);
2658 if (soft_dirty)
2659 entry = pte_mksoft_dirty(entry);
2660 if (uffd_wp)
2661 entry = pte_mkuffd_wp(entry);
2662
2663 for (i = 0; i < HPAGE_PMD_NR; i++)
2664 VM_WARN_ON(!pte_none(ptep_get(pte + i)));
2665
2666 set_ptes(mm, haddr, pte, entry, HPAGE_PMD_NR);
2667 }
2668 pte_unmap(pte);
2669
2670 if (!pmd_migration)
2671 folio_remove_rmap_pmd(folio, page, vma);
2672 if (freeze)
2673 put_page(page);
2674
2675 smp_wmb(); /* make pte visible before pmd */
2676 pmd_populate(mm, pmd, pgtable);
2677 }
2678
__split_huge_pmd(struct vm_area_struct * vma,pmd_t * pmd,unsigned long address,bool freeze,struct folio * folio)2679 void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
2680 unsigned long address, bool freeze, struct folio *folio)
2681 {
2682 spinlock_t *ptl;
2683 struct mmu_notifier_range range;
2684
2685 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
2686 address & HPAGE_PMD_MASK,
2687 (address & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE);
2688 mmu_notifier_invalidate_range_start(&range);
2689 ptl = pmd_lock(vma->vm_mm, pmd);
2690
2691 /*
2692 * If caller asks to setup a migration entry, we need a folio to check
2693 * pmd against. Otherwise we can end up replacing wrong folio.
2694 */
2695 VM_BUG_ON(freeze && !folio);
2696 VM_WARN_ON_ONCE(folio && !folio_test_locked(folio));
2697
2698 if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd) ||
2699 is_pmd_migration_entry(*pmd)) {
2700 /*
2701 * It's safe to call pmd_page when folio is set because it's
2702 * guaranteed that pmd is present.
2703 */
2704 if (folio && folio != page_folio(pmd_page(*pmd)))
2705 goto out;
2706 __split_huge_pmd_locked(vma, pmd, range.start, freeze);
2707 }
2708
2709 out:
2710 spin_unlock(ptl);
2711 mmu_notifier_invalidate_range_end(&range);
2712 }
2713
split_huge_pmd_address(struct vm_area_struct * vma,unsigned long address,bool freeze,struct folio * folio)2714 void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
2715 bool freeze, struct folio *folio)
2716 {
2717 pmd_t *pmd = mm_find_pmd(vma->vm_mm, address);
2718
2719 if (!pmd)
2720 return;
2721
2722 __split_huge_pmd(vma, pmd, address, freeze, folio);
2723 }
2724
split_huge_pmd_if_needed(struct vm_area_struct * vma,unsigned long address)2725 static inline void split_huge_pmd_if_needed(struct vm_area_struct *vma, unsigned long address)
2726 {
2727 /*
2728 * If the new address isn't hpage aligned and it could previously
2729 * contain an hugepage: check if we need to split an huge pmd.
2730 */
2731 if (!IS_ALIGNED(address, HPAGE_PMD_SIZE) &&
2732 range_in_vma(vma, ALIGN_DOWN(address, HPAGE_PMD_SIZE),
2733 ALIGN(address, HPAGE_PMD_SIZE)))
2734 split_huge_pmd_address(vma, address, false, NULL);
2735 }
2736
vma_adjust_trans_huge(struct vm_area_struct * vma,unsigned long start,unsigned long end,long adjust_next)2737 void vma_adjust_trans_huge(struct vm_area_struct *vma,
2738 unsigned long start,
2739 unsigned long end,
2740 long adjust_next)
2741 {
2742 /* Check if we need to split start first. */
2743 split_huge_pmd_if_needed(vma, start);
2744
2745 /* Check if we need to split end next. */
2746 split_huge_pmd_if_needed(vma, end);
2747
2748 /*
2749 * If we're also updating the next vma vm_start,
2750 * check if we need to split it.
2751 */
2752 if (adjust_next > 0) {
2753 struct vm_area_struct *next = find_vma(vma->vm_mm, vma->vm_end);
2754 unsigned long nstart = next->vm_start;
2755 nstart += adjust_next;
2756 split_huge_pmd_if_needed(next, nstart);
2757 }
2758 }
2759
unmap_folio(struct folio * folio)2760 static void unmap_folio(struct folio *folio)
2761 {
2762 enum ttu_flags ttu_flags = TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD |
2763 TTU_SYNC;
2764
2765 VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
2766
2767 /*
2768 * Anon pages need migration entries to preserve them, but file
2769 * pages can simply be left unmapped, then faulted back on demand.
2770 * If that is ever changed (perhaps for mlock), update remap_page().
2771 */
2772 if (folio_test_anon(folio))
2773 try_to_migrate(folio, ttu_flags);
2774 else
2775 try_to_unmap(folio, ttu_flags | TTU_IGNORE_MLOCK);
2776 }
2777
remap_page(struct folio * folio,unsigned long nr)2778 static void remap_page(struct folio *folio, unsigned long nr)
2779 {
2780 int i = 0;
2781
2782 /* If unmap_folio() uses try_to_migrate() on file, remove this check */
2783 if (!folio_test_anon(folio))
2784 return;
2785 for (;;) {
2786 remove_migration_ptes(folio, folio, true);
2787 i += folio_nr_pages(folio);
2788 if (i >= nr)
2789 break;
2790 folio = folio_next(folio);
2791 }
2792 }
2793
prep_to_unmap(struct folio * src)2794 static int prep_to_unmap(struct folio *src)
2795 {
2796 int nr_pages = folio_nr_pages(src);
2797
2798 if (folio_can_split(src))
2799 return 0;
2800
2801 WARN_ON_ONCE(src->_dst_pp);
2802
2803 src->_dst_pp = kcalloc(nr_pages, sizeof(struct page *), GFP_ATOMIC);
2804
2805 return src->_dst_pp ? 0 : -ENOMEM;
2806 }
2807
try_to_discard(struct folio * src,int i)2808 static bool try_to_discard(struct folio *src, int i)
2809 {
2810 int usage;
2811 void *addr;
2812 struct page *page = folio_page(src, i);
2813
2814 if (!folio_test_anon(src))
2815 return false;
2816
2817 if (folio_test_swapcache(src))
2818 return false;
2819
2820 usage = src_page_usage(page);
2821 if (usage & SRC_PAGE_MLOCKED)
2822 return false;
2823
2824 if (!(usage & SRC_PAGE_MAPPED))
2825 return true;
2826
2827 addr = kmap_local_page(page);
2828 if (!memchr_inv(addr, 0, PAGE_SIZE))
2829 set_src_usage(page, SRC_PAGE_CLEAN);
2830 kunmap_local(addr);
2831
2832 return can_discard_src(page);
2833 }
2834
prep_dst_pages(struct folio * src)2835 static int prep_dst_pages(struct folio *src)
2836 {
2837 int i;
2838 int nr_pages = folio_nr_pages(src);
2839
2840 if (folio_can_split(src))
2841 return 0;
2842
2843 if (WARN_ON_ONCE(!src->_dst_pp))
2844 return -ENOMEM;
2845
2846 for (i = 0; i < nr_pages; i++) {
2847 struct page *dst = NULL;
2848
2849 if (try_to_discard(src, i)) {
2850 count_vm_event(THP_SHATTER_PAGE_DISCARDED);
2851 continue;
2852 }
2853
2854 do {
2855 int nid = folio_nid(src);
2856 gfp_t gfp = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) |
2857 GFP_NOWAIT | __GFP_THISNODE;
2858
2859 if (dst)
2860 __free_page(dst);
2861
2862 dst = alloc_pages_node(nid, gfp, 0);
2863 if (!dst)
2864 return -ENOMEM;
2865 } while (!page_ref_freeze(dst, 1));
2866
2867 copy_highpage(dst, folio_page(src, i));
2868 src->_dst_ul[i] |= (unsigned long)dst;
2869
2870 cond_resched();
2871 }
2872
2873 return 0;
2874 }
2875
free_dst_pages(struct folio * src)2876 static void free_dst_pages(struct folio *src)
2877 {
2878 int i;
2879 int nr_pages = folio_nr_pages(src);
2880
2881 if (folio_can_split(src))
2882 return;
2883
2884 for (i = 0; i < nr_pages; i++) {
2885 struct page *dst = folio_dst_page(src, i);
2886
2887 if (!dst)
2888 continue;
2889
2890 page_ref_unfreeze(dst, 1);
2891 __free_page(dst);
2892 }
2893
2894 kfree(src->_dst_pp);
2895 src->_dst_pp = NULL;
2896 }
2897
reset_src_folio(struct folio * src)2898 static void reset_src_folio(struct folio *src)
2899 {
2900 if (folio_can_split(src))
2901 return;
2902
2903 if (WARN_ON_ONCE(!src->_dst_pp))
2904 return;
2905
2906 if (!folio_mapping_flags(src))
2907 src->mapping = NULL;
2908
2909 if (folio_test_anon(src) && folio_test_swapcache(src)) {
2910 folio_clear_swapcache(src);
2911 src->swap.val = 0;
2912 }
2913
2914 kfree(src->_dst_pp);
2915 src->_dst_pp = NULL;
2916 }
2917
lru_add_dst(struct lruvec * lruvec,struct folio * src,struct folio * dst)2918 static bool lru_add_dst(struct lruvec *lruvec, struct folio *src, struct folio *dst)
2919 {
2920 if (folio_can_split(src))
2921 return false;
2922
2923 VM_WARN_ON_ONCE_FOLIO(!folio_test_lru(src), src);
2924 VM_WARN_ON_ONCE_FOLIO(folio_test_lru(dst), dst);
2925 VM_WARN_ON_ONCE_FOLIO(folio_lruvec(dst) != folio_lruvec(src), dst);
2926
2927 if (!lru_gen_add_dst(lruvec, dst)) {
2928 enum lru_list lru = folio_lru_list(dst);
2929 int zone = folio_zonenum(dst);
2930 int delta = folio_nr_pages(dst);
2931
2932 if (folio_test_unevictable(dst))
2933 dst->mlock_count = 0;
2934 else
2935 list_add_tail(&dst->lru, &src->lru);
2936 update_lru_size(lruvec, lru, zone, delta);
2937 }
2938
2939 folio_set_lru(dst);
2940
2941 return true;
2942 }
2943
lru_add_page_tail(struct page * head,struct page * tail,struct lruvec * lruvec,struct list_head * list)2944 static void lru_add_page_tail(struct page *head, struct page *tail,
2945 struct lruvec *lruvec, struct list_head *list)
2946 {
2947 VM_BUG_ON_PAGE(!PageHead(head), head);
2948 VM_BUG_ON_PAGE(PageCompound(tail), head);
2949 VM_BUG_ON_PAGE(PageLRU(tail), head);
2950 lockdep_assert_held(&lruvec->lru_lock);
2951
2952 if (list) {
2953 /* page reclaim is reclaiming a huge page */
2954 VM_WARN_ON(PageLRU(head));
2955 get_page(tail);
2956 list_add_tail(&tail->lru, list);
2957 } else if (!lru_add_dst(lruvec, page_folio(head), page_folio(tail))) {
2958 /* head is still on lru (and we have it frozen) */
2959 VM_WARN_ON(!PageLRU(head));
2960 if (PageUnevictable(tail))
2961 tail->mlock_count = 0;
2962 else
2963 list_add_tail(&tail->lru, &head->lru);
2964 SetPageLRU(tail);
2965 }
2966 }
2967
__split_huge_page_tail(struct folio * folio,int tail,struct lruvec * lruvec,struct list_head * list)2968 static void __split_huge_page_tail(struct folio *folio, int tail,
2969 struct lruvec *lruvec, struct list_head *list)
2970 {
2971 struct page *head = &folio->page;
2972 struct page *page_tail = folio_dst_page(folio, tail);
2973 /*
2974 * Careful: new_folio is not a "real" folio before we cleared PageTail.
2975 * Don't pass it around before clear_compound_head().
2976 */
2977 struct folio *new_folio = (struct folio *)page_tail;
2978
2979 VM_BUG_ON_PAGE(atomic_read(&page_tail->_mapcount) != -1, page_tail);
2980
2981 /*
2982 * Clone page flags before unfreezing refcount.
2983 *
2984 * After successful get_page_unless_zero() might follow flags change,
2985 * for example lock_page() which set PG_waiters.
2986 *
2987 * Note that for mapped sub-pages of an anonymous THP,
2988 * PG_anon_exclusive has been cleared in unmap_folio() and is stored in
2989 * the migration entry instead from where remap_page() will restore it.
2990 * We can still have PG_anon_exclusive set on effectively unmapped and
2991 * unreferenced sub-pages of an anonymous THP: we can simply drop
2992 * PG_anon_exclusive (-> PG_mappedtodisk) for these here.
2993 */
2994 page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
2995 page_tail->flags |= (head->flags &
2996 ((1L << PG_referenced) |
2997 (1L << PG_swapbacked) |
2998 (1L << PG_swapcache) |
2999 (1L << PG_mlocked) |
3000 (1L << PG_uptodate) |
3001 (1L << PG_active) |
3002 (1L << PG_workingset) |
3003 (1L << PG_locked) |
3004 (1L << PG_unevictable) |
3005 #ifdef CONFIG_ARCH_USES_PG_ARCH_X
3006 (1L << PG_arch_2) |
3007 (1L << PG_arch_3) |
3008 #endif
3009 (1L << PG_dirty) |
3010 LRU_GEN_MASK | LRU_REFS_MASK));
3011
3012 /* ->mapping in first and second tail page is replaced by other uses */
3013 VM_BUG_ON_PAGE(folio_can_split(folio) && tail > 2 &&
3014 page_tail->mapping != TAIL_MAPPING, page_tail);
3015 page_tail->mapping = head->mapping;
3016 page_tail->index = head->index + tail;
3017
3018 /*
3019 * page->private should not be set in tail pages. Fix up and warn once
3020 * if private is unexpectedly set.
3021 */
3022 if (unlikely(page_tail->private)) {
3023 VM_WARN_ON_ONCE_PAGE(true, page_tail);
3024 page_tail->private = 0;
3025 }
3026 if (folio_test_swapcache(folio))
3027 new_folio->swap.val = folio->swap.val + tail;
3028
3029 /* Page flags must be visible before we make the page non-compound. */
3030 smp_wmb();
3031
3032 /*
3033 * Clear PageTail before unfreezing page refcount.
3034 *
3035 * After successful get_page_unless_zero() might follow put_page()
3036 * which needs correct compound_head().
3037 */
3038 clear_compound_head(page_tail);
3039
3040 /* Finally unfreeze refcount. Additional reference from page cache. */
3041 page_ref_unfreeze(page_tail, 1 + (!PageAnon(head) ||
3042 PageSwapCache(head)));
3043
3044 if (page_is_young(head))
3045 set_page_young(page_tail);
3046 if (page_is_idle(head))
3047 set_page_idle(page_tail);
3048
3049 page_cpupid_xchg_last(page_tail, page_cpupid_last(head));
3050
3051 /*
3052 * always add to the tail because some iterators expect new
3053 * pages to show after the currently processed elements - e.g.
3054 * migrate_pages
3055 */
3056 lru_add_page_tail(head, page_tail, lruvec, list);
3057 }
3058
__split_huge_page(struct page * page,struct list_head * list,pgoff_t end)3059 static void __split_huge_page(struct page *page, struct list_head *list,
3060 pgoff_t end)
3061 {
3062 struct folio *folio = page_folio(page);
3063 struct page *head = &folio->page;
3064 struct lruvec *lruvec;
3065 struct address_space *swap_cache = NULL;
3066 unsigned long offset = 0;
3067 unsigned int nr = thp_nr_pages(head);
3068 int i, nr_dropped = 0;
3069 bool can_split = folio_can_split(folio);
3070
3071 /* complete memcg works before add pages to LRU */
3072 if (can_split)
3073 split_page_memcg(head, nr);
3074 else
3075 folio_copy_memcg(folio);
3076
3077 if (folio_test_anon(folio) && folio_test_swapcache(folio)) {
3078 offset = swp_offset(folio->swap);
3079 swap_cache = swap_address_space(folio->swap);
3080 xa_lock(&swap_cache->i_pages);
3081 }
3082
3083 /* lock lru list/PageCompound, ref frozen by page_ref_freeze */
3084 lruvec = folio_lruvec_lock(folio);
3085
3086 ClearPageHasHWPoisoned(head);
3087
3088 for (i = nr - 1; i >= can_split; i--) {
3089 struct page *dst = folio_dst_page(folio, i);
3090
3091 if (!dst)
3092 continue;
3093
3094 __split_huge_page_tail(folio, i, lruvec, list);
3095 /* Some pages can be beyond EOF: drop them from page cache */
3096 if (dst->index >= end) {
3097 struct folio *tail = page_folio(dst);
3098
3099 if (shmem_mapping(tail->mapping))
3100 nr_dropped++;
3101 else if (folio_test_clear_dirty(tail))
3102 folio_account_cleaned(tail,
3103 inode_to_wb(tail->mapping->host));
3104 __filemap_remove_folio(tail, NULL);
3105 folio_put(tail);
3106 } else if (!PageAnon(dst)) {
3107 __xa_store(&dst->mapping->i_pages, dst->index, dst, 0);
3108 } else if (swap_cache) {
3109 __xa_store(&swap_cache->i_pages, offset + i, dst, 0);
3110 }
3111 }
3112
3113 if (can_split)
3114 ClearPageCompound(head);
3115 unlock_page_lruvec(lruvec);
3116 /* Caller disabled irqs, so they are still disabled here */
3117
3118 if (can_split)
3119 split_page_owner(head, nr);
3120
3121 /* See comment in __split_huge_page_tail() */
3122 if (PageAnon(head)) {
3123 /* Additional pin to swap cache */
3124 if (PageSwapCache(head)) {
3125 page_ref_add(head, 2 - !can_split);
3126 xa_unlock(&swap_cache->i_pages);
3127 } else {
3128 page_ref_inc(head);
3129 }
3130 } else {
3131 /* Additional pin to page cache */
3132 page_ref_add(head, 2 - !can_split);
3133 xa_unlock(&head->mapping->i_pages);
3134 }
3135 local_irq_enable();
3136
3137 if (nr_dropped)
3138 shmem_uncharge(head->mapping->host, nr_dropped);
3139 remap_page(folio, nr);
3140
3141 for (i = 0; i < nr; i++) {
3142 struct page *subpage = folio_dst_page(folio, i);
3143
3144 if (!subpage || subpage == page)
3145 continue;
3146 unlock_page(subpage);
3147
3148 /*
3149 * Subpages may be freed if there wasn't any mapping
3150 * like if add_to_swap() is running on a lru page that
3151 * had its mapping zapped. And freeing these pages
3152 * requires taking the lru_lock so we do the put_page
3153 * of the tail pages after the split is complete.
3154 */
3155 free_page_and_swap_cache(subpage);
3156 }
3157
3158 reset_src_folio(folio);
3159 }
3160
3161 /* Racy check whether the huge page can be split */
can_split_folio(struct folio * folio,int * pextra_pins)3162 static bool can_split_folio(struct folio *folio, int *pextra_pins)
3163 {
3164 int extra_pins;
3165
3166 /* Additional pins from page cache */
3167 if (folio_test_anon(folio))
3168 extra_pins = folio_test_swapcache(folio) ?
3169 folio_nr_pages(folio) : 0;
3170 else
3171 extra_pins = folio_nr_pages(folio);
3172 if (pextra_pins)
3173 *pextra_pins = extra_pins;
3174 return folio_mapcount(folio) == folio_ref_count(folio) - extra_pins - 1;
3175 }
3176
3177 /*
3178 * This function splits huge page into normal pages. @page can point to any
3179 * subpage of huge page to split. Split doesn't change the position of @page.
3180 *
3181 * Only caller must hold pin on the @page, otherwise split fails with -EBUSY.
3182 * The huge page must be locked.
3183 *
3184 * If @list is null, tail pages will be added to LRU list, otherwise, to @list.
3185 *
3186 * Both head page and tail pages will inherit mapping, flags, and so on from
3187 * the hugepage.
3188 *
3189 * GUP pin and PG_locked transferred to @page. Rest subpages can be freed if
3190 * they are not mapped.
3191 *
3192 * Returns 0 if the hugepage is split successfully.
3193 * Returns -EBUSY if the page is pinned or if anon_vma disappeared from under
3194 * us.
3195 */
split_huge_page_to_list(struct page * page,struct list_head * list)3196 int split_huge_page_to_list(struct page *page, struct list_head *list)
3197 {
3198 struct folio *folio = page_folio(page);
3199 struct deferred_split *ds_queue = get_deferred_split_queue(folio);
3200 XA_STATE(xas, &folio->mapping->i_pages, folio->index);
3201 struct anon_vma *anon_vma = NULL;
3202 struct address_space *mapping = NULL;
3203 int extra_pins, ret;
3204 pgoff_t end;
3205 bool is_hzp;
3206
3207 VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
3208 VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
3209
3210 is_hzp = is_huge_zero_page(&folio->page);
3211 if (is_hzp) {
3212 pr_warn_ratelimited("Called split_huge_page for huge zero page\n");
3213 return -EBUSY;
3214 }
3215
3216 if (folio_test_writeback(folio))
3217 return -EBUSY;
3218
3219 if (folio_test_anon(folio)) {
3220 /*
3221 * The caller does not necessarily hold an mmap_lock that would
3222 * prevent the anon_vma disappearing so we first we take a
3223 * reference to it and then lock the anon_vma for write. This
3224 * is similar to folio_lock_anon_vma_read except the write lock
3225 * is taken to serialise against parallel split or collapse
3226 * operations.
3227 */
3228 anon_vma = folio_get_anon_vma(folio);
3229 if (!anon_vma) {
3230 ret = -EBUSY;
3231 goto out;
3232 }
3233 end = -1;
3234 mapping = NULL;
3235 anon_vma_lock_write(anon_vma);
3236 } else {
3237 gfp_t gfp;
3238
3239 mapping = folio->mapping;
3240
3241 /* Truncated ? */
3242 if (!mapping) {
3243 ret = -EBUSY;
3244 goto out;
3245 }
3246
3247 gfp = current_gfp_context(mapping_gfp_mask(mapping) &
3248 GFP_RECLAIM_MASK);
3249
3250 if (!filemap_release_folio(folio, gfp)) {
3251 ret = -EBUSY;
3252 goto out;
3253 }
3254
3255 xas_split_alloc(&xas, folio, folio_order(folio), gfp);
3256 if (xas_error(&xas)) {
3257 ret = xas_error(&xas);
3258 goto out;
3259 }
3260
3261 anon_vma = NULL;
3262 i_mmap_lock_read(mapping);
3263
3264 /*
3265 *__split_huge_page() may need to trim off pages beyond EOF:
3266 * but on 32-bit, i_size_read() takes an irq-unsafe seqlock,
3267 * which cannot be nested inside the page tree lock. So note
3268 * end now: i_size itself may be changed at any moment, but
3269 * folio lock is good enough to serialize the trimming.
3270 */
3271 end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
3272 if (shmem_mapping(mapping))
3273 end = shmem_fallocend(mapping->host, end);
3274 }
3275
3276 /*
3277 * Racy check if we can split the page, before unmap_folio() will
3278 * split PMDs
3279 */
3280 if (!can_split_folio(folio, &extra_pins)) {
3281 ret = -EAGAIN;
3282 goto out_unlock;
3283 }
3284
3285 ret = prep_to_unmap(folio);
3286 if (ret)
3287 goto out_unlock;
3288
3289 unmap_folio(folio);
3290
3291 if (!folio_ref_freeze(folio, 1 + extra_pins)) {
3292 ret = -EAGAIN;
3293 goto remap;
3294 }
3295
3296 ret = prep_dst_pages(folio);
3297 if (ret)
3298 goto unfreeze;
3299
3300 /* block interrupt reentry in xa_lock and spinlock */
3301 local_irq_disable();
3302 if (mapping) {
3303 /*
3304 * Check if the folio is present in page cache.
3305 * We assume all tail are present too, if folio is there.
3306 */
3307 xas_lock(&xas);
3308 xas_reset(&xas);
3309 if (xas_load(&xas) != folio) {
3310 ret = -EAGAIN;
3311 goto fail;
3312 }
3313 }
3314
3315 /* Prevent deferred_split_scan() touching ->_refcount */
3316 spin_lock(&ds_queue->split_queue_lock);
3317 if (!list_empty(&folio->_deferred_list)) {
3318 ds_queue->split_queue_len--;
3319 list_del_init(&folio->_deferred_list);
3320 }
3321 spin_unlock(&ds_queue->split_queue_lock);
3322 if (mapping) {
3323 int nr = folio_nr_pages(folio);
3324
3325 xas_split(&xas, folio, folio_order(folio));
3326 if (folio_test_pmd_mappable(folio)) {
3327 if (folio_test_swapbacked(folio)) {
3328 __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, -nr);
3329 } else {
3330 __lruvec_stat_mod_folio(folio, NR_FILE_THPS, -nr);
3331 filemap_nr_thps_dec(mapping);
3332 }
3333 }
3334 }
3335
3336 __split_huge_page(page, list, end);
3337 if (ret) {
3338 fail:
3339 if (mapping)
3340 xas_unlock(&xas);
3341 local_irq_enable();
3342 unfreeze:
3343 folio_ref_unfreeze(folio, 1 + extra_pins);
3344 remap:
3345 free_dst_pages(folio);
3346 remap_page(folio, folio_nr_pages(folio));
3347 }
3348
3349 out_unlock:
3350 if (anon_vma) {
3351 anon_vma_unlock_write(anon_vma);
3352 put_anon_vma(anon_vma);
3353 }
3354 if (mapping)
3355 i_mmap_unlock_read(mapping);
3356 out:
3357 xas_destroy(&xas);
3358
3359 if (!folio_can_split(folio)) {
3360 count_vm_event(!ret ? THP_SHATTER_PAGE : THP_SHATTER_PAGE_FAILED);
3361 return ret ? : 1;
3362 }
3363
3364 count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED);
3365 return ret;
3366 }
3367
folio_undo_large_rmappable(struct folio * folio)3368 void folio_undo_large_rmappable(struct folio *folio)
3369 {
3370 struct deferred_split *ds_queue;
3371 unsigned long flags;
3372
3373 /*
3374 * At this point, there is no one trying to add the folio to
3375 * deferred_list. If folio is not in deferred_list, it's safe
3376 * to check without acquiring the split_queue_lock.
3377 */
3378 if (data_race(list_empty(&folio->_deferred_list)))
3379 return;
3380
3381 ds_queue = get_deferred_split_queue(folio);
3382 spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
3383 if (!list_empty(&folio->_deferred_list)) {
3384 ds_queue->split_queue_len--;
3385 list_del(&folio->_deferred_list);
3386 }
3387 spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
3388 }
3389
deferred_split_folio(struct folio * folio)3390 void deferred_split_folio(struct folio *folio)
3391 {
3392 struct deferred_split *ds_queue = get_deferred_split_queue(folio);
3393 #ifdef CONFIG_MEMCG
3394 struct mem_cgroup *memcg = folio_memcg(folio);
3395 #endif
3396 unsigned long flags;
3397
3398 VM_BUG_ON_FOLIO(folio_order(folio) < 2, folio);
3399
3400 /*
3401 * The try_to_unmap() in page reclaim path might reach here too,
3402 * this may cause a race condition to corrupt deferred split queue.
3403 * And, if page reclaim is already handling the same folio, it is
3404 * unnecessary to handle it again in shrinker.
3405 *
3406 * Check the swapcache flag to determine if the folio is being
3407 * handled by page reclaim since THP swap would add the folio into
3408 * swap cache before calling try_to_unmap().
3409 */
3410 if (folio_test_swapcache(folio))
3411 return;
3412
3413 if (!list_empty(&folio->_deferred_list))
3414 return;
3415
3416 spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
3417 if (list_empty(&folio->_deferred_list)) {
3418 count_vm_event(THP_DEFERRED_SPLIT_PAGE);
3419 list_add_tail(&folio->_deferred_list, &ds_queue->split_queue);
3420 ds_queue->split_queue_len++;
3421 #ifdef CONFIG_MEMCG
3422 if (memcg)
3423 set_shrinker_bit(memcg, folio_nid(folio),
3424 deferred_split_shrinker.id);
3425 #endif
3426 }
3427 spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
3428 }
3429
deferred_split_count(struct shrinker * shrink,struct shrink_control * sc)3430 static unsigned long deferred_split_count(struct shrinker *shrink,
3431 struct shrink_control *sc)
3432 {
3433 struct pglist_data *pgdata = NODE_DATA(sc->nid);
3434 struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
3435 bool bypass = false;
3436
3437 trace_android_vh_split_large_folio_bypass(&bypass);
3438 if (bypass)
3439 return 0;
3440 #ifdef CONFIG_MEMCG
3441 if (sc->memcg)
3442 ds_queue = &sc->memcg->deferred_split_queue;
3443 #endif
3444 return READ_ONCE(ds_queue->split_queue_len);
3445 }
3446
deferred_split_scan(struct shrinker * shrink,struct shrink_control * sc)3447 static unsigned long deferred_split_scan(struct shrinker *shrink,
3448 struct shrink_control *sc)
3449 {
3450 struct pglist_data *pgdata = NODE_DATA(sc->nid);
3451 struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
3452 unsigned long flags;
3453 LIST_HEAD(list);
3454 struct folio *folio, *next;
3455 int split = 0;
3456
3457 #ifdef CONFIG_MEMCG
3458 if (sc->memcg)
3459 ds_queue = &sc->memcg->deferred_split_queue;
3460 #endif
3461
3462 spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
3463 /* Take pin on all head pages to avoid freeing them under us */
3464 list_for_each_entry_safe(folio, next, &ds_queue->split_queue,
3465 _deferred_list) {
3466 if (folio_try_get(folio)) {
3467 list_move(&folio->_deferred_list, &list);
3468 } else {
3469 /* We lost race with folio_put() */
3470 list_del_init(&folio->_deferred_list);
3471 ds_queue->split_queue_len--;
3472 }
3473 if (!--sc->nr_to_scan)
3474 break;
3475 }
3476 spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
3477
3478 list_for_each_entry_safe(folio, next, &list, _deferred_list) {
3479 if (!folio_trylock(folio))
3480 goto next;
3481 /* split_huge_page() removes page from list on success */
3482 if (!split_folio(folio))
3483 split++;
3484 folio_unlock(folio);
3485 next:
3486 folio_put(folio);
3487 }
3488
3489 spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
3490 list_splice_tail(&list, &ds_queue->split_queue);
3491 spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
3492
3493 /*
3494 * Stop shrinker if we didn't split any page, but the queue is empty.
3495 * This can happen if pages were freed under us.
3496 */
3497 if (!split && list_empty(&ds_queue->split_queue))
3498 return SHRINK_STOP;
3499 return split;
3500 }
3501
3502 static struct shrinker deferred_split_shrinker = {
3503 .count_objects = deferred_split_count,
3504 .scan_objects = deferred_split_scan,
3505 .seeks = DEFAULT_SEEKS,
3506 .flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE |
3507 SHRINKER_NONSLAB,
3508 };
3509
3510 #ifdef CONFIG_DEBUG_FS
split_huge_pages_all(void)3511 static void split_huge_pages_all(void)
3512 {
3513 struct zone *zone;
3514 struct page *page;
3515 struct folio *folio;
3516 unsigned long pfn, max_zone_pfn;
3517 unsigned long total = 0, split = 0;
3518
3519 pr_debug("Split all THPs\n");
3520 for_each_zone(zone) {
3521 if (!managed_zone(zone))
3522 continue;
3523 max_zone_pfn = zone_end_pfn(zone);
3524 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) {
3525 int nr_pages;
3526
3527 page = pfn_to_online_page(pfn);
3528 if (!page || PageTail(page))
3529 continue;
3530 folio = page_folio(page);
3531 if (!folio_try_get(folio))
3532 continue;
3533
3534 if (unlikely(page_folio(page) != folio))
3535 goto next;
3536
3537 if (zone != folio_zone(folio))
3538 goto next;
3539
3540 if (!folio_test_large(folio)
3541 || folio_test_hugetlb(folio)
3542 || !folio_test_lru(folio))
3543 goto next;
3544
3545 total++;
3546 folio_lock(folio);
3547 nr_pages = folio_nr_pages(folio);
3548 if (!split_folio(folio))
3549 split++;
3550 pfn += nr_pages - 1;
3551 folio_unlock(folio);
3552 next:
3553 folio_put(folio);
3554 cond_resched();
3555 }
3556 }
3557
3558 pr_debug("%lu of %lu THP split\n", split, total);
3559 }
3560
vma_not_suitable_for_thp_split(struct vm_area_struct * vma)3561 static inline bool vma_not_suitable_for_thp_split(struct vm_area_struct *vma)
3562 {
3563 return vma_is_special_huge(vma) || (vma->vm_flags & VM_IO) ||
3564 is_vm_hugetlb_page(vma);
3565 }
3566
split_huge_pages_pid(int pid,unsigned long vaddr_start,unsigned long vaddr_end)3567 static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
3568 unsigned long vaddr_end)
3569 {
3570 int ret = 0;
3571 struct task_struct *task;
3572 struct mm_struct *mm;
3573 unsigned long total = 0, split = 0;
3574 unsigned long addr;
3575
3576 vaddr_start &= PAGE_MASK;
3577 vaddr_end &= PAGE_MASK;
3578
3579 /* Find the task_struct from pid */
3580 rcu_read_lock();
3581 task = find_task_by_vpid(pid);
3582 if (!task) {
3583 rcu_read_unlock();
3584 ret = -ESRCH;
3585 goto out;
3586 }
3587 get_task_struct(task);
3588 rcu_read_unlock();
3589
3590 /* Find the mm_struct */
3591 mm = get_task_mm(task);
3592 put_task_struct(task);
3593
3594 if (!mm) {
3595 ret = -EINVAL;
3596 goto out;
3597 }
3598
3599 pr_debug("Split huge pages in pid: %d, vaddr: [0x%lx - 0x%lx]\n",
3600 pid, vaddr_start, vaddr_end);
3601
3602 mmap_read_lock(mm);
3603 /*
3604 * always increase addr by PAGE_SIZE, since we could have a PTE page
3605 * table filled with PTE-mapped THPs, each of which is distinct.
3606 */
3607 for (addr = vaddr_start; addr < vaddr_end; addr += PAGE_SIZE) {
3608 struct vm_area_struct *vma = vma_lookup(mm, addr);
3609 struct page *page;
3610 struct folio *folio;
3611
3612 if (!vma)
3613 break;
3614
3615 /* skip special VMA and hugetlb VMA */
3616 if (vma_not_suitable_for_thp_split(vma)) {
3617 addr = vma->vm_end;
3618 continue;
3619 }
3620
3621 /* FOLL_DUMP to ignore special (like zero) pages */
3622 page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP);
3623
3624 if (IS_ERR_OR_NULL(page))
3625 continue;
3626
3627 folio = page_folio(page);
3628 if (!is_transparent_hugepage(folio))
3629 goto next;
3630
3631 total++;
3632 if (!can_split_folio(folio, NULL))
3633 goto next;
3634
3635 if (!folio_trylock(folio))
3636 goto next;
3637
3638 if (!split_folio(folio))
3639 split++;
3640
3641 folio_unlock(folio);
3642 next:
3643 folio_put(folio);
3644 cond_resched();
3645 }
3646 mmap_read_unlock(mm);
3647 mmput(mm);
3648
3649 pr_debug("%lu of %lu THP split\n", split, total);
3650
3651 out:
3652 return ret;
3653 }
3654
split_huge_pages_in_file(const char * file_path,pgoff_t off_start,pgoff_t off_end)3655 static int split_huge_pages_in_file(const char *file_path, pgoff_t off_start,
3656 pgoff_t off_end)
3657 {
3658 struct filename *file;
3659 struct file *candidate;
3660 struct address_space *mapping;
3661 int ret = -EINVAL;
3662 pgoff_t index;
3663 int nr_pages = 1;
3664 unsigned long total = 0, split = 0;
3665
3666 file = getname_kernel(file_path);
3667 if (IS_ERR(file))
3668 return ret;
3669
3670 candidate = file_open_name(file, O_RDONLY, 0);
3671 if (IS_ERR(candidate))
3672 goto out;
3673
3674 pr_debug("split file-backed THPs in file: %s, page offset: [0x%lx - 0x%lx]\n",
3675 file_path, off_start, off_end);
3676
3677 mapping = candidate->f_mapping;
3678
3679 for (index = off_start; index < off_end; index += nr_pages) {
3680 struct folio *folio = filemap_get_folio(mapping, index);
3681
3682 nr_pages = 1;
3683 if (IS_ERR(folio))
3684 continue;
3685
3686 if (!folio_test_large(folio))
3687 goto next;
3688
3689 total++;
3690 nr_pages = folio_nr_pages(folio);
3691
3692 if (!folio_trylock(folio))
3693 goto next;
3694
3695 if (!split_folio(folio))
3696 split++;
3697
3698 folio_unlock(folio);
3699 next:
3700 folio_put(folio);
3701 cond_resched();
3702 }
3703
3704 filp_close(candidate, NULL);
3705 ret = 0;
3706
3707 pr_debug("%lu of %lu file-backed THP split\n", split, total);
3708 out:
3709 putname(file);
3710 return ret;
3711 }
3712
3713 #define MAX_INPUT_BUF_SZ 255
3714
split_huge_pages_write(struct file * file,const char __user * buf,size_t count,loff_t * ppops)3715 static ssize_t split_huge_pages_write(struct file *file, const char __user *buf,
3716 size_t count, loff_t *ppops)
3717 {
3718 static DEFINE_MUTEX(split_debug_mutex);
3719 ssize_t ret;
3720 /* hold pid, start_vaddr, end_vaddr or file_path, off_start, off_end */
3721 char input_buf[MAX_INPUT_BUF_SZ];
3722 int pid;
3723 unsigned long vaddr_start, vaddr_end;
3724
3725 ret = mutex_lock_interruptible(&split_debug_mutex);
3726 if (ret)
3727 return ret;
3728
3729 ret = -EFAULT;
3730
3731 memset(input_buf, 0, MAX_INPUT_BUF_SZ);
3732 if (copy_from_user(input_buf, buf, min_t(size_t, count, MAX_INPUT_BUF_SZ)))
3733 goto out;
3734
3735 input_buf[MAX_INPUT_BUF_SZ - 1] = '\0';
3736
3737 if (input_buf[0] == '/') {
3738 char *tok;
3739 char *buf = input_buf;
3740 char file_path[MAX_INPUT_BUF_SZ];
3741 pgoff_t off_start = 0, off_end = 0;
3742 size_t input_len = strlen(input_buf);
3743
3744 tok = strsep(&buf, ",");
3745 if (tok) {
3746 strcpy(file_path, tok);
3747 } else {
3748 ret = -EINVAL;
3749 goto out;
3750 }
3751
3752 ret = sscanf(buf, "0x%lx,0x%lx", &off_start, &off_end);
3753 if (ret != 2) {
3754 ret = -EINVAL;
3755 goto out;
3756 }
3757 ret = split_huge_pages_in_file(file_path, off_start, off_end);
3758 if (!ret)
3759 ret = input_len;
3760
3761 goto out;
3762 }
3763
3764 ret = sscanf(input_buf, "%d,0x%lx,0x%lx", &pid, &vaddr_start, &vaddr_end);
3765 if (ret == 1 && pid == 1) {
3766 split_huge_pages_all();
3767 ret = strlen(input_buf);
3768 goto out;
3769 } else if (ret != 3) {
3770 ret = -EINVAL;
3771 goto out;
3772 }
3773
3774 ret = split_huge_pages_pid(pid, vaddr_start, vaddr_end);
3775 if (!ret)
3776 ret = strlen(input_buf);
3777 out:
3778 mutex_unlock(&split_debug_mutex);
3779 return ret;
3780
3781 }
3782
3783 static const struct file_operations split_huge_pages_fops = {
3784 .owner = THIS_MODULE,
3785 .write = split_huge_pages_write,
3786 .llseek = no_llseek,
3787 };
3788
split_huge_pages_debugfs(void)3789 static int __init split_huge_pages_debugfs(void)
3790 {
3791 debugfs_create_file("split_huge_pages", 0200, NULL, NULL,
3792 &split_huge_pages_fops);
3793 return 0;
3794 }
3795 late_initcall(split_huge_pages_debugfs);
3796 #endif
3797
3798 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
set_pmd_migration_entry(struct page_vma_mapped_walk * pvmw,struct page * page)3799 int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
3800 struct page *page)
3801 {
3802 struct folio *folio = page_folio(page);
3803 struct vm_area_struct *vma = pvmw->vma;
3804 struct mm_struct *mm = vma->vm_mm;
3805 unsigned long address = pvmw->address;
3806 bool anon_exclusive;
3807 pmd_t pmdval;
3808 swp_entry_t entry;
3809 pmd_t pmdswp;
3810
3811 if (!(pvmw->pmd && !pvmw->pte))
3812 return 0;
3813
3814 flush_cache_range(vma, address, address + HPAGE_PMD_SIZE);
3815 pmdval = pmdp_invalidate(vma, address, pvmw->pmd);
3816
3817 /* See folio_try_share_anon_rmap_pmd(): invalidate PMD first. */
3818 anon_exclusive = folio_test_anon(folio) && PageAnonExclusive(page);
3819 if (anon_exclusive && folio_try_share_anon_rmap_pmd(folio, page)) {
3820 set_pmd_at(mm, address, pvmw->pmd, pmdval);
3821 return -EBUSY;
3822 }
3823
3824 if (pmd_dirty(pmdval))
3825 folio_mark_dirty(folio);
3826 if (pmd_write(pmdval))
3827 entry = make_writable_migration_entry(page_to_pfn(page));
3828 else if (anon_exclusive)
3829 entry = make_readable_exclusive_migration_entry(page_to_pfn(page));
3830 else
3831 entry = make_readable_migration_entry(page_to_pfn(page));
3832 if (pmd_young(pmdval))
3833 entry = make_migration_entry_young(entry);
3834 if (pmd_dirty(pmdval))
3835 entry = make_migration_entry_dirty(entry);
3836 pmdswp = swp_entry_to_pmd(entry);
3837 if (pmd_soft_dirty(pmdval))
3838 pmdswp = pmd_swp_mksoft_dirty(pmdswp);
3839 if (pmd_uffd_wp(pmdval))
3840 pmdswp = pmd_swp_mkuffd_wp(pmdswp);
3841 set_pmd_at(mm, address, pvmw->pmd, pmdswp);
3842 folio_remove_rmap_pmd(folio, page, vma);
3843 folio_put(folio);
3844 trace_set_migration_pmd(address, pmd_val(pmdswp));
3845
3846 return 0;
3847 }
3848
remove_migration_pmd(struct page_vma_mapped_walk * pvmw,struct page * new)3849 void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
3850 {
3851 struct folio *folio = page_folio(new);
3852 struct vm_area_struct *vma = pvmw->vma;
3853 struct mm_struct *mm = vma->vm_mm;
3854 unsigned long address = pvmw->address;
3855 unsigned long haddr = address & HPAGE_PMD_MASK;
3856 pmd_t pmde;
3857 swp_entry_t entry;
3858
3859 if (!(pvmw->pmd && !pvmw->pte))
3860 return;
3861
3862 entry = pmd_to_swp_entry(*pvmw->pmd);
3863 folio_get(folio);
3864 pmde = mk_huge_pmd(new, READ_ONCE(vma->vm_page_prot));
3865 if (pmd_swp_soft_dirty(*pvmw->pmd))
3866 pmde = pmd_mksoft_dirty(pmde);
3867 if (is_writable_migration_entry(entry))
3868 pmde = pmd_mkwrite(pmde, vma);
3869 if (pmd_swp_uffd_wp(*pvmw->pmd))
3870 pmde = pmd_mkuffd_wp(pmde);
3871 if (!is_migration_entry_young(entry))
3872 pmde = pmd_mkold(pmde);
3873 /* NOTE: this may contain setting soft-dirty on some archs */
3874 if (folio_test_dirty(folio) && is_migration_entry_dirty(entry))
3875 pmde = pmd_mkdirty(pmde);
3876
3877 if (folio_test_anon(folio)) {
3878 rmap_t rmap_flags = RMAP_NONE;
3879
3880 if (!is_readable_migration_entry(entry))
3881 rmap_flags |= RMAP_EXCLUSIVE;
3882
3883 folio_add_anon_rmap_pmd(folio, new, vma, haddr, rmap_flags);
3884 } else {
3885 folio_add_file_rmap_pmd(folio, new, vma);
3886 }
3887 VM_BUG_ON(pmd_write(pmde) && folio_test_anon(folio) && !PageAnonExclusive(new));
3888 set_pmd_at(mm, haddr, pvmw->pmd, pmde);
3889
3890 /* No need to invalidate - it was non-present before */
3891 update_mmu_cache_pmd(vma, address, pvmw->pmd);
3892 trace_remove_migration_pmd(address, pmd_val(pmde));
3893 }
3894 #endif
3895