1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * Simple NUMA memory policy for the Linux kernel.
4 *
5 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
6 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
7 *
8 * NUMA policy allows the user to give hints in which node(s) memory should
9 * be allocated.
10 *
11 * Support four policies per VMA and per process:
12 *
13 * The VMA policy has priority over the process policy for a page fault.
14 *
15 * interleave Allocate memory interleaved over a set of nodes,
16 * with normal fallback if it fails.
17 * For VMA based allocations this interleaves based on the
18 * offset into the backing object or offset into the mapping
19 * for anonymous memory. For process policy an process counter
20 * is used.
21 *
22 * bind Only allocate memory on a specific set of nodes,
23 * no fallback.
24 * FIXME: memory is allocated starting with the first node
25 * to the last. It would be better if bind would truly restrict
26 * the allocation to memory nodes instead
27 *
28 * preferred Try a specific node first before normal fallback.
29 * As a special case NUMA_NO_NODE here means do the allocation
30 * on the local CPU. This is normally identical to default,
31 * but useful to set in a VMA when you have a non default
32 * process policy.
33 *
34 * default Allocate on the local node first, or when on a VMA
35 * use the process policy. This is what Linux always did
36 * in a NUMA aware kernel and still does by, ahem, default.
37 *
38 * The process policy is applied for most non interrupt memory allocations
39 * in that process' context. Interrupts ignore the policies and always
40 * try to allocate on the local CPU. The VMA policy is only applied for memory
41 * allocations for a VMA in the VM.
42 *
43 * Currently there are a few corner cases in swapping where the policy
44 * is not applied, but the majority should be handled. When process policy
45 * is used it is not remembered over swap outs/swap ins.
46 *
47 * Only the highest zone in the zone hierarchy gets policied. Allocations
48 * requesting a lower zone just use default policy. This implies that
49 * on systems with highmem kernel lowmem allocation don't get policied.
50 * Same with GFP_DMA allocations.
51 *
52 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
53 * all users and remembered even when nobody has memory mapped.
54 */
55
56 /* Notebook:
57 fix mmap readahead to honour policy and enable policy for any page cache
58 object
59 statistics for bigpages
60 global policy for page cache? currently it uses process policy. Requires
61 first item above.
62 handle mremap for shared memory (currently ignored for the policy)
63 grows down?
64 make bind policy root only? It can trigger oom much faster and the
65 kernel is not always grateful with that.
66 */
67
68 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
69
70 #include <linux/mempolicy.h>
71 #include <linux/pagewalk.h>
72 #include <linux/highmem.h>
73 #include <linux/hugetlb.h>
74 #include <linux/kernel.h>
75 #include <linux/sched.h>
76 #include <linux/sched/mm.h>
77 #include <linux/sched/numa_balancing.h>
78 #include <linux/sched/task.h>
79 #include <linux/nodemask.h>
80 #include <linux/cpuset.h>
81 #include <linux/slab.h>
82 #include <linux/string.h>
83 #include <linux/export.h>
84 #include <linux/nsproxy.h>
85 #include <linux/interrupt.h>
86 #include <linux/init.h>
87 #include <linux/compat.h>
88 #include <linux/ptrace.h>
89 #include <linux/swap.h>
90 #include <linux/seq_file.h>
91 #include <linux/proc_fs.h>
92 #include <linux/migrate.h>
93 #include <linux/ksm.h>
94 #include <linux/rmap.h>
95 #include <linux/security.h>
96 #include <linux/syscalls.h>
97 #include <linux/ctype.h>
98 #include <linux/mm_inline.h>
99 #include <linux/mmu_notifier.h>
100 #include <linux/printk.h>
101 #include <linux/swapops.h>
102
103 #include <asm/tlbflush.h>
104 #include <linux/uaccess.h>
105
106 #include "internal.h"
107
108 /* Internal flags */
109 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */
110 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */
111
112 static struct kmem_cache *policy_cache;
113 static struct kmem_cache *sn_cache;
114
115 /* Highest zone. An specific allocation for a zone below that is not
116 policied. */
117 enum zone_type policy_zone = 0;
118
119 /*
120 * run-time system-wide default policy => local allocation
121 */
122 static struct mempolicy default_policy = {
123 .refcnt = ATOMIC_INIT(1), /* never free it */
124 .mode = MPOL_PREFERRED,
125 .flags = MPOL_F_LOCAL,
126 };
127
128 static struct mempolicy preferred_node_policy[MAX_NUMNODES];
129
130 /**
131 * numa_map_to_online_node - Find closest online node
132 * @node: Node id to start the search
133 *
134 * Lookup the next closest node by distance if @nid is not online.
135 */
numa_map_to_online_node(int node)136 int numa_map_to_online_node(int node)
137 {
138 int min_dist = INT_MAX, dist, n, min_node;
139
140 if (node == NUMA_NO_NODE || node_online(node))
141 return node;
142
143 min_node = node;
144 for_each_online_node(n) {
145 dist = node_distance(node, n);
146 if (dist < min_dist) {
147 min_dist = dist;
148 min_node = n;
149 }
150 }
151
152 return min_node;
153 }
154 EXPORT_SYMBOL_GPL(numa_map_to_online_node);
155
get_task_policy(struct task_struct * p)156 struct mempolicy *get_task_policy(struct task_struct *p)
157 {
158 struct mempolicy *pol = p->mempolicy;
159 int node;
160
161 if (pol)
162 return pol;
163
164 node = numa_node_id();
165 if (node != NUMA_NO_NODE) {
166 pol = &preferred_node_policy[node];
167 /* preferred_node_policy is not initialised early in boot */
168 if (pol->mode)
169 return pol;
170 }
171
172 return &default_policy;
173 }
174
175 static const struct mempolicy_operations {
176 int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
177 void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
178 } mpol_ops[MPOL_MAX];
179
mpol_store_user_nodemask(const struct mempolicy * pol)180 static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
181 {
182 return pol->flags & MPOL_MODE_FLAGS;
183 }
184
mpol_relative_nodemask(nodemask_t * ret,const nodemask_t * orig,const nodemask_t * rel)185 static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
186 const nodemask_t *rel)
187 {
188 nodemask_t tmp;
189 nodes_fold(tmp, *orig, nodes_weight(*rel));
190 nodes_onto(*ret, tmp, *rel);
191 }
192
mpol_new_interleave(struct mempolicy * pol,const nodemask_t * nodes)193 static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
194 {
195 if (nodes_empty(*nodes))
196 return -EINVAL;
197 pol->v.nodes = *nodes;
198 return 0;
199 }
200
mpol_new_preferred(struct mempolicy * pol,const nodemask_t * nodes)201 static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
202 {
203 if (!nodes)
204 pol->flags |= MPOL_F_LOCAL; /* local allocation */
205 else if (nodes_empty(*nodes))
206 return -EINVAL; /* no allowed nodes */
207 else
208 pol->v.preferred_node = first_node(*nodes);
209 return 0;
210 }
211
mpol_new_bind(struct mempolicy * pol,const nodemask_t * nodes)212 static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
213 {
214 if (nodes_empty(*nodes))
215 return -EINVAL;
216 pol->v.nodes = *nodes;
217 return 0;
218 }
219
220 /*
221 * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
222 * any, for the new policy. mpol_new() has already validated the nodes
223 * parameter with respect to the policy mode and flags. But, we need to
224 * handle an empty nodemask with MPOL_PREFERRED here.
225 *
226 * Must be called holding task's alloc_lock to protect task's mems_allowed
227 * and mempolicy. May also be called holding the mmap_lock for write.
228 */
mpol_set_nodemask(struct mempolicy * pol,const nodemask_t * nodes,struct nodemask_scratch * nsc)229 static int mpol_set_nodemask(struct mempolicy *pol,
230 const nodemask_t *nodes, struct nodemask_scratch *nsc)
231 {
232 int ret;
233
234 /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
235 if (pol == NULL)
236 return 0;
237 /* Check N_MEMORY */
238 nodes_and(nsc->mask1,
239 cpuset_current_mems_allowed, node_states[N_MEMORY]);
240
241 VM_BUG_ON(!nodes);
242 if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
243 nodes = NULL; /* explicit local allocation */
244 else {
245 if (pol->flags & MPOL_F_RELATIVE_NODES)
246 mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
247 else
248 nodes_and(nsc->mask2, *nodes, nsc->mask1);
249
250 if (mpol_store_user_nodemask(pol))
251 pol->w.user_nodemask = *nodes;
252 else
253 pol->w.cpuset_mems_allowed =
254 cpuset_current_mems_allowed;
255 }
256
257 if (nodes)
258 ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
259 else
260 ret = mpol_ops[pol->mode].create(pol, NULL);
261 return ret;
262 }
263
264 /*
265 * This function just creates a new policy, does some check and simple
266 * initialization. You must invoke mpol_set_nodemask() to set nodes.
267 */
mpol_new(unsigned short mode,unsigned short flags,nodemask_t * nodes)268 static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
269 nodemask_t *nodes)
270 {
271 struct mempolicy *policy;
272
273 pr_debug("setting mode %d flags %d nodes[0] %lx\n",
274 mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
275
276 if (mode == MPOL_DEFAULT) {
277 if (nodes && !nodes_empty(*nodes))
278 return ERR_PTR(-EINVAL);
279 return NULL;
280 }
281 VM_BUG_ON(!nodes);
282
283 /*
284 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
285 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
286 * All other modes require a valid pointer to a non-empty nodemask.
287 */
288 if (mode == MPOL_PREFERRED) {
289 if (nodes_empty(*nodes)) {
290 if (((flags & MPOL_F_STATIC_NODES) ||
291 (flags & MPOL_F_RELATIVE_NODES)))
292 return ERR_PTR(-EINVAL);
293 }
294 } else if (mode == MPOL_LOCAL) {
295 if (!nodes_empty(*nodes) ||
296 (flags & MPOL_F_STATIC_NODES) ||
297 (flags & MPOL_F_RELATIVE_NODES))
298 return ERR_PTR(-EINVAL);
299 mode = MPOL_PREFERRED;
300 } else if (nodes_empty(*nodes))
301 return ERR_PTR(-EINVAL);
302 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
303 if (!policy)
304 return ERR_PTR(-ENOMEM);
305 atomic_set(&policy->refcnt, 1);
306 policy->mode = mode;
307 policy->flags = flags;
308
309 return policy;
310 }
311
312 /* Slow path of a mpol destructor. */
__mpol_put(struct mempolicy * p)313 void __mpol_put(struct mempolicy *p)
314 {
315 if (!atomic_dec_and_test(&p->refcnt))
316 return;
317 kmem_cache_free(policy_cache, p);
318 }
319
mpol_rebind_default(struct mempolicy * pol,const nodemask_t * nodes)320 static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
321 {
322 }
323
mpol_rebind_nodemask(struct mempolicy * pol,const nodemask_t * nodes)324 static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
325 {
326 nodemask_t tmp;
327
328 if (pol->flags & MPOL_F_STATIC_NODES)
329 nodes_and(tmp, pol->w.user_nodemask, *nodes);
330 else if (pol->flags & MPOL_F_RELATIVE_NODES)
331 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
332 else {
333 nodes_remap(tmp, pol->v.nodes,pol->w.cpuset_mems_allowed,
334 *nodes);
335 pol->w.cpuset_mems_allowed = *nodes;
336 }
337
338 if (nodes_empty(tmp))
339 tmp = *nodes;
340
341 pol->v.nodes = tmp;
342 }
343
mpol_rebind_preferred(struct mempolicy * pol,const nodemask_t * nodes)344 static void mpol_rebind_preferred(struct mempolicy *pol,
345 const nodemask_t *nodes)
346 {
347 nodemask_t tmp;
348
349 if (pol->flags & MPOL_F_STATIC_NODES) {
350 int node = first_node(pol->w.user_nodemask);
351
352 if (node_isset(node, *nodes)) {
353 pol->v.preferred_node = node;
354 pol->flags &= ~MPOL_F_LOCAL;
355 } else
356 pol->flags |= MPOL_F_LOCAL;
357 } else if (pol->flags & MPOL_F_RELATIVE_NODES) {
358 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
359 pol->v.preferred_node = first_node(tmp);
360 } else if (!(pol->flags & MPOL_F_LOCAL)) {
361 pol->v.preferred_node = node_remap(pol->v.preferred_node,
362 pol->w.cpuset_mems_allowed,
363 *nodes);
364 pol->w.cpuset_mems_allowed = *nodes;
365 }
366 }
367
368 /*
369 * mpol_rebind_policy - Migrate a policy to a different set of nodes
370 *
371 * Per-vma policies are protected by mmap_lock. Allocations using per-task
372 * policies are protected by task->mems_allowed_seq to prevent a premature
373 * OOM/allocation failure due to parallel nodemask modification.
374 */
mpol_rebind_policy(struct mempolicy * pol,const nodemask_t * newmask)375 static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
376 {
377 if (!pol)
378 return;
379 if (!mpol_store_user_nodemask(pol) && !(pol->flags & MPOL_F_LOCAL) &&
380 nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
381 return;
382
383 mpol_ops[pol->mode].rebind(pol, newmask);
384 }
385
386 /*
387 * Wrapper for mpol_rebind_policy() that just requires task
388 * pointer, and updates task mempolicy.
389 *
390 * Called with task's alloc_lock held.
391 */
392
mpol_rebind_task(struct task_struct * tsk,const nodemask_t * new)393 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
394 {
395 mpol_rebind_policy(tsk->mempolicy, new);
396 }
397
398 /*
399 * Rebind each vma in mm to new nodemask.
400 *
401 * Call holding a reference to mm. Takes mm->mmap_lock during call.
402 */
403
mpol_rebind_mm(struct mm_struct * mm,nodemask_t * new)404 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
405 {
406 struct vm_area_struct *vma;
407
408 mmap_write_lock(mm);
409 for (vma = mm->mmap; vma; vma = vma->vm_next)
410 mpol_rebind_policy(vma->vm_policy, new);
411 mmap_write_unlock(mm);
412 }
413
414 static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
415 [MPOL_DEFAULT] = {
416 .rebind = mpol_rebind_default,
417 },
418 [MPOL_INTERLEAVE] = {
419 .create = mpol_new_interleave,
420 .rebind = mpol_rebind_nodemask,
421 },
422 [MPOL_PREFERRED] = {
423 .create = mpol_new_preferred,
424 .rebind = mpol_rebind_preferred,
425 },
426 [MPOL_BIND] = {
427 .create = mpol_new_bind,
428 .rebind = mpol_rebind_nodemask,
429 },
430 };
431
432 static int migrate_page_add(struct page *page, struct list_head *pagelist,
433 unsigned long flags);
434
435 struct queue_pages {
436 struct list_head *pagelist;
437 unsigned long flags;
438 nodemask_t *nmask;
439 unsigned long start;
440 unsigned long end;
441 struct vm_area_struct *first;
442 };
443
444 /*
445 * Check if the page's nid is in qp->nmask.
446 *
447 * If MPOL_MF_INVERT is set in qp->flags, check if the nid is
448 * in the invert of qp->nmask.
449 */
queue_pages_required(struct page * page,struct queue_pages * qp)450 static inline bool queue_pages_required(struct page *page,
451 struct queue_pages *qp)
452 {
453 int nid = page_to_nid(page);
454 unsigned long flags = qp->flags;
455
456 return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT);
457 }
458
459 /*
460 * queue_pages_pmd() has four possible return values:
461 * 0 - pages are placed on the right node or queued successfully.
462 * 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
463 * specified.
464 * 2 - THP was split.
465 * -EIO - is migration entry or only MPOL_MF_STRICT was specified and an
466 * existing page was already on a node that does not follow the
467 * policy.
468 */
queue_pages_pmd(pmd_t * pmd,spinlock_t * ptl,unsigned long addr,unsigned long end,struct mm_walk * walk)469 static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr,
470 unsigned long end, struct mm_walk *walk)
471 __releases(ptl)
472 {
473 int ret = 0;
474 struct page *page;
475 struct queue_pages *qp = walk->private;
476 unsigned long flags;
477
478 if (unlikely(is_pmd_migration_entry(*pmd))) {
479 ret = -EIO;
480 goto unlock;
481 }
482 page = pmd_page(*pmd);
483 if (is_huge_zero_page(page)) {
484 spin_unlock(ptl);
485 __split_huge_pmd(walk->vma, pmd, addr, false, NULL);
486 ret = 2;
487 goto out;
488 }
489 if (!queue_pages_required(page, qp))
490 goto unlock;
491
492 flags = qp->flags;
493 /* go to thp migration */
494 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
495 if (!vma_migratable(walk->vma) ||
496 migrate_page_add(page, qp->pagelist, flags)) {
497 ret = 1;
498 goto unlock;
499 }
500 } else
501 ret = -EIO;
502 unlock:
503 spin_unlock(ptl);
504 out:
505 return ret;
506 }
507
508 /*
509 * Scan through pages checking if pages follow certain conditions,
510 * and move them to the pagelist if they do.
511 *
512 * queue_pages_pte_range() has three possible return values:
513 * 0 - pages are placed on the right node or queued successfully.
514 * 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
515 * specified.
516 * -EIO - only MPOL_MF_STRICT was specified and an existing page was already
517 * on a node that does not follow the policy.
518 */
queue_pages_pte_range(pmd_t * pmd,unsigned long addr,unsigned long end,struct mm_walk * walk)519 static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
520 unsigned long end, struct mm_walk *walk)
521 {
522 struct vm_area_struct *vma = walk->vma;
523 struct page *page;
524 struct queue_pages *qp = walk->private;
525 unsigned long flags = qp->flags;
526 int ret;
527 bool has_unmovable = false;
528 pte_t *pte, *mapped_pte;
529 spinlock_t *ptl;
530
531 ptl = pmd_trans_huge_lock(pmd, vma);
532 if (ptl) {
533 ret = queue_pages_pmd(pmd, ptl, addr, end, walk);
534 if (ret != 2)
535 return ret;
536 }
537 /* THP was split, fall through to pte walk */
538
539 if (pmd_trans_unstable(pmd))
540 return 0;
541
542 mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
543 for (; addr != end; pte++, addr += PAGE_SIZE) {
544 if (!pte_present(*pte))
545 continue;
546 page = vm_normal_page(vma, addr, *pte);
547 if (!page)
548 continue;
549 /*
550 * vm_normal_page() filters out zero pages, but there might
551 * still be PageReserved pages to skip, perhaps in a VDSO.
552 */
553 if (PageReserved(page))
554 continue;
555 if (!queue_pages_required(page, qp))
556 continue;
557 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
558 /* MPOL_MF_STRICT must be specified if we get here */
559 if (!vma_migratable(vma)) {
560 has_unmovable = true;
561 break;
562 }
563
564 /*
565 * Do not abort immediately since there may be
566 * temporary off LRU pages in the range. Still
567 * need migrate other LRU pages.
568 */
569 if (migrate_page_add(page, qp->pagelist, flags))
570 has_unmovable = true;
571 } else
572 break;
573 }
574 pte_unmap_unlock(mapped_pte, ptl);
575 cond_resched();
576
577 if (has_unmovable)
578 return 1;
579
580 return addr != end ? -EIO : 0;
581 }
582
queue_pages_hugetlb(pte_t * pte,unsigned long hmask,unsigned long addr,unsigned long end,struct mm_walk * walk)583 static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
584 unsigned long addr, unsigned long end,
585 struct mm_walk *walk)
586 {
587 int ret = 0;
588 #ifdef CONFIG_HUGETLB_PAGE
589 struct queue_pages *qp = walk->private;
590 unsigned long flags = (qp->flags & MPOL_MF_VALID);
591 struct page *page;
592 spinlock_t *ptl;
593 pte_t entry;
594
595 ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
596 entry = huge_ptep_get(pte);
597 if (!pte_present(entry))
598 goto unlock;
599 page = pte_page(entry);
600 if (!queue_pages_required(page, qp))
601 goto unlock;
602
603 if (flags == MPOL_MF_STRICT) {
604 /*
605 * STRICT alone means only detecting misplaced page and no
606 * need to further check other vma.
607 */
608 ret = -EIO;
609 goto unlock;
610 }
611
612 if (!vma_migratable(walk->vma)) {
613 /*
614 * Must be STRICT with MOVE*, otherwise .test_walk() have
615 * stopped walking current vma.
616 * Detecting misplaced page but allow migrating pages which
617 * have been queued.
618 */
619 ret = 1;
620 goto unlock;
621 }
622
623 /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
624 if (flags & (MPOL_MF_MOVE_ALL) ||
625 (flags & MPOL_MF_MOVE && page_mapcount(page) == 1)) {
626 if (!isolate_huge_page(page, qp->pagelist) &&
627 (flags & MPOL_MF_STRICT))
628 /*
629 * Failed to isolate page but allow migrating pages
630 * which have been queued.
631 */
632 ret = 1;
633 }
634 unlock:
635 spin_unlock(ptl);
636 #else
637 BUG();
638 #endif
639 return ret;
640 }
641
642 #ifdef CONFIG_NUMA_BALANCING
643 /*
644 * This is used to mark a range of virtual addresses to be inaccessible.
645 * These are later cleared by a NUMA hinting fault. Depending on these
646 * faults, pages may be migrated for better NUMA placement.
647 *
648 * This is assuming that NUMA faults are handled using PROT_NONE. If
649 * an architecture makes a different choice, it will need further
650 * changes to the core.
651 */
change_prot_numa(struct vm_area_struct * vma,unsigned long addr,unsigned long end)652 unsigned long change_prot_numa(struct vm_area_struct *vma,
653 unsigned long addr, unsigned long end)
654 {
655 int nr_updated;
656
657 nr_updated = change_protection(vma, addr, end, PAGE_NONE, MM_CP_PROT_NUMA);
658 if (nr_updated)
659 count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
660
661 return nr_updated;
662 }
663 #else
change_prot_numa(struct vm_area_struct * vma,unsigned long addr,unsigned long end)664 static unsigned long change_prot_numa(struct vm_area_struct *vma,
665 unsigned long addr, unsigned long end)
666 {
667 return 0;
668 }
669 #endif /* CONFIG_NUMA_BALANCING */
670
queue_pages_test_walk(unsigned long start,unsigned long end,struct mm_walk * walk)671 static int queue_pages_test_walk(unsigned long start, unsigned long end,
672 struct mm_walk *walk)
673 {
674 struct vm_area_struct *vma = walk->vma;
675 struct queue_pages *qp = walk->private;
676 unsigned long endvma = vma->vm_end;
677 unsigned long flags = qp->flags;
678
679 /* range check first */
680 VM_BUG_ON_VMA((vma->vm_start > start) || (vma->vm_end < end), vma);
681
682 if (!qp->first) {
683 qp->first = vma;
684 if (!(flags & MPOL_MF_DISCONTIG_OK) &&
685 (qp->start < vma->vm_start))
686 /* hole at head side of range */
687 return -EFAULT;
688 }
689 if (!(flags & MPOL_MF_DISCONTIG_OK) &&
690 ((vma->vm_end < qp->end) &&
691 (!vma->vm_next || vma->vm_end < vma->vm_next->vm_start)))
692 /* hole at middle or tail of range */
693 return -EFAULT;
694
695 /*
696 * Need check MPOL_MF_STRICT to return -EIO if possible
697 * regardless of vma_migratable
698 */
699 if (!vma_migratable(vma) &&
700 !(flags & MPOL_MF_STRICT))
701 return 1;
702
703 if (endvma > end)
704 endvma = end;
705
706 if (flags & MPOL_MF_LAZY) {
707 /* Similar to task_numa_work, skip inaccessible VMAs */
708 if (!is_vm_hugetlb_page(vma) && vma_is_accessible(vma) &&
709 !(vma->vm_flags & VM_MIXEDMAP))
710 change_prot_numa(vma, start, endvma);
711 return 1;
712 }
713
714 /* queue pages from current vma */
715 if (flags & MPOL_MF_VALID)
716 return 0;
717 return 1;
718 }
719
720 static const struct mm_walk_ops queue_pages_walk_ops = {
721 .hugetlb_entry = queue_pages_hugetlb,
722 .pmd_entry = queue_pages_pte_range,
723 .test_walk = queue_pages_test_walk,
724 };
725
726 /*
727 * Walk through page tables and collect pages to be migrated.
728 *
729 * If pages found in a given range are on a set of nodes (determined by
730 * @nodes and @flags,) it's isolated and queued to the pagelist which is
731 * passed via @private.
732 *
733 * queue_pages_range() has three possible return values:
734 * 1 - there is unmovable page, but MPOL_MF_MOVE* & MPOL_MF_STRICT were
735 * specified.
736 * 0 - queue pages successfully or no misplaced page.
737 * errno - i.e. misplaced pages with MPOL_MF_STRICT specified (-EIO) or
738 * memory range specified by nodemask and maxnode points outside
739 * your accessible address space (-EFAULT)
740 */
741 static int
queue_pages_range(struct mm_struct * mm,unsigned long start,unsigned long end,nodemask_t * nodes,unsigned long flags,struct list_head * pagelist)742 queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
743 nodemask_t *nodes, unsigned long flags,
744 struct list_head *pagelist)
745 {
746 int err;
747 struct queue_pages qp = {
748 .pagelist = pagelist,
749 .flags = flags,
750 .nmask = nodes,
751 .start = start,
752 .end = end,
753 .first = NULL,
754 };
755
756 err = walk_page_range(mm, start, end, &queue_pages_walk_ops, &qp);
757
758 if (!qp.first)
759 /* whole range in hole */
760 err = -EFAULT;
761
762 return err;
763 }
764
765 /*
766 * Apply policy to a single VMA
767 * This must be called with the mmap_lock held for writing.
768 */
vma_replace_policy(struct vm_area_struct * vma,struct mempolicy * pol)769 static int vma_replace_policy(struct vm_area_struct *vma,
770 struct mempolicy *pol)
771 {
772 int err;
773 struct mempolicy *old;
774 struct mempolicy *new;
775
776 pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
777 vma->vm_start, vma->vm_end, vma->vm_pgoff,
778 vma->vm_ops, vma->vm_file,
779 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
780
781 new = mpol_dup(pol);
782 if (IS_ERR(new))
783 return PTR_ERR(new);
784
785 if (vma->vm_ops && vma->vm_ops->set_policy) {
786 err = vma->vm_ops->set_policy(vma, new);
787 if (err)
788 goto err_out;
789 }
790
791 old = vma->vm_policy;
792 vma->vm_policy = new; /* protected by mmap_lock */
793 mpol_put(old);
794
795 return 0;
796 err_out:
797 mpol_put(new);
798 return err;
799 }
800
801 /* Step 2: apply policy to a range and do splits. */
mbind_range(struct mm_struct * mm,unsigned long start,unsigned long end,struct mempolicy * new_pol)802 static int mbind_range(struct mm_struct *mm, unsigned long start,
803 unsigned long end, struct mempolicy *new_pol)
804 {
805 struct vm_area_struct *next;
806 struct vm_area_struct *prev;
807 struct vm_area_struct *vma;
808 int err = 0;
809 pgoff_t pgoff;
810 unsigned long vmstart;
811 unsigned long vmend;
812
813 vma = find_vma(mm, start);
814 VM_BUG_ON(!vma);
815
816 prev = vma->vm_prev;
817 if (start > vma->vm_start)
818 prev = vma;
819
820 for (; vma && vma->vm_start < end; prev = vma, vma = next) {
821 next = vma->vm_next;
822 vmstart = max(start, vma->vm_start);
823 vmend = min(end, vma->vm_end);
824
825 if (mpol_equal(vma_policy(vma), new_pol))
826 continue;
827
828 pgoff = vma->vm_pgoff +
829 ((vmstart - vma->vm_start) >> PAGE_SHIFT);
830 prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
831 vma->anon_vma, vma->vm_file, pgoff,
832 new_pol, vma->vm_userfaultfd_ctx,
833 vma_anon_name(vma));
834 if (prev) {
835 vma = prev;
836 next = vma->vm_next;
837 if (mpol_equal(vma_policy(vma), new_pol))
838 continue;
839 /* vma_merge() joined vma && vma->next, case 8 */
840 goto replace;
841 }
842 if (vma->vm_start != vmstart) {
843 err = split_vma(vma->vm_mm, vma, vmstart, 1);
844 if (err)
845 goto out;
846 }
847 if (vma->vm_end != vmend) {
848 err = split_vma(vma->vm_mm, vma, vmend, 0);
849 if (err)
850 goto out;
851 }
852 replace:
853 err = vma_replace_policy(vma, new_pol);
854 if (err)
855 goto out;
856 }
857
858 out:
859 return err;
860 }
861
862 /* Set the process memory policy */
do_set_mempolicy(unsigned short mode,unsigned short flags,nodemask_t * nodes)863 static long do_set_mempolicy(unsigned short mode, unsigned short flags,
864 nodemask_t *nodes)
865 {
866 struct mempolicy *new, *old;
867 NODEMASK_SCRATCH(scratch);
868 int ret;
869
870 if (!scratch)
871 return -ENOMEM;
872
873 new = mpol_new(mode, flags, nodes);
874 if (IS_ERR(new)) {
875 ret = PTR_ERR(new);
876 goto out;
877 }
878
879 ret = mpol_set_nodemask(new, nodes, scratch);
880 if (ret) {
881 mpol_put(new);
882 goto out;
883 }
884 task_lock(current);
885 old = current->mempolicy;
886 current->mempolicy = new;
887 if (new && new->mode == MPOL_INTERLEAVE)
888 current->il_prev = MAX_NUMNODES-1;
889 task_unlock(current);
890 mpol_put(old);
891 ret = 0;
892 out:
893 NODEMASK_SCRATCH_FREE(scratch);
894 return ret;
895 }
896
897 /*
898 * Return nodemask for policy for get_mempolicy() query
899 *
900 * Called with task's alloc_lock held
901 */
get_policy_nodemask(struct mempolicy * p,nodemask_t * nodes)902 static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
903 {
904 nodes_clear(*nodes);
905 if (p == &default_policy)
906 return;
907
908 switch (p->mode) {
909 case MPOL_BIND:
910 case MPOL_INTERLEAVE:
911 *nodes = p->v.nodes;
912 break;
913 case MPOL_PREFERRED:
914 if (!(p->flags & MPOL_F_LOCAL))
915 node_set(p->v.preferred_node, *nodes);
916 /* else return empty node mask for local allocation */
917 break;
918 default:
919 BUG();
920 }
921 }
922
lookup_node(struct mm_struct * mm,unsigned long addr)923 static int lookup_node(struct mm_struct *mm, unsigned long addr)
924 {
925 struct page *p = NULL;
926 int err;
927
928 int locked = 1;
929 err = get_user_pages_locked(addr & PAGE_MASK, 1, 0, &p, &locked);
930 if (err > 0) {
931 err = page_to_nid(p);
932 put_page(p);
933 }
934 if (locked)
935 mmap_read_unlock(mm);
936 return err;
937 }
938
939 /* Retrieve NUMA policy */
do_get_mempolicy(int * policy,nodemask_t * nmask,unsigned long addr,unsigned long flags)940 static long do_get_mempolicy(int *policy, nodemask_t *nmask,
941 unsigned long addr, unsigned long flags)
942 {
943 int err;
944 struct mm_struct *mm = current->mm;
945 struct vm_area_struct *vma = NULL;
946 struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL;
947
948 if (flags &
949 ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
950 return -EINVAL;
951
952 if (flags & MPOL_F_MEMS_ALLOWED) {
953 if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
954 return -EINVAL;
955 *policy = 0; /* just so it's initialized */
956 task_lock(current);
957 *nmask = cpuset_current_mems_allowed;
958 task_unlock(current);
959 return 0;
960 }
961
962 if (flags & MPOL_F_ADDR) {
963 /*
964 * Do NOT fall back to task policy if the
965 * vma/shared policy at addr is NULL. We
966 * want to return MPOL_DEFAULT in this case.
967 */
968 mmap_read_lock(mm);
969 vma = find_vma_intersection(mm, addr, addr+1);
970 if (!vma) {
971 mmap_read_unlock(mm);
972 return -EFAULT;
973 }
974 if (vma->vm_ops && vma->vm_ops->get_policy)
975 pol = vma->vm_ops->get_policy(vma, addr);
976 else
977 pol = vma->vm_policy;
978 } else if (addr)
979 return -EINVAL;
980
981 if (!pol)
982 pol = &default_policy; /* indicates default behavior */
983
984 if (flags & MPOL_F_NODE) {
985 if (flags & MPOL_F_ADDR) {
986 /*
987 * Take a refcount on the mpol, lookup_node()
988 * wil drop the mmap_lock, so after calling
989 * lookup_node() only "pol" remains valid, "vma"
990 * is stale.
991 */
992 pol_refcount = pol;
993 vma = NULL;
994 mpol_get(pol);
995 err = lookup_node(mm, addr);
996 if (err < 0)
997 goto out;
998 *policy = err;
999 } else if (pol == current->mempolicy &&
1000 pol->mode == MPOL_INTERLEAVE) {
1001 *policy = next_node_in(current->il_prev, pol->v.nodes);
1002 } else {
1003 err = -EINVAL;
1004 goto out;
1005 }
1006 } else {
1007 *policy = pol == &default_policy ? MPOL_DEFAULT :
1008 pol->mode;
1009 /*
1010 * Internal mempolicy flags must be masked off before exposing
1011 * the policy to userspace.
1012 */
1013 *policy |= (pol->flags & MPOL_MODE_FLAGS);
1014 }
1015
1016 err = 0;
1017 if (nmask) {
1018 if (mpol_store_user_nodemask(pol)) {
1019 *nmask = pol->w.user_nodemask;
1020 } else {
1021 task_lock(current);
1022 get_policy_nodemask(pol, nmask);
1023 task_unlock(current);
1024 }
1025 }
1026
1027 out:
1028 mpol_cond_put(pol);
1029 if (vma)
1030 mmap_read_unlock(mm);
1031 if (pol_refcount)
1032 mpol_put(pol_refcount);
1033 return err;
1034 }
1035
1036 #ifdef CONFIG_MIGRATION
1037 /*
1038 * page migration, thp tail pages can be passed.
1039 */
migrate_page_add(struct page * page,struct list_head * pagelist,unsigned long flags)1040 static int migrate_page_add(struct page *page, struct list_head *pagelist,
1041 unsigned long flags)
1042 {
1043 struct page *head = compound_head(page);
1044 /*
1045 * Avoid migrating a page that is shared with others.
1046 */
1047 if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(head) == 1) {
1048 if (!isolate_lru_page(head)) {
1049 list_add_tail(&head->lru, pagelist);
1050 mod_node_page_state(page_pgdat(head),
1051 NR_ISOLATED_ANON + page_is_file_lru(head),
1052 thp_nr_pages(head));
1053 } else if (flags & MPOL_MF_STRICT) {
1054 /*
1055 * Non-movable page may reach here. And, there may be
1056 * temporary off LRU pages or non-LRU movable pages.
1057 * Treat them as unmovable pages since they can't be
1058 * isolated, so they can't be moved at the moment. It
1059 * should return -EIO for this case too.
1060 */
1061 return -EIO;
1062 }
1063 }
1064
1065 return 0;
1066 }
1067
1068 /*
1069 * Migrate pages from one node to a target node.
1070 * Returns error or the number of pages not migrated.
1071 */
migrate_to_node(struct mm_struct * mm,int source,int dest,int flags)1072 static int migrate_to_node(struct mm_struct *mm, int source, int dest,
1073 int flags)
1074 {
1075 nodemask_t nmask;
1076 LIST_HEAD(pagelist);
1077 int err = 0;
1078 struct migration_target_control mtc = {
1079 .nid = dest,
1080 .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE,
1081 };
1082
1083 nodes_clear(nmask);
1084 node_set(source, nmask);
1085
1086 /*
1087 * This does not "check" the range but isolates all pages that
1088 * need migration. Between passing in the full user address
1089 * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
1090 */
1091 VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
1092 queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
1093 flags | MPOL_MF_DISCONTIG_OK, &pagelist);
1094
1095 if (!list_empty(&pagelist)) {
1096 err = migrate_pages(&pagelist, alloc_migration_target, NULL,
1097 (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL);
1098 if (err)
1099 putback_movable_pages(&pagelist);
1100 }
1101
1102 return err;
1103 }
1104
1105 /*
1106 * Move pages between the two nodesets so as to preserve the physical
1107 * layout as much as possible.
1108 *
1109 * Returns the number of page that could not be moved.
1110 */
do_migrate_pages(struct mm_struct * mm,const nodemask_t * from,const nodemask_t * to,int flags)1111 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1112 const nodemask_t *to, int flags)
1113 {
1114 int busy = 0;
1115 int err;
1116 nodemask_t tmp;
1117
1118 err = migrate_prep();
1119 if (err)
1120 return err;
1121
1122 mmap_read_lock(mm);
1123
1124 /*
1125 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
1126 * bit in 'to' is not also set in 'tmp'. Clear the found 'source'
1127 * bit in 'tmp', and return that <source, dest> pair for migration.
1128 * The pair of nodemasks 'to' and 'from' define the map.
1129 *
1130 * If no pair of bits is found that way, fallback to picking some
1131 * pair of 'source' and 'dest' bits that are not the same. If the
1132 * 'source' and 'dest' bits are the same, this represents a node
1133 * that will be migrating to itself, so no pages need move.
1134 *
1135 * If no bits are left in 'tmp', or if all remaining bits left
1136 * in 'tmp' correspond to the same bit in 'to', return false
1137 * (nothing left to migrate).
1138 *
1139 * This lets us pick a pair of nodes to migrate between, such that
1140 * if possible the dest node is not already occupied by some other
1141 * source node, minimizing the risk of overloading the memory on a
1142 * node that would happen if we migrated incoming memory to a node
1143 * before migrating outgoing memory source that same node.
1144 *
1145 * A single scan of tmp is sufficient. As we go, we remember the
1146 * most recent <s, d> pair that moved (s != d). If we find a pair
1147 * that not only moved, but what's better, moved to an empty slot
1148 * (d is not set in tmp), then we break out then, with that pair.
1149 * Otherwise when we finish scanning from_tmp, we at least have the
1150 * most recent <s, d> pair that moved. If we get all the way through
1151 * the scan of tmp without finding any node that moved, much less
1152 * moved to an empty node, then there is nothing left worth migrating.
1153 */
1154
1155 tmp = *from;
1156 while (!nodes_empty(tmp)) {
1157 int s,d;
1158 int source = NUMA_NO_NODE;
1159 int dest = 0;
1160
1161 for_each_node_mask(s, tmp) {
1162
1163 /*
1164 * do_migrate_pages() tries to maintain the relative
1165 * node relationship of the pages established between
1166 * threads and memory areas.
1167 *
1168 * However if the number of source nodes is not equal to
1169 * the number of destination nodes we can not preserve
1170 * this node relative relationship. In that case, skip
1171 * copying memory from a node that is in the destination
1172 * mask.
1173 *
1174 * Example: [2,3,4] -> [3,4,5] moves everything.
1175 * [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1176 */
1177
1178 if ((nodes_weight(*from) != nodes_weight(*to)) &&
1179 (node_isset(s, *to)))
1180 continue;
1181
1182 d = node_remap(s, *from, *to);
1183 if (s == d)
1184 continue;
1185
1186 source = s; /* Node moved. Memorize */
1187 dest = d;
1188
1189 /* dest not in remaining from nodes? */
1190 if (!node_isset(dest, tmp))
1191 break;
1192 }
1193 if (source == NUMA_NO_NODE)
1194 break;
1195
1196 node_clear(source, tmp);
1197 err = migrate_to_node(mm, source, dest, flags);
1198 if (err > 0)
1199 busy += err;
1200 if (err < 0)
1201 break;
1202 }
1203 mmap_read_unlock(mm);
1204 if (err < 0)
1205 return err;
1206 return busy;
1207
1208 }
1209
1210 /*
1211 * Allocate a new page for page migration based on vma policy.
1212 * Start by assuming the page is mapped by the same vma as contains @start.
1213 * Search forward from there, if not. N.B., this assumes that the
1214 * list of pages handed to migrate_pages()--which is how we get here--
1215 * is in virtual address order.
1216 */
new_page(struct page * page,unsigned long start)1217 static struct page *new_page(struct page *page, unsigned long start)
1218 {
1219 struct vm_area_struct *vma;
1220 unsigned long address;
1221
1222 vma = find_vma(current->mm, start);
1223 while (vma) {
1224 address = page_address_in_vma(page, vma);
1225 if (address != -EFAULT)
1226 break;
1227 vma = vma->vm_next;
1228 }
1229
1230 if (PageHuge(page)) {
1231 return alloc_huge_page_vma(page_hstate(compound_head(page)),
1232 vma, address);
1233 } else if (PageTransHuge(page)) {
1234 struct page *thp;
1235
1236 thp = alloc_hugepage_vma(GFP_TRANSHUGE, vma, address,
1237 HPAGE_PMD_ORDER);
1238 if (!thp)
1239 return NULL;
1240 prep_transhuge_page(thp);
1241 return thp;
1242 }
1243 /*
1244 * if !vma, alloc_page_vma() will use task or system default policy
1245 */
1246 return alloc_page_vma(GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL,
1247 vma, address);
1248 }
1249 #else
1250
migrate_page_add(struct page * page,struct list_head * pagelist,unsigned long flags)1251 static int migrate_page_add(struct page *page, struct list_head *pagelist,
1252 unsigned long flags)
1253 {
1254 return -EIO;
1255 }
1256
do_migrate_pages(struct mm_struct * mm,const nodemask_t * from,const nodemask_t * to,int flags)1257 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1258 const nodemask_t *to, int flags)
1259 {
1260 return -ENOSYS;
1261 }
1262
new_page(struct page * page,unsigned long start)1263 static struct page *new_page(struct page *page, unsigned long start)
1264 {
1265 return NULL;
1266 }
1267 #endif
1268
do_mbind(unsigned long start,unsigned long len,unsigned short mode,unsigned short mode_flags,nodemask_t * nmask,unsigned long flags)1269 static long do_mbind(unsigned long start, unsigned long len,
1270 unsigned short mode, unsigned short mode_flags,
1271 nodemask_t *nmask, unsigned long flags)
1272 {
1273 struct mm_struct *mm = current->mm;
1274 struct mempolicy *new;
1275 unsigned long end;
1276 int err;
1277 int ret;
1278 LIST_HEAD(pagelist);
1279
1280 if (flags & ~(unsigned long)MPOL_MF_VALID)
1281 return -EINVAL;
1282 if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1283 return -EPERM;
1284
1285 if (start & ~PAGE_MASK)
1286 return -EINVAL;
1287
1288 if (mode == MPOL_DEFAULT)
1289 flags &= ~MPOL_MF_STRICT;
1290
1291 len = (len + PAGE_SIZE - 1) & PAGE_MASK;
1292 end = start + len;
1293
1294 if (end < start)
1295 return -EINVAL;
1296 if (end == start)
1297 return 0;
1298
1299 new = mpol_new(mode, mode_flags, nmask);
1300 if (IS_ERR(new))
1301 return PTR_ERR(new);
1302
1303 if (flags & MPOL_MF_LAZY)
1304 new->flags |= MPOL_F_MOF;
1305
1306 /*
1307 * If we are using the default policy then operation
1308 * on discontinuous address spaces is okay after all
1309 */
1310 if (!new)
1311 flags |= MPOL_MF_DISCONTIG_OK;
1312
1313 pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1314 start, start + len, mode, mode_flags,
1315 nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
1316
1317 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1318
1319 err = migrate_prep();
1320 if (err)
1321 goto mpol_out;
1322 }
1323 {
1324 NODEMASK_SCRATCH(scratch);
1325 if (scratch) {
1326 mmap_write_lock(mm);
1327 err = mpol_set_nodemask(new, nmask, scratch);
1328 if (err)
1329 mmap_write_unlock(mm);
1330 } else
1331 err = -ENOMEM;
1332 NODEMASK_SCRATCH_FREE(scratch);
1333 }
1334 if (err)
1335 goto mpol_out;
1336
1337 ret = queue_pages_range(mm, start, end, nmask,
1338 flags | MPOL_MF_INVERT, &pagelist);
1339
1340 if (ret < 0) {
1341 err = ret;
1342 goto up_out;
1343 }
1344
1345 err = mbind_range(mm, start, end, new);
1346
1347 if (!err) {
1348 int nr_failed = 0;
1349
1350 if (!list_empty(&pagelist)) {
1351 WARN_ON_ONCE(flags & MPOL_MF_LAZY);
1352 nr_failed = migrate_pages(&pagelist, new_page, NULL,
1353 start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
1354 if (nr_failed)
1355 putback_movable_pages(&pagelist);
1356 }
1357
1358 if ((ret > 0) || (nr_failed && (flags & MPOL_MF_STRICT)))
1359 err = -EIO;
1360 } else {
1361 up_out:
1362 if (!list_empty(&pagelist))
1363 putback_movable_pages(&pagelist);
1364 }
1365
1366 mmap_write_unlock(mm);
1367 mpol_out:
1368 mpol_put(new);
1369 return err;
1370 }
1371
1372 /*
1373 * User space interface with variable sized bitmaps for nodelists.
1374 */
1375
1376 /* Copy a node mask from user space. */
get_nodes(nodemask_t * nodes,const unsigned long __user * nmask,unsigned long maxnode)1377 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1378 unsigned long maxnode)
1379 {
1380 unsigned long k;
1381 unsigned long t;
1382 unsigned long nlongs;
1383 unsigned long endmask;
1384
1385 --maxnode;
1386 nodes_clear(*nodes);
1387 if (maxnode == 0 || !nmask)
1388 return 0;
1389 if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1390 return -EINVAL;
1391
1392 nlongs = BITS_TO_LONGS(maxnode);
1393 if ((maxnode % BITS_PER_LONG) == 0)
1394 endmask = ~0UL;
1395 else
1396 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1397
1398 /*
1399 * When the user specified more nodes than supported just check
1400 * if the non supported part is all zero.
1401 *
1402 * If maxnode have more longs than MAX_NUMNODES, check
1403 * the bits in that area first. And then go through to
1404 * check the rest bits which equal or bigger than MAX_NUMNODES.
1405 * Otherwise, just check bits [MAX_NUMNODES, maxnode).
1406 */
1407 if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1408 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1409 if (get_user(t, nmask + k))
1410 return -EFAULT;
1411 if (k == nlongs - 1) {
1412 if (t & endmask)
1413 return -EINVAL;
1414 } else if (t)
1415 return -EINVAL;
1416 }
1417 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1418 endmask = ~0UL;
1419 }
1420
1421 if (maxnode > MAX_NUMNODES && MAX_NUMNODES % BITS_PER_LONG != 0) {
1422 unsigned long valid_mask = endmask;
1423
1424 valid_mask &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1);
1425 if (get_user(t, nmask + nlongs - 1))
1426 return -EFAULT;
1427 if (t & valid_mask)
1428 return -EINVAL;
1429 }
1430
1431 if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1432 return -EFAULT;
1433 nodes_addr(*nodes)[nlongs-1] &= endmask;
1434 return 0;
1435 }
1436
1437 /* Copy a kernel node mask to user space */
copy_nodes_to_user(unsigned long __user * mask,unsigned long maxnode,nodemask_t * nodes)1438 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1439 nodemask_t *nodes)
1440 {
1441 unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1442 unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long);
1443
1444 if (copy > nbytes) {
1445 if (copy > PAGE_SIZE)
1446 return -EINVAL;
1447 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1448 return -EFAULT;
1449 copy = nbytes;
1450 }
1451 return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1452 }
1453
kernel_mbind(unsigned long start,unsigned long len,unsigned long mode,const unsigned long __user * nmask,unsigned long maxnode,unsigned int flags)1454 static long kernel_mbind(unsigned long start, unsigned long len,
1455 unsigned long mode, const unsigned long __user *nmask,
1456 unsigned long maxnode, unsigned int flags)
1457 {
1458 nodemask_t nodes;
1459 int err;
1460 unsigned short mode_flags;
1461
1462 start = untagged_addr(start);
1463 mode_flags = mode & MPOL_MODE_FLAGS;
1464 mode &= ~MPOL_MODE_FLAGS;
1465 if (mode >= MPOL_MAX)
1466 return -EINVAL;
1467 if ((mode_flags & MPOL_F_STATIC_NODES) &&
1468 (mode_flags & MPOL_F_RELATIVE_NODES))
1469 return -EINVAL;
1470 err = get_nodes(&nodes, nmask, maxnode);
1471 if (err)
1472 return err;
1473 return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1474 }
1475
SYSCALL_DEFINE6(mbind,unsigned long,start,unsigned long,len,unsigned long,mode,const unsigned long __user *,nmask,unsigned long,maxnode,unsigned int,flags)1476 SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1477 unsigned long, mode, const unsigned long __user *, nmask,
1478 unsigned long, maxnode, unsigned int, flags)
1479 {
1480 return kernel_mbind(start, len, mode, nmask, maxnode, flags);
1481 }
1482
1483 /* Set the process memory policy */
kernel_set_mempolicy(int mode,const unsigned long __user * nmask,unsigned long maxnode)1484 static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask,
1485 unsigned long maxnode)
1486 {
1487 int err;
1488 nodemask_t nodes;
1489 unsigned short flags;
1490
1491 flags = mode & MPOL_MODE_FLAGS;
1492 mode &= ~MPOL_MODE_FLAGS;
1493 if ((unsigned int)mode >= MPOL_MAX)
1494 return -EINVAL;
1495 if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1496 return -EINVAL;
1497 err = get_nodes(&nodes, nmask, maxnode);
1498 if (err)
1499 return err;
1500 return do_set_mempolicy(mode, flags, &nodes);
1501 }
1502
SYSCALL_DEFINE3(set_mempolicy,int,mode,const unsigned long __user *,nmask,unsigned long,maxnode)1503 SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
1504 unsigned long, maxnode)
1505 {
1506 return kernel_set_mempolicy(mode, nmask, maxnode);
1507 }
1508
kernel_migrate_pages(pid_t pid,unsigned long maxnode,const unsigned long __user * old_nodes,const unsigned long __user * new_nodes)1509 static int kernel_migrate_pages(pid_t pid, unsigned long maxnode,
1510 const unsigned long __user *old_nodes,
1511 const unsigned long __user *new_nodes)
1512 {
1513 struct mm_struct *mm = NULL;
1514 struct task_struct *task;
1515 nodemask_t task_nodes;
1516 int err;
1517 nodemask_t *old;
1518 nodemask_t *new;
1519 NODEMASK_SCRATCH(scratch);
1520
1521 if (!scratch)
1522 return -ENOMEM;
1523
1524 old = &scratch->mask1;
1525 new = &scratch->mask2;
1526
1527 err = get_nodes(old, old_nodes, maxnode);
1528 if (err)
1529 goto out;
1530
1531 err = get_nodes(new, new_nodes, maxnode);
1532 if (err)
1533 goto out;
1534
1535 /* Find the mm_struct */
1536 rcu_read_lock();
1537 task = pid ? find_task_by_vpid(pid) : current;
1538 if (!task) {
1539 rcu_read_unlock();
1540 err = -ESRCH;
1541 goto out;
1542 }
1543 get_task_struct(task);
1544
1545 err = -EINVAL;
1546
1547 /*
1548 * Check if this process has the right to modify the specified process.
1549 * Use the regular "ptrace_may_access()" checks.
1550 */
1551 if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
1552 rcu_read_unlock();
1553 err = -EPERM;
1554 goto out_put;
1555 }
1556 rcu_read_unlock();
1557
1558 task_nodes = cpuset_mems_allowed(task);
1559 /* Is the user allowed to access the target nodes? */
1560 if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1561 err = -EPERM;
1562 goto out_put;
1563 }
1564
1565 task_nodes = cpuset_mems_allowed(current);
1566 nodes_and(*new, *new, task_nodes);
1567 if (nodes_empty(*new))
1568 goto out_put;
1569
1570 err = security_task_movememory(task);
1571 if (err)
1572 goto out_put;
1573
1574 mm = get_task_mm(task);
1575 put_task_struct(task);
1576
1577 if (!mm) {
1578 err = -EINVAL;
1579 goto out;
1580 }
1581
1582 err = do_migrate_pages(mm, old, new,
1583 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1584
1585 mmput(mm);
1586 out:
1587 NODEMASK_SCRATCH_FREE(scratch);
1588
1589 return err;
1590
1591 out_put:
1592 put_task_struct(task);
1593 goto out;
1594
1595 }
1596
SYSCALL_DEFINE4(migrate_pages,pid_t,pid,unsigned long,maxnode,const unsigned long __user *,old_nodes,const unsigned long __user *,new_nodes)1597 SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1598 const unsigned long __user *, old_nodes,
1599 const unsigned long __user *, new_nodes)
1600 {
1601 return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes);
1602 }
1603
1604
1605 /* Retrieve NUMA policy */
kernel_get_mempolicy(int __user * policy,unsigned long __user * nmask,unsigned long maxnode,unsigned long addr,unsigned long flags)1606 static int kernel_get_mempolicy(int __user *policy,
1607 unsigned long __user *nmask,
1608 unsigned long maxnode,
1609 unsigned long addr,
1610 unsigned long flags)
1611 {
1612 int err;
1613 int pval;
1614 nodemask_t nodes;
1615
1616 if (nmask != NULL && maxnode < nr_node_ids)
1617 return -EINVAL;
1618
1619 addr = untagged_addr(addr);
1620
1621 err = do_get_mempolicy(&pval, &nodes, addr, flags);
1622
1623 if (err)
1624 return err;
1625
1626 if (policy && put_user(pval, policy))
1627 return -EFAULT;
1628
1629 if (nmask)
1630 err = copy_nodes_to_user(nmask, maxnode, &nodes);
1631
1632 return err;
1633 }
1634
SYSCALL_DEFINE5(get_mempolicy,int __user *,policy,unsigned long __user *,nmask,unsigned long,maxnode,unsigned long,addr,unsigned long,flags)1635 SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1636 unsigned long __user *, nmask, unsigned long, maxnode,
1637 unsigned long, addr, unsigned long, flags)
1638 {
1639 return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags);
1640 }
1641
1642 #ifdef CONFIG_COMPAT
1643
COMPAT_SYSCALL_DEFINE5(get_mempolicy,int __user *,policy,compat_ulong_t __user *,nmask,compat_ulong_t,maxnode,compat_ulong_t,addr,compat_ulong_t,flags)1644 COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1645 compat_ulong_t __user *, nmask,
1646 compat_ulong_t, maxnode,
1647 compat_ulong_t, addr, compat_ulong_t, flags)
1648 {
1649 long err;
1650 unsigned long __user *nm = NULL;
1651 unsigned long nr_bits, alloc_size;
1652 DECLARE_BITMAP(bm, MAX_NUMNODES);
1653
1654 nr_bits = min_t(unsigned long, maxnode-1, nr_node_ids);
1655 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1656
1657 if (nmask)
1658 nm = compat_alloc_user_space(alloc_size);
1659
1660 err = kernel_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1661
1662 if (!err && nmask) {
1663 unsigned long copy_size;
1664 copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
1665 err = copy_from_user(bm, nm, copy_size);
1666 /* ensure entire bitmap is zeroed */
1667 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1668 err |= compat_put_bitmap(nmask, bm, nr_bits);
1669 }
1670
1671 return err;
1672 }
1673
COMPAT_SYSCALL_DEFINE3(set_mempolicy,int,mode,compat_ulong_t __user *,nmask,compat_ulong_t,maxnode)1674 COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask,
1675 compat_ulong_t, maxnode)
1676 {
1677 unsigned long __user *nm = NULL;
1678 unsigned long nr_bits, alloc_size;
1679 DECLARE_BITMAP(bm, MAX_NUMNODES);
1680
1681 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1682 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1683
1684 if (nmask) {
1685 if (compat_get_bitmap(bm, nmask, nr_bits))
1686 return -EFAULT;
1687 nm = compat_alloc_user_space(alloc_size);
1688 if (copy_to_user(nm, bm, alloc_size))
1689 return -EFAULT;
1690 }
1691
1692 return kernel_set_mempolicy(mode, nm, nr_bits+1);
1693 }
1694
COMPAT_SYSCALL_DEFINE6(mbind,compat_ulong_t,start,compat_ulong_t,len,compat_ulong_t,mode,compat_ulong_t __user *,nmask,compat_ulong_t,maxnode,compat_ulong_t,flags)1695 COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
1696 compat_ulong_t, mode, compat_ulong_t __user *, nmask,
1697 compat_ulong_t, maxnode, compat_ulong_t, flags)
1698 {
1699 unsigned long __user *nm = NULL;
1700 unsigned long nr_bits, alloc_size;
1701 nodemask_t bm;
1702
1703 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1704 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1705
1706 if (nmask) {
1707 if (compat_get_bitmap(nodes_addr(bm), nmask, nr_bits))
1708 return -EFAULT;
1709 nm = compat_alloc_user_space(alloc_size);
1710 if (copy_to_user(nm, nodes_addr(bm), alloc_size))
1711 return -EFAULT;
1712 }
1713
1714 return kernel_mbind(start, len, mode, nm, nr_bits+1, flags);
1715 }
1716
COMPAT_SYSCALL_DEFINE4(migrate_pages,compat_pid_t,pid,compat_ulong_t,maxnode,const compat_ulong_t __user *,old_nodes,const compat_ulong_t __user *,new_nodes)1717 COMPAT_SYSCALL_DEFINE4(migrate_pages, compat_pid_t, pid,
1718 compat_ulong_t, maxnode,
1719 const compat_ulong_t __user *, old_nodes,
1720 const compat_ulong_t __user *, new_nodes)
1721 {
1722 unsigned long __user *old = NULL;
1723 unsigned long __user *new = NULL;
1724 nodemask_t tmp_mask;
1725 unsigned long nr_bits;
1726 unsigned long size;
1727
1728 nr_bits = min_t(unsigned long, maxnode - 1, MAX_NUMNODES);
1729 size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1730 if (old_nodes) {
1731 if (compat_get_bitmap(nodes_addr(tmp_mask), old_nodes, nr_bits))
1732 return -EFAULT;
1733 old = compat_alloc_user_space(new_nodes ? size * 2 : size);
1734 if (new_nodes)
1735 new = old + size / sizeof(unsigned long);
1736 if (copy_to_user(old, nodes_addr(tmp_mask), size))
1737 return -EFAULT;
1738 }
1739 if (new_nodes) {
1740 if (compat_get_bitmap(nodes_addr(tmp_mask), new_nodes, nr_bits))
1741 return -EFAULT;
1742 if (new == NULL)
1743 new = compat_alloc_user_space(size);
1744 if (copy_to_user(new, nodes_addr(tmp_mask), size))
1745 return -EFAULT;
1746 }
1747 return kernel_migrate_pages(pid, nr_bits + 1, old, new);
1748 }
1749
1750 #endif /* CONFIG_COMPAT */
1751
vma_migratable(struct vm_area_struct * vma)1752 bool vma_migratable(struct vm_area_struct *vma)
1753 {
1754 if (vma->vm_flags & (VM_IO | VM_PFNMAP))
1755 return false;
1756
1757 /*
1758 * DAX device mappings require predictable access latency, so avoid
1759 * incurring periodic faults.
1760 */
1761 if (vma_is_dax(vma))
1762 return false;
1763
1764 if (is_vm_hugetlb_page(vma) &&
1765 !hugepage_migration_supported(hstate_vma(vma)))
1766 return false;
1767
1768 /*
1769 * Migration allocates pages in the highest zone. If we cannot
1770 * do so then migration (at least from node to node) is not
1771 * possible.
1772 */
1773 if (vma->vm_file &&
1774 gfp_zone(mapping_gfp_mask(vma->vm_file->f_mapping))
1775 < policy_zone)
1776 return false;
1777 return true;
1778 }
1779
__get_vma_policy(struct vm_area_struct * vma,unsigned long addr)1780 struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
1781 unsigned long addr)
1782 {
1783 struct mempolicy *pol = NULL;
1784
1785 if (vma) {
1786 if (vma->vm_ops && vma->vm_ops->get_policy) {
1787 pol = vma->vm_ops->get_policy(vma, addr);
1788 } else if (vma->vm_policy) {
1789 pol = vma->vm_policy;
1790
1791 /*
1792 * shmem_alloc_page() passes MPOL_F_SHARED policy with
1793 * a pseudo vma whose vma->vm_ops=NULL. Take a reference
1794 * count on these policies which will be dropped by
1795 * mpol_cond_put() later
1796 */
1797 if (mpol_needs_cond_ref(pol))
1798 mpol_get(pol);
1799 }
1800 }
1801
1802 return pol;
1803 }
1804
1805 /*
1806 * get_vma_policy(@vma, @addr)
1807 * @vma: virtual memory area whose policy is sought
1808 * @addr: address in @vma for shared policy lookup
1809 *
1810 * Returns effective policy for a VMA at specified address.
1811 * Falls back to current->mempolicy or system default policy, as necessary.
1812 * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1813 * count--added by the get_policy() vm_op, as appropriate--to protect against
1814 * freeing by another task. It is the caller's responsibility to free the
1815 * extra reference for shared policies.
1816 */
get_vma_policy(struct vm_area_struct * vma,unsigned long addr)1817 static struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
1818 unsigned long addr)
1819 {
1820 struct mempolicy *pol = __get_vma_policy(vma, addr);
1821
1822 if (!pol)
1823 pol = get_task_policy(current);
1824
1825 return pol;
1826 }
1827
vma_policy_mof(struct vm_area_struct * vma)1828 bool vma_policy_mof(struct vm_area_struct *vma)
1829 {
1830 struct mempolicy *pol;
1831
1832 if (vma->vm_ops && vma->vm_ops->get_policy) {
1833 bool ret = false;
1834
1835 pol = vma->vm_ops->get_policy(vma, vma->vm_start);
1836 if (pol && (pol->flags & MPOL_F_MOF))
1837 ret = true;
1838 mpol_cond_put(pol);
1839
1840 return ret;
1841 }
1842
1843 pol = vma->vm_policy;
1844 if (!pol)
1845 pol = get_task_policy(current);
1846
1847 return pol->flags & MPOL_F_MOF;
1848 }
1849
apply_policy_zone(struct mempolicy * policy,enum zone_type zone)1850 static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
1851 {
1852 enum zone_type dynamic_policy_zone = policy_zone;
1853
1854 BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
1855
1856 /*
1857 * if policy->v.nodes has movable memory only,
1858 * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
1859 *
1860 * policy->v.nodes is intersect with node_states[N_MEMORY].
1861 * so if the following test faile, it implies
1862 * policy->v.nodes has movable memory only.
1863 */
1864 if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))
1865 dynamic_policy_zone = ZONE_MOVABLE;
1866
1867 return zone >= dynamic_policy_zone;
1868 }
1869
1870 /*
1871 * Return a nodemask representing a mempolicy for filtering nodes for
1872 * page allocation
1873 */
policy_nodemask(gfp_t gfp,struct mempolicy * policy)1874 nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1875 {
1876 /* Lower zones don't get a nodemask applied for MPOL_BIND */
1877 if (unlikely(policy->mode == MPOL_BIND) &&
1878 apply_policy_zone(policy, gfp_zone(gfp)) &&
1879 cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1880 return &policy->v.nodes;
1881
1882 return NULL;
1883 }
1884
1885 /* Return the node id preferred by the given mempolicy, or the given id */
policy_node(gfp_t gfp,struct mempolicy * policy,int nd)1886 static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd)
1887 {
1888 if (policy->mode == MPOL_PREFERRED && !(policy->flags & MPOL_F_LOCAL))
1889 nd = policy->v.preferred_node;
1890 else {
1891 /*
1892 * __GFP_THISNODE shouldn't even be used with the bind policy
1893 * because we might easily break the expectation to stay on the
1894 * requested node and not break the policy.
1895 */
1896 WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE));
1897 }
1898
1899 return nd;
1900 }
1901
1902 /* Do dynamic interleaving for a process */
interleave_nodes(struct mempolicy * policy)1903 static unsigned interleave_nodes(struct mempolicy *policy)
1904 {
1905 unsigned next;
1906 struct task_struct *me = current;
1907
1908 next = next_node_in(me->il_prev, policy->v.nodes);
1909 if (next < MAX_NUMNODES)
1910 me->il_prev = next;
1911 return next;
1912 }
1913
1914 /*
1915 * Depending on the memory policy provide a node from which to allocate the
1916 * next slab entry.
1917 */
mempolicy_slab_node(void)1918 unsigned int mempolicy_slab_node(void)
1919 {
1920 struct mempolicy *policy;
1921 int node = numa_mem_id();
1922
1923 if (in_interrupt())
1924 return node;
1925
1926 policy = current->mempolicy;
1927 if (!policy || policy->flags & MPOL_F_LOCAL)
1928 return node;
1929
1930 switch (policy->mode) {
1931 case MPOL_PREFERRED:
1932 /*
1933 * handled MPOL_F_LOCAL above
1934 */
1935 return policy->v.preferred_node;
1936
1937 case MPOL_INTERLEAVE:
1938 return interleave_nodes(policy);
1939
1940 case MPOL_BIND: {
1941 struct zoneref *z;
1942
1943 /*
1944 * Follow bind policy behavior and start allocation at the
1945 * first node.
1946 */
1947 struct zonelist *zonelist;
1948 enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1949 zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
1950 z = first_zones_zonelist(zonelist, highest_zoneidx,
1951 &policy->v.nodes);
1952 return z->zone ? zone_to_nid(z->zone) : node;
1953 }
1954
1955 default:
1956 BUG();
1957 }
1958 }
1959
1960 /*
1961 * Do static interleaving for a VMA with known offset @n. Returns the n'th
1962 * node in pol->v.nodes (starting from n=0), wrapping around if n exceeds the
1963 * number of present nodes.
1964 */
offset_il_node(struct mempolicy * pol,unsigned long n)1965 static unsigned offset_il_node(struct mempolicy *pol, unsigned long n)
1966 {
1967 unsigned nnodes = nodes_weight(pol->v.nodes);
1968 unsigned target;
1969 int i;
1970 int nid;
1971
1972 if (!nnodes)
1973 return numa_node_id();
1974 target = (unsigned int)n % nnodes;
1975 nid = first_node(pol->v.nodes);
1976 for (i = 0; i < target; i++)
1977 nid = next_node(nid, pol->v.nodes);
1978 return nid;
1979 }
1980
1981 /* Determine a node number for interleave */
interleave_nid(struct mempolicy * pol,struct vm_area_struct * vma,unsigned long addr,int shift)1982 static inline unsigned interleave_nid(struct mempolicy *pol,
1983 struct vm_area_struct *vma, unsigned long addr, int shift)
1984 {
1985 if (vma) {
1986 unsigned long off;
1987
1988 /*
1989 * for small pages, there is no difference between
1990 * shift and PAGE_SHIFT, so the bit-shift is safe.
1991 * for huge pages, since vm_pgoff is in units of small
1992 * pages, we need to shift off the always 0 bits to get
1993 * a useful offset.
1994 */
1995 BUG_ON(shift < PAGE_SHIFT);
1996 off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1997 off += (addr - vma->vm_start) >> shift;
1998 return offset_il_node(pol, off);
1999 } else
2000 return interleave_nodes(pol);
2001 }
2002
2003 #ifdef CONFIG_HUGETLBFS
2004 /*
2005 * huge_node(@vma, @addr, @gfp_flags, @mpol)
2006 * @vma: virtual memory area whose policy is sought
2007 * @addr: address in @vma for shared policy lookup and interleave policy
2008 * @gfp_flags: for requested zone
2009 * @mpol: pointer to mempolicy pointer for reference counted mempolicy
2010 * @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask
2011 *
2012 * Returns a nid suitable for a huge page allocation and a pointer
2013 * to the struct mempolicy for conditional unref after allocation.
2014 * If the effective policy is 'BIND, returns a pointer to the mempolicy's
2015 * @nodemask for filtering the zonelist.
2016 *
2017 * Must be protected by read_mems_allowed_begin()
2018 */
huge_node(struct vm_area_struct * vma,unsigned long addr,gfp_t gfp_flags,struct mempolicy ** mpol,nodemask_t ** nodemask)2019 int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
2020 struct mempolicy **mpol, nodemask_t **nodemask)
2021 {
2022 int nid;
2023
2024 *mpol = get_vma_policy(vma, addr);
2025 *nodemask = NULL; /* assume !MPOL_BIND */
2026
2027 if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
2028 nid = interleave_nid(*mpol, vma, addr,
2029 huge_page_shift(hstate_vma(vma)));
2030 } else {
2031 nid = policy_node(gfp_flags, *mpol, numa_node_id());
2032 if ((*mpol)->mode == MPOL_BIND)
2033 *nodemask = &(*mpol)->v.nodes;
2034 }
2035 return nid;
2036 }
2037
2038 /*
2039 * init_nodemask_of_mempolicy
2040 *
2041 * If the current task's mempolicy is "default" [NULL], return 'false'
2042 * to indicate default policy. Otherwise, extract the policy nodemask
2043 * for 'bind' or 'interleave' policy into the argument nodemask, or
2044 * initialize the argument nodemask to contain the single node for
2045 * 'preferred' or 'local' policy and return 'true' to indicate presence
2046 * of non-default mempolicy.
2047 *
2048 * We don't bother with reference counting the mempolicy [mpol_get/put]
2049 * because the current task is examining it's own mempolicy and a task's
2050 * mempolicy is only ever changed by the task itself.
2051 *
2052 * N.B., it is the caller's responsibility to free a returned nodemask.
2053 */
init_nodemask_of_mempolicy(nodemask_t * mask)2054 bool init_nodemask_of_mempolicy(nodemask_t *mask)
2055 {
2056 struct mempolicy *mempolicy;
2057 int nid;
2058
2059 if (!(mask && current->mempolicy))
2060 return false;
2061
2062 task_lock(current);
2063 mempolicy = current->mempolicy;
2064 switch (mempolicy->mode) {
2065 case MPOL_PREFERRED:
2066 if (mempolicy->flags & MPOL_F_LOCAL)
2067 nid = numa_node_id();
2068 else
2069 nid = mempolicy->v.preferred_node;
2070 init_nodemask_of_node(mask, nid);
2071 break;
2072
2073 case MPOL_BIND:
2074 case MPOL_INTERLEAVE:
2075 *mask = mempolicy->v.nodes;
2076 break;
2077
2078 default:
2079 BUG();
2080 }
2081 task_unlock(current);
2082
2083 return true;
2084 }
2085 #endif
2086
2087 /*
2088 * mempolicy_nodemask_intersects
2089 *
2090 * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
2091 * policy. Otherwise, check for intersection between mask and the policy
2092 * nodemask for 'bind' or 'interleave' policy. For 'perferred' or 'local'
2093 * policy, always return true since it may allocate elsewhere on fallback.
2094 *
2095 * Takes task_lock(tsk) to prevent freeing of its mempolicy.
2096 */
mempolicy_nodemask_intersects(struct task_struct * tsk,const nodemask_t * mask)2097 bool mempolicy_nodemask_intersects(struct task_struct *tsk,
2098 const nodemask_t *mask)
2099 {
2100 struct mempolicy *mempolicy;
2101 bool ret = true;
2102
2103 if (!mask)
2104 return ret;
2105 task_lock(tsk);
2106 mempolicy = tsk->mempolicy;
2107 if (!mempolicy)
2108 goto out;
2109
2110 switch (mempolicy->mode) {
2111 case MPOL_PREFERRED:
2112 /*
2113 * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
2114 * allocate from, they may fallback to other nodes when oom.
2115 * Thus, it's possible for tsk to have allocated memory from
2116 * nodes in mask.
2117 */
2118 break;
2119 case MPOL_BIND:
2120 case MPOL_INTERLEAVE:
2121 ret = nodes_intersects(mempolicy->v.nodes, *mask);
2122 break;
2123 default:
2124 BUG();
2125 }
2126 out:
2127 task_unlock(tsk);
2128 return ret;
2129 }
2130
2131 /* Allocate a page in interleaved policy.
2132 Own path because it needs to do special accounting. */
alloc_page_interleave(gfp_t gfp,unsigned order,unsigned nid)2133 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
2134 unsigned nid)
2135 {
2136 struct page *page;
2137
2138 page = __alloc_pages(gfp, order, nid);
2139 /* skip NUMA_INTERLEAVE_HIT counter update if numa stats is disabled */
2140 if (!static_branch_likely(&vm_numa_stat_key))
2141 return page;
2142 if (page && page_to_nid(page) == nid) {
2143 preempt_disable();
2144 __inc_numa_state(page_zone(page), NUMA_INTERLEAVE_HIT);
2145 preempt_enable();
2146 }
2147 return page;
2148 }
2149
2150 /**
2151 * alloc_pages_vma - Allocate a page for a VMA.
2152 *
2153 * @gfp:
2154 * %GFP_USER user allocation.
2155 * %GFP_KERNEL kernel allocations,
2156 * %GFP_HIGHMEM highmem/user allocations,
2157 * %GFP_FS allocation should not call back into a file system.
2158 * %GFP_ATOMIC don't sleep.
2159 *
2160 * @order:Order of the GFP allocation.
2161 * @vma: Pointer to VMA or NULL if not available.
2162 * @addr: Virtual Address of the allocation. Must be inside the VMA.
2163 * @node: Which node to prefer for allocation (modulo policy).
2164 * @hugepage: for hugepages try only the preferred node if possible
2165 *
2166 * This function allocates a page from the kernel page pool and applies
2167 * a NUMA policy associated with the VMA or the current process.
2168 * When VMA is not NULL caller must read-lock the mmap_lock of the
2169 * mm_struct of the VMA to prevent it from going away. Should be used for
2170 * all allocations for pages that will be mapped into user space. Returns
2171 * NULL when no page can be allocated.
2172 */
2173 struct page *
alloc_pages_vma(gfp_t gfp,int order,struct vm_area_struct * vma,unsigned long addr,int node,bool hugepage)2174 alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
2175 unsigned long addr, int node, bool hugepage)
2176 {
2177 struct mempolicy *pol;
2178 struct page *page;
2179 int preferred_nid;
2180 nodemask_t *nmask;
2181
2182 pol = get_vma_policy(vma, addr);
2183
2184 if (pol->mode == MPOL_INTERLEAVE) {
2185 unsigned nid;
2186
2187 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
2188 mpol_cond_put(pol);
2189 page = alloc_page_interleave(gfp, order, nid);
2190 goto out;
2191 }
2192
2193 if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
2194 int hpage_node = node;
2195
2196 /*
2197 * For hugepage allocation and non-interleave policy which
2198 * allows the current node (or other explicitly preferred
2199 * node) we only try to allocate from the current/preferred
2200 * node and don't fall back to other nodes, as the cost of
2201 * remote accesses would likely offset THP benefits.
2202 *
2203 * If the policy is interleave, or does not allow the current
2204 * node in its nodemask, we allocate the standard way.
2205 */
2206 if (pol->mode == MPOL_PREFERRED && !(pol->flags & MPOL_F_LOCAL))
2207 hpage_node = pol->v.preferred_node;
2208
2209 nmask = policy_nodemask(gfp, pol);
2210 if (!nmask || node_isset(hpage_node, *nmask)) {
2211 mpol_cond_put(pol);
2212 /*
2213 * First, try to allocate THP only on local node, but
2214 * don't reclaim unnecessarily, just compact.
2215 */
2216 page = __alloc_pages_node(hpage_node,
2217 gfp | __GFP_THISNODE | __GFP_NORETRY, order);
2218
2219 /*
2220 * If hugepage allocations are configured to always
2221 * synchronous compact or the vma has been madvised
2222 * to prefer hugepage backing, retry allowing remote
2223 * memory with both reclaim and compact as well.
2224 */
2225 if (!page && (gfp & __GFP_DIRECT_RECLAIM))
2226 page = __alloc_pages_node(hpage_node,
2227 gfp, order);
2228
2229 goto out;
2230 }
2231 }
2232
2233 nmask = policy_nodemask(gfp, pol);
2234 preferred_nid = policy_node(gfp, pol, node);
2235 page = __alloc_pages_nodemask(gfp, order, preferred_nid, nmask);
2236 mpol_cond_put(pol);
2237 out:
2238 return page;
2239 }
2240 EXPORT_SYMBOL(alloc_pages_vma);
2241
2242 /**
2243 * alloc_pages_current - Allocate pages.
2244 *
2245 * @gfp:
2246 * %GFP_USER user allocation,
2247 * %GFP_KERNEL kernel allocation,
2248 * %GFP_HIGHMEM highmem allocation,
2249 * %GFP_FS don't call back into a file system.
2250 * %GFP_ATOMIC don't sleep.
2251 * @order: Power of two of allocation size in pages. 0 is a single page.
2252 *
2253 * Allocate a page from the kernel page pool. When not in
2254 * interrupt context and apply the current process NUMA policy.
2255 * Returns NULL when no page can be allocated.
2256 */
alloc_pages_current(gfp_t gfp,unsigned order)2257 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
2258 {
2259 struct mempolicy *pol = &default_policy;
2260 struct page *page;
2261
2262 if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2263 pol = get_task_policy(current);
2264
2265 /*
2266 * No reference counting needed for current->mempolicy
2267 * nor system default_policy
2268 */
2269 if (pol->mode == MPOL_INTERLEAVE)
2270 page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
2271 else
2272 page = __alloc_pages_nodemask(gfp, order,
2273 policy_node(gfp, pol, numa_node_id()),
2274 policy_nodemask(gfp, pol));
2275
2276 return page;
2277 }
2278 EXPORT_SYMBOL(alloc_pages_current);
2279
vma_dup_policy(struct vm_area_struct * src,struct vm_area_struct * dst)2280 int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
2281 {
2282 struct mempolicy *pol = mpol_dup(vma_policy(src));
2283
2284 if (IS_ERR(pol))
2285 return PTR_ERR(pol);
2286 dst->vm_policy = pol;
2287 return 0;
2288 }
2289
2290 /*
2291 * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
2292 * rebinds the mempolicy its copying by calling mpol_rebind_policy()
2293 * with the mems_allowed returned by cpuset_mems_allowed(). This
2294 * keeps mempolicies cpuset relative after its cpuset moves. See
2295 * further kernel/cpuset.c update_nodemask().
2296 *
2297 * current's mempolicy may be rebinded by the other task(the task that changes
2298 * cpuset's mems), so we needn't do rebind work for current task.
2299 */
2300
2301 /* Slow path of a mempolicy duplicate */
__mpol_dup(struct mempolicy * old)2302 struct mempolicy *__mpol_dup(struct mempolicy *old)
2303 {
2304 struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2305
2306 if (!new)
2307 return ERR_PTR(-ENOMEM);
2308
2309 /* task's mempolicy is protected by alloc_lock */
2310 if (old == current->mempolicy) {
2311 task_lock(current);
2312 *new = *old;
2313 task_unlock(current);
2314 } else
2315 *new = *old;
2316
2317 if (current_cpuset_is_being_rebound()) {
2318 nodemask_t mems = cpuset_mems_allowed(current);
2319 mpol_rebind_policy(new, &mems);
2320 }
2321 atomic_set(&new->refcnt, 1);
2322 return new;
2323 }
2324
2325 /* Slow path of a mempolicy comparison */
__mpol_equal(struct mempolicy * a,struct mempolicy * b)2326 bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2327 {
2328 if (!a || !b)
2329 return false;
2330 if (a->mode != b->mode)
2331 return false;
2332 if (a->flags != b->flags)
2333 return false;
2334 if (mpol_store_user_nodemask(a))
2335 if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
2336 return false;
2337
2338 switch (a->mode) {
2339 case MPOL_BIND:
2340 case MPOL_INTERLEAVE:
2341 return !!nodes_equal(a->v.nodes, b->v.nodes);
2342 case MPOL_PREFERRED:
2343 /* a's ->flags is the same as b's */
2344 if (a->flags & MPOL_F_LOCAL)
2345 return true;
2346 return a->v.preferred_node == b->v.preferred_node;
2347 default:
2348 BUG();
2349 return false;
2350 }
2351 }
2352
2353 /*
2354 * Shared memory backing store policy support.
2355 *
2356 * Remember policies even when nobody has shared memory mapped.
2357 * The policies are kept in Red-Black tree linked from the inode.
2358 * They are protected by the sp->lock rwlock, which should be held
2359 * for any accesses to the tree.
2360 */
2361
2362 /*
2363 * lookup first element intersecting start-end. Caller holds sp->lock for
2364 * reading or for writing
2365 */
2366 static struct sp_node *
sp_lookup(struct shared_policy * sp,unsigned long start,unsigned long end)2367 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
2368 {
2369 struct rb_node *n = sp->root.rb_node;
2370
2371 while (n) {
2372 struct sp_node *p = rb_entry(n, struct sp_node, nd);
2373
2374 if (start >= p->end)
2375 n = n->rb_right;
2376 else if (end <= p->start)
2377 n = n->rb_left;
2378 else
2379 break;
2380 }
2381 if (!n)
2382 return NULL;
2383 for (;;) {
2384 struct sp_node *w = NULL;
2385 struct rb_node *prev = rb_prev(n);
2386 if (!prev)
2387 break;
2388 w = rb_entry(prev, struct sp_node, nd);
2389 if (w->end <= start)
2390 break;
2391 n = prev;
2392 }
2393 return rb_entry(n, struct sp_node, nd);
2394 }
2395
2396 /*
2397 * Insert a new shared policy into the list. Caller holds sp->lock for
2398 * writing.
2399 */
sp_insert(struct shared_policy * sp,struct sp_node * new)2400 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2401 {
2402 struct rb_node **p = &sp->root.rb_node;
2403 struct rb_node *parent = NULL;
2404 struct sp_node *nd;
2405
2406 while (*p) {
2407 parent = *p;
2408 nd = rb_entry(parent, struct sp_node, nd);
2409 if (new->start < nd->start)
2410 p = &(*p)->rb_left;
2411 else if (new->end > nd->end)
2412 p = &(*p)->rb_right;
2413 else
2414 BUG();
2415 }
2416 rb_link_node(&new->nd, parent, p);
2417 rb_insert_color(&new->nd, &sp->root);
2418 pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
2419 new->policy ? new->policy->mode : 0);
2420 }
2421
2422 /* Find shared policy intersecting idx */
2423 struct mempolicy *
mpol_shared_policy_lookup(struct shared_policy * sp,unsigned long idx)2424 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2425 {
2426 struct mempolicy *pol = NULL;
2427 struct sp_node *sn;
2428
2429 if (!sp->root.rb_node)
2430 return NULL;
2431 read_lock(&sp->lock);
2432 sn = sp_lookup(sp, idx, idx+1);
2433 if (sn) {
2434 mpol_get(sn->policy);
2435 pol = sn->policy;
2436 }
2437 read_unlock(&sp->lock);
2438 return pol;
2439 }
2440
sp_free(struct sp_node * n)2441 static void sp_free(struct sp_node *n)
2442 {
2443 mpol_put(n->policy);
2444 kmem_cache_free(sn_cache, n);
2445 }
2446
2447 /**
2448 * mpol_misplaced - check whether current page node is valid in policy
2449 *
2450 * @page: page to be checked
2451 * @vma: vm area where page mapped
2452 * @addr: virtual address where page mapped
2453 *
2454 * Lookup current policy node id for vma,addr and "compare to" page's
2455 * node id.
2456 *
2457 * Returns:
2458 * -1 - not misplaced, page is in the right node
2459 * node - node id where the page should be
2460 *
2461 * Policy determination "mimics" alloc_page_vma().
2462 * Called from fault path where we know the vma and faulting address.
2463 */
mpol_misplaced(struct page * page,struct vm_area_struct * vma,unsigned long addr)2464 int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
2465 {
2466 struct mempolicy *pol;
2467 struct zoneref *z;
2468 int curnid = page_to_nid(page);
2469 unsigned long pgoff;
2470 int thiscpu = raw_smp_processor_id();
2471 int thisnid = cpu_to_node(thiscpu);
2472 int polnid = NUMA_NO_NODE;
2473 int ret = -1;
2474
2475 pol = get_vma_policy(vma, addr);
2476 if (!(pol->flags & MPOL_F_MOF))
2477 goto out;
2478
2479 switch (pol->mode) {
2480 case MPOL_INTERLEAVE:
2481 pgoff = vma->vm_pgoff;
2482 pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
2483 polnid = offset_il_node(pol, pgoff);
2484 break;
2485
2486 case MPOL_PREFERRED:
2487 if (pol->flags & MPOL_F_LOCAL)
2488 polnid = numa_node_id();
2489 else
2490 polnid = pol->v.preferred_node;
2491 break;
2492
2493 case MPOL_BIND:
2494
2495 /*
2496 * allows binding to multiple nodes.
2497 * use current page if in policy nodemask,
2498 * else select nearest allowed node, if any.
2499 * If no allowed nodes, use current [!misplaced].
2500 */
2501 if (node_isset(curnid, pol->v.nodes))
2502 goto out;
2503 z = first_zones_zonelist(
2504 node_zonelist(numa_node_id(), GFP_HIGHUSER),
2505 gfp_zone(GFP_HIGHUSER),
2506 &pol->v.nodes);
2507 polnid = zone_to_nid(z->zone);
2508 break;
2509
2510 default:
2511 BUG();
2512 }
2513
2514 /* Migrate the page towards the node whose CPU is referencing it */
2515 if (pol->flags & MPOL_F_MORON) {
2516 polnid = thisnid;
2517
2518 if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
2519 goto out;
2520 }
2521
2522 if (curnid != polnid)
2523 ret = polnid;
2524 out:
2525 mpol_cond_put(pol);
2526
2527 return ret;
2528 }
2529
2530 /*
2531 * Drop the (possibly final) reference to task->mempolicy. It needs to be
2532 * dropped after task->mempolicy is set to NULL so that any allocation done as
2533 * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed
2534 * policy.
2535 */
mpol_put_task_policy(struct task_struct * task)2536 void mpol_put_task_policy(struct task_struct *task)
2537 {
2538 struct mempolicy *pol;
2539
2540 task_lock(task);
2541 pol = task->mempolicy;
2542 task->mempolicy = NULL;
2543 task_unlock(task);
2544 mpol_put(pol);
2545 }
2546
sp_delete(struct shared_policy * sp,struct sp_node * n)2547 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2548 {
2549 pr_debug("deleting %lx-l%lx\n", n->start, n->end);
2550 rb_erase(&n->nd, &sp->root);
2551 sp_free(n);
2552 }
2553
sp_node_init(struct sp_node * node,unsigned long start,unsigned long end,struct mempolicy * pol)2554 static void sp_node_init(struct sp_node *node, unsigned long start,
2555 unsigned long end, struct mempolicy *pol)
2556 {
2557 node->start = start;
2558 node->end = end;
2559 node->policy = pol;
2560 }
2561
sp_alloc(unsigned long start,unsigned long end,struct mempolicy * pol)2562 static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2563 struct mempolicy *pol)
2564 {
2565 struct sp_node *n;
2566 struct mempolicy *newpol;
2567
2568 n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2569 if (!n)
2570 return NULL;
2571
2572 newpol = mpol_dup(pol);
2573 if (IS_ERR(newpol)) {
2574 kmem_cache_free(sn_cache, n);
2575 return NULL;
2576 }
2577 newpol->flags |= MPOL_F_SHARED;
2578 sp_node_init(n, start, end, newpol);
2579
2580 return n;
2581 }
2582
2583 /* Replace a policy range. */
shared_policy_replace(struct shared_policy * sp,unsigned long start,unsigned long end,struct sp_node * new)2584 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2585 unsigned long end, struct sp_node *new)
2586 {
2587 struct sp_node *n;
2588 struct sp_node *n_new = NULL;
2589 struct mempolicy *mpol_new = NULL;
2590 int ret = 0;
2591
2592 restart:
2593 write_lock(&sp->lock);
2594 n = sp_lookup(sp, start, end);
2595 /* Take care of old policies in the same range. */
2596 while (n && n->start < end) {
2597 struct rb_node *next = rb_next(&n->nd);
2598 if (n->start >= start) {
2599 if (n->end <= end)
2600 sp_delete(sp, n);
2601 else
2602 n->start = end;
2603 } else {
2604 /* Old policy spanning whole new range. */
2605 if (n->end > end) {
2606 if (!n_new)
2607 goto alloc_new;
2608
2609 *mpol_new = *n->policy;
2610 atomic_set(&mpol_new->refcnt, 1);
2611 sp_node_init(n_new, end, n->end, mpol_new);
2612 n->end = start;
2613 sp_insert(sp, n_new);
2614 n_new = NULL;
2615 mpol_new = NULL;
2616 break;
2617 } else
2618 n->end = start;
2619 }
2620 if (!next)
2621 break;
2622 n = rb_entry(next, struct sp_node, nd);
2623 }
2624 if (new)
2625 sp_insert(sp, new);
2626 write_unlock(&sp->lock);
2627 ret = 0;
2628
2629 err_out:
2630 if (mpol_new)
2631 mpol_put(mpol_new);
2632 if (n_new)
2633 kmem_cache_free(sn_cache, n_new);
2634
2635 return ret;
2636
2637 alloc_new:
2638 write_unlock(&sp->lock);
2639 ret = -ENOMEM;
2640 n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2641 if (!n_new)
2642 goto err_out;
2643 mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2644 if (!mpol_new)
2645 goto err_out;
2646 goto restart;
2647 }
2648
2649 /**
2650 * mpol_shared_policy_init - initialize shared policy for inode
2651 * @sp: pointer to inode shared policy
2652 * @mpol: struct mempolicy to install
2653 *
2654 * Install non-NULL @mpol in inode's shared policy rb-tree.
2655 * On entry, the current task has a reference on a non-NULL @mpol.
2656 * This must be released on exit.
2657 * This is called at get_inode() calls and we can use GFP_KERNEL.
2658 */
mpol_shared_policy_init(struct shared_policy * sp,struct mempolicy * mpol)2659 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2660 {
2661 int ret;
2662
2663 sp->root = RB_ROOT; /* empty tree == default mempolicy */
2664 rwlock_init(&sp->lock);
2665
2666 if (mpol) {
2667 struct vm_area_struct pvma;
2668 struct mempolicy *new;
2669 NODEMASK_SCRATCH(scratch);
2670
2671 if (!scratch)
2672 goto put_mpol;
2673 /* contextualize the tmpfs mount point mempolicy */
2674 new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
2675 if (IS_ERR(new))
2676 goto free_scratch; /* no valid nodemask intersection */
2677
2678 task_lock(current);
2679 ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
2680 task_unlock(current);
2681 if (ret)
2682 goto put_new;
2683
2684 /* Create pseudo-vma that contains just the policy */
2685 vma_init(&pvma, NULL);
2686 pvma.vm_end = TASK_SIZE; /* policy covers entire file */
2687 mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
2688
2689 put_new:
2690 mpol_put(new); /* drop initial ref */
2691 free_scratch:
2692 NODEMASK_SCRATCH_FREE(scratch);
2693 put_mpol:
2694 mpol_put(mpol); /* drop our incoming ref on sb mpol */
2695 }
2696 }
2697
mpol_set_shared_policy(struct shared_policy * info,struct vm_area_struct * vma,struct mempolicy * npol)2698 int mpol_set_shared_policy(struct shared_policy *info,
2699 struct vm_area_struct *vma, struct mempolicy *npol)
2700 {
2701 int err;
2702 struct sp_node *new = NULL;
2703 unsigned long sz = vma_pages(vma);
2704
2705 pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
2706 vma->vm_pgoff,
2707 sz, npol ? npol->mode : -1,
2708 npol ? npol->flags : -1,
2709 npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE);
2710
2711 if (npol) {
2712 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2713 if (!new)
2714 return -ENOMEM;
2715 }
2716 err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2717 if (err && new)
2718 sp_free(new);
2719 return err;
2720 }
2721
2722 /* Free a backing policy store on inode delete. */
mpol_free_shared_policy(struct shared_policy * p)2723 void mpol_free_shared_policy(struct shared_policy *p)
2724 {
2725 struct sp_node *n;
2726 struct rb_node *next;
2727
2728 if (!p->root.rb_node)
2729 return;
2730 write_lock(&p->lock);
2731 next = rb_first(&p->root);
2732 while (next) {
2733 n = rb_entry(next, struct sp_node, nd);
2734 next = rb_next(&n->nd);
2735 sp_delete(p, n);
2736 }
2737 write_unlock(&p->lock);
2738 }
2739
2740 #ifdef CONFIG_NUMA_BALANCING
2741 static int __initdata numabalancing_override;
2742
check_numabalancing_enable(void)2743 static void __init check_numabalancing_enable(void)
2744 {
2745 bool numabalancing_default = false;
2746
2747 if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
2748 numabalancing_default = true;
2749
2750 /* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
2751 if (numabalancing_override)
2752 set_numabalancing_state(numabalancing_override == 1);
2753
2754 if (num_online_nodes() > 1 && !numabalancing_override) {
2755 pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
2756 numabalancing_default ? "Enabling" : "Disabling");
2757 set_numabalancing_state(numabalancing_default);
2758 }
2759 }
2760
setup_numabalancing(char * str)2761 static int __init setup_numabalancing(char *str)
2762 {
2763 int ret = 0;
2764 if (!str)
2765 goto out;
2766
2767 if (!strcmp(str, "enable")) {
2768 numabalancing_override = 1;
2769 ret = 1;
2770 } else if (!strcmp(str, "disable")) {
2771 numabalancing_override = -1;
2772 ret = 1;
2773 }
2774 out:
2775 if (!ret)
2776 pr_warn("Unable to parse numa_balancing=\n");
2777
2778 return ret;
2779 }
2780 __setup("numa_balancing=", setup_numabalancing);
2781 #else
check_numabalancing_enable(void)2782 static inline void __init check_numabalancing_enable(void)
2783 {
2784 }
2785 #endif /* CONFIG_NUMA_BALANCING */
2786
2787 /* assumes fs == KERNEL_DS */
numa_policy_init(void)2788 void __init numa_policy_init(void)
2789 {
2790 nodemask_t interleave_nodes;
2791 unsigned long largest = 0;
2792 int nid, prefer = 0;
2793
2794 policy_cache = kmem_cache_create("numa_policy",
2795 sizeof(struct mempolicy),
2796 0, SLAB_PANIC, NULL);
2797
2798 sn_cache = kmem_cache_create("shared_policy_node",
2799 sizeof(struct sp_node),
2800 0, SLAB_PANIC, NULL);
2801
2802 for_each_node(nid) {
2803 preferred_node_policy[nid] = (struct mempolicy) {
2804 .refcnt = ATOMIC_INIT(1),
2805 .mode = MPOL_PREFERRED,
2806 .flags = MPOL_F_MOF | MPOL_F_MORON,
2807 .v = { .preferred_node = nid, },
2808 };
2809 }
2810
2811 /*
2812 * Set interleaving policy for system init. Interleaving is only
2813 * enabled across suitably sized nodes (default is >= 16MB), or
2814 * fall back to the largest node if they're all smaller.
2815 */
2816 nodes_clear(interleave_nodes);
2817 for_each_node_state(nid, N_MEMORY) {
2818 unsigned long total_pages = node_present_pages(nid);
2819
2820 /* Preserve the largest node */
2821 if (largest < total_pages) {
2822 largest = total_pages;
2823 prefer = nid;
2824 }
2825
2826 /* Interleave this node? */
2827 if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2828 node_set(nid, interleave_nodes);
2829 }
2830
2831 /* All too small, use the largest */
2832 if (unlikely(nodes_empty(interleave_nodes)))
2833 node_set(prefer, interleave_nodes);
2834
2835 if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2836 pr_err("%s: interleaving failed\n", __func__);
2837
2838 check_numabalancing_enable();
2839 }
2840
2841 /* Reset policy of current process to default */
numa_default_policy(void)2842 void numa_default_policy(void)
2843 {
2844 do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
2845 }
2846
2847 /*
2848 * Parse and format mempolicy from/to strings
2849 */
2850
2851 /*
2852 * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag.
2853 */
2854 static const char * const policy_modes[] =
2855 {
2856 [MPOL_DEFAULT] = "default",
2857 [MPOL_PREFERRED] = "prefer",
2858 [MPOL_BIND] = "bind",
2859 [MPOL_INTERLEAVE] = "interleave",
2860 [MPOL_LOCAL] = "local",
2861 };
2862
2863
2864 #ifdef CONFIG_TMPFS
2865 /**
2866 * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
2867 * @str: string containing mempolicy to parse
2868 * @mpol: pointer to struct mempolicy pointer, returned on success.
2869 *
2870 * Format of input:
2871 * <mode>[=<flags>][:<nodelist>]
2872 *
2873 * On success, returns 0, else 1
2874 */
mpol_parse_str(char * str,struct mempolicy ** mpol)2875 int mpol_parse_str(char *str, struct mempolicy **mpol)
2876 {
2877 struct mempolicy *new = NULL;
2878 unsigned short mode_flags;
2879 nodemask_t nodes;
2880 char *nodelist = strchr(str, ':');
2881 char *flags = strchr(str, '=');
2882 int err = 1, mode;
2883
2884 if (flags)
2885 *flags++ = '\0'; /* terminate mode string */
2886
2887 if (nodelist) {
2888 /* NUL-terminate mode or flags string */
2889 *nodelist++ = '\0';
2890 if (nodelist_parse(nodelist, nodes))
2891 goto out;
2892 if (!nodes_subset(nodes, node_states[N_MEMORY]))
2893 goto out;
2894 } else
2895 nodes_clear(nodes);
2896
2897 mode = match_string(policy_modes, MPOL_MAX, str);
2898 if (mode < 0)
2899 goto out;
2900
2901 switch (mode) {
2902 case MPOL_PREFERRED:
2903 /*
2904 * Insist on a nodelist of one node only, although later
2905 * we use first_node(nodes) to grab a single node, so here
2906 * nodelist (or nodes) cannot be empty.
2907 */
2908 if (nodelist) {
2909 char *rest = nodelist;
2910 while (isdigit(*rest))
2911 rest++;
2912 if (*rest)
2913 goto out;
2914 if (nodes_empty(nodes))
2915 goto out;
2916 }
2917 break;
2918 case MPOL_INTERLEAVE:
2919 /*
2920 * Default to online nodes with memory if no nodelist
2921 */
2922 if (!nodelist)
2923 nodes = node_states[N_MEMORY];
2924 break;
2925 case MPOL_LOCAL:
2926 /*
2927 * Don't allow a nodelist; mpol_new() checks flags
2928 */
2929 if (nodelist)
2930 goto out;
2931 mode = MPOL_PREFERRED;
2932 break;
2933 case MPOL_DEFAULT:
2934 /*
2935 * Insist on a empty nodelist
2936 */
2937 if (!nodelist)
2938 err = 0;
2939 goto out;
2940 case MPOL_BIND:
2941 /*
2942 * Insist on a nodelist
2943 */
2944 if (!nodelist)
2945 goto out;
2946 }
2947
2948 mode_flags = 0;
2949 if (flags) {
2950 /*
2951 * Currently, we only support two mutually exclusive
2952 * mode flags.
2953 */
2954 if (!strcmp(flags, "static"))
2955 mode_flags |= MPOL_F_STATIC_NODES;
2956 else if (!strcmp(flags, "relative"))
2957 mode_flags |= MPOL_F_RELATIVE_NODES;
2958 else
2959 goto out;
2960 }
2961
2962 new = mpol_new(mode, mode_flags, &nodes);
2963 if (IS_ERR(new))
2964 goto out;
2965
2966 /*
2967 * Save nodes for mpol_to_str() to show the tmpfs mount options
2968 * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
2969 */
2970 if (mode != MPOL_PREFERRED)
2971 new->v.nodes = nodes;
2972 else if (nodelist)
2973 new->v.preferred_node = first_node(nodes);
2974 else
2975 new->flags |= MPOL_F_LOCAL;
2976
2977 /*
2978 * Save nodes for contextualization: this will be used to "clone"
2979 * the mempolicy in a specific context [cpuset] at a later time.
2980 */
2981 new->w.user_nodemask = nodes;
2982
2983 err = 0;
2984
2985 out:
2986 /* Restore string for error message */
2987 if (nodelist)
2988 *--nodelist = ':';
2989 if (flags)
2990 *--flags = '=';
2991 if (!err)
2992 *mpol = new;
2993 return err;
2994 }
2995 #endif /* CONFIG_TMPFS */
2996
2997 /**
2998 * mpol_to_str - format a mempolicy structure for printing
2999 * @buffer: to contain formatted mempolicy string
3000 * @maxlen: length of @buffer
3001 * @pol: pointer to mempolicy to be formatted
3002 *
3003 * Convert @pol into a string. If @buffer is too short, truncate the string.
3004 * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
3005 * longest flag, "relative", and to display at least a few node ids.
3006 */
mpol_to_str(char * buffer,int maxlen,struct mempolicy * pol)3007 void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
3008 {
3009 char *p = buffer;
3010 nodemask_t nodes = NODE_MASK_NONE;
3011 unsigned short mode = MPOL_DEFAULT;
3012 unsigned short flags = 0;
3013
3014 if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
3015 mode = pol->mode;
3016 flags = pol->flags;
3017 }
3018
3019 switch (mode) {
3020 case MPOL_DEFAULT:
3021 break;
3022 case MPOL_PREFERRED:
3023 if (flags & MPOL_F_LOCAL)
3024 mode = MPOL_LOCAL;
3025 else
3026 node_set(pol->v.preferred_node, nodes);
3027 break;
3028 case MPOL_BIND:
3029 case MPOL_INTERLEAVE:
3030 nodes = pol->v.nodes;
3031 break;
3032 default:
3033 WARN_ON_ONCE(1);
3034 snprintf(p, maxlen, "unknown");
3035 return;
3036 }
3037
3038 p += snprintf(p, maxlen, "%s", policy_modes[mode]);
3039
3040 if (flags & MPOL_MODE_FLAGS) {
3041 p += snprintf(p, buffer + maxlen - p, "=");
3042
3043 /*
3044 * Currently, the only defined flags are mutually exclusive
3045 */
3046 if (flags & MPOL_F_STATIC_NODES)
3047 p += snprintf(p, buffer + maxlen - p, "static");
3048 else if (flags & MPOL_F_RELATIVE_NODES)
3049 p += snprintf(p, buffer + maxlen - p, "relative");
3050 }
3051
3052 if (!nodes_empty(nodes))
3053 p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
3054 nodemask_pr_args(&nodes));
3055 }
3056