• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Simple NUMA memory policy for the Linux kernel.
4  *
5  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
6  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
7  *
8  * NUMA policy allows the user to give hints in which node(s) memory should
9  * be allocated.
10  *
11  * Support four policies per VMA and per process:
12  *
13  * The VMA policy has priority over the process policy for a page fault.
14  *
15  * interleave     Allocate memory interleaved over a set of nodes,
16  *                with normal fallback if it fails.
17  *                For VMA based allocations this interleaves based on the
18  *                offset into the backing object or offset into the mapping
19  *                for anonymous memory. For process policy an process counter
20  *                is used.
21  *
22  * bind           Only allocate memory on a specific set of nodes,
23  *                no fallback.
24  *                FIXME: memory is allocated starting with the first node
25  *                to the last. It would be better if bind would truly restrict
26  *                the allocation to memory nodes instead
27  *
28  * preferred       Try a specific node first before normal fallback.
29  *                As a special case NUMA_NO_NODE here means do the allocation
30  *                on the local CPU. This is normally identical to default,
31  *                but useful to set in a VMA when you have a non default
32  *                process policy.
33  *
34  * preferred many Try a set of nodes first before normal fallback. This is
35  *                similar to preferred without the special case.
36  *
37  * default        Allocate on the local node first, or when on a VMA
38  *                use the process policy. This is what Linux always did
39  *		  in a NUMA aware kernel and still does by, ahem, default.
40  *
41  * The process policy is applied for most non interrupt memory allocations
42  * in that process' context. Interrupts ignore the policies and always
43  * try to allocate on the local CPU. The VMA policy is only applied for memory
44  * allocations for a VMA in the VM.
45  *
46  * Currently there are a few corner cases in swapping where the policy
47  * is not applied, but the majority should be handled. When process policy
48  * is used it is not remembered over swap outs/swap ins.
49  *
50  * Only the highest zone in the zone hierarchy gets policied. Allocations
51  * requesting a lower zone just use default policy. This implies that
52  * on systems with highmem kernel lowmem allocation don't get policied.
53  * Same with GFP_DMA allocations.
54  *
55  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
56  * all users and remembered even when nobody has memory mapped.
57  */
58 
59 /* Notebook:
60    fix mmap readahead to honour policy and enable policy for any page cache
61    object
62    statistics for bigpages
63    global policy for page cache? currently it uses process policy. Requires
64    first item above.
65    handle mremap for shared memory (currently ignored for the policy)
66    grows down?
67    make bind policy root only? It can trigger oom much faster and the
68    kernel is not always grateful with that.
69 */
70 
71 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
72 
73 #include <linux/mempolicy.h>
74 #include <linux/pagewalk.h>
75 #include <linux/highmem.h>
76 #include <linux/hugetlb.h>
77 #include <linux/kernel.h>
78 #include <linux/sched.h>
79 #include <linux/sched/mm.h>
80 #include <linux/sched/numa_balancing.h>
81 #include <linux/sched/task.h>
82 #include <linux/nodemask.h>
83 #include <linux/cpuset.h>
84 #include <linux/slab.h>
85 #include <linux/string.h>
86 #include <linux/export.h>
87 #include <linux/nsproxy.h>
88 #include <linux/interrupt.h>
89 #include <linux/init.h>
90 #include <linux/compat.h>
91 #include <linux/ptrace.h>
92 #include <linux/swap.h>
93 #include <linux/seq_file.h>
94 #include <linux/proc_fs.h>
95 #include <linux/migrate.h>
96 #include <linux/ksm.h>
97 #include <linux/rmap.h>
98 #include <linux/security.h>
99 #include <linux/syscalls.h>
100 #include <linux/ctype.h>
101 #include <linux/mm_inline.h>
102 #include <linux/mmu_notifier.h>
103 #include <linux/printk.h>
104 #include <linux/swapops.h>
105 
106 #include <asm/tlbflush.h>
107 #include <linux/uaccess.h>
108 
109 #include "internal.h"
110 
111 /* Internal flags */
112 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)	/* Skip checks for continuous vmas */
113 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)		/* Invert check for nodemask */
114 
115 static struct kmem_cache *policy_cache;
116 static struct kmem_cache *sn_cache;
117 
118 /* Highest zone. An specific allocation for a zone below that is not
119    policied. */
120 enum zone_type policy_zone = 0;
121 
122 /*
123  * run-time system-wide default policy => local allocation
124  */
125 static struct mempolicy default_policy = {
126 	.refcnt = ATOMIC_INIT(1), /* never free it */
127 	.mode = MPOL_LOCAL,
128 };
129 
130 static struct mempolicy preferred_node_policy[MAX_NUMNODES];
131 
132 /**
133  * numa_map_to_online_node - Find closest online node
134  * @node: Node id to start the search
135  *
136  * Lookup the next closest node by distance if @nid is not online.
137  */
numa_map_to_online_node(int node)138 int numa_map_to_online_node(int node)
139 {
140 	int min_dist = INT_MAX, dist, n, min_node;
141 
142 	if (node == NUMA_NO_NODE || node_online(node))
143 		return node;
144 
145 	min_node = node;
146 	for_each_online_node(n) {
147 		dist = node_distance(node, n);
148 		if (dist < min_dist) {
149 			min_dist = dist;
150 			min_node = n;
151 		}
152 	}
153 
154 	return min_node;
155 }
156 EXPORT_SYMBOL_GPL(numa_map_to_online_node);
157 
get_task_policy(struct task_struct * p)158 struct mempolicy *get_task_policy(struct task_struct *p)
159 {
160 	struct mempolicy *pol = p->mempolicy;
161 	int node;
162 
163 	if (pol)
164 		return pol;
165 
166 	node = numa_node_id();
167 	if (node != NUMA_NO_NODE) {
168 		pol = &preferred_node_policy[node];
169 		/* preferred_node_policy is not initialised early in boot */
170 		if (pol->mode)
171 			return pol;
172 	}
173 
174 	return &default_policy;
175 }
176 
177 static const struct mempolicy_operations {
178 	int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
179 	void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
180 } mpol_ops[MPOL_MAX];
181 
mpol_store_user_nodemask(const struct mempolicy * pol)182 static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
183 {
184 	return pol->flags & MPOL_MODE_FLAGS;
185 }
186 
mpol_relative_nodemask(nodemask_t * ret,const nodemask_t * orig,const nodemask_t * rel)187 static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
188 				   const nodemask_t *rel)
189 {
190 	nodemask_t tmp;
191 	nodes_fold(tmp, *orig, nodes_weight(*rel));
192 	nodes_onto(*ret, tmp, *rel);
193 }
194 
mpol_new_nodemask(struct mempolicy * pol,const nodemask_t * nodes)195 static int mpol_new_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
196 {
197 	if (nodes_empty(*nodes))
198 		return -EINVAL;
199 	pol->nodes = *nodes;
200 	return 0;
201 }
202 
mpol_new_preferred(struct mempolicy * pol,const nodemask_t * nodes)203 static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
204 {
205 	if (nodes_empty(*nodes))
206 		return -EINVAL;
207 
208 	nodes_clear(pol->nodes);
209 	node_set(first_node(*nodes), pol->nodes);
210 	return 0;
211 }
212 
213 /*
214  * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
215  * any, for the new policy.  mpol_new() has already validated the nodes
216  * parameter with respect to the policy mode and flags.
217  *
218  * Must be called holding task's alloc_lock to protect task's mems_allowed
219  * and mempolicy.  May also be called holding the mmap_lock for write.
220  */
mpol_set_nodemask(struct mempolicy * pol,const nodemask_t * nodes,struct nodemask_scratch * nsc)221 static int mpol_set_nodemask(struct mempolicy *pol,
222 		     const nodemask_t *nodes, struct nodemask_scratch *nsc)
223 {
224 	int ret;
225 
226 	/*
227 	 * Default (pol==NULL) resp. local memory policies are not a
228 	 * subject of any remapping. They also do not need any special
229 	 * constructor.
230 	 */
231 	if (!pol || pol->mode == MPOL_LOCAL)
232 		return 0;
233 
234 	/* Check N_MEMORY */
235 	nodes_and(nsc->mask1,
236 		  cpuset_current_mems_allowed, node_states[N_MEMORY]);
237 
238 	VM_BUG_ON(!nodes);
239 
240 	if (pol->flags & MPOL_F_RELATIVE_NODES)
241 		mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
242 	else
243 		nodes_and(nsc->mask2, *nodes, nsc->mask1);
244 
245 	if (mpol_store_user_nodemask(pol))
246 		pol->w.user_nodemask = *nodes;
247 	else
248 		pol->w.cpuset_mems_allowed = cpuset_current_mems_allowed;
249 
250 	ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
251 	return ret;
252 }
253 
254 /*
255  * This function just creates a new policy, does some check and simple
256  * initialization. You must invoke mpol_set_nodemask() to set nodes.
257  */
mpol_new(unsigned short mode,unsigned short flags,nodemask_t * nodes)258 static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
259 				  nodemask_t *nodes)
260 {
261 	struct mempolicy *policy;
262 
263 	pr_debug("setting mode %d flags %d nodes[0] %lx\n",
264 		 mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
265 
266 	if (mode == MPOL_DEFAULT) {
267 		if (nodes && !nodes_empty(*nodes))
268 			return ERR_PTR(-EINVAL);
269 		return NULL;
270 	}
271 	VM_BUG_ON(!nodes);
272 
273 	/*
274 	 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
275 	 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
276 	 * All other modes require a valid pointer to a non-empty nodemask.
277 	 */
278 	if (mode == MPOL_PREFERRED) {
279 		if (nodes_empty(*nodes)) {
280 			if (((flags & MPOL_F_STATIC_NODES) ||
281 			     (flags & MPOL_F_RELATIVE_NODES)))
282 				return ERR_PTR(-EINVAL);
283 
284 			mode = MPOL_LOCAL;
285 		}
286 	} else if (mode == MPOL_LOCAL) {
287 		if (!nodes_empty(*nodes) ||
288 		    (flags & MPOL_F_STATIC_NODES) ||
289 		    (flags & MPOL_F_RELATIVE_NODES))
290 			return ERR_PTR(-EINVAL);
291 	} else if (nodes_empty(*nodes))
292 		return ERR_PTR(-EINVAL);
293 	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
294 	if (!policy)
295 		return ERR_PTR(-ENOMEM);
296 	atomic_set(&policy->refcnt, 1);
297 	policy->mode = mode;
298 	policy->flags = flags;
299 
300 	return policy;
301 }
302 
303 /* Slow path of a mpol destructor. */
__mpol_put(struct mempolicy * p)304 void __mpol_put(struct mempolicy *p)
305 {
306 	if (!atomic_dec_and_test(&p->refcnt))
307 		return;
308 	kmem_cache_free(policy_cache, p);
309 }
310 
mpol_rebind_default(struct mempolicy * pol,const nodemask_t * nodes)311 static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
312 {
313 }
314 
mpol_rebind_nodemask(struct mempolicy * pol,const nodemask_t * nodes)315 static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
316 {
317 	nodemask_t tmp;
318 
319 	if (pol->flags & MPOL_F_STATIC_NODES)
320 		nodes_and(tmp, pol->w.user_nodemask, *nodes);
321 	else if (pol->flags & MPOL_F_RELATIVE_NODES)
322 		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
323 	else {
324 		nodes_remap(tmp, pol->nodes, pol->w.cpuset_mems_allowed,
325 								*nodes);
326 		pol->w.cpuset_mems_allowed = *nodes;
327 	}
328 
329 	if (nodes_empty(tmp))
330 		tmp = *nodes;
331 
332 	pol->nodes = tmp;
333 }
334 
mpol_rebind_preferred(struct mempolicy * pol,const nodemask_t * nodes)335 static void mpol_rebind_preferred(struct mempolicy *pol,
336 						const nodemask_t *nodes)
337 {
338 	pol->w.cpuset_mems_allowed = *nodes;
339 }
340 
341 /*
342  * mpol_rebind_policy - Migrate a policy to a different set of nodes
343  *
344  * Per-vma policies are protected by mmap_lock. Allocations using per-task
345  * policies are protected by task->mems_allowed_seq to prevent a premature
346  * OOM/allocation failure due to parallel nodemask modification.
347  */
mpol_rebind_policy(struct mempolicy * pol,const nodemask_t * newmask)348 static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
349 {
350 	if (!pol || pol->mode == MPOL_LOCAL)
351 		return;
352 	if (!mpol_store_user_nodemask(pol) &&
353 	    nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
354 		return;
355 
356 	mpol_ops[pol->mode].rebind(pol, newmask);
357 }
358 
359 /*
360  * Wrapper for mpol_rebind_policy() that just requires task
361  * pointer, and updates task mempolicy.
362  *
363  * Called with task's alloc_lock held.
364  */
365 
mpol_rebind_task(struct task_struct * tsk,const nodemask_t * new)366 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
367 {
368 	mpol_rebind_policy(tsk->mempolicy, new);
369 }
370 
371 /*
372  * Rebind each vma in mm to new nodemask.
373  *
374  * Call holding a reference to mm.  Takes mm->mmap_lock during call.
375  */
376 
mpol_rebind_mm(struct mm_struct * mm,nodemask_t * new)377 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
378 {
379 	struct vm_area_struct *vma;
380 
381 	mmap_write_lock(mm);
382 	for (vma = mm->mmap; vma; vma = vma->vm_next)
383 		mpol_rebind_policy(vma->vm_policy, new);
384 	mmap_write_unlock(mm);
385 }
386 
387 static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
388 	[MPOL_DEFAULT] = {
389 		.rebind = mpol_rebind_default,
390 	},
391 	[MPOL_INTERLEAVE] = {
392 		.create = mpol_new_nodemask,
393 		.rebind = mpol_rebind_nodemask,
394 	},
395 	[MPOL_PREFERRED] = {
396 		.create = mpol_new_preferred,
397 		.rebind = mpol_rebind_preferred,
398 	},
399 	[MPOL_BIND] = {
400 		.create = mpol_new_nodemask,
401 		.rebind = mpol_rebind_nodemask,
402 	},
403 	[MPOL_LOCAL] = {
404 		.rebind = mpol_rebind_default,
405 	},
406 	[MPOL_PREFERRED_MANY] = {
407 		.create = mpol_new_nodemask,
408 		.rebind = mpol_rebind_preferred,
409 	},
410 };
411 
412 static int migrate_page_add(struct page *page, struct list_head *pagelist,
413 				unsigned long flags);
414 
415 struct queue_pages {
416 	struct list_head *pagelist;
417 	unsigned long flags;
418 	nodemask_t *nmask;
419 	unsigned long start;
420 	unsigned long end;
421 	struct vm_area_struct *first;
422 };
423 
424 /*
425  * Check if the page's nid is in qp->nmask.
426  *
427  * If MPOL_MF_INVERT is set in qp->flags, check if the nid is
428  * in the invert of qp->nmask.
429  */
queue_pages_required(struct page * page,struct queue_pages * qp)430 static inline bool queue_pages_required(struct page *page,
431 					struct queue_pages *qp)
432 {
433 	int nid = page_to_nid(page);
434 	unsigned long flags = qp->flags;
435 
436 	return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT);
437 }
438 
439 /*
440  * queue_pages_pmd() has four possible return values:
441  * 0 - pages are placed on the right node or queued successfully, or
442  *     special page is met, i.e. huge zero page.
443  * 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
444  *     specified.
445  * 2 - THP was split.
446  * -EIO - is migration entry or only MPOL_MF_STRICT was specified and an
447  *        existing page was already on a node that does not follow the
448  *        policy.
449  */
queue_pages_pmd(pmd_t * pmd,spinlock_t * ptl,unsigned long addr,unsigned long end,struct mm_walk * walk)450 static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr,
451 				unsigned long end, struct mm_walk *walk)
452 	__releases(ptl)
453 {
454 	int ret = 0;
455 	struct page *page;
456 	struct queue_pages *qp = walk->private;
457 	unsigned long flags;
458 
459 	if (unlikely(is_pmd_migration_entry(*pmd))) {
460 		ret = -EIO;
461 		goto unlock;
462 	}
463 	page = pmd_page(*pmd);
464 	if (is_huge_zero_page(page)) {
465 		spin_unlock(ptl);
466 		walk->action = ACTION_CONTINUE;
467 		goto out;
468 	}
469 	if (!queue_pages_required(page, qp))
470 		goto unlock;
471 
472 	flags = qp->flags;
473 	/* go to thp migration */
474 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
475 		if (!vma_migratable(walk->vma) ||
476 		    migrate_page_add(page, qp->pagelist, flags)) {
477 			ret = 1;
478 			goto unlock;
479 		}
480 	} else
481 		ret = -EIO;
482 unlock:
483 	spin_unlock(ptl);
484 out:
485 	return ret;
486 }
487 
488 /*
489  * Scan through pages checking if pages follow certain conditions,
490  * and move them to the pagelist if they do.
491  *
492  * queue_pages_pte_range() has three possible return values:
493  * 0 - pages are placed on the right node or queued successfully, or
494  *     special page is met, i.e. zero page.
495  * 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
496  *     specified.
497  * -EIO - only MPOL_MF_STRICT was specified and an existing page was already
498  *        on a node that does not follow the policy.
499  */
queue_pages_pte_range(pmd_t * pmd,unsigned long addr,unsigned long end,struct mm_walk * walk)500 static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
501 			unsigned long end, struct mm_walk *walk)
502 {
503 	struct vm_area_struct *vma = walk->vma;
504 	struct page *page;
505 	struct queue_pages *qp = walk->private;
506 	unsigned long flags = qp->flags;
507 	int ret;
508 	bool has_unmovable = false;
509 	pte_t *pte, *mapped_pte;
510 	spinlock_t *ptl;
511 
512 	ptl = pmd_trans_huge_lock(pmd, vma);
513 	if (ptl) {
514 		ret = queue_pages_pmd(pmd, ptl, addr, end, walk);
515 		if (ret != 2)
516 			return ret;
517 	}
518 	/* THP was split, fall through to pte walk */
519 
520 	if (pmd_trans_unstable(pmd))
521 		return 0;
522 
523 	mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
524 	for (; addr != end; pte++, addr += PAGE_SIZE) {
525 		if (!pte_present(*pte))
526 			continue;
527 		page = vm_normal_page(vma, addr, *pte);
528 		if (!page)
529 			continue;
530 		/*
531 		 * vm_normal_page() filters out zero pages, but there might
532 		 * still be PageReserved pages to skip, perhaps in a VDSO.
533 		 */
534 		if (PageReserved(page))
535 			continue;
536 		if (!queue_pages_required(page, qp))
537 			continue;
538 		if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
539 			/* MPOL_MF_STRICT must be specified if we get here */
540 			if (!vma_migratable(vma)) {
541 				has_unmovable = true;
542 				break;
543 			}
544 
545 			/*
546 			 * Do not abort immediately since there may be
547 			 * temporary off LRU pages in the range.  Still
548 			 * need migrate other LRU pages.
549 			 */
550 			if (migrate_page_add(page, qp->pagelist, flags))
551 				has_unmovable = true;
552 		} else
553 			break;
554 	}
555 	pte_unmap_unlock(mapped_pte, ptl);
556 	cond_resched();
557 
558 	if (has_unmovable)
559 		return 1;
560 
561 	return addr != end ? -EIO : 0;
562 }
563 
queue_pages_hugetlb(pte_t * pte,unsigned long hmask,unsigned long addr,unsigned long end,struct mm_walk * walk)564 static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
565 			       unsigned long addr, unsigned long end,
566 			       struct mm_walk *walk)
567 {
568 	int ret = 0;
569 #ifdef CONFIG_HUGETLB_PAGE
570 	struct queue_pages *qp = walk->private;
571 	unsigned long flags = (qp->flags & MPOL_MF_VALID);
572 	struct page *page;
573 	spinlock_t *ptl;
574 	pte_t entry;
575 
576 	ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
577 	entry = huge_ptep_get(pte);
578 	if (!pte_present(entry))
579 		goto unlock;
580 	page = pte_page(entry);
581 	if (!queue_pages_required(page, qp))
582 		goto unlock;
583 
584 	if (flags == MPOL_MF_STRICT) {
585 		/*
586 		 * STRICT alone means only detecting misplaced page and no
587 		 * need to further check other vma.
588 		 */
589 		ret = -EIO;
590 		goto unlock;
591 	}
592 
593 	if (!vma_migratable(walk->vma)) {
594 		/*
595 		 * Must be STRICT with MOVE*, otherwise .test_walk() have
596 		 * stopped walking current vma.
597 		 * Detecting misplaced page but allow migrating pages which
598 		 * have been queued.
599 		 */
600 		ret = 1;
601 		goto unlock;
602 	}
603 
604 	/* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
605 	if (flags & (MPOL_MF_MOVE_ALL) ||
606 	    (flags & MPOL_MF_MOVE && page_mapcount(page) == 1 &&
607 	     !hugetlb_pmd_shared(pte))) {
608 		if (isolate_hugetlb(page, qp->pagelist) &&
609 			(flags & MPOL_MF_STRICT))
610 			/*
611 			 * Failed to isolate page but allow migrating pages
612 			 * which have been queued.
613 			 */
614 			ret = 1;
615 	}
616 unlock:
617 	spin_unlock(ptl);
618 #else
619 	BUG();
620 #endif
621 	return ret;
622 }
623 
624 #ifdef CONFIG_NUMA_BALANCING
625 /*
626  * This is used to mark a range of virtual addresses to be inaccessible.
627  * These are later cleared by a NUMA hinting fault. Depending on these
628  * faults, pages may be migrated for better NUMA placement.
629  *
630  * This is assuming that NUMA faults are handled using PROT_NONE. If
631  * an architecture makes a different choice, it will need further
632  * changes to the core.
633  */
change_prot_numa(struct vm_area_struct * vma,unsigned long addr,unsigned long end)634 unsigned long change_prot_numa(struct vm_area_struct *vma,
635 			unsigned long addr, unsigned long end)
636 {
637 	int nr_updated;
638 
639 	nr_updated = change_protection(vma, addr, end, PAGE_NONE, MM_CP_PROT_NUMA);
640 	if (nr_updated)
641 		count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
642 
643 	return nr_updated;
644 }
645 #else
change_prot_numa(struct vm_area_struct * vma,unsigned long addr,unsigned long end)646 static unsigned long change_prot_numa(struct vm_area_struct *vma,
647 			unsigned long addr, unsigned long end)
648 {
649 	return 0;
650 }
651 #endif /* CONFIG_NUMA_BALANCING */
652 
queue_pages_test_walk(unsigned long start,unsigned long end,struct mm_walk * walk)653 static int queue_pages_test_walk(unsigned long start, unsigned long end,
654 				struct mm_walk *walk)
655 {
656 	struct vm_area_struct *vma = walk->vma;
657 	struct queue_pages *qp = walk->private;
658 	unsigned long endvma = vma->vm_end;
659 	unsigned long flags = qp->flags;
660 
661 	/* range check first */
662 	VM_BUG_ON_VMA(!range_in_vma(vma, start, end), vma);
663 
664 	if (!qp->first) {
665 		qp->first = vma;
666 		if (!(flags & MPOL_MF_DISCONTIG_OK) &&
667 			(qp->start < vma->vm_start))
668 			/* hole at head side of range */
669 			return -EFAULT;
670 	}
671 	if (!(flags & MPOL_MF_DISCONTIG_OK) &&
672 		((vma->vm_end < qp->end) &&
673 		(!vma->vm_next || vma->vm_end < vma->vm_next->vm_start)))
674 		/* hole at middle or tail of range */
675 		return -EFAULT;
676 
677 	/*
678 	 * Need check MPOL_MF_STRICT to return -EIO if possible
679 	 * regardless of vma_migratable
680 	 */
681 	if (!vma_migratable(vma) &&
682 	    !(flags & MPOL_MF_STRICT))
683 		return 1;
684 
685 	if (endvma > end)
686 		endvma = end;
687 
688 	if (flags & MPOL_MF_LAZY) {
689 		/* Similar to task_numa_work, skip inaccessible VMAs */
690 		if (!is_vm_hugetlb_page(vma) && vma_is_accessible(vma) &&
691 			!(vma->vm_flags & VM_MIXEDMAP))
692 			change_prot_numa(vma, start, endvma);
693 		return 1;
694 	}
695 
696 	/* queue pages from current vma */
697 	if (flags & MPOL_MF_VALID)
698 		return 0;
699 	return 1;
700 }
701 
702 static const struct mm_walk_ops queue_pages_walk_ops = {
703 	.hugetlb_entry		= queue_pages_hugetlb,
704 	.pmd_entry		= queue_pages_pte_range,
705 	.test_walk		= queue_pages_test_walk,
706 };
707 
708 /*
709  * Walk through page tables and collect pages to be migrated.
710  *
711  * If pages found in a given range are on a set of nodes (determined by
712  * @nodes and @flags,) it's isolated and queued to the pagelist which is
713  * passed via @private.
714  *
715  * queue_pages_range() has three possible return values:
716  * 1 - there is unmovable page, but MPOL_MF_MOVE* & MPOL_MF_STRICT were
717  *     specified.
718  * 0 - queue pages successfully or no misplaced page.
719  * errno - i.e. misplaced pages with MPOL_MF_STRICT specified (-EIO) or
720  *         memory range specified by nodemask and maxnode points outside
721  *         your accessible address space (-EFAULT)
722  */
723 static int
queue_pages_range(struct mm_struct * mm,unsigned long start,unsigned long end,nodemask_t * nodes,unsigned long flags,struct list_head * pagelist)724 queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
725 		nodemask_t *nodes, unsigned long flags,
726 		struct list_head *pagelist)
727 {
728 	int err;
729 	struct queue_pages qp = {
730 		.pagelist = pagelist,
731 		.flags = flags,
732 		.nmask = nodes,
733 		.start = start,
734 		.end = end,
735 		.first = NULL,
736 	};
737 
738 	err = walk_page_range(mm, start, end, &queue_pages_walk_ops, &qp);
739 
740 	if (!qp.first)
741 		/* whole range in hole */
742 		err = -EFAULT;
743 
744 	return err;
745 }
746 
747 /*
748  * Apply policy to a single VMA
749  * This must be called with the mmap_lock held for writing.
750  */
vma_replace_policy(struct vm_area_struct * vma,struct mempolicy * pol)751 static int vma_replace_policy(struct vm_area_struct *vma,
752 						struct mempolicy *pol)
753 {
754 	int err;
755 	struct mempolicy *old;
756 	struct mempolicy *new;
757 
758 	pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
759 		 vma->vm_start, vma->vm_end, vma->vm_pgoff,
760 		 vma->vm_ops, vma->vm_file,
761 		 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
762 
763 	new = mpol_dup(pol);
764 	if (IS_ERR(new))
765 		return PTR_ERR(new);
766 
767 	if (vma->vm_ops && vma->vm_ops->set_policy) {
768 		err = vma->vm_ops->set_policy(vma, new);
769 		if (err)
770 			goto err_out;
771 	}
772 
773 	old = vma->vm_policy;
774 	vma->vm_policy = new; /* protected by mmap_lock */
775 	mpol_put(old);
776 
777 	return 0;
778  err_out:
779 	mpol_put(new);
780 	return err;
781 }
782 
783 /* Step 2: apply policy to a range and do splits. */
mbind_range(struct mm_struct * mm,unsigned long start,unsigned long end,struct mempolicy * new_pol)784 static int mbind_range(struct mm_struct *mm, unsigned long start,
785 		       unsigned long end, struct mempolicy *new_pol)
786 {
787 	struct vm_area_struct *prev;
788 	struct vm_area_struct *vma;
789 	int err = 0;
790 	pgoff_t pgoff;
791 	unsigned long vmstart;
792 	unsigned long vmend;
793 
794 	vma = find_vma(mm, start);
795 	VM_BUG_ON(!vma);
796 
797 	prev = vma->vm_prev;
798 	if (start > vma->vm_start)
799 		prev = vma;
800 
801 	for (; vma && vma->vm_start < end; prev = vma, vma = vma->vm_next) {
802 		vmstart = max(start, vma->vm_start);
803 		vmend   = min(end, vma->vm_end);
804 
805 		if (mpol_equal(vma_policy(vma), new_pol))
806 			continue;
807 
808 		pgoff = vma->vm_pgoff +
809 			((vmstart - vma->vm_start) >> PAGE_SHIFT);
810 		prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
811 				 vma->anon_vma, vma->vm_file, pgoff,
812 				 new_pol, vma->vm_userfaultfd_ctx,
813 				 anon_vma_name(vma));
814 		if (prev) {
815 			vma = prev;
816 			goto replace;
817 		}
818 		if (vma->vm_start != vmstart) {
819 			err = split_vma(vma->vm_mm, vma, vmstart, 1);
820 			if (err)
821 				goto out;
822 		}
823 		if (vma->vm_end != vmend) {
824 			err = split_vma(vma->vm_mm, vma, vmend, 0);
825 			if (err)
826 				goto out;
827 		}
828  replace:
829 		err = vma_replace_policy(vma, new_pol);
830 		if (err)
831 			goto out;
832 	}
833 
834  out:
835 	return err;
836 }
837 
838 /* Set the process memory policy */
do_set_mempolicy(unsigned short mode,unsigned short flags,nodemask_t * nodes)839 static long do_set_mempolicy(unsigned short mode, unsigned short flags,
840 			     nodemask_t *nodes)
841 {
842 	struct mempolicy *new, *old;
843 	NODEMASK_SCRATCH(scratch);
844 	int ret;
845 
846 	if (!scratch)
847 		return -ENOMEM;
848 
849 	new = mpol_new(mode, flags, nodes);
850 	if (IS_ERR(new)) {
851 		ret = PTR_ERR(new);
852 		goto out;
853 	}
854 
855 	ret = mpol_set_nodemask(new, nodes, scratch);
856 	if (ret) {
857 		mpol_put(new);
858 		goto out;
859 	}
860 	task_lock(current);
861 	old = current->mempolicy;
862 	current->mempolicy = new;
863 	if (new && new->mode == MPOL_INTERLEAVE)
864 		current->il_prev = MAX_NUMNODES-1;
865 	task_unlock(current);
866 	mpol_put(old);
867 	ret = 0;
868 out:
869 	NODEMASK_SCRATCH_FREE(scratch);
870 	return ret;
871 }
872 
873 /*
874  * Return nodemask for policy for get_mempolicy() query
875  *
876  * Called with task's alloc_lock held
877  */
get_policy_nodemask(struct mempolicy * p,nodemask_t * nodes)878 static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
879 {
880 	nodes_clear(*nodes);
881 	if (p == &default_policy)
882 		return;
883 
884 	switch (p->mode) {
885 	case MPOL_BIND:
886 	case MPOL_INTERLEAVE:
887 	case MPOL_PREFERRED:
888 	case MPOL_PREFERRED_MANY:
889 		*nodes = p->nodes;
890 		break;
891 	case MPOL_LOCAL:
892 		/* return empty node mask for local allocation */
893 		break;
894 	default:
895 		BUG();
896 	}
897 }
898 
lookup_node(struct mm_struct * mm,unsigned long addr)899 static int lookup_node(struct mm_struct *mm, unsigned long addr)
900 {
901 	struct page *p = NULL;
902 	int err;
903 
904 	int locked = 1;
905 	err = get_user_pages_locked(addr & PAGE_MASK, 1, 0, &p, &locked);
906 	if (err > 0) {
907 		err = page_to_nid(p);
908 		put_page(p);
909 	}
910 	if (locked)
911 		mmap_read_unlock(mm);
912 	return err;
913 }
914 
915 /* Retrieve NUMA policy */
do_get_mempolicy(int * policy,nodemask_t * nmask,unsigned long addr,unsigned long flags)916 static long do_get_mempolicy(int *policy, nodemask_t *nmask,
917 			     unsigned long addr, unsigned long flags)
918 {
919 	int err;
920 	struct mm_struct *mm = current->mm;
921 	struct vm_area_struct *vma = NULL;
922 	struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL;
923 
924 	if (flags &
925 		~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
926 		return -EINVAL;
927 
928 	if (flags & MPOL_F_MEMS_ALLOWED) {
929 		if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
930 			return -EINVAL;
931 		*policy = 0;	/* just so it's initialized */
932 		task_lock(current);
933 		*nmask  = cpuset_current_mems_allowed;
934 		task_unlock(current);
935 		return 0;
936 	}
937 
938 	if (flags & MPOL_F_ADDR) {
939 		/*
940 		 * Do NOT fall back to task policy if the
941 		 * vma/shared policy at addr is NULL.  We
942 		 * want to return MPOL_DEFAULT in this case.
943 		 */
944 		mmap_read_lock(mm);
945 		vma = vma_lookup(mm, addr);
946 		if (!vma) {
947 			mmap_read_unlock(mm);
948 			return -EFAULT;
949 		}
950 		if (vma->vm_ops && vma->vm_ops->get_policy)
951 			pol = vma->vm_ops->get_policy(vma, addr);
952 		else
953 			pol = vma->vm_policy;
954 	} else if (addr)
955 		return -EINVAL;
956 
957 	if (!pol)
958 		pol = &default_policy;	/* indicates default behavior */
959 
960 	if (flags & MPOL_F_NODE) {
961 		if (flags & MPOL_F_ADDR) {
962 			/*
963 			 * Take a refcount on the mpol, lookup_node()
964 			 * will drop the mmap_lock, so after calling
965 			 * lookup_node() only "pol" remains valid, "vma"
966 			 * is stale.
967 			 */
968 			pol_refcount = pol;
969 			vma = NULL;
970 			mpol_get(pol);
971 			err = lookup_node(mm, addr);
972 			if (err < 0)
973 				goto out;
974 			*policy = err;
975 		} else if (pol == current->mempolicy &&
976 				pol->mode == MPOL_INTERLEAVE) {
977 			*policy = next_node_in(current->il_prev, pol->nodes);
978 		} else {
979 			err = -EINVAL;
980 			goto out;
981 		}
982 	} else {
983 		*policy = pol == &default_policy ? MPOL_DEFAULT :
984 						pol->mode;
985 		/*
986 		 * Internal mempolicy flags must be masked off before exposing
987 		 * the policy to userspace.
988 		 */
989 		*policy |= (pol->flags & MPOL_MODE_FLAGS);
990 	}
991 
992 	err = 0;
993 	if (nmask) {
994 		if (mpol_store_user_nodemask(pol)) {
995 			*nmask = pol->w.user_nodemask;
996 		} else {
997 			task_lock(current);
998 			get_policy_nodemask(pol, nmask);
999 			task_unlock(current);
1000 		}
1001 	}
1002 
1003  out:
1004 	mpol_cond_put(pol);
1005 	if (vma)
1006 		mmap_read_unlock(mm);
1007 	if (pol_refcount)
1008 		mpol_put(pol_refcount);
1009 	return err;
1010 }
1011 
1012 #ifdef CONFIG_MIGRATION
1013 /*
1014  * page migration, thp tail pages can be passed.
1015  */
migrate_page_add(struct page * page,struct list_head * pagelist,unsigned long flags)1016 static int migrate_page_add(struct page *page, struct list_head *pagelist,
1017 				unsigned long flags)
1018 {
1019 	struct page *head = compound_head(page);
1020 	/*
1021 	 * Avoid migrating a page that is shared with others.
1022 	 */
1023 	if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(head) == 1) {
1024 		if (!isolate_lru_page(head)) {
1025 			list_add_tail(&head->lru, pagelist);
1026 			mod_node_page_state(page_pgdat(head),
1027 				NR_ISOLATED_ANON + page_is_file_lru(head),
1028 				thp_nr_pages(head));
1029 		} else if (flags & MPOL_MF_STRICT) {
1030 			/*
1031 			 * Non-movable page may reach here.  And, there may be
1032 			 * temporary off LRU pages or non-LRU movable pages.
1033 			 * Treat them as unmovable pages since they can't be
1034 			 * isolated, so they can't be moved at the moment.  It
1035 			 * should return -EIO for this case too.
1036 			 */
1037 			return -EIO;
1038 		}
1039 	}
1040 
1041 	return 0;
1042 }
1043 
1044 /*
1045  * Migrate pages from one node to a target node.
1046  * Returns error or the number of pages not migrated.
1047  */
migrate_to_node(struct mm_struct * mm,int source,int dest,int flags)1048 static int migrate_to_node(struct mm_struct *mm, int source, int dest,
1049 			   int flags)
1050 {
1051 	nodemask_t nmask;
1052 	LIST_HEAD(pagelist);
1053 	int err = 0;
1054 	struct migration_target_control mtc = {
1055 		.nid = dest,
1056 		.gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE,
1057 	};
1058 
1059 	nodes_clear(nmask);
1060 	node_set(source, nmask);
1061 
1062 	/*
1063 	 * This does not "check" the range but isolates all pages that
1064 	 * need migration.  Between passing in the full user address
1065 	 * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
1066 	 */
1067 	VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
1068 	queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
1069 			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
1070 
1071 	if (!list_empty(&pagelist)) {
1072 		err = migrate_pages(&pagelist, alloc_migration_target, NULL,
1073 				(unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL);
1074 		if (err)
1075 			putback_movable_pages(&pagelist);
1076 	}
1077 
1078 	return err;
1079 }
1080 
1081 /*
1082  * Move pages between the two nodesets so as to preserve the physical
1083  * layout as much as possible.
1084  *
1085  * Returns the number of page that could not be moved.
1086  */
do_migrate_pages(struct mm_struct * mm,const nodemask_t * from,const nodemask_t * to,int flags)1087 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1088 		     const nodemask_t *to, int flags)
1089 {
1090 	int busy = 0;
1091 	int err = 0;
1092 	nodemask_t tmp;
1093 
1094 	lru_cache_disable();
1095 
1096 	mmap_read_lock(mm);
1097 
1098 	/*
1099 	 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
1100 	 * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
1101 	 * bit in 'tmp', and return that <source, dest> pair for migration.
1102 	 * The pair of nodemasks 'to' and 'from' define the map.
1103 	 *
1104 	 * If no pair of bits is found that way, fallback to picking some
1105 	 * pair of 'source' and 'dest' bits that are not the same.  If the
1106 	 * 'source' and 'dest' bits are the same, this represents a node
1107 	 * that will be migrating to itself, so no pages need move.
1108 	 *
1109 	 * If no bits are left in 'tmp', or if all remaining bits left
1110 	 * in 'tmp' correspond to the same bit in 'to', return false
1111 	 * (nothing left to migrate).
1112 	 *
1113 	 * This lets us pick a pair of nodes to migrate between, such that
1114 	 * if possible the dest node is not already occupied by some other
1115 	 * source node, minimizing the risk of overloading the memory on a
1116 	 * node that would happen if we migrated incoming memory to a node
1117 	 * before migrating outgoing memory source that same node.
1118 	 *
1119 	 * A single scan of tmp is sufficient.  As we go, we remember the
1120 	 * most recent <s, d> pair that moved (s != d).  If we find a pair
1121 	 * that not only moved, but what's better, moved to an empty slot
1122 	 * (d is not set in tmp), then we break out then, with that pair.
1123 	 * Otherwise when we finish scanning from_tmp, we at least have the
1124 	 * most recent <s, d> pair that moved.  If we get all the way through
1125 	 * the scan of tmp without finding any node that moved, much less
1126 	 * moved to an empty node, then there is nothing left worth migrating.
1127 	 */
1128 
1129 	tmp = *from;
1130 	while (!nodes_empty(tmp)) {
1131 		int s, d;
1132 		int source = NUMA_NO_NODE;
1133 		int dest = 0;
1134 
1135 		for_each_node_mask(s, tmp) {
1136 
1137 			/*
1138 			 * do_migrate_pages() tries to maintain the relative
1139 			 * node relationship of the pages established between
1140 			 * threads and memory areas.
1141                          *
1142 			 * However if the number of source nodes is not equal to
1143 			 * the number of destination nodes we can not preserve
1144 			 * this node relative relationship.  In that case, skip
1145 			 * copying memory from a node that is in the destination
1146 			 * mask.
1147 			 *
1148 			 * Example: [2,3,4] -> [3,4,5] moves everything.
1149 			 *          [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1150 			 */
1151 
1152 			if ((nodes_weight(*from) != nodes_weight(*to)) &&
1153 						(node_isset(s, *to)))
1154 				continue;
1155 
1156 			d = node_remap(s, *from, *to);
1157 			if (s == d)
1158 				continue;
1159 
1160 			source = s;	/* Node moved. Memorize */
1161 			dest = d;
1162 
1163 			/* dest not in remaining from nodes? */
1164 			if (!node_isset(dest, tmp))
1165 				break;
1166 		}
1167 		if (source == NUMA_NO_NODE)
1168 			break;
1169 
1170 		node_clear(source, tmp);
1171 		err = migrate_to_node(mm, source, dest, flags);
1172 		if (err > 0)
1173 			busy += err;
1174 		if (err < 0)
1175 			break;
1176 	}
1177 	mmap_read_unlock(mm);
1178 
1179 	lru_cache_enable();
1180 	if (err < 0)
1181 		return err;
1182 	return busy;
1183 
1184 }
1185 
1186 /*
1187  * Allocate a new page for page migration based on vma policy.
1188  * Start by assuming the page is mapped by the same vma as contains @start.
1189  * Search forward from there, if not.  N.B., this assumes that the
1190  * list of pages handed to migrate_pages()--which is how we get here--
1191  * is in virtual address order.
1192  */
new_page(struct page * page,unsigned long start)1193 static struct page *new_page(struct page *page, unsigned long start)
1194 {
1195 	struct vm_area_struct *vma;
1196 	unsigned long address;
1197 
1198 	vma = find_vma(current->mm, start);
1199 	while (vma) {
1200 		address = page_address_in_vma(page, vma);
1201 		if (address != -EFAULT)
1202 			break;
1203 		vma = vma->vm_next;
1204 	}
1205 
1206 	if (PageHuge(page)) {
1207 		return alloc_huge_page_vma(page_hstate(compound_head(page)),
1208 				vma, address);
1209 	} else if (PageTransHuge(page)) {
1210 		struct page *thp;
1211 
1212 		thp = alloc_hugepage_vma(GFP_TRANSHUGE, vma, address,
1213 					 HPAGE_PMD_ORDER);
1214 		if (!thp)
1215 			return NULL;
1216 		prep_transhuge_page(thp);
1217 		return thp;
1218 	}
1219 	/*
1220 	 * if !vma, alloc_page_vma() will use task or system default policy
1221 	 */
1222 	return alloc_page_vma(GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL,
1223 			vma, address);
1224 }
1225 #else
1226 
migrate_page_add(struct page * page,struct list_head * pagelist,unsigned long flags)1227 static int migrate_page_add(struct page *page, struct list_head *pagelist,
1228 				unsigned long flags)
1229 {
1230 	return -EIO;
1231 }
1232 
do_migrate_pages(struct mm_struct * mm,const nodemask_t * from,const nodemask_t * to,int flags)1233 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1234 		     const nodemask_t *to, int flags)
1235 {
1236 	return -ENOSYS;
1237 }
1238 
new_page(struct page * page,unsigned long start)1239 static struct page *new_page(struct page *page, unsigned long start)
1240 {
1241 	return NULL;
1242 }
1243 #endif
1244 
do_mbind(unsigned long start,unsigned long len,unsigned short mode,unsigned short mode_flags,nodemask_t * nmask,unsigned long flags)1245 static long do_mbind(unsigned long start, unsigned long len,
1246 		     unsigned short mode, unsigned short mode_flags,
1247 		     nodemask_t *nmask, unsigned long flags)
1248 {
1249 	struct mm_struct *mm = current->mm;
1250 	struct mempolicy *new;
1251 	unsigned long end;
1252 	int err;
1253 	int ret;
1254 	LIST_HEAD(pagelist);
1255 
1256 	if (flags & ~(unsigned long)MPOL_MF_VALID)
1257 		return -EINVAL;
1258 	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1259 		return -EPERM;
1260 
1261 	if (start & ~PAGE_MASK)
1262 		return -EINVAL;
1263 
1264 	if (mode == MPOL_DEFAULT)
1265 		flags &= ~MPOL_MF_STRICT;
1266 
1267 	len = (len + PAGE_SIZE - 1) & PAGE_MASK;
1268 	end = start + len;
1269 
1270 	if (end < start)
1271 		return -EINVAL;
1272 	if (end == start)
1273 		return 0;
1274 
1275 	new = mpol_new(mode, mode_flags, nmask);
1276 	if (IS_ERR(new))
1277 		return PTR_ERR(new);
1278 
1279 	if (flags & MPOL_MF_LAZY)
1280 		new->flags |= MPOL_F_MOF;
1281 
1282 	/*
1283 	 * If we are using the default policy then operation
1284 	 * on discontinuous address spaces is okay after all
1285 	 */
1286 	if (!new)
1287 		flags |= MPOL_MF_DISCONTIG_OK;
1288 
1289 	pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1290 		 start, start + len, mode, mode_flags,
1291 		 nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
1292 
1293 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1294 
1295 		lru_cache_disable();
1296 	}
1297 	{
1298 		NODEMASK_SCRATCH(scratch);
1299 		if (scratch) {
1300 			mmap_write_lock(mm);
1301 			err = mpol_set_nodemask(new, nmask, scratch);
1302 			if (err)
1303 				mmap_write_unlock(mm);
1304 		} else
1305 			err = -ENOMEM;
1306 		NODEMASK_SCRATCH_FREE(scratch);
1307 	}
1308 	if (err)
1309 		goto mpol_out;
1310 
1311 	ret = queue_pages_range(mm, start, end, nmask,
1312 			  flags | MPOL_MF_INVERT, &pagelist);
1313 
1314 	if (ret < 0) {
1315 		err = ret;
1316 		goto up_out;
1317 	}
1318 
1319 	err = mbind_range(mm, start, end, new);
1320 
1321 	if (!err) {
1322 		int nr_failed = 0;
1323 
1324 		if (!list_empty(&pagelist)) {
1325 			WARN_ON_ONCE(flags & MPOL_MF_LAZY);
1326 			nr_failed = migrate_pages(&pagelist, new_page, NULL,
1327 				start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND, NULL);
1328 			if (nr_failed)
1329 				putback_movable_pages(&pagelist);
1330 		}
1331 
1332 		if ((ret > 0) || (nr_failed && (flags & MPOL_MF_STRICT)))
1333 			err = -EIO;
1334 	} else {
1335 up_out:
1336 		if (!list_empty(&pagelist))
1337 			putback_movable_pages(&pagelist);
1338 	}
1339 
1340 	mmap_write_unlock(mm);
1341 mpol_out:
1342 	mpol_put(new);
1343 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
1344 		lru_cache_enable();
1345 	return err;
1346 }
1347 
1348 /*
1349  * User space interface with variable sized bitmaps for nodelists.
1350  */
get_bitmap(unsigned long * mask,const unsigned long __user * nmask,unsigned long maxnode)1351 static int get_bitmap(unsigned long *mask, const unsigned long __user *nmask,
1352 		      unsigned long maxnode)
1353 {
1354 	unsigned long nlongs = BITS_TO_LONGS(maxnode);
1355 	int ret;
1356 
1357 	if (in_compat_syscall())
1358 		ret = compat_get_bitmap(mask,
1359 					(const compat_ulong_t __user *)nmask,
1360 					maxnode);
1361 	else
1362 		ret = copy_from_user(mask, nmask,
1363 				     nlongs * sizeof(unsigned long));
1364 
1365 	if (ret)
1366 		return -EFAULT;
1367 
1368 	if (maxnode % BITS_PER_LONG)
1369 		mask[nlongs - 1] &= (1UL << (maxnode % BITS_PER_LONG)) - 1;
1370 
1371 	return 0;
1372 }
1373 
1374 /* Copy a node mask from user space. */
get_nodes(nodemask_t * nodes,const unsigned long __user * nmask,unsigned long maxnode)1375 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1376 		     unsigned long maxnode)
1377 {
1378 	--maxnode;
1379 	nodes_clear(*nodes);
1380 	if (maxnode == 0 || !nmask)
1381 		return 0;
1382 	if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1383 		return -EINVAL;
1384 
1385 	/*
1386 	 * When the user specified more nodes than supported just check
1387 	 * if the non supported part is all zero, one word at a time,
1388 	 * starting at the end.
1389 	 */
1390 	while (maxnode > MAX_NUMNODES) {
1391 		unsigned long bits = min_t(unsigned long, maxnode, BITS_PER_LONG);
1392 		unsigned long t;
1393 
1394 		if (get_bitmap(&t, &nmask[(maxnode - 1) / BITS_PER_LONG], bits))
1395 			return -EFAULT;
1396 
1397 		if (maxnode - bits >= MAX_NUMNODES) {
1398 			maxnode -= bits;
1399 		} else {
1400 			maxnode = MAX_NUMNODES;
1401 			t &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1);
1402 		}
1403 		if (t)
1404 			return -EINVAL;
1405 	}
1406 
1407 	return get_bitmap(nodes_addr(*nodes), nmask, maxnode);
1408 }
1409 
1410 /* Copy a kernel node mask to user space */
copy_nodes_to_user(unsigned long __user * mask,unsigned long maxnode,nodemask_t * nodes)1411 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1412 			      nodemask_t *nodes)
1413 {
1414 	unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1415 	unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long);
1416 	bool compat = in_compat_syscall();
1417 
1418 	if (compat)
1419 		nbytes = BITS_TO_COMPAT_LONGS(nr_node_ids) * sizeof(compat_long_t);
1420 
1421 	if (copy > nbytes) {
1422 		if (copy > PAGE_SIZE)
1423 			return -EINVAL;
1424 		if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1425 			return -EFAULT;
1426 		copy = nbytes;
1427 		maxnode = nr_node_ids;
1428 	}
1429 
1430 	if (compat)
1431 		return compat_put_bitmap((compat_ulong_t __user *)mask,
1432 					 nodes_addr(*nodes), maxnode);
1433 
1434 	return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1435 }
1436 
1437 /* Basic parameter sanity check used by both mbind() and set_mempolicy() */
sanitize_mpol_flags(int * mode,unsigned short * flags)1438 static inline int sanitize_mpol_flags(int *mode, unsigned short *flags)
1439 {
1440 	*flags = *mode & MPOL_MODE_FLAGS;
1441 	*mode &= ~MPOL_MODE_FLAGS;
1442 
1443 	if ((unsigned int)(*mode) >=  MPOL_MAX)
1444 		return -EINVAL;
1445 	if ((*flags & MPOL_F_STATIC_NODES) && (*flags & MPOL_F_RELATIVE_NODES))
1446 		return -EINVAL;
1447 	if (*flags & MPOL_F_NUMA_BALANCING) {
1448 		if (*mode != MPOL_BIND)
1449 			return -EINVAL;
1450 		*flags |= (MPOL_F_MOF | MPOL_F_MORON);
1451 	}
1452 	return 0;
1453 }
1454 
kernel_mbind(unsigned long start,unsigned long len,unsigned long mode,const unsigned long __user * nmask,unsigned long maxnode,unsigned int flags)1455 static long kernel_mbind(unsigned long start, unsigned long len,
1456 			 unsigned long mode, const unsigned long __user *nmask,
1457 			 unsigned long maxnode, unsigned int flags)
1458 {
1459 	unsigned short mode_flags;
1460 	nodemask_t nodes;
1461 	int lmode = mode;
1462 	int err;
1463 
1464 	start = untagged_addr(start);
1465 	err = sanitize_mpol_flags(&lmode, &mode_flags);
1466 	if (err)
1467 		return err;
1468 
1469 	err = get_nodes(&nodes, nmask, maxnode);
1470 	if (err)
1471 		return err;
1472 
1473 	return do_mbind(start, len, lmode, mode_flags, &nodes, flags);
1474 }
1475 
SYSCALL_DEFINE6(mbind,unsigned long,start,unsigned long,len,unsigned long,mode,const unsigned long __user *,nmask,unsigned long,maxnode,unsigned int,flags)1476 SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1477 		unsigned long, mode, const unsigned long __user *, nmask,
1478 		unsigned long, maxnode, unsigned int, flags)
1479 {
1480 	return kernel_mbind(start, len, mode, nmask, maxnode, flags);
1481 }
1482 
1483 /* Set the process memory policy */
kernel_set_mempolicy(int mode,const unsigned long __user * nmask,unsigned long maxnode)1484 static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask,
1485 				 unsigned long maxnode)
1486 {
1487 	unsigned short mode_flags;
1488 	nodemask_t nodes;
1489 	int lmode = mode;
1490 	int err;
1491 
1492 	err = sanitize_mpol_flags(&lmode, &mode_flags);
1493 	if (err)
1494 		return err;
1495 
1496 	err = get_nodes(&nodes, nmask, maxnode);
1497 	if (err)
1498 		return err;
1499 
1500 	return do_set_mempolicy(lmode, mode_flags, &nodes);
1501 }
1502 
SYSCALL_DEFINE3(set_mempolicy,int,mode,const unsigned long __user *,nmask,unsigned long,maxnode)1503 SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
1504 		unsigned long, maxnode)
1505 {
1506 	return kernel_set_mempolicy(mode, nmask, maxnode);
1507 }
1508 
kernel_migrate_pages(pid_t pid,unsigned long maxnode,const unsigned long __user * old_nodes,const unsigned long __user * new_nodes)1509 static int kernel_migrate_pages(pid_t pid, unsigned long maxnode,
1510 				const unsigned long __user *old_nodes,
1511 				const unsigned long __user *new_nodes)
1512 {
1513 	struct mm_struct *mm = NULL;
1514 	struct task_struct *task;
1515 	nodemask_t task_nodes;
1516 	int err;
1517 	nodemask_t *old;
1518 	nodemask_t *new;
1519 	NODEMASK_SCRATCH(scratch);
1520 
1521 	if (!scratch)
1522 		return -ENOMEM;
1523 
1524 	old = &scratch->mask1;
1525 	new = &scratch->mask2;
1526 
1527 	err = get_nodes(old, old_nodes, maxnode);
1528 	if (err)
1529 		goto out;
1530 
1531 	err = get_nodes(new, new_nodes, maxnode);
1532 	if (err)
1533 		goto out;
1534 
1535 	/* Find the mm_struct */
1536 	rcu_read_lock();
1537 	task = pid ? find_task_by_vpid(pid) : current;
1538 	if (!task) {
1539 		rcu_read_unlock();
1540 		err = -ESRCH;
1541 		goto out;
1542 	}
1543 	get_task_struct(task);
1544 
1545 	err = -EINVAL;
1546 
1547 	/*
1548 	 * Check if this process has the right to modify the specified process.
1549 	 * Use the regular "ptrace_may_access()" checks.
1550 	 */
1551 	if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
1552 		rcu_read_unlock();
1553 		err = -EPERM;
1554 		goto out_put;
1555 	}
1556 	rcu_read_unlock();
1557 
1558 	task_nodes = cpuset_mems_allowed(task);
1559 	/* Is the user allowed to access the target nodes? */
1560 	if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1561 		err = -EPERM;
1562 		goto out_put;
1563 	}
1564 
1565 	task_nodes = cpuset_mems_allowed(current);
1566 	nodes_and(*new, *new, task_nodes);
1567 	if (nodes_empty(*new))
1568 		goto out_put;
1569 
1570 	err = security_task_movememory(task);
1571 	if (err)
1572 		goto out_put;
1573 
1574 	mm = get_task_mm(task);
1575 	put_task_struct(task);
1576 
1577 	if (!mm) {
1578 		err = -EINVAL;
1579 		goto out;
1580 	}
1581 
1582 	err = do_migrate_pages(mm, old, new,
1583 		capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1584 
1585 	mmput(mm);
1586 out:
1587 	NODEMASK_SCRATCH_FREE(scratch);
1588 
1589 	return err;
1590 
1591 out_put:
1592 	put_task_struct(task);
1593 	goto out;
1594 
1595 }
1596 
SYSCALL_DEFINE4(migrate_pages,pid_t,pid,unsigned long,maxnode,const unsigned long __user *,old_nodes,const unsigned long __user *,new_nodes)1597 SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1598 		const unsigned long __user *, old_nodes,
1599 		const unsigned long __user *, new_nodes)
1600 {
1601 	return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes);
1602 }
1603 
1604 
1605 /* Retrieve NUMA policy */
kernel_get_mempolicy(int __user * policy,unsigned long __user * nmask,unsigned long maxnode,unsigned long addr,unsigned long flags)1606 static int kernel_get_mempolicy(int __user *policy,
1607 				unsigned long __user *nmask,
1608 				unsigned long maxnode,
1609 				unsigned long addr,
1610 				unsigned long flags)
1611 {
1612 	int err;
1613 	int pval;
1614 	nodemask_t nodes;
1615 
1616 	if (nmask != NULL && maxnode < nr_node_ids)
1617 		return -EINVAL;
1618 
1619 	addr = untagged_addr(addr);
1620 
1621 	err = do_get_mempolicy(&pval, &nodes, addr, flags);
1622 
1623 	if (err)
1624 		return err;
1625 
1626 	if (policy && put_user(pval, policy))
1627 		return -EFAULT;
1628 
1629 	if (nmask)
1630 		err = copy_nodes_to_user(nmask, maxnode, &nodes);
1631 
1632 	return err;
1633 }
1634 
SYSCALL_DEFINE5(get_mempolicy,int __user *,policy,unsigned long __user *,nmask,unsigned long,maxnode,unsigned long,addr,unsigned long,flags)1635 SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1636 		unsigned long __user *, nmask, unsigned long, maxnode,
1637 		unsigned long, addr, unsigned long, flags)
1638 {
1639 	return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags);
1640 }
1641 
vma_migratable(struct vm_area_struct * vma)1642 bool vma_migratable(struct vm_area_struct *vma)
1643 {
1644 	if (vma->vm_flags & (VM_IO | VM_PFNMAP))
1645 		return false;
1646 
1647 	/*
1648 	 * DAX device mappings require predictable access latency, so avoid
1649 	 * incurring periodic faults.
1650 	 */
1651 	if (vma_is_dax(vma))
1652 		return false;
1653 
1654 	if (is_vm_hugetlb_page(vma) &&
1655 		!hugepage_migration_supported(hstate_vma(vma)))
1656 		return false;
1657 
1658 	/*
1659 	 * Migration allocates pages in the highest zone. If we cannot
1660 	 * do so then migration (at least from node to node) is not
1661 	 * possible.
1662 	 */
1663 	if (vma->vm_file &&
1664 		gfp_zone(mapping_gfp_mask(vma->vm_file->f_mapping))
1665 			< policy_zone)
1666 		return false;
1667 	return true;
1668 }
1669 
__get_vma_policy(struct vm_area_struct * vma,unsigned long addr)1670 struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
1671 						unsigned long addr)
1672 {
1673 	struct mempolicy *pol = NULL;
1674 
1675 	if (vma) {
1676 		if (vma->vm_ops && vma->vm_ops->get_policy) {
1677 			pol = vma->vm_ops->get_policy(vma, addr);
1678 		} else if (vma->vm_policy) {
1679 			pol = vma->vm_policy;
1680 
1681 			/*
1682 			 * shmem_alloc_page() passes MPOL_F_SHARED policy with
1683 			 * a pseudo vma whose vma->vm_ops=NULL. Take a reference
1684 			 * count on these policies which will be dropped by
1685 			 * mpol_cond_put() later
1686 			 */
1687 			if (mpol_needs_cond_ref(pol))
1688 				mpol_get(pol);
1689 		}
1690 	}
1691 
1692 	return pol;
1693 }
1694 
1695 /*
1696  * get_vma_policy(@vma, @addr)
1697  * @vma: virtual memory area whose policy is sought
1698  * @addr: address in @vma for shared policy lookup
1699  *
1700  * Returns effective policy for a VMA at specified address.
1701  * Falls back to current->mempolicy or system default policy, as necessary.
1702  * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1703  * count--added by the get_policy() vm_op, as appropriate--to protect against
1704  * freeing by another task.  It is the caller's responsibility to free the
1705  * extra reference for shared policies.
1706  */
get_vma_policy(struct vm_area_struct * vma,unsigned long addr)1707 static struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
1708 						unsigned long addr)
1709 {
1710 	struct mempolicy *pol = __get_vma_policy(vma, addr);
1711 
1712 	if (!pol)
1713 		pol = get_task_policy(current);
1714 
1715 	return pol;
1716 }
1717 
vma_policy_mof(struct vm_area_struct * vma)1718 bool vma_policy_mof(struct vm_area_struct *vma)
1719 {
1720 	struct mempolicy *pol;
1721 
1722 	if (vma->vm_ops && vma->vm_ops->get_policy) {
1723 		bool ret = false;
1724 
1725 		pol = vma->vm_ops->get_policy(vma, vma->vm_start);
1726 		if (pol && (pol->flags & MPOL_F_MOF))
1727 			ret = true;
1728 		mpol_cond_put(pol);
1729 
1730 		return ret;
1731 	}
1732 
1733 	pol = vma->vm_policy;
1734 	if (!pol)
1735 		pol = get_task_policy(current);
1736 
1737 	return pol->flags & MPOL_F_MOF;
1738 }
1739 
apply_policy_zone(struct mempolicy * policy,enum zone_type zone)1740 static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
1741 {
1742 	enum zone_type dynamic_policy_zone = policy_zone;
1743 
1744 	BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
1745 
1746 	/*
1747 	 * if policy->nodes has movable memory only,
1748 	 * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
1749 	 *
1750 	 * policy->nodes is intersect with node_states[N_MEMORY].
1751 	 * so if the following test fails, it implies
1752 	 * policy->nodes has movable memory only.
1753 	 */
1754 	if (!nodes_intersects(policy->nodes, node_states[N_HIGH_MEMORY]))
1755 		dynamic_policy_zone = ZONE_MOVABLE;
1756 
1757 	return zone >= dynamic_policy_zone;
1758 }
1759 
1760 /*
1761  * Return a nodemask representing a mempolicy for filtering nodes for
1762  * page allocation
1763  */
policy_nodemask(gfp_t gfp,struct mempolicy * policy)1764 nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1765 {
1766 	int mode = policy->mode;
1767 
1768 	/* Lower zones don't get a nodemask applied for MPOL_BIND */
1769 	if (unlikely(mode == MPOL_BIND) &&
1770 		apply_policy_zone(policy, gfp_zone(gfp)) &&
1771 		cpuset_nodemask_valid_mems_allowed(&policy->nodes))
1772 		return &policy->nodes;
1773 
1774 	if (mode == MPOL_PREFERRED_MANY)
1775 		return &policy->nodes;
1776 
1777 	return NULL;
1778 }
1779 
1780 /*
1781  * Return the  preferred node id for 'prefer' mempolicy, and return
1782  * the given id for all other policies.
1783  *
1784  * policy_node() is always coupled with policy_nodemask(), which
1785  * secures the nodemask limit for 'bind' and 'prefer-many' policy.
1786  */
policy_node(gfp_t gfp,struct mempolicy * policy,int nd)1787 static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd)
1788 {
1789 	if (policy->mode == MPOL_PREFERRED) {
1790 		nd = first_node(policy->nodes);
1791 	} else {
1792 		/*
1793 		 * __GFP_THISNODE shouldn't even be used with the bind policy
1794 		 * because we might easily break the expectation to stay on the
1795 		 * requested node and not break the policy.
1796 		 */
1797 		WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE));
1798 	}
1799 
1800 	return nd;
1801 }
1802 
1803 /* Do dynamic interleaving for a process */
interleave_nodes(struct mempolicy * policy)1804 static unsigned interleave_nodes(struct mempolicy *policy)
1805 {
1806 	unsigned next;
1807 	struct task_struct *me = current;
1808 
1809 	next = next_node_in(me->il_prev, policy->nodes);
1810 	if (next < MAX_NUMNODES)
1811 		me->il_prev = next;
1812 	return next;
1813 }
1814 
1815 /*
1816  * Depending on the memory policy provide a node from which to allocate the
1817  * next slab entry.
1818  */
mempolicy_slab_node(void)1819 unsigned int mempolicy_slab_node(void)
1820 {
1821 	struct mempolicy *policy;
1822 	int node = numa_mem_id();
1823 
1824 	if (!in_task())
1825 		return node;
1826 
1827 	policy = current->mempolicy;
1828 	if (!policy)
1829 		return node;
1830 
1831 	switch (policy->mode) {
1832 	case MPOL_PREFERRED:
1833 		return first_node(policy->nodes);
1834 
1835 	case MPOL_INTERLEAVE:
1836 		return interleave_nodes(policy);
1837 
1838 	case MPOL_BIND:
1839 	case MPOL_PREFERRED_MANY:
1840 	{
1841 		struct zoneref *z;
1842 
1843 		/*
1844 		 * Follow bind policy behavior and start allocation at the
1845 		 * first node.
1846 		 */
1847 		struct zonelist *zonelist;
1848 		enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1849 		zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
1850 		z = first_zones_zonelist(zonelist, highest_zoneidx,
1851 							&policy->nodes);
1852 		return z->zone ? zone_to_nid(z->zone) : node;
1853 	}
1854 	case MPOL_LOCAL:
1855 		return node;
1856 
1857 	default:
1858 		BUG();
1859 	}
1860 }
1861 
1862 /*
1863  * Do static interleaving for a VMA with known offset @n.  Returns the n'th
1864  * node in pol->nodes (starting from n=0), wrapping around if n exceeds the
1865  * number of present nodes.
1866  */
offset_il_node(struct mempolicy * pol,unsigned long n)1867 static unsigned offset_il_node(struct mempolicy *pol, unsigned long n)
1868 {
1869 	nodemask_t nodemask = pol->nodes;
1870 	unsigned int target, nnodes;
1871 	int i;
1872 	int nid;
1873 	/*
1874 	 * The barrier will stabilize the nodemask in a register or on
1875 	 * the stack so that it will stop changing under the code.
1876 	 *
1877 	 * Between first_node() and next_node(), pol->nodes could be changed
1878 	 * by other threads. So we put pol->nodes in a local stack.
1879 	 */
1880 	barrier();
1881 
1882 	nnodes = nodes_weight(nodemask);
1883 	if (!nnodes)
1884 		return numa_node_id();
1885 	target = (unsigned int)n % nnodes;
1886 	nid = first_node(nodemask);
1887 	for (i = 0; i < target; i++)
1888 		nid = next_node(nid, nodemask);
1889 	return nid;
1890 }
1891 
1892 /* Determine a node number for interleave */
interleave_nid(struct mempolicy * pol,struct vm_area_struct * vma,unsigned long addr,int shift)1893 static inline unsigned interleave_nid(struct mempolicy *pol,
1894 		 struct vm_area_struct *vma, unsigned long addr, int shift)
1895 {
1896 	if (vma) {
1897 		unsigned long off;
1898 
1899 		/*
1900 		 * for small pages, there is no difference between
1901 		 * shift and PAGE_SHIFT, so the bit-shift is safe.
1902 		 * for huge pages, since vm_pgoff is in units of small
1903 		 * pages, we need to shift off the always 0 bits to get
1904 		 * a useful offset.
1905 		 */
1906 		BUG_ON(shift < PAGE_SHIFT);
1907 		off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1908 		off += (addr - vma->vm_start) >> shift;
1909 		return offset_il_node(pol, off);
1910 	} else
1911 		return interleave_nodes(pol);
1912 }
1913 
1914 #ifdef CONFIG_HUGETLBFS
1915 /*
1916  * huge_node(@vma, @addr, @gfp_flags, @mpol)
1917  * @vma: virtual memory area whose policy is sought
1918  * @addr: address in @vma for shared policy lookup and interleave policy
1919  * @gfp_flags: for requested zone
1920  * @mpol: pointer to mempolicy pointer for reference counted mempolicy
1921  * @nodemask: pointer to nodemask pointer for 'bind' and 'prefer-many' policy
1922  *
1923  * Returns a nid suitable for a huge page allocation and a pointer
1924  * to the struct mempolicy for conditional unref after allocation.
1925  * If the effective policy is 'bind' or 'prefer-many', returns a pointer
1926  * to the mempolicy's @nodemask for filtering the zonelist.
1927  *
1928  * Must be protected by read_mems_allowed_begin()
1929  */
huge_node(struct vm_area_struct * vma,unsigned long addr,gfp_t gfp_flags,struct mempolicy ** mpol,nodemask_t ** nodemask)1930 int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
1931 				struct mempolicy **mpol, nodemask_t **nodemask)
1932 {
1933 	int nid;
1934 	int mode;
1935 
1936 	*mpol = get_vma_policy(vma, addr);
1937 	*nodemask = NULL;
1938 	mode = (*mpol)->mode;
1939 
1940 	if (unlikely(mode == MPOL_INTERLEAVE)) {
1941 		nid = interleave_nid(*mpol, vma, addr,
1942 					huge_page_shift(hstate_vma(vma)));
1943 	} else {
1944 		nid = policy_node(gfp_flags, *mpol, numa_node_id());
1945 		if (mode == MPOL_BIND || mode == MPOL_PREFERRED_MANY)
1946 			*nodemask = &(*mpol)->nodes;
1947 	}
1948 	return nid;
1949 }
1950 
1951 /*
1952  * init_nodemask_of_mempolicy
1953  *
1954  * If the current task's mempolicy is "default" [NULL], return 'false'
1955  * to indicate default policy.  Otherwise, extract the policy nodemask
1956  * for 'bind' or 'interleave' policy into the argument nodemask, or
1957  * initialize the argument nodemask to contain the single node for
1958  * 'preferred' or 'local' policy and return 'true' to indicate presence
1959  * of non-default mempolicy.
1960  *
1961  * We don't bother with reference counting the mempolicy [mpol_get/put]
1962  * because the current task is examining it's own mempolicy and a task's
1963  * mempolicy is only ever changed by the task itself.
1964  *
1965  * N.B., it is the caller's responsibility to free a returned nodemask.
1966  */
init_nodemask_of_mempolicy(nodemask_t * mask)1967 bool init_nodemask_of_mempolicy(nodemask_t *mask)
1968 {
1969 	struct mempolicy *mempolicy;
1970 
1971 	if (!(mask && current->mempolicy))
1972 		return false;
1973 
1974 	task_lock(current);
1975 	mempolicy = current->mempolicy;
1976 	switch (mempolicy->mode) {
1977 	case MPOL_PREFERRED:
1978 	case MPOL_PREFERRED_MANY:
1979 	case MPOL_BIND:
1980 	case MPOL_INTERLEAVE:
1981 		*mask = mempolicy->nodes;
1982 		break;
1983 
1984 	case MPOL_LOCAL:
1985 		init_nodemask_of_node(mask, numa_node_id());
1986 		break;
1987 
1988 	default:
1989 		BUG();
1990 	}
1991 	task_unlock(current);
1992 
1993 	return true;
1994 }
1995 #endif
1996 
1997 /*
1998  * mempolicy_in_oom_domain
1999  *
2000  * If tsk's mempolicy is "bind", check for intersection between mask and
2001  * the policy nodemask. Otherwise, return true for all other policies
2002  * including "interleave", as a tsk with "interleave" policy may have
2003  * memory allocated from all nodes in system.
2004  *
2005  * Takes task_lock(tsk) to prevent freeing of its mempolicy.
2006  */
mempolicy_in_oom_domain(struct task_struct * tsk,const nodemask_t * mask)2007 bool mempolicy_in_oom_domain(struct task_struct *tsk,
2008 					const nodemask_t *mask)
2009 {
2010 	struct mempolicy *mempolicy;
2011 	bool ret = true;
2012 
2013 	if (!mask)
2014 		return ret;
2015 
2016 	task_lock(tsk);
2017 	mempolicy = tsk->mempolicy;
2018 	if (mempolicy && mempolicy->mode == MPOL_BIND)
2019 		ret = nodes_intersects(mempolicy->nodes, *mask);
2020 	task_unlock(tsk);
2021 
2022 	return ret;
2023 }
2024 
2025 /* Allocate a page in interleaved policy.
2026    Own path because it needs to do special accounting. */
alloc_page_interleave(gfp_t gfp,unsigned order,unsigned nid)2027 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
2028 					unsigned nid)
2029 {
2030 	struct page *page;
2031 
2032 	page = __alloc_pages(gfp, order, nid, NULL);
2033 	/* skip NUMA_INTERLEAVE_HIT counter update if numa stats is disabled */
2034 	if (!static_branch_likely(&vm_numa_stat_key))
2035 		return page;
2036 	if (page && page_to_nid(page) == nid) {
2037 		preempt_disable();
2038 		__count_numa_event(page_zone(page), NUMA_INTERLEAVE_HIT);
2039 		preempt_enable();
2040 	}
2041 	return page;
2042 }
2043 
alloc_pages_preferred_many(gfp_t gfp,unsigned int order,int nid,struct mempolicy * pol)2044 static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order,
2045 						int nid, struct mempolicy *pol)
2046 {
2047 	struct page *page;
2048 	gfp_t preferred_gfp;
2049 
2050 	/*
2051 	 * This is a two pass approach. The first pass will only try the
2052 	 * preferred nodes but skip the direct reclaim and allow the
2053 	 * allocation to fail, while the second pass will try all the
2054 	 * nodes in system.
2055 	 */
2056 	preferred_gfp = gfp | __GFP_NOWARN;
2057 	preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
2058 	page = __alloc_pages(preferred_gfp, order, nid, &pol->nodes);
2059 	if (!page)
2060 		page = __alloc_pages(gfp, order, numa_node_id(), NULL);
2061 
2062 	return page;
2063 }
2064 
2065 /**
2066  * alloc_pages_vma - Allocate a page for a VMA.
2067  * @gfp: GFP flags.
2068  * @order: Order of the GFP allocation.
2069  * @vma: Pointer to VMA or NULL if not available.
2070  * @addr: Virtual address of the allocation.  Must be inside @vma.
2071  * @node: Which node to prefer for allocation (modulo policy).
2072  * @hugepage: For hugepages try only the preferred node if possible.
2073  *
2074  * Allocate a page for a specific address in @vma, using the appropriate
2075  * NUMA policy.  When @vma is not NULL the caller must hold the mmap_lock
2076  * of the mm_struct of the VMA to prevent it from going away.  Should be
2077  * used for all allocations for pages that will be mapped into user space.
2078  *
2079  * Return: The page on success or NULL if allocation fails.
2080  */
alloc_pages_vma(gfp_t gfp,int order,struct vm_area_struct * vma,unsigned long addr,int node,bool hugepage)2081 struct page *alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
2082 		unsigned long addr, int node, bool hugepage)
2083 {
2084 	struct mempolicy *pol;
2085 	struct page *page;
2086 	int preferred_nid;
2087 	nodemask_t *nmask;
2088 
2089 	pol = get_vma_policy(vma, addr);
2090 
2091 	if (pol->mode == MPOL_INTERLEAVE) {
2092 		unsigned nid;
2093 
2094 		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
2095 		mpol_cond_put(pol);
2096 		page = alloc_page_interleave(gfp, order, nid);
2097 		goto out;
2098 	}
2099 
2100 	if (pol->mode == MPOL_PREFERRED_MANY) {
2101 		page = alloc_pages_preferred_many(gfp, order, node, pol);
2102 		mpol_cond_put(pol);
2103 		goto out;
2104 	}
2105 
2106 	if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
2107 		int hpage_node = node;
2108 
2109 		/*
2110 		 * For hugepage allocation and non-interleave policy which
2111 		 * allows the current node (or other explicitly preferred
2112 		 * node) we only try to allocate from the current/preferred
2113 		 * node and don't fall back to other nodes, as the cost of
2114 		 * remote accesses would likely offset THP benefits.
2115 		 *
2116 		 * If the policy is interleave or does not allow the current
2117 		 * node in its nodemask, we allocate the standard way.
2118 		 */
2119 		if (pol->mode == MPOL_PREFERRED)
2120 			hpage_node = first_node(pol->nodes);
2121 
2122 		nmask = policy_nodemask(gfp, pol);
2123 		if (!nmask || node_isset(hpage_node, *nmask)) {
2124 			mpol_cond_put(pol);
2125 			/*
2126 			 * First, try to allocate THP only on local node, but
2127 			 * don't reclaim unnecessarily, just compact.
2128 			 */
2129 			page = __alloc_pages_node(hpage_node,
2130 				gfp | __GFP_THISNODE | __GFP_NORETRY, order);
2131 
2132 			/*
2133 			 * If hugepage allocations are configured to always
2134 			 * synchronous compact or the vma has been madvised
2135 			 * to prefer hugepage backing, retry allowing remote
2136 			 * memory with both reclaim and compact as well.
2137 			 */
2138 			if (!page && (gfp & __GFP_DIRECT_RECLAIM))
2139 				page = __alloc_pages(gfp, order, hpage_node, nmask);
2140 
2141 			goto out;
2142 		}
2143 	}
2144 
2145 	nmask = policy_nodemask(gfp, pol);
2146 	preferred_nid = policy_node(gfp, pol, node);
2147 	page = __alloc_pages(gfp, order, preferred_nid, nmask);
2148 	mpol_cond_put(pol);
2149 out:
2150 	return page;
2151 }
2152 EXPORT_SYMBOL(alloc_pages_vma);
2153 
2154 /**
2155  * alloc_pages - Allocate pages.
2156  * @gfp: GFP flags.
2157  * @order: Power of two of number of pages to allocate.
2158  *
2159  * Allocate 1 << @order contiguous pages.  The physical address of the
2160  * first page is naturally aligned (eg an order-3 allocation will be aligned
2161  * to a multiple of 8 * PAGE_SIZE bytes).  The NUMA policy of the current
2162  * process is honoured when in process context.
2163  *
2164  * Context: Can be called from any context, providing the appropriate GFP
2165  * flags are used.
2166  * Return: The page on success or NULL if allocation fails.
2167  */
alloc_pages(gfp_t gfp,unsigned order)2168 struct page *alloc_pages(gfp_t gfp, unsigned order)
2169 {
2170 	struct mempolicy *pol = &default_policy;
2171 	struct page *page;
2172 
2173 	if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2174 		pol = get_task_policy(current);
2175 
2176 	/*
2177 	 * No reference counting needed for current->mempolicy
2178 	 * nor system default_policy
2179 	 */
2180 	if (pol->mode == MPOL_INTERLEAVE)
2181 		page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
2182 	else if (pol->mode == MPOL_PREFERRED_MANY)
2183 		page = alloc_pages_preferred_many(gfp, order,
2184 				numa_node_id(), pol);
2185 	else
2186 		page = __alloc_pages(gfp, order,
2187 				policy_node(gfp, pol, numa_node_id()),
2188 				policy_nodemask(gfp, pol));
2189 
2190 	return page;
2191 }
2192 EXPORT_SYMBOL(alloc_pages);
2193 
vma_dup_policy(struct vm_area_struct * src,struct vm_area_struct * dst)2194 int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
2195 {
2196 	struct mempolicy *pol = mpol_dup(vma_policy(src));
2197 
2198 	if (IS_ERR(pol))
2199 		return PTR_ERR(pol);
2200 	dst->vm_policy = pol;
2201 	return 0;
2202 }
2203 
2204 /*
2205  * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
2206  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
2207  * with the mems_allowed returned by cpuset_mems_allowed().  This
2208  * keeps mempolicies cpuset relative after its cpuset moves.  See
2209  * further kernel/cpuset.c update_nodemask().
2210  *
2211  * current's mempolicy may be rebinded by the other task(the task that changes
2212  * cpuset's mems), so we needn't do rebind work for current task.
2213  */
2214 
2215 /* Slow path of a mempolicy duplicate */
__mpol_dup(struct mempolicy * old)2216 struct mempolicy *__mpol_dup(struct mempolicy *old)
2217 {
2218 	struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2219 
2220 	if (!new)
2221 		return ERR_PTR(-ENOMEM);
2222 
2223 	/* task's mempolicy is protected by alloc_lock */
2224 	if (old == current->mempolicy) {
2225 		task_lock(current);
2226 		*new = *old;
2227 		task_unlock(current);
2228 	} else
2229 		*new = *old;
2230 
2231 	if (current_cpuset_is_being_rebound()) {
2232 		nodemask_t mems = cpuset_mems_allowed(current);
2233 		mpol_rebind_policy(new, &mems);
2234 	}
2235 	atomic_set(&new->refcnt, 1);
2236 	return new;
2237 }
2238 
2239 /* Slow path of a mempolicy comparison */
__mpol_equal(struct mempolicy * a,struct mempolicy * b)2240 bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2241 {
2242 	if (!a || !b)
2243 		return false;
2244 	if (a->mode != b->mode)
2245 		return false;
2246 	if (a->flags != b->flags)
2247 		return false;
2248 	if (mpol_store_user_nodemask(a))
2249 		if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
2250 			return false;
2251 
2252 	switch (a->mode) {
2253 	case MPOL_BIND:
2254 	case MPOL_INTERLEAVE:
2255 	case MPOL_PREFERRED:
2256 	case MPOL_PREFERRED_MANY:
2257 		return !!nodes_equal(a->nodes, b->nodes);
2258 	case MPOL_LOCAL:
2259 		return true;
2260 	default:
2261 		BUG();
2262 		return false;
2263 	}
2264 }
2265 
2266 /*
2267  * Shared memory backing store policy support.
2268  *
2269  * Remember policies even when nobody has shared memory mapped.
2270  * The policies are kept in Red-Black tree linked from the inode.
2271  * They are protected by the sp->lock rwlock, which should be held
2272  * for any accesses to the tree.
2273  */
2274 
2275 /*
2276  * lookup first element intersecting start-end.  Caller holds sp->lock for
2277  * reading or for writing
2278  */
2279 static struct sp_node *
sp_lookup(struct shared_policy * sp,unsigned long start,unsigned long end)2280 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
2281 {
2282 	struct rb_node *n = sp->root.rb_node;
2283 
2284 	while (n) {
2285 		struct sp_node *p = rb_entry(n, struct sp_node, nd);
2286 
2287 		if (start >= p->end)
2288 			n = n->rb_right;
2289 		else if (end <= p->start)
2290 			n = n->rb_left;
2291 		else
2292 			break;
2293 	}
2294 	if (!n)
2295 		return NULL;
2296 	for (;;) {
2297 		struct sp_node *w = NULL;
2298 		struct rb_node *prev = rb_prev(n);
2299 		if (!prev)
2300 			break;
2301 		w = rb_entry(prev, struct sp_node, nd);
2302 		if (w->end <= start)
2303 			break;
2304 		n = prev;
2305 	}
2306 	return rb_entry(n, struct sp_node, nd);
2307 }
2308 
2309 /*
2310  * Insert a new shared policy into the list.  Caller holds sp->lock for
2311  * writing.
2312  */
sp_insert(struct shared_policy * sp,struct sp_node * new)2313 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2314 {
2315 	struct rb_node **p = &sp->root.rb_node;
2316 	struct rb_node *parent = NULL;
2317 	struct sp_node *nd;
2318 
2319 	while (*p) {
2320 		parent = *p;
2321 		nd = rb_entry(parent, struct sp_node, nd);
2322 		if (new->start < nd->start)
2323 			p = &(*p)->rb_left;
2324 		else if (new->end > nd->end)
2325 			p = &(*p)->rb_right;
2326 		else
2327 			BUG();
2328 	}
2329 	rb_link_node(&new->nd, parent, p);
2330 	rb_insert_color(&new->nd, &sp->root);
2331 	pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
2332 		 new->policy ? new->policy->mode : 0);
2333 }
2334 
2335 /* Find shared policy intersecting idx */
2336 struct mempolicy *
mpol_shared_policy_lookup(struct shared_policy * sp,unsigned long idx)2337 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2338 {
2339 	struct mempolicy *pol = NULL;
2340 	struct sp_node *sn;
2341 
2342 	if (!sp->root.rb_node)
2343 		return NULL;
2344 	read_lock(&sp->lock);
2345 	sn = sp_lookup(sp, idx, idx+1);
2346 	if (sn) {
2347 		mpol_get(sn->policy);
2348 		pol = sn->policy;
2349 	}
2350 	read_unlock(&sp->lock);
2351 	return pol;
2352 }
2353 
sp_free(struct sp_node * n)2354 static void sp_free(struct sp_node *n)
2355 {
2356 	mpol_put(n->policy);
2357 	kmem_cache_free(sn_cache, n);
2358 }
2359 
2360 /**
2361  * mpol_misplaced - check whether current page node is valid in policy
2362  *
2363  * @page: page to be checked
2364  * @vma: vm area where page mapped
2365  * @addr: virtual address where page mapped
2366  *
2367  * Lookup current policy node id for vma,addr and "compare to" page's
2368  * node id.  Policy determination "mimics" alloc_page_vma().
2369  * Called from fault path where we know the vma and faulting address.
2370  *
2371  * Return: NUMA_NO_NODE if the page is in a node that is valid for this
2372  * policy, or a suitable node ID to allocate a replacement page from.
2373  */
mpol_misplaced(struct page * page,struct vm_area_struct * vma,unsigned long addr)2374 int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
2375 {
2376 	struct mempolicy *pol;
2377 	struct zoneref *z;
2378 	int curnid = page_to_nid(page);
2379 	unsigned long pgoff;
2380 	int thiscpu = raw_smp_processor_id();
2381 	int thisnid = cpu_to_node(thiscpu);
2382 	int polnid = NUMA_NO_NODE;
2383 	int ret = NUMA_NO_NODE;
2384 
2385 	pol = get_vma_policy(vma, addr);
2386 	if (!(pol->flags & MPOL_F_MOF))
2387 		goto out;
2388 
2389 	switch (pol->mode) {
2390 	case MPOL_INTERLEAVE:
2391 		pgoff = vma->vm_pgoff;
2392 		pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
2393 		polnid = offset_il_node(pol, pgoff);
2394 		break;
2395 
2396 	case MPOL_PREFERRED:
2397 		if (node_isset(curnid, pol->nodes))
2398 			goto out;
2399 		polnid = first_node(pol->nodes);
2400 		break;
2401 
2402 	case MPOL_LOCAL:
2403 		polnid = numa_node_id();
2404 		break;
2405 
2406 	case MPOL_BIND:
2407 		/* Optimize placement among multiple nodes via NUMA balancing */
2408 		if (pol->flags & MPOL_F_MORON) {
2409 			if (node_isset(thisnid, pol->nodes))
2410 				break;
2411 			goto out;
2412 		}
2413 		fallthrough;
2414 
2415 	case MPOL_PREFERRED_MANY:
2416 		/*
2417 		 * use current page if in policy nodemask,
2418 		 * else select nearest allowed node, if any.
2419 		 * If no allowed nodes, use current [!misplaced].
2420 		 */
2421 		if (node_isset(curnid, pol->nodes))
2422 			goto out;
2423 		z = first_zones_zonelist(
2424 				node_zonelist(numa_node_id(), GFP_HIGHUSER),
2425 				gfp_zone(GFP_HIGHUSER),
2426 				&pol->nodes);
2427 		polnid = zone_to_nid(z->zone);
2428 		break;
2429 
2430 	default:
2431 		BUG();
2432 	}
2433 
2434 	/* Migrate the page towards the node whose CPU is referencing it */
2435 	if (pol->flags & MPOL_F_MORON) {
2436 		polnid = thisnid;
2437 
2438 		if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
2439 			goto out;
2440 	}
2441 
2442 	if (curnid != polnid)
2443 		ret = polnid;
2444 out:
2445 	mpol_cond_put(pol);
2446 
2447 	return ret;
2448 }
2449 
2450 /*
2451  * Drop the (possibly final) reference to task->mempolicy.  It needs to be
2452  * dropped after task->mempolicy is set to NULL so that any allocation done as
2453  * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed
2454  * policy.
2455  */
mpol_put_task_policy(struct task_struct * task)2456 void mpol_put_task_policy(struct task_struct *task)
2457 {
2458 	struct mempolicy *pol;
2459 
2460 	task_lock(task);
2461 	pol = task->mempolicy;
2462 	task->mempolicy = NULL;
2463 	task_unlock(task);
2464 	mpol_put(pol);
2465 }
2466 
sp_delete(struct shared_policy * sp,struct sp_node * n)2467 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2468 {
2469 	pr_debug("deleting %lx-l%lx\n", n->start, n->end);
2470 	rb_erase(&n->nd, &sp->root);
2471 	sp_free(n);
2472 }
2473 
sp_node_init(struct sp_node * node,unsigned long start,unsigned long end,struct mempolicy * pol)2474 static void sp_node_init(struct sp_node *node, unsigned long start,
2475 			unsigned long end, struct mempolicy *pol)
2476 {
2477 	node->start = start;
2478 	node->end = end;
2479 	node->policy = pol;
2480 }
2481 
sp_alloc(unsigned long start,unsigned long end,struct mempolicy * pol)2482 static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2483 				struct mempolicy *pol)
2484 {
2485 	struct sp_node *n;
2486 	struct mempolicy *newpol;
2487 
2488 	n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2489 	if (!n)
2490 		return NULL;
2491 
2492 	newpol = mpol_dup(pol);
2493 	if (IS_ERR(newpol)) {
2494 		kmem_cache_free(sn_cache, n);
2495 		return NULL;
2496 	}
2497 	newpol->flags |= MPOL_F_SHARED;
2498 	sp_node_init(n, start, end, newpol);
2499 
2500 	return n;
2501 }
2502 
2503 /* Replace a policy range. */
shared_policy_replace(struct shared_policy * sp,unsigned long start,unsigned long end,struct sp_node * new)2504 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2505 				 unsigned long end, struct sp_node *new)
2506 {
2507 	struct sp_node *n;
2508 	struct sp_node *n_new = NULL;
2509 	struct mempolicy *mpol_new = NULL;
2510 	int ret = 0;
2511 
2512 restart:
2513 	write_lock(&sp->lock);
2514 	n = sp_lookup(sp, start, end);
2515 	/* Take care of old policies in the same range. */
2516 	while (n && n->start < end) {
2517 		struct rb_node *next = rb_next(&n->nd);
2518 		if (n->start >= start) {
2519 			if (n->end <= end)
2520 				sp_delete(sp, n);
2521 			else
2522 				n->start = end;
2523 		} else {
2524 			/* Old policy spanning whole new range. */
2525 			if (n->end > end) {
2526 				if (!n_new)
2527 					goto alloc_new;
2528 
2529 				*mpol_new = *n->policy;
2530 				atomic_set(&mpol_new->refcnt, 1);
2531 				sp_node_init(n_new, end, n->end, mpol_new);
2532 				n->end = start;
2533 				sp_insert(sp, n_new);
2534 				n_new = NULL;
2535 				mpol_new = NULL;
2536 				break;
2537 			} else
2538 				n->end = start;
2539 		}
2540 		if (!next)
2541 			break;
2542 		n = rb_entry(next, struct sp_node, nd);
2543 	}
2544 	if (new)
2545 		sp_insert(sp, new);
2546 	write_unlock(&sp->lock);
2547 	ret = 0;
2548 
2549 err_out:
2550 	if (mpol_new)
2551 		mpol_put(mpol_new);
2552 	if (n_new)
2553 		kmem_cache_free(sn_cache, n_new);
2554 
2555 	return ret;
2556 
2557 alloc_new:
2558 	write_unlock(&sp->lock);
2559 	ret = -ENOMEM;
2560 	n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2561 	if (!n_new)
2562 		goto err_out;
2563 	mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2564 	if (!mpol_new)
2565 		goto err_out;
2566 	atomic_set(&mpol_new->refcnt, 1);
2567 	goto restart;
2568 }
2569 
2570 /**
2571  * mpol_shared_policy_init - initialize shared policy for inode
2572  * @sp: pointer to inode shared policy
2573  * @mpol:  struct mempolicy to install
2574  *
2575  * Install non-NULL @mpol in inode's shared policy rb-tree.
2576  * On entry, the current task has a reference on a non-NULL @mpol.
2577  * This must be released on exit.
2578  * This is called at get_inode() calls and we can use GFP_KERNEL.
2579  */
mpol_shared_policy_init(struct shared_policy * sp,struct mempolicy * mpol)2580 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2581 {
2582 	int ret;
2583 
2584 	sp->root = RB_ROOT;		/* empty tree == default mempolicy */
2585 	rwlock_init(&sp->lock);
2586 
2587 	if (mpol) {
2588 		struct vm_area_struct pvma;
2589 		struct mempolicy *new;
2590 		NODEMASK_SCRATCH(scratch);
2591 
2592 		if (!scratch)
2593 			goto put_mpol;
2594 		/* contextualize the tmpfs mount point mempolicy */
2595 		new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
2596 		if (IS_ERR(new))
2597 			goto free_scratch; /* no valid nodemask intersection */
2598 
2599 		task_lock(current);
2600 		ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
2601 		task_unlock(current);
2602 		if (ret)
2603 			goto put_new;
2604 
2605 		/* Create pseudo-vma that contains just the policy */
2606 		vma_init(&pvma, NULL);
2607 		pvma.vm_end = TASK_SIZE;	/* policy covers entire file */
2608 		mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
2609 
2610 put_new:
2611 		mpol_put(new);			/* drop initial ref */
2612 free_scratch:
2613 		NODEMASK_SCRATCH_FREE(scratch);
2614 put_mpol:
2615 		mpol_put(mpol);	/* drop our incoming ref on sb mpol */
2616 	}
2617 }
2618 
mpol_set_shared_policy(struct shared_policy * info,struct vm_area_struct * vma,struct mempolicy * npol)2619 int mpol_set_shared_policy(struct shared_policy *info,
2620 			struct vm_area_struct *vma, struct mempolicy *npol)
2621 {
2622 	int err;
2623 	struct sp_node *new = NULL;
2624 	unsigned long sz = vma_pages(vma);
2625 
2626 	pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
2627 		 vma->vm_pgoff,
2628 		 sz, npol ? npol->mode : -1,
2629 		 npol ? npol->flags : -1,
2630 		 npol ? nodes_addr(npol->nodes)[0] : NUMA_NO_NODE);
2631 
2632 	if (npol) {
2633 		new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2634 		if (!new)
2635 			return -ENOMEM;
2636 	}
2637 	err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2638 	if (err && new)
2639 		sp_free(new);
2640 	return err;
2641 }
2642 
2643 /* Free a backing policy store on inode delete. */
mpol_free_shared_policy(struct shared_policy * p)2644 void mpol_free_shared_policy(struct shared_policy *p)
2645 {
2646 	struct sp_node *n;
2647 	struct rb_node *next;
2648 
2649 	if (!p->root.rb_node)
2650 		return;
2651 	write_lock(&p->lock);
2652 	next = rb_first(&p->root);
2653 	while (next) {
2654 		n = rb_entry(next, struct sp_node, nd);
2655 		next = rb_next(&n->nd);
2656 		sp_delete(p, n);
2657 	}
2658 	write_unlock(&p->lock);
2659 }
2660 
2661 #ifdef CONFIG_NUMA_BALANCING
2662 static int __initdata numabalancing_override;
2663 
check_numabalancing_enable(void)2664 static void __init check_numabalancing_enable(void)
2665 {
2666 	bool numabalancing_default = false;
2667 
2668 	if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
2669 		numabalancing_default = true;
2670 
2671 	/* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
2672 	if (numabalancing_override)
2673 		set_numabalancing_state(numabalancing_override == 1);
2674 
2675 	if (num_online_nodes() > 1 && !numabalancing_override) {
2676 		pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
2677 			numabalancing_default ? "Enabling" : "Disabling");
2678 		set_numabalancing_state(numabalancing_default);
2679 	}
2680 }
2681 
setup_numabalancing(char * str)2682 static int __init setup_numabalancing(char *str)
2683 {
2684 	int ret = 0;
2685 	if (!str)
2686 		goto out;
2687 
2688 	if (!strcmp(str, "enable")) {
2689 		numabalancing_override = 1;
2690 		ret = 1;
2691 	} else if (!strcmp(str, "disable")) {
2692 		numabalancing_override = -1;
2693 		ret = 1;
2694 	}
2695 out:
2696 	if (!ret)
2697 		pr_warn("Unable to parse numa_balancing=\n");
2698 
2699 	return ret;
2700 }
2701 __setup("numa_balancing=", setup_numabalancing);
2702 #else
check_numabalancing_enable(void)2703 static inline void __init check_numabalancing_enable(void)
2704 {
2705 }
2706 #endif /* CONFIG_NUMA_BALANCING */
2707 
2708 /* assumes fs == KERNEL_DS */
numa_policy_init(void)2709 void __init numa_policy_init(void)
2710 {
2711 	nodemask_t interleave_nodes;
2712 	unsigned long largest = 0;
2713 	int nid, prefer = 0;
2714 
2715 	policy_cache = kmem_cache_create("numa_policy",
2716 					 sizeof(struct mempolicy),
2717 					 0, SLAB_PANIC, NULL);
2718 
2719 	sn_cache = kmem_cache_create("shared_policy_node",
2720 				     sizeof(struct sp_node),
2721 				     0, SLAB_PANIC, NULL);
2722 
2723 	for_each_node(nid) {
2724 		preferred_node_policy[nid] = (struct mempolicy) {
2725 			.refcnt = ATOMIC_INIT(1),
2726 			.mode = MPOL_PREFERRED,
2727 			.flags = MPOL_F_MOF | MPOL_F_MORON,
2728 			.nodes = nodemask_of_node(nid),
2729 		};
2730 	}
2731 
2732 	/*
2733 	 * Set interleaving policy for system init. Interleaving is only
2734 	 * enabled across suitably sized nodes (default is >= 16MB), or
2735 	 * fall back to the largest node if they're all smaller.
2736 	 */
2737 	nodes_clear(interleave_nodes);
2738 	for_each_node_state(nid, N_MEMORY) {
2739 		unsigned long total_pages = node_present_pages(nid);
2740 
2741 		/* Preserve the largest node */
2742 		if (largest < total_pages) {
2743 			largest = total_pages;
2744 			prefer = nid;
2745 		}
2746 
2747 		/* Interleave this node? */
2748 		if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2749 			node_set(nid, interleave_nodes);
2750 	}
2751 
2752 	/* All too small, use the largest */
2753 	if (unlikely(nodes_empty(interleave_nodes)))
2754 		node_set(prefer, interleave_nodes);
2755 
2756 	if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2757 		pr_err("%s: interleaving failed\n", __func__);
2758 
2759 	check_numabalancing_enable();
2760 }
2761 
2762 /* Reset policy of current process to default */
numa_default_policy(void)2763 void numa_default_policy(void)
2764 {
2765 	do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
2766 }
2767 
2768 /*
2769  * Parse and format mempolicy from/to strings
2770  */
2771 
2772 static const char * const policy_modes[] =
2773 {
2774 	[MPOL_DEFAULT]    = "default",
2775 	[MPOL_PREFERRED]  = "prefer",
2776 	[MPOL_BIND]       = "bind",
2777 	[MPOL_INTERLEAVE] = "interleave",
2778 	[MPOL_LOCAL]      = "local",
2779 	[MPOL_PREFERRED_MANY]  = "prefer (many)",
2780 };
2781 
2782 
2783 #ifdef CONFIG_TMPFS
2784 /**
2785  * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
2786  * @str:  string containing mempolicy to parse
2787  * @mpol:  pointer to struct mempolicy pointer, returned on success.
2788  *
2789  * Format of input:
2790  *	<mode>[=<flags>][:<nodelist>]
2791  *
2792  * On success, returns 0, else 1
2793  */
mpol_parse_str(char * str,struct mempolicy ** mpol)2794 int mpol_parse_str(char *str, struct mempolicy **mpol)
2795 {
2796 	struct mempolicy *new = NULL;
2797 	unsigned short mode_flags;
2798 	nodemask_t nodes;
2799 	char *nodelist = strchr(str, ':');
2800 	char *flags = strchr(str, '=');
2801 	int err = 1, mode;
2802 
2803 	if (flags)
2804 		*flags++ = '\0';	/* terminate mode string */
2805 
2806 	if (nodelist) {
2807 		/* NUL-terminate mode or flags string */
2808 		*nodelist++ = '\0';
2809 		if (nodelist_parse(nodelist, nodes))
2810 			goto out;
2811 		if (!nodes_subset(nodes, node_states[N_MEMORY]))
2812 			goto out;
2813 	} else
2814 		nodes_clear(nodes);
2815 
2816 	mode = match_string(policy_modes, MPOL_MAX, str);
2817 	if (mode < 0)
2818 		goto out;
2819 
2820 	switch (mode) {
2821 	case MPOL_PREFERRED:
2822 		/*
2823 		 * Insist on a nodelist of one node only, although later
2824 		 * we use first_node(nodes) to grab a single node, so here
2825 		 * nodelist (or nodes) cannot be empty.
2826 		 */
2827 		if (nodelist) {
2828 			char *rest = nodelist;
2829 			while (isdigit(*rest))
2830 				rest++;
2831 			if (*rest)
2832 				goto out;
2833 			if (nodes_empty(nodes))
2834 				goto out;
2835 		}
2836 		break;
2837 	case MPOL_INTERLEAVE:
2838 		/*
2839 		 * Default to online nodes with memory if no nodelist
2840 		 */
2841 		if (!nodelist)
2842 			nodes = node_states[N_MEMORY];
2843 		break;
2844 	case MPOL_LOCAL:
2845 		/*
2846 		 * Don't allow a nodelist;  mpol_new() checks flags
2847 		 */
2848 		if (nodelist)
2849 			goto out;
2850 		break;
2851 	case MPOL_DEFAULT:
2852 		/*
2853 		 * Insist on a empty nodelist
2854 		 */
2855 		if (!nodelist)
2856 			err = 0;
2857 		goto out;
2858 	case MPOL_PREFERRED_MANY:
2859 	case MPOL_BIND:
2860 		/*
2861 		 * Insist on a nodelist
2862 		 */
2863 		if (!nodelist)
2864 			goto out;
2865 	}
2866 
2867 	mode_flags = 0;
2868 	if (flags) {
2869 		/*
2870 		 * Currently, we only support two mutually exclusive
2871 		 * mode flags.
2872 		 */
2873 		if (!strcmp(flags, "static"))
2874 			mode_flags |= MPOL_F_STATIC_NODES;
2875 		else if (!strcmp(flags, "relative"))
2876 			mode_flags |= MPOL_F_RELATIVE_NODES;
2877 		else
2878 			goto out;
2879 	}
2880 
2881 	new = mpol_new(mode, mode_flags, &nodes);
2882 	if (IS_ERR(new))
2883 		goto out;
2884 
2885 	/*
2886 	 * Save nodes for mpol_to_str() to show the tmpfs mount options
2887 	 * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
2888 	 */
2889 	if (mode != MPOL_PREFERRED) {
2890 		new->nodes = nodes;
2891 	} else if (nodelist) {
2892 		nodes_clear(new->nodes);
2893 		node_set(first_node(nodes), new->nodes);
2894 	} else {
2895 		new->mode = MPOL_LOCAL;
2896 	}
2897 
2898 	/*
2899 	 * Save nodes for contextualization: this will be used to "clone"
2900 	 * the mempolicy in a specific context [cpuset] at a later time.
2901 	 */
2902 	new->w.user_nodemask = nodes;
2903 
2904 	err = 0;
2905 
2906 out:
2907 	/* Restore string for error message */
2908 	if (nodelist)
2909 		*--nodelist = ':';
2910 	if (flags)
2911 		*--flags = '=';
2912 	if (!err)
2913 		*mpol = new;
2914 	return err;
2915 }
2916 #endif /* CONFIG_TMPFS */
2917 
2918 /**
2919  * mpol_to_str - format a mempolicy structure for printing
2920  * @buffer:  to contain formatted mempolicy string
2921  * @maxlen:  length of @buffer
2922  * @pol:  pointer to mempolicy to be formatted
2923  *
2924  * Convert @pol into a string.  If @buffer is too short, truncate the string.
2925  * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
2926  * longest flag, "relative", and to display at least a few node ids.
2927  */
mpol_to_str(char * buffer,int maxlen,struct mempolicy * pol)2928 void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
2929 {
2930 	char *p = buffer;
2931 	nodemask_t nodes = NODE_MASK_NONE;
2932 	unsigned short mode = MPOL_DEFAULT;
2933 	unsigned short flags = 0;
2934 
2935 	if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
2936 		mode = pol->mode;
2937 		flags = pol->flags;
2938 	}
2939 
2940 	switch (mode) {
2941 	case MPOL_DEFAULT:
2942 	case MPOL_LOCAL:
2943 		break;
2944 	case MPOL_PREFERRED:
2945 	case MPOL_PREFERRED_MANY:
2946 	case MPOL_BIND:
2947 	case MPOL_INTERLEAVE:
2948 		nodes = pol->nodes;
2949 		break;
2950 	default:
2951 		WARN_ON_ONCE(1);
2952 		snprintf(p, maxlen, "unknown");
2953 		return;
2954 	}
2955 
2956 	p += snprintf(p, maxlen, "%s", policy_modes[mode]);
2957 
2958 	if (flags & MPOL_MODE_FLAGS) {
2959 		p += snprintf(p, buffer + maxlen - p, "=");
2960 
2961 		/*
2962 		 * Currently, the only defined flags are mutually exclusive
2963 		 */
2964 		if (flags & MPOL_F_STATIC_NODES)
2965 			p += snprintf(p, buffer + maxlen - p, "static");
2966 		else if (flags & MPOL_F_RELATIVE_NODES)
2967 			p += snprintf(p, buffer + maxlen - p, "relative");
2968 	}
2969 
2970 	if (!nodes_empty(nodes))
2971 		p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
2972 			       nodemask_pr_args(&nodes));
2973 }
2974 
2975 bool numa_demotion_enabled = false;
2976 
2977 #ifdef CONFIG_SYSFS
numa_demotion_enabled_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)2978 static ssize_t numa_demotion_enabled_show(struct kobject *kobj,
2979 					  struct kobj_attribute *attr, char *buf)
2980 {
2981 	return sysfs_emit(buf, "%s\n",
2982 			  numa_demotion_enabled? "true" : "false");
2983 }
2984 
numa_demotion_enabled_store(struct kobject * kobj,struct kobj_attribute * attr,const char * buf,size_t count)2985 static ssize_t numa_demotion_enabled_store(struct kobject *kobj,
2986 					   struct kobj_attribute *attr,
2987 					   const char *buf, size_t count)
2988 {
2989 	if (!strncmp(buf, "true", 4) || !strncmp(buf, "1", 1))
2990 		numa_demotion_enabled = true;
2991 	else if (!strncmp(buf, "false", 5) || !strncmp(buf, "0", 1))
2992 		numa_demotion_enabled = false;
2993 	else
2994 		return -EINVAL;
2995 
2996 	return count;
2997 }
2998 
2999 static struct kobj_attribute numa_demotion_enabled_attr =
3000 	__ATTR(demotion_enabled, 0644, numa_demotion_enabled_show,
3001 	       numa_demotion_enabled_store);
3002 
3003 static struct attribute *numa_attrs[] = {
3004 	&numa_demotion_enabled_attr.attr,
3005 	NULL,
3006 };
3007 
3008 static const struct attribute_group numa_attr_group = {
3009 	.attrs = numa_attrs,
3010 };
3011 
numa_init_sysfs(void)3012 static int __init numa_init_sysfs(void)
3013 {
3014 	int err;
3015 	struct kobject *numa_kobj;
3016 
3017 	numa_kobj = kobject_create_and_add("numa", mm_kobj);
3018 	if (!numa_kobj) {
3019 		pr_err("failed to create numa kobject\n");
3020 		return -ENOMEM;
3021 	}
3022 	err = sysfs_create_group(numa_kobj, &numa_attr_group);
3023 	if (err) {
3024 		pr_err("failed to register numa group\n");
3025 		goto delete_obj;
3026 	}
3027 	return 0;
3028 
3029 delete_obj:
3030 	kobject_put(numa_kobj);
3031 	return err;
3032 }
3033 subsys_initcall(numa_init_sysfs);
3034 #endif
3035