• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Simple NUMA memory policy for the Linux kernel.
3  *
4  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
5  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
6  * Subject to the GNU Public License, version 2.
7  *
8  * NUMA policy allows the user to give hints in which node(s) memory should
9  * be allocated.
10  *
11  * Support four policies per VMA and per process:
12  *
13  * The VMA policy has priority over the process policy for a page fault.
14  *
15  * interleave     Allocate memory interleaved over a set of nodes,
16  *                with normal fallback if it fails.
17  *                For VMA based allocations this interleaves based on the
18  *                offset into the backing object or offset into the mapping
19  *                for anonymous memory. For process policy an process counter
20  *                is used.
21  *
22  * bind           Only allocate memory on a specific set of nodes,
23  *                no fallback.
24  *                FIXME: memory is allocated starting with the first node
25  *                to the last. It would be better if bind would truly restrict
26  *                the allocation to memory nodes instead
27  *
28  * preferred       Try a specific node first before normal fallback.
29  *                As a special case NUMA_NO_NODE here means do the allocation
30  *                on the local CPU. This is normally identical to default,
31  *                but useful to set in a VMA when you have a non default
32  *                process policy.
33  *
34  * default        Allocate on the local node first, or when on a VMA
35  *                use the process policy. This is what Linux always did
36  *		  in a NUMA aware kernel and still does by, ahem, default.
37  *
38  * The process policy is applied for most non interrupt memory allocations
39  * in that process' context. Interrupts ignore the policies and always
40  * try to allocate on the local CPU. The VMA policy is only applied for memory
41  * allocations for a VMA in the VM.
42  *
43  * Currently there are a few corner cases in swapping where the policy
44  * is not applied, but the majority should be handled. When process policy
45  * is used it is not remembered over swap outs/swap ins.
46  *
47  * Only the highest zone in the zone hierarchy gets policied. Allocations
48  * requesting a lower zone just use default policy. This implies that
49  * on systems with highmem kernel lowmem allocation don't get policied.
50  * Same with GFP_DMA allocations.
51  *
52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
53  * all users and remembered even when nobody has memory mapped.
54  */
55 
56 /* Notebook:
57    fix mmap readahead to honour policy and enable policy for any page cache
58    object
59    statistics for bigpages
60    global policy for page cache? currently it uses process policy. Requires
61    first item above.
62    handle mremap for shared memory (currently ignored for the policy)
63    grows down?
64    make bind policy root only? It can trigger oom much faster and the
65    kernel is not always grateful with that.
66 */
67 
68 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
69 
70 #include <linux/mempolicy.h>
71 #include <linux/mm.h>
72 #include <linux/highmem.h>
73 #include <linux/hugetlb.h>
74 #include <linux/kernel.h>
75 #include <linux/sched.h>
76 #include <linux/nodemask.h>
77 #include <linux/cpuset.h>
78 #include <linux/slab.h>
79 #include <linux/string.h>
80 #include <linux/export.h>
81 #include <linux/nsproxy.h>
82 #include <linux/interrupt.h>
83 #include <linux/init.h>
84 #include <linux/compat.h>
85 #include <linux/swap.h>
86 #include <linux/seq_file.h>
87 #include <linux/proc_fs.h>
88 #include <linux/migrate.h>
89 #include <linux/ksm.h>
90 #include <linux/rmap.h>
91 #include <linux/security.h>
92 #include <linux/syscalls.h>
93 #include <linux/ctype.h>
94 #include <linux/mm_inline.h>
95 #include <linux/mmu_notifier.h>
96 #include <linux/printk.h>
97 
98 #include <asm/tlbflush.h>
99 #include <asm/uaccess.h>
100 
101 #include "internal.h"
102 
103 /* Internal flags */
104 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)	/* Skip checks for continuous vmas */
105 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)		/* Invert check for nodemask */
106 
107 static struct kmem_cache *policy_cache;
108 static struct kmem_cache *sn_cache;
109 
110 /* Highest zone. An specific allocation for a zone below that is not
111    policied. */
112 enum zone_type policy_zone = 0;
113 
114 /*
115  * run-time system-wide default policy => local allocation
116  */
117 static struct mempolicy default_policy = {
118 	.refcnt = ATOMIC_INIT(1), /* never free it */
119 	.mode = MPOL_PREFERRED,
120 	.flags = MPOL_F_LOCAL,
121 };
122 
123 static struct mempolicy preferred_node_policy[MAX_NUMNODES];
124 
get_task_policy(struct task_struct * p)125 struct mempolicy *get_task_policy(struct task_struct *p)
126 {
127 	struct mempolicy *pol = p->mempolicy;
128 	int node;
129 
130 	if (pol)
131 		return pol;
132 
133 	node = numa_node_id();
134 	if (node != NUMA_NO_NODE) {
135 		pol = &preferred_node_policy[node];
136 		/* preferred_node_policy is not initialised early in boot */
137 		if (pol->mode)
138 			return pol;
139 	}
140 
141 	return &default_policy;
142 }
143 
144 static const struct mempolicy_operations {
145 	int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
146 	/*
147 	 * If read-side task has no lock to protect task->mempolicy, write-side
148 	 * task will rebind the task->mempolicy by two step. The first step is
149 	 * setting all the newly nodes, and the second step is cleaning all the
150 	 * disallowed nodes. In this way, we can avoid finding no node to alloc
151 	 * page.
152 	 * If we have a lock to protect task->mempolicy in read-side, we do
153 	 * rebind directly.
154 	 *
155 	 * step:
156 	 * 	MPOL_REBIND_ONCE - do rebind work at once
157 	 * 	MPOL_REBIND_STEP1 - set all the newly nodes
158 	 * 	MPOL_REBIND_STEP2 - clean all the disallowed nodes
159 	 */
160 	void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes,
161 			enum mpol_rebind_step step);
162 } mpol_ops[MPOL_MAX];
163 
mpol_store_user_nodemask(const struct mempolicy * pol)164 static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
165 {
166 	return pol->flags & MPOL_MODE_FLAGS;
167 }
168 
mpol_relative_nodemask(nodemask_t * ret,const nodemask_t * orig,const nodemask_t * rel)169 static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
170 				   const nodemask_t *rel)
171 {
172 	nodemask_t tmp;
173 	nodes_fold(tmp, *orig, nodes_weight(*rel));
174 	nodes_onto(*ret, tmp, *rel);
175 }
176 
mpol_new_interleave(struct mempolicy * pol,const nodemask_t * nodes)177 static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
178 {
179 	if (nodes_empty(*nodes))
180 		return -EINVAL;
181 	pol->v.nodes = *nodes;
182 	return 0;
183 }
184 
mpol_new_preferred(struct mempolicy * pol,const nodemask_t * nodes)185 static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
186 {
187 	if (!nodes)
188 		pol->flags |= MPOL_F_LOCAL;	/* local allocation */
189 	else if (nodes_empty(*nodes))
190 		return -EINVAL;			/*  no allowed nodes */
191 	else
192 		pol->v.preferred_node = first_node(*nodes);
193 	return 0;
194 }
195 
mpol_new_bind(struct mempolicy * pol,const nodemask_t * nodes)196 static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
197 {
198 	if (nodes_empty(*nodes))
199 		return -EINVAL;
200 	pol->v.nodes = *nodes;
201 	return 0;
202 }
203 
204 /*
205  * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
206  * any, for the new policy.  mpol_new() has already validated the nodes
207  * parameter with respect to the policy mode and flags.  But, we need to
208  * handle an empty nodemask with MPOL_PREFERRED here.
209  *
210  * Must be called holding task's alloc_lock to protect task's mems_allowed
211  * and mempolicy.  May also be called holding the mmap_semaphore for write.
212  */
mpol_set_nodemask(struct mempolicy * pol,const nodemask_t * nodes,struct nodemask_scratch * nsc)213 static int mpol_set_nodemask(struct mempolicy *pol,
214 		     const nodemask_t *nodes, struct nodemask_scratch *nsc)
215 {
216 	int ret;
217 
218 	/* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
219 	if (pol == NULL)
220 		return 0;
221 	/* Check N_MEMORY */
222 	nodes_and(nsc->mask1,
223 		  cpuset_current_mems_allowed, node_states[N_MEMORY]);
224 
225 	VM_BUG_ON(!nodes);
226 	if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
227 		nodes = NULL;	/* explicit local allocation */
228 	else {
229 		if (pol->flags & MPOL_F_RELATIVE_NODES)
230 			mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
231 		else
232 			nodes_and(nsc->mask2, *nodes, nsc->mask1);
233 
234 		if (mpol_store_user_nodemask(pol))
235 			pol->w.user_nodemask = *nodes;
236 		else
237 			pol->w.cpuset_mems_allowed =
238 						cpuset_current_mems_allowed;
239 	}
240 
241 	if (nodes)
242 		ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
243 	else
244 		ret = mpol_ops[pol->mode].create(pol, NULL);
245 	return ret;
246 }
247 
248 /*
249  * This function just creates a new policy, does some check and simple
250  * initialization. You must invoke mpol_set_nodemask() to set nodes.
251  */
mpol_new(unsigned short mode,unsigned short flags,nodemask_t * nodes)252 static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
253 				  nodemask_t *nodes)
254 {
255 	struct mempolicy *policy;
256 
257 	pr_debug("setting mode %d flags %d nodes[0] %lx\n",
258 		 mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
259 
260 	if (mode == MPOL_DEFAULT) {
261 		if (nodes && !nodes_empty(*nodes))
262 			return ERR_PTR(-EINVAL);
263 		return NULL;
264 	}
265 	VM_BUG_ON(!nodes);
266 
267 	/*
268 	 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
269 	 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
270 	 * All other modes require a valid pointer to a non-empty nodemask.
271 	 */
272 	if (mode == MPOL_PREFERRED) {
273 		if (nodes_empty(*nodes)) {
274 			if (((flags & MPOL_F_STATIC_NODES) ||
275 			     (flags & MPOL_F_RELATIVE_NODES)))
276 				return ERR_PTR(-EINVAL);
277 		}
278 	} else if (mode == MPOL_LOCAL) {
279 		if (!nodes_empty(*nodes))
280 			return ERR_PTR(-EINVAL);
281 		mode = MPOL_PREFERRED;
282 	} else if (nodes_empty(*nodes))
283 		return ERR_PTR(-EINVAL);
284 	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
285 	if (!policy)
286 		return ERR_PTR(-ENOMEM);
287 	atomic_set(&policy->refcnt, 1);
288 	policy->mode = mode;
289 	policy->flags = flags;
290 
291 	return policy;
292 }
293 
294 /* Slow path of a mpol destructor. */
__mpol_put(struct mempolicy * p)295 void __mpol_put(struct mempolicy *p)
296 {
297 	if (!atomic_dec_and_test(&p->refcnt))
298 		return;
299 	kmem_cache_free(policy_cache, p);
300 }
301 
mpol_rebind_default(struct mempolicy * pol,const nodemask_t * nodes,enum mpol_rebind_step step)302 static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes,
303 				enum mpol_rebind_step step)
304 {
305 }
306 
307 /*
308  * step:
309  * 	MPOL_REBIND_ONCE  - do rebind work at once
310  * 	MPOL_REBIND_STEP1 - set all the newly nodes
311  * 	MPOL_REBIND_STEP2 - clean all the disallowed nodes
312  */
mpol_rebind_nodemask(struct mempolicy * pol,const nodemask_t * nodes,enum mpol_rebind_step step)313 static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes,
314 				 enum mpol_rebind_step step)
315 {
316 	nodemask_t tmp;
317 
318 	if (pol->flags & MPOL_F_STATIC_NODES)
319 		nodes_and(tmp, pol->w.user_nodemask, *nodes);
320 	else if (pol->flags & MPOL_F_RELATIVE_NODES)
321 		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
322 	else {
323 		/*
324 		 * if step == 1, we use ->w.cpuset_mems_allowed to cache the
325 		 * result
326 		 */
327 		if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP1) {
328 			nodes_remap(tmp, pol->v.nodes,
329 					pol->w.cpuset_mems_allowed, *nodes);
330 			pol->w.cpuset_mems_allowed = step ? tmp : *nodes;
331 		} else if (step == MPOL_REBIND_STEP2) {
332 			tmp = pol->w.cpuset_mems_allowed;
333 			pol->w.cpuset_mems_allowed = *nodes;
334 		} else
335 			BUG();
336 	}
337 
338 	if (nodes_empty(tmp))
339 		tmp = *nodes;
340 
341 	if (step == MPOL_REBIND_STEP1)
342 		nodes_or(pol->v.nodes, pol->v.nodes, tmp);
343 	else if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP2)
344 		pol->v.nodes = tmp;
345 	else
346 		BUG();
347 
348 	if (!node_isset(current->il_next, tmp)) {
349 		current->il_next = next_node_in(current->il_next, tmp);
350 		if (current->il_next >= MAX_NUMNODES)
351 			current->il_next = numa_node_id();
352 	}
353 }
354 
mpol_rebind_preferred(struct mempolicy * pol,const nodemask_t * nodes,enum mpol_rebind_step step)355 static void mpol_rebind_preferred(struct mempolicy *pol,
356 				  const nodemask_t *nodes,
357 				  enum mpol_rebind_step step)
358 {
359 	nodemask_t tmp;
360 
361 	if (pol->flags & MPOL_F_STATIC_NODES) {
362 		int node = first_node(pol->w.user_nodemask);
363 
364 		if (node_isset(node, *nodes)) {
365 			pol->v.preferred_node = node;
366 			pol->flags &= ~MPOL_F_LOCAL;
367 		} else
368 			pol->flags |= MPOL_F_LOCAL;
369 	} else if (pol->flags & MPOL_F_RELATIVE_NODES) {
370 		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
371 		pol->v.preferred_node = first_node(tmp);
372 	} else if (!(pol->flags & MPOL_F_LOCAL)) {
373 		pol->v.preferred_node = node_remap(pol->v.preferred_node,
374 						   pol->w.cpuset_mems_allowed,
375 						   *nodes);
376 		pol->w.cpuset_mems_allowed = *nodes;
377 	}
378 }
379 
380 /*
381  * mpol_rebind_policy - Migrate a policy to a different set of nodes
382  *
383  * If read-side task has no lock to protect task->mempolicy, write-side
384  * task will rebind the task->mempolicy by two step. The first step is
385  * setting all the newly nodes, and the second step is cleaning all the
386  * disallowed nodes. In this way, we can avoid finding no node to alloc
387  * page.
388  * If we have a lock to protect task->mempolicy in read-side, we do
389  * rebind directly.
390  *
391  * step:
392  * 	MPOL_REBIND_ONCE  - do rebind work at once
393  * 	MPOL_REBIND_STEP1 - set all the newly nodes
394  * 	MPOL_REBIND_STEP2 - clean all the disallowed nodes
395  */
mpol_rebind_policy(struct mempolicy * pol,const nodemask_t * newmask,enum mpol_rebind_step step)396 static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask,
397 				enum mpol_rebind_step step)
398 {
399 	if (!pol)
400 		return;
401 	if (!mpol_store_user_nodemask(pol) && step == MPOL_REBIND_ONCE &&
402 	    nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
403 		return;
404 
405 	if (step == MPOL_REBIND_STEP1 && (pol->flags & MPOL_F_REBINDING))
406 		return;
407 
408 	if (step == MPOL_REBIND_STEP2 && !(pol->flags & MPOL_F_REBINDING))
409 		BUG();
410 
411 	if (step == MPOL_REBIND_STEP1)
412 		pol->flags |= MPOL_F_REBINDING;
413 	else if (step == MPOL_REBIND_STEP2)
414 		pol->flags &= ~MPOL_F_REBINDING;
415 	else if (step >= MPOL_REBIND_NSTEP)
416 		BUG();
417 
418 	mpol_ops[pol->mode].rebind(pol, newmask, step);
419 }
420 
421 /*
422  * Wrapper for mpol_rebind_policy() that just requires task
423  * pointer, and updates task mempolicy.
424  *
425  * Called with task's alloc_lock held.
426  */
427 
mpol_rebind_task(struct task_struct * tsk,const nodemask_t * new,enum mpol_rebind_step step)428 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new,
429 			enum mpol_rebind_step step)
430 {
431 	mpol_rebind_policy(tsk->mempolicy, new, step);
432 }
433 
434 /*
435  * Rebind each vma in mm to new nodemask.
436  *
437  * Call holding a reference to mm.  Takes mm->mmap_sem during call.
438  */
439 
mpol_rebind_mm(struct mm_struct * mm,nodemask_t * new)440 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
441 {
442 	struct vm_area_struct *vma;
443 
444 	down_write(&mm->mmap_sem);
445 	for (vma = mm->mmap; vma; vma = vma->vm_next)
446 		mpol_rebind_policy(vma->vm_policy, new, MPOL_REBIND_ONCE);
447 	up_write(&mm->mmap_sem);
448 }
449 
450 static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
451 	[MPOL_DEFAULT] = {
452 		.rebind = mpol_rebind_default,
453 	},
454 	[MPOL_INTERLEAVE] = {
455 		.create = mpol_new_interleave,
456 		.rebind = mpol_rebind_nodemask,
457 	},
458 	[MPOL_PREFERRED] = {
459 		.create = mpol_new_preferred,
460 		.rebind = mpol_rebind_preferred,
461 	},
462 	[MPOL_BIND] = {
463 		.create = mpol_new_bind,
464 		.rebind = mpol_rebind_nodemask,
465 	},
466 };
467 
468 static void migrate_page_add(struct page *page, struct list_head *pagelist,
469 				unsigned long flags);
470 
471 struct queue_pages {
472 	struct list_head *pagelist;
473 	unsigned long flags;
474 	nodemask_t *nmask;
475 	struct vm_area_struct *prev;
476 };
477 
478 /*
479  * Scan through pages checking if pages follow certain conditions,
480  * and move them to the pagelist if they do.
481  */
queue_pages_pte_range(pmd_t * pmd,unsigned long addr,unsigned long end,struct mm_walk * walk)482 static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
483 			unsigned long end, struct mm_walk *walk)
484 {
485 	struct vm_area_struct *vma = walk->vma;
486 	struct page *page;
487 	struct queue_pages *qp = walk->private;
488 	unsigned long flags = qp->flags;
489 	int nid, ret;
490 	pte_t *pte;
491 	spinlock_t *ptl;
492 
493 	if (pmd_trans_huge(*pmd)) {
494 		ptl = pmd_lock(walk->mm, pmd);
495 		if (pmd_trans_huge(*pmd)) {
496 			page = pmd_page(*pmd);
497 			if (is_huge_zero_page(page)) {
498 				spin_unlock(ptl);
499 				split_huge_pmd(vma, pmd, addr);
500 			} else {
501 				get_page(page);
502 				spin_unlock(ptl);
503 				lock_page(page);
504 				ret = split_huge_page(page);
505 				unlock_page(page);
506 				put_page(page);
507 				if (ret)
508 					return 0;
509 			}
510 		} else {
511 			spin_unlock(ptl);
512 		}
513 	}
514 
515 	if (pmd_trans_unstable(pmd))
516 		return 0;
517 retry:
518 	pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
519 	for (; addr != end; pte++, addr += PAGE_SIZE) {
520 		if (!pte_present(*pte))
521 			continue;
522 		page = vm_normal_page(vma, addr, *pte);
523 		if (!page)
524 			continue;
525 		/*
526 		 * vm_normal_page() filters out zero pages, but there might
527 		 * still be PageReserved pages to skip, perhaps in a VDSO.
528 		 */
529 		if (PageReserved(page))
530 			continue;
531 		nid = page_to_nid(page);
532 		if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT))
533 			continue;
534 		if (PageTransCompound(page)) {
535 			get_page(page);
536 			pte_unmap_unlock(pte, ptl);
537 			lock_page(page);
538 			ret = split_huge_page(page);
539 			unlock_page(page);
540 			put_page(page);
541 			/* Failed to split -- skip. */
542 			if (ret) {
543 				pte = pte_offset_map_lock(walk->mm, pmd,
544 						addr, &ptl);
545 				continue;
546 			}
547 			goto retry;
548 		}
549 
550 		migrate_page_add(page, qp->pagelist, flags);
551 	}
552 	pte_unmap_unlock(pte - 1, ptl);
553 	cond_resched();
554 	return 0;
555 }
556 
queue_pages_hugetlb(pte_t * pte,unsigned long hmask,unsigned long addr,unsigned long end,struct mm_walk * walk)557 static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
558 			       unsigned long addr, unsigned long end,
559 			       struct mm_walk *walk)
560 {
561 #ifdef CONFIG_HUGETLB_PAGE
562 	struct queue_pages *qp = walk->private;
563 	unsigned long flags = qp->flags;
564 	int nid;
565 	struct page *page;
566 	spinlock_t *ptl;
567 	pte_t entry;
568 
569 	ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
570 	entry = huge_ptep_get(pte);
571 	if (!pte_present(entry))
572 		goto unlock;
573 	page = pte_page(entry);
574 	nid = page_to_nid(page);
575 	if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT))
576 		goto unlock;
577 	/* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
578 	if (flags & (MPOL_MF_MOVE_ALL) ||
579 	    (flags & MPOL_MF_MOVE && page_mapcount(page) == 1))
580 		isolate_huge_page(page, qp->pagelist);
581 unlock:
582 	spin_unlock(ptl);
583 #else
584 	BUG();
585 #endif
586 	return 0;
587 }
588 
589 #ifdef CONFIG_NUMA_BALANCING
590 /*
591  * This is used to mark a range of virtual addresses to be inaccessible.
592  * These are later cleared by a NUMA hinting fault. Depending on these
593  * faults, pages may be migrated for better NUMA placement.
594  *
595  * This is assuming that NUMA faults are handled using PROT_NONE. If
596  * an architecture makes a different choice, it will need further
597  * changes to the core.
598  */
change_prot_numa(struct vm_area_struct * vma,unsigned long addr,unsigned long end)599 unsigned long change_prot_numa(struct vm_area_struct *vma,
600 			unsigned long addr, unsigned long end)
601 {
602 	int nr_updated;
603 
604 	nr_updated = change_protection(vma, addr, end, PAGE_NONE, 0, 1);
605 	if (nr_updated)
606 		count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
607 
608 	return nr_updated;
609 }
610 #else
change_prot_numa(struct vm_area_struct * vma,unsigned long addr,unsigned long end)611 static unsigned long change_prot_numa(struct vm_area_struct *vma,
612 			unsigned long addr, unsigned long end)
613 {
614 	return 0;
615 }
616 #endif /* CONFIG_NUMA_BALANCING */
617 
queue_pages_test_walk(unsigned long start,unsigned long end,struct mm_walk * walk)618 static int queue_pages_test_walk(unsigned long start, unsigned long end,
619 				struct mm_walk *walk)
620 {
621 	struct vm_area_struct *vma = walk->vma;
622 	struct queue_pages *qp = walk->private;
623 	unsigned long endvma = vma->vm_end;
624 	unsigned long flags = qp->flags;
625 
626 	if (!vma_migratable(vma))
627 		return 1;
628 
629 	if (endvma > end)
630 		endvma = end;
631 	if (vma->vm_start > start)
632 		start = vma->vm_start;
633 
634 	if (!(flags & MPOL_MF_DISCONTIG_OK)) {
635 		if (!vma->vm_next && vma->vm_end < end)
636 			return -EFAULT;
637 		if (qp->prev && qp->prev->vm_end < vma->vm_start)
638 			return -EFAULT;
639 	}
640 
641 	qp->prev = vma;
642 
643 	if (flags & MPOL_MF_LAZY) {
644 		/* Similar to task_numa_work, skip inaccessible VMAs */
645 		if (!is_vm_hugetlb_page(vma) &&
646 			(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) &&
647 			!(vma->vm_flags & VM_MIXEDMAP))
648 			change_prot_numa(vma, start, endvma);
649 		return 1;
650 	}
651 
652 	/* queue pages from current vma */
653 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
654 		return 0;
655 	return 1;
656 }
657 
658 /*
659  * Walk through page tables and collect pages to be migrated.
660  *
661  * If pages found in a given range are on a set of nodes (determined by
662  * @nodes and @flags,) it's isolated and queued to the pagelist which is
663  * passed via @private.)
664  */
665 static int
queue_pages_range(struct mm_struct * mm,unsigned long start,unsigned long end,nodemask_t * nodes,unsigned long flags,struct list_head * pagelist)666 queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
667 		nodemask_t *nodes, unsigned long flags,
668 		struct list_head *pagelist)
669 {
670 	struct queue_pages qp = {
671 		.pagelist = pagelist,
672 		.flags = flags,
673 		.nmask = nodes,
674 		.prev = NULL,
675 	};
676 	struct mm_walk queue_pages_walk = {
677 		.hugetlb_entry = queue_pages_hugetlb,
678 		.pmd_entry = queue_pages_pte_range,
679 		.test_walk = queue_pages_test_walk,
680 		.mm = mm,
681 		.private = &qp,
682 	};
683 
684 	return walk_page_range(start, end, &queue_pages_walk);
685 }
686 
687 /*
688  * Apply policy to a single VMA
689  * This must be called with the mmap_sem held for writing.
690  */
vma_replace_policy(struct vm_area_struct * vma,struct mempolicy * pol)691 static int vma_replace_policy(struct vm_area_struct *vma,
692 						struct mempolicy *pol)
693 {
694 	int err;
695 	struct mempolicy *old;
696 	struct mempolicy *new;
697 
698 	pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
699 		 vma->vm_start, vma->vm_end, vma->vm_pgoff,
700 		 vma->vm_ops, vma->vm_file,
701 		 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
702 
703 	new = mpol_dup(pol);
704 	if (IS_ERR(new))
705 		return PTR_ERR(new);
706 
707 	if (vma->vm_ops && vma->vm_ops->set_policy) {
708 		err = vma->vm_ops->set_policy(vma, new);
709 		if (err)
710 			goto err_out;
711 	}
712 
713 	old = vma->vm_policy;
714 	vma->vm_policy = new; /* protected by mmap_sem */
715 	mpol_put(old);
716 
717 	return 0;
718  err_out:
719 	mpol_put(new);
720 	return err;
721 }
722 
723 /* Step 2: apply policy to a range and do splits. */
mbind_range(struct mm_struct * mm,unsigned long start,unsigned long end,struct mempolicy * new_pol)724 static int mbind_range(struct mm_struct *mm, unsigned long start,
725 		       unsigned long end, struct mempolicy *new_pol)
726 {
727 	struct vm_area_struct *next;
728 	struct vm_area_struct *prev;
729 	struct vm_area_struct *vma;
730 	int err = 0;
731 	pgoff_t pgoff;
732 	unsigned long vmstart;
733 	unsigned long vmend;
734 
735 	vma = find_vma(mm, start);
736 	if (!vma || vma->vm_start > start)
737 		return -EFAULT;
738 
739 	prev = vma->vm_prev;
740 	if (start > vma->vm_start)
741 		prev = vma;
742 
743 	for (; vma && vma->vm_start < end; prev = vma, vma = next) {
744 		next = vma->vm_next;
745 		vmstart = max(start, vma->vm_start);
746 		vmend   = min(end, vma->vm_end);
747 
748 		if (mpol_equal(vma_policy(vma), new_pol))
749 			continue;
750 
751 		pgoff = vma->vm_pgoff +
752 			((vmstart - vma->vm_start) >> PAGE_SHIFT);
753 		prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
754 				 vma->anon_vma, vma->vm_file, pgoff,
755 				 new_pol, vma->vm_userfaultfd_ctx,
756 				 vma_get_anon_name(vma));
757 		if (prev) {
758 			vma = prev;
759 			next = vma->vm_next;
760 			if (mpol_equal(vma_policy(vma), new_pol))
761 				continue;
762 			/* vma_merge() joined vma && vma->next, case 8 */
763 			goto replace;
764 		}
765 		if (vma->vm_start != vmstart) {
766 			err = split_vma(vma->vm_mm, vma, vmstart, 1);
767 			if (err)
768 				goto out;
769 		}
770 		if (vma->vm_end != vmend) {
771 			err = split_vma(vma->vm_mm, vma, vmend, 0);
772 			if (err)
773 				goto out;
774 		}
775  replace:
776 		err = vma_replace_policy(vma, new_pol);
777 		if (err)
778 			goto out;
779 	}
780 
781  out:
782 	return err;
783 }
784 
785 /* Set the process memory policy */
do_set_mempolicy(unsigned short mode,unsigned short flags,nodemask_t * nodes)786 static long do_set_mempolicy(unsigned short mode, unsigned short flags,
787 			     nodemask_t *nodes)
788 {
789 	struct mempolicy *new, *old;
790 	NODEMASK_SCRATCH(scratch);
791 	int ret;
792 
793 	if (!scratch)
794 		return -ENOMEM;
795 
796 	new = mpol_new(mode, flags, nodes);
797 	if (IS_ERR(new)) {
798 		ret = PTR_ERR(new);
799 		goto out;
800 	}
801 
802 	task_lock(current);
803 	ret = mpol_set_nodemask(new, nodes, scratch);
804 	if (ret) {
805 		task_unlock(current);
806 		mpol_put(new);
807 		goto out;
808 	}
809 	old = current->mempolicy;
810 	current->mempolicy = new;
811 	if (new && new->mode == MPOL_INTERLEAVE &&
812 	    nodes_weight(new->v.nodes))
813 		current->il_next = first_node(new->v.nodes);
814 	task_unlock(current);
815 	mpol_put(old);
816 	ret = 0;
817 out:
818 	NODEMASK_SCRATCH_FREE(scratch);
819 	return ret;
820 }
821 
822 /*
823  * Return nodemask for policy for get_mempolicy() query
824  *
825  * Called with task's alloc_lock held
826  */
get_policy_nodemask(struct mempolicy * p,nodemask_t * nodes)827 static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
828 {
829 	nodes_clear(*nodes);
830 	if (p == &default_policy)
831 		return;
832 
833 	switch (p->mode) {
834 	case MPOL_BIND:
835 		/* Fall through */
836 	case MPOL_INTERLEAVE:
837 		*nodes = p->v.nodes;
838 		break;
839 	case MPOL_PREFERRED:
840 		if (!(p->flags & MPOL_F_LOCAL))
841 			node_set(p->v.preferred_node, *nodes);
842 		/* else return empty node mask for local allocation */
843 		break;
844 	default:
845 		BUG();
846 	}
847 }
848 
lookup_node(unsigned long addr)849 static int lookup_node(unsigned long addr)
850 {
851 	struct page *p;
852 	int err;
853 
854 	err = get_user_pages(addr & PAGE_MASK, 1, 0, &p, NULL);
855 	if (err >= 0) {
856 		err = page_to_nid(p);
857 		put_page(p);
858 	}
859 	return err;
860 }
861 
862 /* Retrieve NUMA policy */
do_get_mempolicy(int * policy,nodemask_t * nmask,unsigned long addr,unsigned long flags)863 static long do_get_mempolicy(int *policy, nodemask_t *nmask,
864 			     unsigned long addr, unsigned long flags)
865 {
866 	int err;
867 	struct mm_struct *mm = current->mm;
868 	struct vm_area_struct *vma = NULL;
869 	struct mempolicy *pol = current->mempolicy;
870 
871 	if (flags &
872 		~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
873 		return -EINVAL;
874 
875 	if (flags & MPOL_F_MEMS_ALLOWED) {
876 		if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
877 			return -EINVAL;
878 		*policy = 0;	/* just so it's initialized */
879 		task_lock(current);
880 		*nmask  = cpuset_current_mems_allowed;
881 		task_unlock(current);
882 		return 0;
883 	}
884 
885 	if (flags & MPOL_F_ADDR) {
886 		/*
887 		 * Do NOT fall back to task policy if the
888 		 * vma/shared policy at addr is NULL.  We
889 		 * want to return MPOL_DEFAULT in this case.
890 		 */
891 		down_read(&mm->mmap_sem);
892 		vma = find_vma_intersection(mm, addr, addr+1);
893 		if (!vma) {
894 			up_read(&mm->mmap_sem);
895 			return -EFAULT;
896 		}
897 		if (vma->vm_ops && vma->vm_ops->get_policy)
898 			pol = vma->vm_ops->get_policy(vma, addr);
899 		else
900 			pol = vma->vm_policy;
901 	} else if (addr)
902 		return -EINVAL;
903 
904 	if (!pol)
905 		pol = &default_policy;	/* indicates default behavior */
906 
907 	if (flags & MPOL_F_NODE) {
908 		if (flags & MPOL_F_ADDR) {
909 			err = lookup_node(addr);
910 			if (err < 0)
911 				goto out;
912 			*policy = err;
913 		} else if (pol == current->mempolicy &&
914 				pol->mode == MPOL_INTERLEAVE) {
915 			*policy = current->il_next;
916 		} else {
917 			err = -EINVAL;
918 			goto out;
919 		}
920 	} else {
921 		*policy = pol == &default_policy ? MPOL_DEFAULT :
922 						pol->mode;
923 		/*
924 		 * Internal mempolicy flags must be masked off before exposing
925 		 * the policy to userspace.
926 		 */
927 		*policy |= (pol->flags & MPOL_MODE_FLAGS);
928 	}
929 
930 	err = 0;
931 	if (nmask) {
932 		if (mpol_store_user_nodemask(pol)) {
933 			*nmask = pol->w.user_nodemask;
934 		} else {
935 			task_lock(current);
936 			get_policy_nodemask(pol, nmask);
937 			task_unlock(current);
938 		}
939 	}
940 
941  out:
942 	mpol_cond_put(pol);
943 	if (vma)
944 		up_read(&current->mm->mmap_sem);
945 	return err;
946 }
947 
948 #ifdef CONFIG_MIGRATION
949 /*
950  * page migration
951  */
migrate_page_add(struct page * page,struct list_head * pagelist,unsigned long flags)952 static void migrate_page_add(struct page *page, struct list_head *pagelist,
953 				unsigned long flags)
954 {
955 	/*
956 	 * Avoid migrating a page that is shared with others.
957 	 */
958 	if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
959 		if (!isolate_lru_page(page)) {
960 			list_add_tail(&page->lru, pagelist);
961 			inc_node_page_state(page, NR_ISOLATED_ANON +
962 					    page_is_file_cache(page));
963 		}
964 	}
965 }
966 
new_node_page(struct page * page,unsigned long node,int ** x)967 static struct page *new_node_page(struct page *page, unsigned long node, int **x)
968 {
969 	if (PageHuge(page))
970 		return alloc_huge_page_node(page_hstate(compound_head(page)),
971 					node);
972 	else
973 		return __alloc_pages_node(node, GFP_HIGHUSER_MOVABLE |
974 						    __GFP_THISNODE, 0);
975 }
976 
977 /*
978  * Migrate pages from one node to a target node.
979  * Returns error or the number of pages not migrated.
980  */
migrate_to_node(struct mm_struct * mm,int source,int dest,int flags)981 static int migrate_to_node(struct mm_struct *mm, int source, int dest,
982 			   int flags)
983 {
984 	nodemask_t nmask;
985 	LIST_HEAD(pagelist);
986 	int err = 0;
987 
988 	nodes_clear(nmask);
989 	node_set(source, nmask);
990 
991 	/*
992 	 * This does not "check" the range but isolates all pages that
993 	 * need migration.  Between passing in the full user address
994 	 * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
995 	 */
996 	VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
997 	queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
998 			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
999 
1000 	if (!list_empty(&pagelist)) {
1001 		err = migrate_pages(&pagelist, new_node_page, NULL, dest,
1002 					MIGRATE_SYNC, MR_SYSCALL);
1003 		if (err)
1004 			putback_movable_pages(&pagelist);
1005 	}
1006 
1007 	return err;
1008 }
1009 
1010 /*
1011  * Move pages between the two nodesets so as to preserve the physical
1012  * layout as much as possible.
1013  *
1014  * Returns the number of page that could not be moved.
1015  */
do_migrate_pages(struct mm_struct * mm,const nodemask_t * from,const nodemask_t * to,int flags)1016 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1017 		     const nodemask_t *to, int flags)
1018 {
1019 	int busy = 0;
1020 	int err;
1021 	nodemask_t tmp;
1022 
1023 	err = migrate_prep();
1024 	if (err)
1025 		return err;
1026 
1027 	down_read(&mm->mmap_sem);
1028 
1029 	/*
1030 	 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
1031 	 * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
1032 	 * bit in 'tmp', and return that <source, dest> pair for migration.
1033 	 * The pair of nodemasks 'to' and 'from' define the map.
1034 	 *
1035 	 * If no pair of bits is found that way, fallback to picking some
1036 	 * pair of 'source' and 'dest' bits that are not the same.  If the
1037 	 * 'source' and 'dest' bits are the same, this represents a node
1038 	 * that will be migrating to itself, so no pages need move.
1039 	 *
1040 	 * If no bits are left in 'tmp', or if all remaining bits left
1041 	 * in 'tmp' correspond to the same bit in 'to', return false
1042 	 * (nothing left to migrate).
1043 	 *
1044 	 * This lets us pick a pair of nodes to migrate between, such that
1045 	 * if possible the dest node is not already occupied by some other
1046 	 * source node, minimizing the risk of overloading the memory on a
1047 	 * node that would happen if we migrated incoming memory to a node
1048 	 * before migrating outgoing memory source that same node.
1049 	 *
1050 	 * A single scan of tmp is sufficient.  As we go, we remember the
1051 	 * most recent <s, d> pair that moved (s != d).  If we find a pair
1052 	 * that not only moved, but what's better, moved to an empty slot
1053 	 * (d is not set in tmp), then we break out then, with that pair.
1054 	 * Otherwise when we finish scanning from_tmp, we at least have the
1055 	 * most recent <s, d> pair that moved.  If we get all the way through
1056 	 * the scan of tmp without finding any node that moved, much less
1057 	 * moved to an empty node, then there is nothing left worth migrating.
1058 	 */
1059 
1060 	tmp = *from;
1061 	while (!nodes_empty(tmp)) {
1062 		int s,d;
1063 		int source = NUMA_NO_NODE;
1064 		int dest = 0;
1065 
1066 		for_each_node_mask(s, tmp) {
1067 
1068 			/*
1069 			 * do_migrate_pages() tries to maintain the relative
1070 			 * node relationship of the pages established between
1071 			 * threads and memory areas.
1072                          *
1073 			 * However if the number of source nodes is not equal to
1074 			 * the number of destination nodes we can not preserve
1075 			 * this node relative relationship.  In that case, skip
1076 			 * copying memory from a node that is in the destination
1077 			 * mask.
1078 			 *
1079 			 * Example: [2,3,4] -> [3,4,5] moves everything.
1080 			 *          [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1081 			 */
1082 
1083 			if ((nodes_weight(*from) != nodes_weight(*to)) &&
1084 						(node_isset(s, *to)))
1085 				continue;
1086 
1087 			d = node_remap(s, *from, *to);
1088 			if (s == d)
1089 				continue;
1090 
1091 			source = s;	/* Node moved. Memorize */
1092 			dest = d;
1093 
1094 			/* dest not in remaining from nodes? */
1095 			if (!node_isset(dest, tmp))
1096 				break;
1097 		}
1098 		if (source == NUMA_NO_NODE)
1099 			break;
1100 
1101 		node_clear(source, tmp);
1102 		err = migrate_to_node(mm, source, dest, flags);
1103 		if (err > 0)
1104 			busy += err;
1105 		if (err < 0)
1106 			break;
1107 	}
1108 	up_read(&mm->mmap_sem);
1109 	if (err < 0)
1110 		return err;
1111 	return busy;
1112 
1113 }
1114 
1115 /*
1116  * Allocate a new page for page migration based on vma policy.
1117  * Start by assuming the page is mapped by the same vma as contains @start.
1118  * Search forward from there, if not.  N.B., this assumes that the
1119  * list of pages handed to migrate_pages()--which is how we get here--
1120  * is in virtual address order.
1121  */
new_page(struct page * page,unsigned long start,int ** x)1122 static struct page *new_page(struct page *page, unsigned long start, int **x)
1123 {
1124 	struct vm_area_struct *vma;
1125 	unsigned long uninitialized_var(address);
1126 
1127 	vma = find_vma(current->mm, start);
1128 	while (vma) {
1129 		address = page_address_in_vma(page, vma);
1130 		if (address != -EFAULT)
1131 			break;
1132 		vma = vma->vm_next;
1133 	}
1134 
1135 	if (PageHuge(page)) {
1136 		BUG_ON(!vma);
1137 		return alloc_huge_page_noerr(vma, address, 1);
1138 	}
1139 	/*
1140 	 * if !vma, alloc_page_vma() will use task or system default policy
1141 	 */
1142 	return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
1143 }
1144 #else
1145 
migrate_page_add(struct page * page,struct list_head * pagelist,unsigned long flags)1146 static void migrate_page_add(struct page *page, struct list_head *pagelist,
1147 				unsigned long flags)
1148 {
1149 }
1150 
do_migrate_pages(struct mm_struct * mm,const nodemask_t * from,const nodemask_t * to,int flags)1151 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1152 		     const nodemask_t *to, int flags)
1153 {
1154 	return -ENOSYS;
1155 }
1156 
new_page(struct page * page,unsigned long start,int ** x)1157 static struct page *new_page(struct page *page, unsigned long start, int **x)
1158 {
1159 	return NULL;
1160 }
1161 #endif
1162 
do_mbind(unsigned long start,unsigned long len,unsigned short mode,unsigned short mode_flags,nodemask_t * nmask,unsigned long flags)1163 static long do_mbind(unsigned long start, unsigned long len,
1164 		     unsigned short mode, unsigned short mode_flags,
1165 		     nodemask_t *nmask, unsigned long flags)
1166 {
1167 	struct mm_struct *mm = current->mm;
1168 	struct mempolicy *new;
1169 	unsigned long end;
1170 	int err;
1171 	LIST_HEAD(pagelist);
1172 
1173 	if (flags & ~(unsigned long)MPOL_MF_VALID)
1174 		return -EINVAL;
1175 	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1176 		return -EPERM;
1177 
1178 	if (start & ~PAGE_MASK)
1179 		return -EINVAL;
1180 
1181 	if (mode == MPOL_DEFAULT)
1182 		flags &= ~MPOL_MF_STRICT;
1183 
1184 	len = (len + PAGE_SIZE - 1) & PAGE_MASK;
1185 	end = start + len;
1186 
1187 	if (end < start)
1188 		return -EINVAL;
1189 	if (end == start)
1190 		return 0;
1191 
1192 	new = mpol_new(mode, mode_flags, nmask);
1193 	if (IS_ERR(new))
1194 		return PTR_ERR(new);
1195 
1196 	if (flags & MPOL_MF_LAZY)
1197 		new->flags |= MPOL_F_MOF;
1198 
1199 	/*
1200 	 * If we are using the default policy then operation
1201 	 * on discontinuous address spaces is okay after all
1202 	 */
1203 	if (!new)
1204 		flags |= MPOL_MF_DISCONTIG_OK;
1205 
1206 	pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1207 		 start, start + len, mode, mode_flags,
1208 		 nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
1209 
1210 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1211 
1212 		err = migrate_prep();
1213 		if (err)
1214 			goto mpol_out;
1215 	}
1216 	{
1217 		NODEMASK_SCRATCH(scratch);
1218 		if (scratch) {
1219 			down_write(&mm->mmap_sem);
1220 			task_lock(current);
1221 			err = mpol_set_nodemask(new, nmask, scratch);
1222 			task_unlock(current);
1223 			if (err)
1224 				up_write(&mm->mmap_sem);
1225 		} else
1226 			err = -ENOMEM;
1227 		NODEMASK_SCRATCH_FREE(scratch);
1228 	}
1229 	if (err)
1230 		goto mpol_out;
1231 
1232 	err = queue_pages_range(mm, start, end, nmask,
1233 			  flags | MPOL_MF_INVERT, &pagelist);
1234 	if (!err)
1235 		err = mbind_range(mm, start, end, new);
1236 
1237 	if (!err) {
1238 		int nr_failed = 0;
1239 
1240 		if (!list_empty(&pagelist)) {
1241 			WARN_ON_ONCE(flags & MPOL_MF_LAZY);
1242 			nr_failed = migrate_pages(&pagelist, new_page, NULL,
1243 				start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
1244 			if (nr_failed)
1245 				putback_movable_pages(&pagelist);
1246 		}
1247 
1248 		if (nr_failed && (flags & MPOL_MF_STRICT))
1249 			err = -EIO;
1250 	} else
1251 		putback_movable_pages(&pagelist);
1252 
1253 	up_write(&mm->mmap_sem);
1254  mpol_out:
1255 	mpol_put(new);
1256 	return err;
1257 }
1258 
1259 /*
1260  * User space interface with variable sized bitmaps for nodelists.
1261  */
1262 
1263 /* Copy a node mask from user space. */
get_nodes(nodemask_t * nodes,const unsigned long __user * nmask,unsigned long maxnode)1264 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1265 		     unsigned long maxnode)
1266 {
1267 	unsigned long k;
1268 	unsigned long nlongs;
1269 	unsigned long endmask;
1270 
1271 	--maxnode;
1272 	nodes_clear(*nodes);
1273 	if (maxnode == 0 || !nmask)
1274 		return 0;
1275 	if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1276 		return -EINVAL;
1277 
1278 	nlongs = BITS_TO_LONGS(maxnode);
1279 	if ((maxnode % BITS_PER_LONG) == 0)
1280 		endmask = ~0UL;
1281 	else
1282 		endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1283 
1284 	/* When the user specified more nodes than supported just check
1285 	   if the non supported part is all zero. */
1286 	if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1287 		if (nlongs > PAGE_SIZE/sizeof(long))
1288 			return -EINVAL;
1289 		for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1290 			unsigned long t;
1291 			if (get_user(t, nmask + k))
1292 				return -EFAULT;
1293 			if (k == nlongs - 1) {
1294 				if (t & endmask)
1295 					return -EINVAL;
1296 			} else if (t)
1297 				return -EINVAL;
1298 		}
1299 		nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1300 		endmask = ~0UL;
1301 	}
1302 
1303 	if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1304 		return -EFAULT;
1305 	nodes_addr(*nodes)[nlongs-1] &= endmask;
1306 	return 0;
1307 }
1308 
1309 /* Copy a kernel node mask to user space */
copy_nodes_to_user(unsigned long __user * mask,unsigned long maxnode,nodemask_t * nodes)1310 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1311 			      nodemask_t *nodes)
1312 {
1313 	unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1314 	const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
1315 
1316 	if (copy > nbytes) {
1317 		if (copy > PAGE_SIZE)
1318 			return -EINVAL;
1319 		if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1320 			return -EFAULT;
1321 		copy = nbytes;
1322 	}
1323 	return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1324 }
1325 
SYSCALL_DEFINE6(mbind,unsigned long,start,unsigned long,len,unsigned long,mode,const unsigned long __user *,nmask,unsigned long,maxnode,unsigned,flags)1326 SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1327 		unsigned long, mode, const unsigned long __user *, nmask,
1328 		unsigned long, maxnode, unsigned, flags)
1329 {
1330 	nodemask_t nodes;
1331 	int err;
1332 	unsigned short mode_flags;
1333 
1334 	mode_flags = mode & MPOL_MODE_FLAGS;
1335 	mode &= ~MPOL_MODE_FLAGS;
1336 	if (mode >= MPOL_MAX)
1337 		return -EINVAL;
1338 	if ((mode_flags & MPOL_F_STATIC_NODES) &&
1339 	    (mode_flags & MPOL_F_RELATIVE_NODES))
1340 		return -EINVAL;
1341 	err = get_nodes(&nodes, nmask, maxnode);
1342 	if (err)
1343 		return err;
1344 	return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1345 }
1346 
1347 /* Set the process memory policy */
SYSCALL_DEFINE3(set_mempolicy,int,mode,const unsigned long __user *,nmask,unsigned long,maxnode)1348 SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
1349 		unsigned long, maxnode)
1350 {
1351 	int err;
1352 	nodemask_t nodes;
1353 	unsigned short flags;
1354 
1355 	flags = mode & MPOL_MODE_FLAGS;
1356 	mode &= ~MPOL_MODE_FLAGS;
1357 	if ((unsigned int)mode >= MPOL_MAX)
1358 		return -EINVAL;
1359 	if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1360 		return -EINVAL;
1361 	err = get_nodes(&nodes, nmask, maxnode);
1362 	if (err)
1363 		return err;
1364 	return do_set_mempolicy(mode, flags, &nodes);
1365 }
1366 
SYSCALL_DEFINE4(migrate_pages,pid_t,pid,unsigned long,maxnode,const unsigned long __user *,old_nodes,const unsigned long __user *,new_nodes)1367 SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1368 		const unsigned long __user *, old_nodes,
1369 		const unsigned long __user *, new_nodes)
1370 {
1371 	const struct cred *cred = current_cred(), *tcred;
1372 	struct mm_struct *mm = NULL;
1373 	struct task_struct *task;
1374 	nodemask_t task_nodes;
1375 	int err;
1376 	nodemask_t *old;
1377 	nodemask_t *new;
1378 	NODEMASK_SCRATCH(scratch);
1379 
1380 	if (!scratch)
1381 		return -ENOMEM;
1382 
1383 	old = &scratch->mask1;
1384 	new = &scratch->mask2;
1385 
1386 	err = get_nodes(old, old_nodes, maxnode);
1387 	if (err)
1388 		goto out;
1389 
1390 	err = get_nodes(new, new_nodes, maxnode);
1391 	if (err)
1392 		goto out;
1393 
1394 	/* Find the mm_struct */
1395 	rcu_read_lock();
1396 	task = pid ? find_task_by_vpid(pid) : current;
1397 	if (!task) {
1398 		rcu_read_unlock();
1399 		err = -ESRCH;
1400 		goto out;
1401 	}
1402 	get_task_struct(task);
1403 
1404 	err = -EINVAL;
1405 
1406 	/*
1407 	 * Check if this process has the right to modify the specified
1408 	 * process. The right exists if the process has administrative
1409 	 * capabilities, superuser privileges or the same
1410 	 * userid as the target process.
1411 	 */
1412 	tcred = __task_cred(task);
1413 	if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) &&
1414 	    !uid_eq(cred->uid,  tcred->suid) && !uid_eq(cred->uid,  tcred->uid) &&
1415 	    !capable(CAP_SYS_NICE)) {
1416 		rcu_read_unlock();
1417 		err = -EPERM;
1418 		goto out_put;
1419 	}
1420 	rcu_read_unlock();
1421 
1422 	task_nodes = cpuset_mems_allowed(task);
1423 	/* Is the user allowed to access the target nodes? */
1424 	if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1425 		err = -EPERM;
1426 		goto out_put;
1427 	}
1428 
1429 	if (!nodes_subset(*new, node_states[N_MEMORY])) {
1430 		err = -EINVAL;
1431 		goto out_put;
1432 	}
1433 
1434 	err = security_task_movememory(task);
1435 	if (err)
1436 		goto out_put;
1437 
1438 	mm = get_task_mm(task);
1439 	put_task_struct(task);
1440 
1441 	if (!mm) {
1442 		err = -EINVAL;
1443 		goto out;
1444 	}
1445 
1446 	err = do_migrate_pages(mm, old, new,
1447 		capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1448 
1449 	mmput(mm);
1450 out:
1451 	NODEMASK_SCRATCH_FREE(scratch);
1452 
1453 	return err;
1454 
1455 out_put:
1456 	put_task_struct(task);
1457 	goto out;
1458 
1459 }
1460 
1461 
1462 /* Retrieve NUMA policy */
SYSCALL_DEFINE5(get_mempolicy,int __user *,policy,unsigned long __user *,nmask,unsigned long,maxnode,unsigned long,addr,unsigned long,flags)1463 SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1464 		unsigned long __user *, nmask, unsigned long, maxnode,
1465 		unsigned long, addr, unsigned long, flags)
1466 {
1467 	int err;
1468 	int uninitialized_var(pval);
1469 	nodemask_t nodes;
1470 
1471 	if (nmask != NULL && maxnode < MAX_NUMNODES)
1472 		return -EINVAL;
1473 
1474 	err = do_get_mempolicy(&pval, &nodes, addr, flags);
1475 
1476 	if (err)
1477 		return err;
1478 
1479 	if (policy && put_user(pval, policy))
1480 		return -EFAULT;
1481 
1482 	if (nmask)
1483 		err = copy_nodes_to_user(nmask, maxnode, &nodes);
1484 
1485 	return err;
1486 }
1487 
1488 #ifdef CONFIG_COMPAT
1489 
COMPAT_SYSCALL_DEFINE5(get_mempolicy,int __user *,policy,compat_ulong_t __user *,nmask,compat_ulong_t,maxnode,compat_ulong_t,addr,compat_ulong_t,flags)1490 COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1491 		       compat_ulong_t __user *, nmask,
1492 		       compat_ulong_t, maxnode,
1493 		       compat_ulong_t, addr, compat_ulong_t, flags)
1494 {
1495 	long err;
1496 	unsigned long __user *nm = NULL;
1497 	unsigned long nr_bits, alloc_size;
1498 	DECLARE_BITMAP(bm, MAX_NUMNODES);
1499 
1500 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1501 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1502 
1503 	if (nmask)
1504 		nm = compat_alloc_user_space(alloc_size);
1505 
1506 	err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1507 
1508 	if (!err && nmask) {
1509 		unsigned long copy_size;
1510 		copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
1511 		err = copy_from_user(bm, nm, copy_size);
1512 		/* ensure entire bitmap is zeroed */
1513 		err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1514 		err |= compat_put_bitmap(nmask, bm, nr_bits);
1515 	}
1516 
1517 	return err;
1518 }
1519 
COMPAT_SYSCALL_DEFINE3(set_mempolicy,int,mode,compat_ulong_t __user *,nmask,compat_ulong_t,maxnode)1520 COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask,
1521 		       compat_ulong_t, maxnode)
1522 {
1523 	unsigned long __user *nm = NULL;
1524 	unsigned long nr_bits, alloc_size;
1525 	DECLARE_BITMAP(bm, MAX_NUMNODES);
1526 
1527 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1528 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1529 
1530 	if (nmask) {
1531 		if (compat_get_bitmap(bm, nmask, nr_bits))
1532 			return -EFAULT;
1533 		nm = compat_alloc_user_space(alloc_size);
1534 		if (copy_to_user(nm, bm, alloc_size))
1535 			return -EFAULT;
1536 	}
1537 
1538 	return sys_set_mempolicy(mode, nm, nr_bits+1);
1539 }
1540 
COMPAT_SYSCALL_DEFINE6(mbind,compat_ulong_t,start,compat_ulong_t,len,compat_ulong_t,mode,compat_ulong_t __user *,nmask,compat_ulong_t,maxnode,compat_ulong_t,flags)1541 COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
1542 		       compat_ulong_t, mode, compat_ulong_t __user *, nmask,
1543 		       compat_ulong_t, maxnode, compat_ulong_t, flags)
1544 {
1545 	unsigned long __user *nm = NULL;
1546 	unsigned long nr_bits, alloc_size;
1547 	nodemask_t bm;
1548 
1549 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1550 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1551 
1552 	if (nmask) {
1553 		if (compat_get_bitmap(nodes_addr(bm), nmask, nr_bits))
1554 			return -EFAULT;
1555 		nm = compat_alloc_user_space(alloc_size);
1556 		if (copy_to_user(nm, nodes_addr(bm), alloc_size))
1557 			return -EFAULT;
1558 	}
1559 
1560 	return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1561 }
1562 
1563 #endif
1564 
__get_vma_policy(struct vm_area_struct * vma,unsigned long addr)1565 struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
1566 						unsigned long addr)
1567 {
1568 	struct mempolicy *pol = NULL;
1569 
1570 	if (vma) {
1571 		if (vma->vm_ops && vma->vm_ops->get_policy) {
1572 			pol = vma->vm_ops->get_policy(vma, addr);
1573 		} else if (vma->vm_policy) {
1574 			pol = vma->vm_policy;
1575 
1576 			/*
1577 			 * shmem_alloc_page() passes MPOL_F_SHARED policy with
1578 			 * a pseudo vma whose vma->vm_ops=NULL. Take a reference
1579 			 * count on these policies which will be dropped by
1580 			 * mpol_cond_put() later
1581 			 */
1582 			if (mpol_needs_cond_ref(pol))
1583 				mpol_get(pol);
1584 		}
1585 	}
1586 
1587 	return pol;
1588 }
1589 
1590 /*
1591  * get_vma_policy(@vma, @addr)
1592  * @vma: virtual memory area whose policy is sought
1593  * @addr: address in @vma for shared policy lookup
1594  *
1595  * Returns effective policy for a VMA at specified address.
1596  * Falls back to current->mempolicy or system default policy, as necessary.
1597  * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1598  * count--added by the get_policy() vm_op, as appropriate--to protect against
1599  * freeing by another task.  It is the caller's responsibility to free the
1600  * extra reference for shared policies.
1601  */
get_vma_policy(struct vm_area_struct * vma,unsigned long addr)1602 static struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
1603 						unsigned long addr)
1604 {
1605 	struct mempolicy *pol = __get_vma_policy(vma, addr);
1606 
1607 	if (!pol)
1608 		pol = get_task_policy(current);
1609 
1610 	return pol;
1611 }
1612 
vma_policy_mof(struct vm_area_struct * vma)1613 bool vma_policy_mof(struct vm_area_struct *vma)
1614 {
1615 	struct mempolicy *pol;
1616 
1617 	if (vma->vm_ops && vma->vm_ops->get_policy) {
1618 		bool ret = false;
1619 
1620 		pol = vma->vm_ops->get_policy(vma, vma->vm_start);
1621 		if (pol && (pol->flags & MPOL_F_MOF))
1622 			ret = true;
1623 		mpol_cond_put(pol);
1624 
1625 		return ret;
1626 	}
1627 
1628 	pol = vma->vm_policy;
1629 	if (!pol)
1630 		pol = get_task_policy(current);
1631 
1632 	return pol->flags & MPOL_F_MOF;
1633 }
1634 
apply_policy_zone(struct mempolicy * policy,enum zone_type zone)1635 static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
1636 {
1637 	enum zone_type dynamic_policy_zone = policy_zone;
1638 
1639 	BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
1640 
1641 	/*
1642 	 * if policy->v.nodes has movable memory only,
1643 	 * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
1644 	 *
1645 	 * policy->v.nodes is intersect with node_states[N_MEMORY].
1646 	 * so if the following test faile, it implies
1647 	 * policy->v.nodes has movable memory only.
1648 	 */
1649 	if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))
1650 		dynamic_policy_zone = ZONE_MOVABLE;
1651 
1652 	return zone >= dynamic_policy_zone;
1653 }
1654 
1655 /*
1656  * Return a nodemask representing a mempolicy for filtering nodes for
1657  * page allocation
1658  */
policy_nodemask(gfp_t gfp,struct mempolicy * policy)1659 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1660 {
1661 	/* Lower zones don't get a nodemask applied for MPOL_BIND */
1662 	if (unlikely(policy->mode == MPOL_BIND) &&
1663 			apply_policy_zone(policy, gfp_zone(gfp)) &&
1664 			cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1665 		return &policy->v.nodes;
1666 
1667 	return NULL;
1668 }
1669 
1670 /* Return a zonelist indicated by gfp for node representing a mempolicy */
policy_zonelist(gfp_t gfp,struct mempolicy * policy,int nd)1671 static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy,
1672 	int nd)
1673 {
1674 	switch (policy->mode) {
1675 	case MPOL_PREFERRED:
1676 		if (!(policy->flags & MPOL_F_LOCAL))
1677 			nd = policy->v.preferred_node;
1678 		break;
1679 	case MPOL_BIND:
1680 		/*
1681 		 * Normally, MPOL_BIND allocations are node-local within the
1682 		 * allowed nodemask.  However, if __GFP_THISNODE is set and the
1683 		 * current node isn't part of the mask, we use the zonelist for
1684 		 * the first node in the mask instead.
1685 		 */
1686 		if (unlikely(gfp & __GFP_THISNODE) &&
1687 				unlikely(!node_isset(nd, policy->v.nodes)))
1688 			nd = first_node(policy->v.nodes);
1689 		break;
1690 	default:
1691 		BUG();
1692 	}
1693 	return node_zonelist(nd, gfp);
1694 }
1695 
1696 /* Do dynamic interleaving for a process */
interleave_nodes(struct mempolicy * policy)1697 static unsigned interleave_nodes(struct mempolicy *policy)
1698 {
1699 	unsigned nid, next;
1700 	struct task_struct *me = current;
1701 
1702 	nid = me->il_next;
1703 	next = next_node_in(nid, policy->v.nodes);
1704 	if (next < MAX_NUMNODES)
1705 		me->il_next = next;
1706 	return nid;
1707 }
1708 
1709 /*
1710  * Depending on the memory policy provide a node from which to allocate the
1711  * next slab entry.
1712  */
mempolicy_slab_node(void)1713 unsigned int mempolicy_slab_node(void)
1714 {
1715 	struct mempolicy *policy;
1716 	int node = numa_mem_id();
1717 
1718 	if (in_interrupt())
1719 		return node;
1720 
1721 	policy = current->mempolicy;
1722 	if (!policy || policy->flags & MPOL_F_LOCAL)
1723 		return node;
1724 
1725 	switch (policy->mode) {
1726 	case MPOL_PREFERRED:
1727 		/*
1728 		 * handled MPOL_F_LOCAL above
1729 		 */
1730 		return policy->v.preferred_node;
1731 
1732 	case MPOL_INTERLEAVE:
1733 		return interleave_nodes(policy);
1734 
1735 	case MPOL_BIND: {
1736 		struct zoneref *z;
1737 
1738 		/*
1739 		 * Follow bind policy behavior and start allocation at the
1740 		 * first node.
1741 		 */
1742 		struct zonelist *zonelist;
1743 		enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1744 		zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
1745 		z = first_zones_zonelist(zonelist, highest_zoneidx,
1746 							&policy->v.nodes);
1747 		return z->zone ? z->zone->node : node;
1748 	}
1749 
1750 	default:
1751 		BUG();
1752 	}
1753 }
1754 
1755 /*
1756  * Do static interleaving for a VMA with known offset @n.  Returns the n'th
1757  * node in pol->v.nodes (starting from n=0), wrapping around if n exceeds the
1758  * number of present nodes.
1759  */
offset_il_node(struct mempolicy * pol,struct vm_area_struct * vma,unsigned long n)1760 static unsigned offset_il_node(struct mempolicy *pol,
1761 			       struct vm_area_struct *vma, unsigned long n)
1762 {
1763 	unsigned nnodes = nodes_weight(pol->v.nodes);
1764 	unsigned target;
1765 	int i;
1766 	int nid;
1767 
1768 	if (!nnodes)
1769 		return numa_node_id();
1770 	target = (unsigned int)n % nnodes;
1771 	nid = first_node(pol->v.nodes);
1772 	for (i = 0; i < target; i++)
1773 		nid = next_node(nid, pol->v.nodes);
1774 	return nid;
1775 }
1776 
1777 /* Determine a node number for interleave */
interleave_nid(struct mempolicy * pol,struct vm_area_struct * vma,unsigned long addr,int shift)1778 static inline unsigned interleave_nid(struct mempolicy *pol,
1779 		 struct vm_area_struct *vma, unsigned long addr, int shift)
1780 {
1781 	if (vma) {
1782 		unsigned long off;
1783 
1784 		/*
1785 		 * for small pages, there is no difference between
1786 		 * shift and PAGE_SHIFT, so the bit-shift is safe.
1787 		 * for huge pages, since vm_pgoff is in units of small
1788 		 * pages, we need to shift off the always 0 bits to get
1789 		 * a useful offset.
1790 		 */
1791 		BUG_ON(shift < PAGE_SHIFT);
1792 		off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1793 		off += (addr - vma->vm_start) >> shift;
1794 		return offset_il_node(pol, vma, off);
1795 	} else
1796 		return interleave_nodes(pol);
1797 }
1798 
1799 #ifdef CONFIG_HUGETLBFS
1800 /*
1801  * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
1802  * @vma: virtual memory area whose policy is sought
1803  * @addr: address in @vma for shared policy lookup and interleave policy
1804  * @gfp_flags: for requested zone
1805  * @mpol: pointer to mempolicy pointer for reference counted mempolicy
1806  * @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask
1807  *
1808  * Returns a zonelist suitable for a huge page allocation and a pointer
1809  * to the struct mempolicy for conditional unref after allocation.
1810  * If the effective policy is 'BIND, returns a pointer to the mempolicy's
1811  * @nodemask for filtering the zonelist.
1812  *
1813  * Must be protected by read_mems_allowed_begin()
1814  */
huge_zonelist(struct vm_area_struct * vma,unsigned long addr,gfp_t gfp_flags,struct mempolicy ** mpol,nodemask_t ** nodemask)1815 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1816 				gfp_t gfp_flags, struct mempolicy **mpol,
1817 				nodemask_t **nodemask)
1818 {
1819 	struct zonelist *zl;
1820 
1821 	*mpol = get_vma_policy(vma, addr);
1822 	*nodemask = NULL;	/* assume !MPOL_BIND */
1823 
1824 	if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1825 		zl = node_zonelist(interleave_nid(*mpol, vma, addr,
1826 				huge_page_shift(hstate_vma(vma))), gfp_flags);
1827 	} else {
1828 		zl = policy_zonelist(gfp_flags, *mpol, numa_node_id());
1829 		if ((*mpol)->mode == MPOL_BIND)
1830 			*nodemask = &(*mpol)->v.nodes;
1831 	}
1832 	return zl;
1833 }
1834 
1835 /*
1836  * init_nodemask_of_mempolicy
1837  *
1838  * If the current task's mempolicy is "default" [NULL], return 'false'
1839  * to indicate default policy.  Otherwise, extract the policy nodemask
1840  * for 'bind' or 'interleave' policy into the argument nodemask, or
1841  * initialize the argument nodemask to contain the single node for
1842  * 'preferred' or 'local' policy and return 'true' to indicate presence
1843  * of non-default mempolicy.
1844  *
1845  * We don't bother with reference counting the mempolicy [mpol_get/put]
1846  * because the current task is examining it's own mempolicy and a task's
1847  * mempolicy is only ever changed by the task itself.
1848  *
1849  * N.B., it is the caller's responsibility to free a returned nodemask.
1850  */
init_nodemask_of_mempolicy(nodemask_t * mask)1851 bool init_nodemask_of_mempolicy(nodemask_t *mask)
1852 {
1853 	struct mempolicy *mempolicy;
1854 	int nid;
1855 
1856 	if (!(mask && current->mempolicy))
1857 		return false;
1858 
1859 	task_lock(current);
1860 	mempolicy = current->mempolicy;
1861 	switch (mempolicy->mode) {
1862 	case MPOL_PREFERRED:
1863 		if (mempolicy->flags & MPOL_F_LOCAL)
1864 			nid = numa_node_id();
1865 		else
1866 			nid = mempolicy->v.preferred_node;
1867 		init_nodemask_of_node(mask, nid);
1868 		break;
1869 
1870 	case MPOL_BIND:
1871 		/* Fall through */
1872 	case MPOL_INTERLEAVE:
1873 		*mask =  mempolicy->v.nodes;
1874 		break;
1875 
1876 	default:
1877 		BUG();
1878 	}
1879 	task_unlock(current);
1880 
1881 	return true;
1882 }
1883 #endif
1884 
1885 /*
1886  * mempolicy_nodemask_intersects
1887  *
1888  * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
1889  * policy.  Otherwise, check for intersection between mask and the policy
1890  * nodemask for 'bind' or 'interleave' policy.  For 'perferred' or 'local'
1891  * policy, always return true since it may allocate elsewhere on fallback.
1892  *
1893  * Takes task_lock(tsk) to prevent freeing of its mempolicy.
1894  */
mempolicy_nodemask_intersects(struct task_struct * tsk,const nodemask_t * mask)1895 bool mempolicy_nodemask_intersects(struct task_struct *tsk,
1896 					const nodemask_t *mask)
1897 {
1898 	struct mempolicy *mempolicy;
1899 	bool ret = true;
1900 
1901 	if (!mask)
1902 		return ret;
1903 	task_lock(tsk);
1904 	mempolicy = tsk->mempolicy;
1905 	if (!mempolicy)
1906 		goto out;
1907 
1908 	switch (mempolicy->mode) {
1909 	case MPOL_PREFERRED:
1910 		/*
1911 		 * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
1912 		 * allocate from, they may fallback to other nodes when oom.
1913 		 * Thus, it's possible for tsk to have allocated memory from
1914 		 * nodes in mask.
1915 		 */
1916 		break;
1917 	case MPOL_BIND:
1918 	case MPOL_INTERLEAVE:
1919 		ret = nodes_intersects(mempolicy->v.nodes, *mask);
1920 		break;
1921 	default:
1922 		BUG();
1923 	}
1924 out:
1925 	task_unlock(tsk);
1926 	return ret;
1927 }
1928 
1929 /* Allocate a page in interleaved policy.
1930    Own path because it needs to do special accounting. */
alloc_page_interleave(gfp_t gfp,unsigned order,unsigned nid)1931 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1932 					unsigned nid)
1933 {
1934 	struct zonelist *zl;
1935 	struct page *page;
1936 
1937 	zl = node_zonelist(nid, gfp);
1938 	page = __alloc_pages(gfp, order, zl);
1939 	if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
1940 		inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1941 	return page;
1942 }
1943 
1944 /**
1945  * 	alloc_pages_vma	- Allocate a page for a VMA.
1946  *
1947  * 	@gfp:
1948  *      %GFP_USER    user allocation.
1949  *      %GFP_KERNEL  kernel allocations,
1950  *      %GFP_HIGHMEM highmem/user allocations,
1951  *      %GFP_FS      allocation should not call back into a file system.
1952  *      %GFP_ATOMIC  don't sleep.
1953  *
1954  *	@order:Order of the GFP allocation.
1955  * 	@vma:  Pointer to VMA or NULL if not available.
1956  *	@addr: Virtual Address of the allocation. Must be inside the VMA.
1957  *	@node: Which node to prefer for allocation (modulo policy).
1958  *	@hugepage: for hugepages try only the preferred node if possible
1959  *
1960  * 	This function allocates a page from the kernel page pool and applies
1961  *	a NUMA policy associated with the VMA or the current process.
1962  *	When VMA is not NULL caller must hold down_read on the mmap_sem of the
1963  *	mm_struct of the VMA to prevent it from going away. Should be used for
1964  *	all allocations for pages that will be mapped into user space. Returns
1965  *	NULL when no page can be allocated.
1966  */
1967 struct page *
alloc_pages_vma(gfp_t gfp,int order,struct vm_area_struct * vma,unsigned long addr,int node,bool hugepage)1968 alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
1969 		unsigned long addr, int node, bool hugepage)
1970 {
1971 	struct mempolicy *pol;
1972 	struct page *page;
1973 	unsigned int cpuset_mems_cookie;
1974 	struct zonelist *zl;
1975 	nodemask_t *nmask;
1976 
1977 retry_cpuset:
1978 	pol = get_vma_policy(vma, addr);
1979 	cpuset_mems_cookie = read_mems_allowed_begin();
1980 
1981 	if (pol->mode == MPOL_INTERLEAVE) {
1982 		unsigned nid;
1983 
1984 		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
1985 		mpol_cond_put(pol);
1986 		page = alloc_page_interleave(gfp, order, nid);
1987 		goto out;
1988 	}
1989 
1990 	if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
1991 		int hpage_node = node;
1992 
1993 		/*
1994 		 * For hugepage allocation and non-interleave policy which
1995 		 * allows the current node (or other explicitly preferred
1996 		 * node) we only try to allocate from the current/preferred
1997 		 * node and don't fall back to other nodes, as the cost of
1998 		 * remote accesses would likely offset THP benefits.
1999 		 *
2000 		 * If the policy is interleave, or does not allow the current
2001 		 * node in its nodemask, we allocate the standard way.
2002 		 */
2003 		if (pol->mode == MPOL_PREFERRED &&
2004 						!(pol->flags & MPOL_F_LOCAL))
2005 			hpage_node = pol->v.preferred_node;
2006 
2007 		nmask = policy_nodemask(gfp, pol);
2008 		if (!nmask || node_isset(hpage_node, *nmask)) {
2009 			mpol_cond_put(pol);
2010 			page = __alloc_pages_node(hpage_node,
2011 						gfp | __GFP_THISNODE, order);
2012 			goto out;
2013 		}
2014 	}
2015 
2016 	nmask = policy_nodemask(gfp, pol);
2017 	zl = policy_zonelist(gfp, pol, node);
2018 	page = __alloc_pages_nodemask(gfp, order, zl, nmask);
2019 	mpol_cond_put(pol);
2020 out:
2021 	if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
2022 		goto retry_cpuset;
2023 	return page;
2024 }
2025 
2026 /**
2027  * 	alloc_pages_current - Allocate pages.
2028  *
2029  *	@gfp:
2030  *		%GFP_USER   user allocation,
2031  *      	%GFP_KERNEL kernel allocation,
2032  *      	%GFP_HIGHMEM highmem allocation,
2033  *      	%GFP_FS     don't call back into a file system.
2034  *      	%GFP_ATOMIC don't sleep.
2035  *	@order: Power of two of allocation size in pages. 0 is a single page.
2036  *
2037  *	Allocate a page from the kernel page pool.  When not in
2038  *	interrupt context and apply the current process NUMA policy.
2039  *	Returns NULL when no page can be allocated.
2040  *
2041  *	Don't call cpuset_update_task_memory_state() unless
2042  *	1) it's ok to take cpuset_sem (can WAIT), and
2043  *	2) allocating for current task (not interrupt).
2044  */
alloc_pages_current(gfp_t gfp,unsigned order)2045 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
2046 {
2047 	struct mempolicy *pol = &default_policy;
2048 	struct page *page;
2049 	unsigned int cpuset_mems_cookie;
2050 
2051 	if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2052 		pol = get_task_policy(current);
2053 
2054 retry_cpuset:
2055 	cpuset_mems_cookie = read_mems_allowed_begin();
2056 
2057 	/*
2058 	 * No reference counting needed for current->mempolicy
2059 	 * nor system default_policy
2060 	 */
2061 	if (pol->mode == MPOL_INTERLEAVE)
2062 		page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
2063 	else
2064 		page = __alloc_pages_nodemask(gfp, order,
2065 				policy_zonelist(gfp, pol, numa_node_id()),
2066 				policy_nodemask(gfp, pol));
2067 
2068 	if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
2069 		goto retry_cpuset;
2070 
2071 	return page;
2072 }
2073 EXPORT_SYMBOL(alloc_pages_current);
2074 
vma_dup_policy(struct vm_area_struct * src,struct vm_area_struct * dst)2075 int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
2076 {
2077 	struct mempolicy *pol = mpol_dup(vma_policy(src));
2078 
2079 	if (IS_ERR(pol))
2080 		return PTR_ERR(pol);
2081 	dst->vm_policy = pol;
2082 	return 0;
2083 }
2084 
2085 /*
2086  * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
2087  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
2088  * with the mems_allowed returned by cpuset_mems_allowed().  This
2089  * keeps mempolicies cpuset relative after its cpuset moves.  See
2090  * further kernel/cpuset.c update_nodemask().
2091  *
2092  * current's mempolicy may be rebinded by the other task(the task that changes
2093  * cpuset's mems), so we needn't do rebind work for current task.
2094  */
2095 
2096 /* Slow path of a mempolicy duplicate */
__mpol_dup(struct mempolicy * old)2097 struct mempolicy *__mpol_dup(struct mempolicy *old)
2098 {
2099 	struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2100 
2101 	if (!new)
2102 		return ERR_PTR(-ENOMEM);
2103 
2104 	/* task's mempolicy is protected by alloc_lock */
2105 	if (old == current->mempolicy) {
2106 		task_lock(current);
2107 		*new = *old;
2108 		task_unlock(current);
2109 	} else
2110 		*new = *old;
2111 
2112 	if (current_cpuset_is_being_rebound()) {
2113 		nodemask_t mems = cpuset_mems_allowed(current);
2114 		if (new->flags & MPOL_F_REBINDING)
2115 			mpol_rebind_policy(new, &mems, MPOL_REBIND_STEP2);
2116 		else
2117 			mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE);
2118 	}
2119 	atomic_set(&new->refcnt, 1);
2120 	return new;
2121 }
2122 
2123 /* Slow path of a mempolicy comparison */
__mpol_equal(struct mempolicy * a,struct mempolicy * b)2124 bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2125 {
2126 	if (!a || !b)
2127 		return false;
2128 	if (a->mode != b->mode)
2129 		return false;
2130 	if (a->flags != b->flags)
2131 		return false;
2132 	if (mpol_store_user_nodemask(a))
2133 		if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
2134 			return false;
2135 
2136 	switch (a->mode) {
2137 	case MPOL_BIND:
2138 		/* Fall through */
2139 	case MPOL_INTERLEAVE:
2140 		return !!nodes_equal(a->v.nodes, b->v.nodes);
2141 	case MPOL_PREFERRED:
2142 		return a->v.preferred_node == b->v.preferred_node;
2143 	default:
2144 		BUG();
2145 		return false;
2146 	}
2147 }
2148 
2149 /*
2150  * Shared memory backing store policy support.
2151  *
2152  * Remember policies even when nobody has shared memory mapped.
2153  * The policies are kept in Red-Black tree linked from the inode.
2154  * They are protected by the sp->lock rwlock, which should be held
2155  * for any accesses to the tree.
2156  */
2157 
2158 /*
2159  * lookup first element intersecting start-end.  Caller holds sp->lock for
2160  * reading or for writing
2161  */
2162 static struct sp_node *
sp_lookup(struct shared_policy * sp,unsigned long start,unsigned long end)2163 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
2164 {
2165 	struct rb_node *n = sp->root.rb_node;
2166 
2167 	while (n) {
2168 		struct sp_node *p = rb_entry(n, struct sp_node, nd);
2169 
2170 		if (start >= p->end)
2171 			n = n->rb_right;
2172 		else if (end <= p->start)
2173 			n = n->rb_left;
2174 		else
2175 			break;
2176 	}
2177 	if (!n)
2178 		return NULL;
2179 	for (;;) {
2180 		struct sp_node *w = NULL;
2181 		struct rb_node *prev = rb_prev(n);
2182 		if (!prev)
2183 			break;
2184 		w = rb_entry(prev, struct sp_node, nd);
2185 		if (w->end <= start)
2186 			break;
2187 		n = prev;
2188 	}
2189 	return rb_entry(n, struct sp_node, nd);
2190 }
2191 
2192 /*
2193  * Insert a new shared policy into the list.  Caller holds sp->lock for
2194  * writing.
2195  */
sp_insert(struct shared_policy * sp,struct sp_node * new)2196 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2197 {
2198 	struct rb_node **p = &sp->root.rb_node;
2199 	struct rb_node *parent = NULL;
2200 	struct sp_node *nd;
2201 
2202 	while (*p) {
2203 		parent = *p;
2204 		nd = rb_entry(parent, struct sp_node, nd);
2205 		if (new->start < nd->start)
2206 			p = &(*p)->rb_left;
2207 		else if (new->end > nd->end)
2208 			p = &(*p)->rb_right;
2209 		else
2210 			BUG();
2211 	}
2212 	rb_link_node(&new->nd, parent, p);
2213 	rb_insert_color(&new->nd, &sp->root);
2214 	pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
2215 		 new->policy ? new->policy->mode : 0);
2216 }
2217 
2218 /* Find shared policy intersecting idx */
2219 struct mempolicy *
mpol_shared_policy_lookup(struct shared_policy * sp,unsigned long idx)2220 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2221 {
2222 	struct mempolicy *pol = NULL;
2223 	struct sp_node *sn;
2224 
2225 	if (!sp->root.rb_node)
2226 		return NULL;
2227 	read_lock(&sp->lock);
2228 	sn = sp_lookup(sp, idx, idx+1);
2229 	if (sn) {
2230 		mpol_get(sn->policy);
2231 		pol = sn->policy;
2232 	}
2233 	read_unlock(&sp->lock);
2234 	return pol;
2235 }
2236 
sp_free(struct sp_node * n)2237 static void sp_free(struct sp_node *n)
2238 {
2239 	mpol_put(n->policy);
2240 	kmem_cache_free(sn_cache, n);
2241 }
2242 
2243 /**
2244  * mpol_misplaced - check whether current page node is valid in policy
2245  *
2246  * @page: page to be checked
2247  * @vma: vm area where page mapped
2248  * @addr: virtual address where page mapped
2249  *
2250  * Lookup current policy node id for vma,addr and "compare to" page's
2251  * node id.
2252  *
2253  * Returns:
2254  *	-1	- not misplaced, page is in the right node
2255  *	node	- node id where the page should be
2256  *
2257  * Policy determination "mimics" alloc_page_vma().
2258  * Called from fault path where we know the vma and faulting address.
2259  */
mpol_misplaced(struct page * page,struct vm_area_struct * vma,unsigned long addr)2260 int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
2261 {
2262 	struct mempolicy *pol;
2263 	struct zoneref *z;
2264 	int curnid = page_to_nid(page);
2265 	unsigned long pgoff;
2266 	int thiscpu = raw_smp_processor_id();
2267 	int thisnid = cpu_to_node(thiscpu);
2268 	int polnid = -1;
2269 	int ret = -1;
2270 
2271 	BUG_ON(!vma);
2272 
2273 	pol = get_vma_policy(vma, addr);
2274 	if (!(pol->flags & MPOL_F_MOF))
2275 		goto out;
2276 
2277 	switch (pol->mode) {
2278 	case MPOL_INTERLEAVE:
2279 		BUG_ON(addr >= vma->vm_end);
2280 		BUG_ON(addr < vma->vm_start);
2281 
2282 		pgoff = vma->vm_pgoff;
2283 		pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
2284 		polnid = offset_il_node(pol, vma, pgoff);
2285 		break;
2286 
2287 	case MPOL_PREFERRED:
2288 		if (pol->flags & MPOL_F_LOCAL)
2289 			polnid = numa_node_id();
2290 		else
2291 			polnid = pol->v.preferred_node;
2292 		break;
2293 
2294 	case MPOL_BIND:
2295 
2296 		/*
2297 		 * allows binding to multiple nodes.
2298 		 * use current page if in policy nodemask,
2299 		 * else select nearest allowed node, if any.
2300 		 * If no allowed nodes, use current [!misplaced].
2301 		 */
2302 		if (node_isset(curnid, pol->v.nodes))
2303 			goto out;
2304 		z = first_zones_zonelist(
2305 				node_zonelist(numa_node_id(), GFP_HIGHUSER),
2306 				gfp_zone(GFP_HIGHUSER),
2307 				&pol->v.nodes);
2308 		polnid = z->zone->node;
2309 		break;
2310 
2311 	default:
2312 		BUG();
2313 	}
2314 
2315 	/* Migrate the page towards the node whose CPU is referencing it */
2316 	if (pol->flags & MPOL_F_MORON) {
2317 		polnid = thisnid;
2318 
2319 		if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
2320 			goto out;
2321 	}
2322 
2323 	if (curnid != polnid)
2324 		ret = polnid;
2325 out:
2326 	mpol_cond_put(pol);
2327 
2328 	return ret;
2329 }
2330 
2331 /*
2332  * Drop the (possibly final) reference to task->mempolicy.  It needs to be
2333  * dropped after task->mempolicy is set to NULL so that any allocation done as
2334  * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed
2335  * policy.
2336  */
mpol_put_task_policy(struct task_struct * task)2337 void mpol_put_task_policy(struct task_struct *task)
2338 {
2339 	struct mempolicy *pol;
2340 
2341 	task_lock(task);
2342 	pol = task->mempolicy;
2343 	task->mempolicy = NULL;
2344 	task_unlock(task);
2345 	mpol_put(pol);
2346 }
2347 
sp_delete(struct shared_policy * sp,struct sp_node * n)2348 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2349 {
2350 	pr_debug("deleting %lx-l%lx\n", n->start, n->end);
2351 	rb_erase(&n->nd, &sp->root);
2352 	sp_free(n);
2353 }
2354 
sp_node_init(struct sp_node * node,unsigned long start,unsigned long end,struct mempolicy * pol)2355 static void sp_node_init(struct sp_node *node, unsigned long start,
2356 			unsigned long end, struct mempolicy *pol)
2357 {
2358 	node->start = start;
2359 	node->end = end;
2360 	node->policy = pol;
2361 }
2362 
sp_alloc(unsigned long start,unsigned long end,struct mempolicy * pol)2363 static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2364 				struct mempolicy *pol)
2365 {
2366 	struct sp_node *n;
2367 	struct mempolicy *newpol;
2368 
2369 	n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2370 	if (!n)
2371 		return NULL;
2372 
2373 	newpol = mpol_dup(pol);
2374 	if (IS_ERR(newpol)) {
2375 		kmem_cache_free(sn_cache, n);
2376 		return NULL;
2377 	}
2378 	newpol->flags |= MPOL_F_SHARED;
2379 	sp_node_init(n, start, end, newpol);
2380 
2381 	return n;
2382 }
2383 
2384 /* Replace a policy range. */
shared_policy_replace(struct shared_policy * sp,unsigned long start,unsigned long end,struct sp_node * new)2385 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2386 				 unsigned long end, struct sp_node *new)
2387 {
2388 	struct sp_node *n;
2389 	struct sp_node *n_new = NULL;
2390 	struct mempolicy *mpol_new = NULL;
2391 	int ret = 0;
2392 
2393 restart:
2394 	write_lock(&sp->lock);
2395 	n = sp_lookup(sp, start, end);
2396 	/* Take care of old policies in the same range. */
2397 	while (n && n->start < end) {
2398 		struct rb_node *next = rb_next(&n->nd);
2399 		if (n->start >= start) {
2400 			if (n->end <= end)
2401 				sp_delete(sp, n);
2402 			else
2403 				n->start = end;
2404 		} else {
2405 			/* Old policy spanning whole new range. */
2406 			if (n->end > end) {
2407 				if (!n_new)
2408 					goto alloc_new;
2409 
2410 				*mpol_new = *n->policy;
2411 				atomic_set(&mpol_new->refcnt, 1);
2412 				sp_node_init(n_new, end, n->end, mpol_new);
2413 				n->end = start;
2414 				sp_insert(sp, n_new);
2415 				n_new = NULL;
2416 				mpol_new = NULL;
2417 				break;
2418 			} else
2419 				n->end = start;
2420 		}
2421 		if (!next)
2422 			break;
2423 		n = rb_entry(next, struct sp_node, nd);
2424 	}
2425 	if (new)
2426 		sp_insert(sp, new);
2427 	write_unlock(&sp->lock);
2428 	ret = 0;
2429 
2430 err_out:
2431 	if (mpol_new)
2432 		mpol_put(mpol_new);
2433 	if (n_new)
2434 		kmem_cache_free(sn_cache, n_new);
2435 
2436 	return ret;
2437 
2438 alloc_new:
2439 	write_unlock(&sp->lock);
2440 	ret = -ENOMEM;
2441 	n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2442 	if (!n_new)
2443 		goto err_out;
2444 	mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2445 	if (!mpol_new)
2446 		goto err_out;
2447 	goto restart;
2448 }
2449 
2450 /**
2451  * mpol_shared_policy_init - initialize shared policy for inode
2452  * @sp: pointer to inode shared policy
2453  * @mpol:  struct mempolicy to install
2454  *
2455  * Install non-NULL @mpol in inode's shared policy rb-tree.
2456  * On entry, the current task has a reference on a non-NULL @mpol.
2457  * This must be released on exit.
2458  * This is called at get_inode() calls and we can use GFP_KERNEL.
2459  */
mpol_shared_policy_init(struct shared_policy * sp,struct mempolicy * mpol)2460 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2461 {
2462 	int ret;
2463 
2464 	sp->root = RB_ROOT;		/* empty tree == default mempolicy */
2465 	rwlock_init(&sp->lock);
2466 
2467 	if (mpol) {
2468 		struct vm_area_struct pvma;
2469 		struct mempolicy *new;
2470 		NODEMASK_SCRATCH(scratch);
2471 
2472 		if (!scratch)
2473 			goto put_mpol;
2474 		/* contextualize the tmpfs mount point mempolicy */
2475 		new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
2476 		if (IS_ERR(new))
2477 			goto free_scratch; /* no valid nodemask intersection */
2478 
2479 		task_lock(current);
2480 		ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
2481 		task_unlock(current);
2482 		if (ret)
2483 			goto put_new;
2484 
2485 		/* Create pseudo-vma that contains just the policy */
2486 		memset(&pvma, 0, sizeof(struct vm_area_struct));
2487 		pvma.vm_end = TASK_SIZE;	/* policy covers entire file */
2488 		mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
2489 
2490 put_new:
2491 		mpol_put(new);			/* drop initial ref */
2492 free_scratch:
2493 		NODEMASK_SCRATCH_FREE(scratch);
2494 put_mpol:
2495 		mpol_put(mpol);	/* drop our incoming ref on sb mpol */
2496 	}
2497 }
2498 
mpol_set_shared_policy(struct shared_policy * info,struct vm_area_struct * vma,struct mempolicy * npol)2499 int mpol_set_shared_policy(struct shared_policy *info,
2500 			struct vm_area_struct *vma, struct mempolicy *npol)
2501 {
2502 	int err;
2503 	struct sp_node *new = NULL;
2504 	unsigned long sz = vma_pages(vma);
2505 
2506 	pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
2507 		 vma->vm_pgoff,
2508 		 sz, npol ? npol->mode : -1,
2509 		 npol ? npol->flags : -1,
2510 		 npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE);
2511 
2512 	if (npol) {
2513 		new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2514 		if (!new)
2515 			return -ENOMEM;
2516 	}
2517 	err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2518 	if (err && new)
2519 		sp_free(new);
2520 	return err;
2521 }
2522 
2523 /* Free a backing policy store on inode delete. */
mpol_free_shared_policy(struct shared_policy * p)2524 void mpol_free_shared_policy(struct shared_policy *p)
2525 {
2526 	struct sp_node *n;
2527 	struct rb_node *next;
2528 
2529 	if (!p->root.rb_node)
2530 		return;
2531 	write_lock(&p->lock);
2532 	next = rb_first(&p->root);
2533 	while (next) {
2534 		n = rb_entry(next, struct sp_node, nd);
2535 		next = rb_next(&n->nd);
2536 		sp_delete(p, n);
2537 	}
2538 	write_unlock(&p->lock);
2539 }
2540 
2541 #ifdef CONFIG_NUMA_BALANCING
2542 static int __initdata numabalancing_override;
2543 
check_numabalancing_enable(void)2544 static void __init check_numabalancing_enable(void)
2545 {
2546 	bool numabalancing_default = false;
2547 
2548 	if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
2549 		numabalancing_default = true;
2550 
2551 	/* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
2552 	if (numabalancing_override)
2553 		set_numabalancing_state(numabalancing_override == 1);
2554 
2555 	if (num_online_nodes() > 1 && !numabalancing_override) {
2556 		pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
2557 			numabalancing_default ? "Enabling" : "Disabling");
2558 		set_numabalancing_state(numabalancing_default);
2559 	}
2560 }
2561 
setup_numabalancing(char * str)2562 static int __init setup_numabalancing(char *str)
2563 {
2564 	int ret = 0;
2565 	if (!str)
2566 		goto out;
2567 
2568 	if (!strcmp(str, "enable")) {
2569 		numabalancing_override = 1;
2570 		ret = 1;
2571 	} else if (!strcmp(str, "disable")) {
2572 		numabalancing_override = -1;
2573 		ret = 1;
2574 	}
2575 out:
2576 	if (!ret)
2577 		pr_warn("Unable to parse numa_balancing=\n");
2578 
2579 	return ret;
2580 }
2581 __setup("numa_balancing=", setup_numabalancing);
2582 #else
check_numabalancing_enable(void)2583 static inline void __init check_numabalancing_enable(void)
2584 {
2585 }
2586 #endif /* CONFIG_NUMA_BALANCING */
2587 
2588 /* assumes fs == KERNEL_DS */
numa_policy_init(void)2589 void __init numa_policy_init(void)
2590 {
2591 	nodemask_t interleave_nodes;
2592 	unsigned long largest = 0;
2593 	int nid, prefer = 0;
2594 
2595 	policy_cache = kmem_cache_create("numa_policy",
2596 					 sizeof(struct mempolicy),
2597 					 0, SLAB_PANIC, NULL);
2598 
2599 	sn_cache = kmem_cache_create("shared_policy_node",
2600 				     sizeof(struct sp_node),
2601 				     0, SLAB_PANIC, NULL);
2602 
2603 	for_each_node(nid) {
2604 		preferred_node_policy[nid] = (struct mempolicy) {
2605 			.refcnt = ATOMIC_INIT(1),
2606 			.mode = MPOL_PREFERRED,
2607 			.flags = MPOL_F_MOF | MPOL_F_MORON,
2608 			.v = { .preferred_node = nid, },
2609 		};
2610 	}
2611 
2612 	/*
2613 	 * Set interleaving policy for system init. Interleaving is only
2614 	 * enabled across suitably sized nodes (default is >= 16MB), or
2615 	 * fall back to the largest node if they're all smaller.
2616 	 */
2617 	nodes_clear(interleave_nodes);
2618 	for_each_node_state(nid, N_MEMORY) {
2619 		unsigned long total_pages = node_present_pages(nid);
2620 
2621 		/* Preserve the largest node */
2622 		if (largest < total_pages) {
2623 			largest = total_pages;
2624 			prefer = nid;
2625 		}
2626 
2627 		/* Interleave this node? */
2628 		if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2629 			node_set(nid, interleave_nodes);
2630 	}
2631 
2632 	/* All too small, use the largest */
2633 	if (unlikely(nodes_empty(interleave_nodes)))
2634 		node_set(prefer, interleave_nodes);
2635 
2636 	if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2637 		pr_err("%s: interleaving failed\n", __func__);
2638 
2639 	check_numabalancing_enable();
2640 }
2641 
2642 /* Reset policy of current process to default */
numa_default_policy(void)2643 void numa_default_policy(void)
2644 {
2645 	do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
2646 }
2647 
2648 /*
2649  * Parse and format mempolicy from/to strings
2650  */
2651 
2652 /*
2653  * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag.
2654  */
2655 static const char * const policy_modes[] =
2656 {
2657 	[MPOL_DEFAULT]    = "default",
2658 	[MPOL_PREFERRED]  = "prefer",
2659 	[MPOL_BIND]       = "bind",
2660 	[MPOL_INTERLEAVE] = "interleave",
2661 	[MPOL_LOCAL]      = "local",
2662 };
2663 
2664 
2665 #ifdef CONFIG_TMPFS
2666 /**
2667  * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
2668  * @str:  string containing mempolicy to parse
2669  * @mpol:  pointer to struct mempolicy pointer, returned on success.
2670  *
2671  * Format of input:
2672  *	<mode>[=<flags>][:<nodelist>]
2673  *
2674  * On success, returns 0, else 1
2675  */
mpol_parse_str(char * str,struct mempolicy ** mpol)2676 int mpol_parse_str(char *str, struct mempolicy **mpol)
2677 {
2678 	struct mempolicy *new = NULL;
2679 	unsigned short mode;
2680 	unsigned short mode_flags;
2681 	nodemask_t nodes;
2682 	char *nodelist = strchr(str, ':');
2683 	char *flags = strchr(str, '=');
2684 	int err = 1;
2685 
2686 	if (nodelist) {
2687 		/* NUL-terminate mode or flags string */
2688 		*nodelist++ = '\0';
2689 		if (nodelist_parse(nodelist, nodes))
2690 			goto out;
2691 		if (!nodes_subset(nodes, node_states[N_MEMORY]))
2692 			goto out;
2693 	} else
2694 		nodes_clear(nodes);
2695 
2696 	if (flags)
2697 		*flags++ = '\0';	/* terminate mode string */
2698 
2699 	for (mode = 0; mode < MPOL_MAX; mode++) {
2700 		if (!strcmp(str, policy_modes[mode])) {
2701 			break;
2702 		}
2703 	}
2704 	if (mode >= MPOL_MAX)
2705 		goto out;
2706 
2707 	switch (mode) {
2708 	case MPOL_PREFERRED:
2709 		/*
2710 		 * Insist on a nodelist of one node only
2711 		 */
2712 		if (nodelist) {
2713 			char *rest = nodelist;
2714 			while (isdigit(*rest))
2715 				rest++;
2716 			if (*rest)
2717 				goto out;
2718 		}
2719 		break;
2720 	case MPOL_INTERLEAVE:
2721 		/*
2722 		 * Default to online nodes with memory if no nodelist
2723 		 */
2724 		if (!nodelist)
2725 			nodes = node_states[N_MEMORY];
2726 		break;
2727 	case MPOL_LOCAL:
2728 		/*
2729 		 * Don't allow a nodelist;  mpol_new() checks flags
2730 		 */
2731 		if (nodelist)
2732 			goto out;
2733 		mode = MPOL_PREFERRED;
2734 		break;
2735 	case MPOL_DEFAULT:
2736 		/*
2737 		 * Insist on a empty nodelist
2738 		 */
2739 		if (!nodelist)
2740 			err = 0;
2741 		goto out;
2742 	case MPOL_BIND:
2743 		/*
2744 		 * Insist on a nodelist
2745 		 */
2746 		if (!nodelist)
2747 			goto out;
2748 	}
2749 
2750 	mode_flags = 0;
2751 	if (flags) {
2752 		/*
2753 		 * Currently, we only support two mutually exclusive
2754 		 * mode flags.
2755 		 */
2756 		if (!strcmp(flags, "static"))
2757 			mode_flags |= MPOL_F_STATIC_NODES;
2758 		else if (!strcmp(flags, "relative"))
2759 			mode_flags |= MPOL_F_RELATIVE_NODES;
2760 		else
2761 			goto out;
2762 	}
2763 
2764 	new = mpol_new(mode, mode_flags, &nodes);
2765 	if (IS_ERR(new))
2766 		goto out;
2767 
2768 	/*
2769 	 * Save nodes for mpol_to_str() to show the tmpfs mount options
2770 	 * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
2771 	 */
2772 	if (mode != MPOL_PREFERRED)
2773 		new->v.nodes = nodes;
2774 	else if (nodelist)
2775 		new->v.preferred_node = first_node(nodes);
2776 	else
2777 		new->flags |= MPOL_F_LOCAL;
2778 
2779 	/*
2780 	 * Save nodes for contextualization: this will be used to "clone"
2781 	 * the mempolicy in a specific context [cpuset] at a later time.
2782 	 */
2783 	new->w.user_nodemask = nodes;
2784 
2785 	err = 0;
2786 
2787 out:
2788 	/* Restore string for error message */
2789 	if (nodelist)
2790 		*--nodelist = ':';
2791 	if (flags)
2792 		*--flags = '=';
2793 	if (!err)
2794 		*mpol = new;
2795 	return err;
2796 }
2797 #endif /* CONFIG_TMPFS */
2798 
2799 /**
2800  * mpol_to_str - format a mempolicy structure for printing
2801  * @buffer:  to contain formatted mempolicy string
2802  * @maxlen:  length of @buffer
2803  * @pol:  pointer to mempolicy to be formatted
2804  *
2805  * Convert @pol into a string.  If @buffer is too short, truncate the string.
2806  * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
2807  * longest flag, "relative", and to display at least a few node ids.
2808  */
mpol_to_str(char * buffer,int maxlen,struct mempolicy * pol)2809 void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
2810 {
2811 	char *p = buffer;
2812 	nodemask_t nodes = NODE_MASK_NONE;
2813 	unsigned short mode = MPOL_DEFAULT;
2814 	unsigned short flags = 0;
2815 
2816 	if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
2817 		mode = pol->mode;
2818 		flags = pol->flags;
2819 	}
2820 
2821 	switch (mode) {
2822 	case MPOL_DEFAULT:
2823 		break;
2824 	case MPOL_PREFERRED:
2825 		if (flags & MPOL_F_LOCAL)
2826 			mode = MPOL_LOCAL;
2827 		else
2828 			node_set(pol->v.preferred_node, nodes);
2829 		break;
2830 	case MPOL_BIND:
2831 	case MPOL_INTERLEAVE:
2832 		nodes = pol->v.nodes;
2833 		break;
2834 	default:
2835 		WARN_ON_ONCE(1);
2836 		snprintf(p, maxlen, "unknown");
2837 		return;
2838 	}
2839 
2840 	p += snprintf(p, maxlen, "%s", policy_modes[mode]);
2841 
2842 	if (flags & MPOL_MODE_FLAGS) {
2843 		p += snprintf(p, buffer + maxlen - p, "=");
2844 
2845 		/*
2846 		 * Currently, the only defined flags are mutually exclusive
2847 		 */
2848 		if (flags & MPOL_F_STATIC_NODES)
2849 			p += snprintf(p, buffer + maxlen - p, "static");
2850 		else if (flags & MPOL_F_RELATIVE_NODES)
2851 			p += snprintf(p, buffer + maxlen - p, "relative");
2852 	}
2853 
2854 	if (!nodes_empty(nodes))
2855 		p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
2856 			       nodemask_pr_args(&nodes));
2857 }
2858