• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * Copyright (C) 2010-2017 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
4  *
5  * membarrier system call
6  */
7 #include "sched.h"
8 
9 /*
10  * Bitmask made from a "or" of all commands within enum membarrier_cmd,
11  * except MEMBARRIER_CMD_QUERY.
12  */
13 #ifdef CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE
14 #define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK			\
15 	(MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE			\
16 	| MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE)
17 #else
18 #define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK	0
19 #endif
20 
21 #ifdef CONFIG_RSEQ
22 #define MEMBARRIER_PRIVATE_EXPEDITED_RSEQ_BITMASK		\
23 	(MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ			\
24 	| MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ)
25 #else
26 #define MEMBARRIER_PRIVATE_EXPEDITED_RSEQ_BITMASK	0
27 #endif
28 
29 #define MEMBARRIER_CMD_BITMASK						\
30 	(MEMBARRIER_CMD_GLOBAL | MEMBARRIER_CMD_GLOBAL_EXPEDITED	\
31 	| MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED			\
32 	| MEMBARRIER_CMD_PRIVATE_EXPEDITED				\
33 	| MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED			\
34 	| MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK		\
35 	| MEMBARRIER_PRIVATE_EXPEDITED_RSEQ_BITMASK)
36 
37 static DEFINE_MUTEX(membarrier_ipi_mutex);
38 
ipi_mb(void * info)39 static void ipi_mb(void *info)
40 {
41 	smp_mb();	/* IPIs should be serializing but paranoid. */
42 }
43 
ipi_sync_core(void * info)44 static void ipi_sync_core(void *info)
45 {
46 	/*
47 	 * The smp_mb() in membarrier after all the IPIs is supposed to
48 	 * ensure that memory on remote CPUs that occur before the IPI
49 	 * become visible to membarrier()'s caller -- see scenario B in
50 	 * the big comment at the top of this file.
51 	 *
52 	 * A sync_core() would provide this guarantee, but
53 	 * sync_core_before_usermode() might end up being deferred until
54 	 * after membarrier()'s smp_mb().
55 	 */
56 	smp_mb();	/* IPIs should be serializing but paranoid. */
57 
58 	sync_core_before_usermode();
59 }
60 
ipi_rseq(void * info)61 static void ipi_rseq(void *info)
62 {
63 	/*
64 	 * Ensure that all stores done by the calling thread are visible
65 	 * to the current task before the current task resumes.  We could
66 	 * probably optimize this away on most architectures, but by the
67 	 * time we've already sent an IPI, the cost of the extra smp_mb()
68 	 * is negligible.
69 	 */
70 	smp_mb();
71 	rseq_preempt(current);
72 }
73 
ipi_sync_rq_state(void * info)74 static void ipi_sync_rq_state(void *info)
75 {
76 	struct mm_struct *mm = (struct mm_struct *) info;
77 
78 	if (current->mm != mm)
79 		return;
80 	this_cpu_write(runqueues.membarrier_state,
81 		       atomic_read(&mm->membarrier_state));
82 	/*
83 	 * Issue a memory barrier after setting
84 	 * MEMBARRIER_STATE_GLOBAL_EXPEDITED in the current runqueue to
85 	 * guarantee that no memory access following registration is reordered
86 	 * before registration.
87 	 */
88 	smp_mb();
89 }
90 
membarrier_exec_mmap(struct mm_struct * mm)91 void membarrier_exec_mmap(struct mm_struct *mm)
92 {
93 	/*
94 	 * Issue a memory barrier before clearing membarrier_state to
95 	 * guarantee that no memory access prior to exec is reordered after
96 	 * clearing this state.
97 	 */
98 	smp_mb();
99 	atomic_set(&mm->membarrier_state, 0);
100 	/*
101 	 * Keep the runqueue membarrier_state in sync with this mm
102 	 * membarrier_state.
103 	 */
104 	this_cpu_write(runqueues.membarrier_state, 0);
105 }
106 
membarrier_global_expedited(void)107 static int membarrier_global_expedited(void)
108 {
109 	int cpu;
110 	cpumask_var_t tmpmask;
111 
112 	if (num_online_cpus() == 1)
113 		return 0;
114 
115 	/*
116 	 * Matches memory barriers around rq->curr modification in
117 	 * scheduler.
118 	 */
119 	smp_mb();	/* system call entry is not a mb. */
120 
121 	if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
122 		return -ENOMEM;
123 
124 	mutex_lock(&membarrier_ipi_mutex);
125 	cpus_read_lock();
126 	rcu_read_lock();
127 	for_each_online_cpu(cpu) {
128 		struct task_struct *p;
129 
130 		/*
131 		 * Skipping the current CPU is OK even through we can be
132 		 * migrated at any point. The current CPU, at the point
133 		 * where we read raw_smp_processor_id(), is ensured to
134 		 * be in program order with respect to the caller
135 		 * thread. Therefore, we can skip this CPU from the
136 		 * iteration.
137 		 */
138 		if (cpu == raw_smp_processor_id())
139 			continue;
140 
141 		if (!(READ_ONCE(cpu_rq(cpu)->membarrier_state) &
142 		    MEMBARRIER_STATE_GLOBAL_EXPEDITED))
143 			continue;
144 
145 		/*
146 		 * Skip the CPU if it runs a kernel thread. The scheduler
147 		 * leaves the prior task mm in place as an optimization when
148 		 * scheduling a kthread.
149 		 */
150 		p = rcu_dereference(cpu_rq(cpu)->curr);
151 		if (p->flags & PF_KTHREAD)
152 			continue;
153 
154 		__cpumask_set_cpu(cpu, tmpmask);
155 	}
156 	rcu_read_unlock();
157 
158 	preempt_disable();
159 	smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
160 	preempt_enable();
161 
162 	free_cpumask_var(tmpmask);
163 	cpus_read_unlock();
164 
165 	/*
166 	 * Memory barrier on the caller thread _after_ we finished
167 	 * waiting for the last IPI. Matches memory barriers around
168 	 * rq->curr modification in scheduler.
169 	 */
170 	smp_mb();	/* exit from system call is not a mb */
171 	mutex_unlock(&membarrier_ipi_mutex);
172 
173 	return 0;
174 }
175 
membarrier_private_expedited(int flags,int cpu_id)176 static int membarrier_private_expedited(int flags, int cpu_id)
177 {
178 	cpumask_var_t tmpmask;
179 	struct mm_struct *mm = current->mm;
180 	smp_call_func_t ipi_func = ipi_mb;
181 
182 	if (flags == MEMBARRIER_FLAG_SYNC_CORE) {
183 		if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
184 			return -EINVAL;
185 		if (!(atomic_read(&mm->membarrier_state) &
186 		      MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY))
187 			return -EPERM;
188 		ipi_func = ipi_sync_core;
189 	} else if (flags == MEMBARRIER_FLAG_RSEQ) {
190 		if (!IS_ENABLED(CONFIG_RSEQ))
191 			return -EINVAL;
192 		if (!(atomic_read(&mm->membarrier_state) &
193 		      MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY))
194 			return -EPERM;
195 		ipi_func = ipi_rseq;
196 	} else {
197 		WARN_ON_ONCE(flags);
198 		if (!(atomic_read(&mm->membarrier_state) &
199 		      MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY))
200 			return -EPERM;
201 	}
202 
203 	if (flags != MEMBARRIER_FLAG_SYNC_CORE &&
204 	    (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1))
205 		return 0;
206 
207 	/*
208 	 * Matches memory barriers around rq->curr modification in
209 	 * scheduler.
210 	 */
211 	smp_mb();	/* system call entry is not a mb. */
212 
213 	if (cpu_id < 0 && !zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
214 		return -ENOMEM;
215 
216 	mutex_lock(&membarrier_ipi_mutex);
217 	cpus_read_lock();
218 
219 	if (cpu_id >= 0) {
220 		struct task_struct *p;
221 
222 		if (cpu_id >= nr_cpu_ids || !cpu_online(cpu_id))
223 			goto out;
224 		rcu_read_lock();
225 		p = rcu_dereference(cpu_rq(cpu_id)->curr);
226 		if (!p || p->mm != mm) {
227 			rcu_read_unlock();
228 			goto out;
229 		}
230 		rcu_read_unlock();
231 	} else {
232 		int cpu;
233 
234 		rcu_read_lock();
235 		for_each_online_cpu(cpu) {
236 			struct task_struct *p;
237 
238 			p = rcu_dereference(cpu_rq(cpu)->curr);
239 			if (p && p->mm == mm)
240 				__cpumask_set_cpu(cpu, tmpmask);
241 		}
242 		rcu_read_unlock();
243 	}
244 
245 	if (cpu_id >= 0) {
246 		/*
247 		 * smp_call_function_single() will call ipi_func() if cpu_id
248 		 * is the calling CPU.
249 		 */
250 		smp_call_function_single(cpu_id, ipi_func, NULL, 1);
251 	} else {
252 		/*
253 		 * For regular membarrier, we can save a few cycles by
254 		 * skipping the current cpu -- we're about to do smp_mb()
255 		 * below, and if we migrate to a different cpu, this cpu
256 		 * and the new cpu will execute a full barrier in the
257 		 * scheduler.
258 		 *
259 		 * For SYNC_CORE, we do need a barrier on the current cpu --
260 		 * otherwise, if we are migrated and replaced by a different
261 		 * task in the same mm just before, during, or after
262 		 * membarrier, we will end up with some thread in the mm
263 		 * running without a core sync.
264 		 *
265 		 * For RSEQ, don't rseq_preempt() the caller.  User code
266 		 * is not supposed to issue syscalls at all from inside an
267 		 * rseq critical section.
268 		 */
269 		if (flags != MEMBARRIER_FLAG_SYNC_CORE) {
270 			preempt_disable();
271 			smp_call_function_many(tmpmask, ipi_func, NULL, true);
272 			preempt_enable();
273 		} else {
274 			on_each_cpu_mask(tmpmask, ipi_func, NULL, true);
275 		}
276 	}
277 
278 out:
279 	if (cpu_id < 0)
280 		free_cpumask_var(tmpmask);
281 	cpus_read_unlock();
282 
283 	/*
284 	 * Memory barrier on the caller thread _after_ we finished
285 	 * waiting for the last IPI. Matches memory barriers around
286 	 * rq->curr modification in scheduler.
287 	 */
288 	smp_mb();	/* exit from system call is not a mb */
289 	mutex_unlock(&membarrier_ipi_mutex);
290 
291 	return 0;
292 }
293 
sync_runqueues_membarrier_state(struct mm_struct * mm)294 static int sync_runqueues_membarrier_state(struct mm_struct *mm)
295 {
296 	int membarrier_state = atomic_read(&mm->membarrier_state);
297 	cpumask_var_t tmpmask;
298 	int cpu;
299 
300 	if (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1) {
301 		this_cpu_write(runqueues.membarrier_state, membarrier_state);
302 
303 		/*
304 		 * For single mm user, we can simply issue a memory barrier
305 		 * after setting MEMBARRIER_STATE_GLOBAL_EXPEDITED in the
306 		 * mm and in the current runqueue to guarantee that no memory
307 		 * access following registration is reordered before
308 		 * registration.
309 		 */
310 		smp_mb();
311 		return 0;
312 	}
313 
314 	if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
315 		return -ENOMEM;
316 
317 	/*
318 	 * For mm with multiple users, we need to ensure all future
319 	 * scheduler executions will observe @mm's new membarrier
320 	 * state.
321 	 */
322 	synchronize_rcu();
323 
324 	/*
325 	 * For each cpu runqueue, if the task's mm match @mm, ensure that all
326 	 * @mm's membarrier state set bits are also set in the runqueue's
327 	 * membarrier state. This ensures that a runqueue scheduling
328 	 * between threads which are users of @mm has its membarrier state
329 	 * updated.
330 	 */
331 	mutex_lock(&membarrier_ipi_mutex);
332 	cpus_read_lock();
333 	rcu_read_lock();
334 	for_each_online_cpu(cpu) {
335 		struct rq *rq = cpu_rq(cpu);
336 		struct task_struct *p;
337 
338 		p = rcu_dereference(rq->curr);
339 		if (p && p->mm == mm)
340 			__cpumask_set_cpu(cpu, tmpmask);
341 	}
342 	rcu_read_unlock();
343 
344 	on_each_cpu_mask(tmpmask, ipi_sync_rq_state, mm, true);
345 
346 	free_cpumask_var(tmpmask);
347 	cpus_read_unlock();
348 	mutex_unlock(&membarrier_ipi_mutex);
349 
350 	return 0;
351 }
352 
membarrier_register_global_expedited(void)353 static int membarrier_register_global_expedited(void)
354 {
355 	struct task_struct *p = current;
356 	struct mm_struct *mm = p->mm;
357 	int ret;
358 
359 	if (atomic_read(&mm->membarrier_state) &
360 	    MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY)
361 		return 0;
362 	atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED, &mm->membarrier_state);
363 	ret = sync_runqueues_membarrier_state(mm);
364 	if (ret)
365 		return ret;
366 	atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY,
367 		  &mm->membarrier_state);
368 
369 	return 0;
370 }
371 
membarrier_register_private_expedited(int flags)372 static int membarrier_register_private_expedited(int flags)
373 {
374 	struct task_struct *p = current;
375 	struct mm_struct *mm = p->mm;
376 	int ready_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY,
377 	    set_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED,
378 	    ret;
379 
380 	if (flags == MEMBARRIER_FLAG_SYNC_CORE) {
381 		if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
382 			return -EINVAL;
383 		ready_state =
384 			MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY;
385 	} else if (flags == MEMBARRIER_FLAG_RSEQ) {
386 		if (!IS_ENABLED(CONFIG_RSEQ))
387 			return -EINVAL;
388 		ready_state =
389 			MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY;
390 	} else {
391 		WARN_ON_ONCE(flags);
392 	}
393 
394 	/*
395 	 * We need to consider threads belonging to different thread
396 	 * groups, which use the same mm. (CLONE_VM but not
397 	 * CLONE_THREAD).
398 	 */
399 	if ((atomic_read(&mm->membarrier_state) & ready_state) == ready_state)
400 		return 0;
401 	if (flags & MEMBARRIER_FLAG_SYNC_CORE)
402 		set_state |= MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE;
403 	if (flags & MEMBARRIER_FLAG_RSEQ)
404 		set_state |= MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ;
405 	atomic_or(set_state, &mm->membarrier_state);
406 	ret = sync_runqueues_membarrier_state(mm);
407 	if (ret)
408 		return ret;
409 	atomic_or(ready_state, &mm->membarrier_state);
410 
411 	return 0;
412 }
413 
414 /**
415  * sys_membarrier - issue memory barriers on a set of threads
416  * @cmd:    Takes command values defined in enum membarrier_cmd.
417  * @flags:  Currently needs to be 0 for all commands other than
418  *          MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ: in the latter
419  *          case it can be MEMBARRIER_CMD_FLAG_CPU, indicating that @cpu_id
420  *          contains the CPU on which to interrupt (= restart)
421  *          the RSEQ critical section.
422  * @cpu_id: if @flags == MEMBARRIER_CMD_FLAG_CPU, indicates the cpu on which
423  *          RSEQ CS should be interrupted (@cmd must be
424  *          MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ).
425  *
426  * If this system call is not implemented, -ENOSYS is returned. If the
427  * command specified does not exist, not available on the running
428  * kernel, or if the command argument is invalid, this system call
429  * returns -EINVAL. For a given command, with flags argument set to 0,
430  * if this system call returns -ENOSYS or -EINVAL, it is guaranteed to
431  * always return the same value until reboot. In addition, it can return
432  * -ENOMEM if there is not enough memory available to perform the system
433  * call.
434  *
435  * All memory accesses performed in program order from each targeted thread
436  * is guaranteed to be ordered with respect to sys_membarrier(). If we use
437  * the semantic "barrier()" to represent a compiler barrier forcing memory
438  * accesses to be performed in program order across the barrier, and
439  * smp_mb() to represent explicit memory barriers forcing full memory
440  * ordering across the barrier, we have the following ordering table for
441  * each pair of barrier(), sys_membarrier() and smp_mb():
442  *
443  * The pair ordering is detailed as (O: ordered, X: not ordered):
444  *
445  *                        barrier()   smp_mb() sys_membarrier()
446  *        barrier()          X           X            O
447  *        smp_mb()           X           O            O
448  *        sys_membarrier()   O           O            O
449  */
SYSCALL_DEFINE3(membarrier,int,cmd,unsigned int,flags,int,cpu_id)450 SYSCALL_DEFINE3(membarrier, int, cmd, unsigned int, flags, int, cpu_id)
451 {
452 	switch (cmd) {
453 	case MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ:
454 		if (unlikely(flags && flags != MEMBARRIER_CMD_FLAG_CPU))
455 			return -EINVAL;
456 		break;
457 	default:
458 		if (unlikely(flags))
459 			return -EINVAL;
460 	}
461 
462 	if (!(flags & MEMBARRIER_CMD_FLAG_CPU))
463 		cpu_id = -1;
464 
465 	switch (cmd) {
466 	case MEMBARRIER_CMD_QUERY:
467 	{
468 		int cmd_mask = MEMBARRIER_CMD_BITMASK;
469 
470 		if (tick_nohz_full_enabled())
471 			cmd_mask &= ~MEMBARRIER_CMD_GLOBAL;
472 		return cmd_mask;
473 	}
474 	case MEMBARRIER_CMD_GLOBAL:
475 		/* MEMBARRIER_CMD_GLOBAL is not compatible with nohz_full. */
476 		if (tick_nohz_full_enabled())
477 			return -EINVAL;
478 		if (num_online_cpus() > 1)
479 			synchronize_rcu();
480 		return 0;
481 	case MEMBARRIER_CMD_GLOBAL_EXPEDITED:
482 		return membarrier_global_expedited();
483 	case MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED:
484 		return membarrier_register_global_expedited();
485 	case MEMBARRIER_CMD_PRIVATE_EXPEDITED:
486 		return membarrier_private_expedited(0, cpu_id);
487 	case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED:
488 		return membarrier_register_private_expedited(0);
489 	case MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE:
490 		return membarrier_private_expedited(MEMBARRIER_FLAG_SYNC_CORE, cpu_id);
491 	case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE:
492 		return membarrier_register_private_expedited(MEMBARRIER_FLAG_SYNC_CORE);
493 	case MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ:
494 		return membarrier_private_expedited(MEMBARRIER_FLAG_RSEQ, cpu_id);
495 	case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ:
496 		return membarrier_register_private_expedited(MEMBARRIER_FLAG_RSEQ);
497 	default:
498 		return -EINVAL;
499 	}
500 }
501