• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * Copyright (C) 2010-2017 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
4  *
5  * membarrier system call
6  */
7 #include "sched.h"
8 
9 /*
10  * Bitmask made from a "or" of all commands within enum membarrier_cmd,
11  * except MEMBARRIER_CMD_QUERY.
12  */
13 #ifdef CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE
14 #define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK			\
15 	(MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE			\
16 	| MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE)
17 #else
18 #define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK	0
19 #endif
20 
21 #ifdef CONFIG_RSEQ
22 #define MEMBARRIER_PRIVATE_EXPEDITED_RSEQ_BITMASK		\
23 	(MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ			\
24 	| MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ)
25 #else
26 #define MEMBARRIER_PRIVATE_EXPEDITED_RSEQ_BITMASK	0
27 #endif
28 
29 #define MEMBARRIER_CMD_BITMASK						\
30 	(MEMBARRIER_CMD_GLOBAL | MEMBARRIER_CMD_GLOBAL_EXPEDITED	\
31 	| MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED			\
32 	| MEMBARRIER_CMD_PRIVATE_EXPEDITED				\
33 	| MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED			\
34 	| MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK		\
35 	| MEMBARRIER_PRIVATE_EXPEDITED_RSEQ_BITMASK)
36 
ipi_mb(void * info)37 static void ipi_mb(void *info)
38 {
39 	smp_mb();	/* IPIs should be serializing but paranoid. */
40 }
41 
ipi_sync_core(void * info)42 static void ipi_sync_core(void *info)
43 {
44 	/*
45 	 * The smp_mb() in membarrier after all the IPIs is supposed to
46 	 * ensure that memory on remote CPUs that occur before the IPI
47 	 * become visible to membarrier()'s caller -- see scenario B in
48 	 * the big comment at the top of this file.
49 	 *
50 	 * A sync_core() would provide this guarantee, but
51 	 * sync_core_before_usermode() might end up being deferred until
52 	 * after membarrier()'s smp_mb().
53 	 */
54 	smp_mb();	/* IPIs should be serializing but paranoid. */
55 
56 	sync_core_before_usermode();
57 }
58 
ipi_rseq(void * info)59 static void ipi_rseq(void *info)
60 {
61 	/*
62 	 * Ensure that all stores done by the calling thread are visible
63 	 * to the current task before the current task resumes.  We could
64 	 * probably optimize this away on most architectures, but by the
65 	 * time we've already sent an IPI, the cost of the extra smp_mb()
66 	 * is negligible.
67 	 */
68 	smp_mb();
69 	rseq_preempt(current);
70 }
71 
ipi_sync_rq_state(void * info)72 static void ipi_sync_rq_state(void *info)
73 {
74 	struct mm_struct *mm = (struct mm_struct *) info;
75 
76 	if (current->mm != mm)
77 		return;
78 	this_cpu_write(runqueues.membarrier_state,
79 		       atomic_read(&mm->membarrier_state));
80 	/*
81 	 * Issue a memory barrier after setting
82 	 * MEMBARRIER_STATE_GLOBAL_EXPEDITED in the current runqueue to
83 	 * guarantee that no memory access following registration is reordered
84 	 * before registration.
85 	 */
86 	smp_mb();
87 }
88 
membarrier_exec_mmap(struct mm_struct * mm)89 void membarrier_exec_mmap(struct mm_struct *mm)
90 {
91 	/*
92 	 * Issue a memory barrier before clearing membarrier_state to
93 	 * guarantee that no memory access prior to exec is reordered after
94 	 * clearing this state.
95 	 */
96 	smp_mb();
97 	atomic_set(&mm->membarrier_state, 0);
98 	/*
99 	 * Keep the runqueue membarrier_state in sync with this mm
100 	 * membarrier_state.
101 	 */
102 	this_cpu_write(runqueues.membarrier_state, 0);
103 }
104 
membarrier_global_expedited(void)105 static int membarrier_global_expedited(void)
106 {
107 	int cpu;
108 	cpumask_var_t tmpmask;
109 
110 	if (num_online_cpus() == 1)
111 		return 0;
112 
113 	/*
114 	 * Matches memory barriers around rq->curr modification in
115 	 * scheduler.
116 	 */
117 	smp_mb();	/* system call entry is not a mb. */
118 
119 	if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
120 		return -ENOMEM;
121 
122 	cpus_read_lock();
123 	rcu_read_lock();
124 	for_each_online_cpu(cpu) {
125 		struct task_struct *p;
126 
127 		/*
128 		 * Skipping the current CPU is OK even through we can be
129 		 * migrated at any point. The current CPU, at the point
130 		 * where we read raw_smp_processor_id(), is ensured to
131 		 * be in program order with respect to the caller
132 		 * thread. Therefore, we can skip this CPU from the
133 		 * iteration.
134 		 */
135 		if (cpu == raw_smp_processor_id())
136 			continue;
137 
138 		if (!(READ_ONCE(cpu_rq(cpu)->membarrier_state) &
139 		    MEMBARRIER_STATE_GLOBAL_EXPEDITED))
140 			continue;
141 
142 		/*
143 		 * Skip the CPU if it runs a kernel thread. The scheduler
144 		 * leaves the prior task mm in place as an optimization when
145 		 * scheduling a kthread.
146 		 */
147 		p = rcu_dereference(cpu_rq(cpu)->curr);
148 		if (p->flags & PF_KTHREAD)
149 			continue;
150 
151 		__cpumask_set_cpu(cpu, tmpmask);
152 	}
153 	rcu_read_unlock();
154 
155 	preempt_disable();
156 	smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
157 	preempt_enable();
158 
159 	free_cpumask_var(tmpmask);
160 	cpus_read_unlock();
161 
162 	/*
163 	 * Memory barrier on the caller thread _after_ we finished
164 	 * waiting for the last IPI. Matches memory barriers around
165 	 * rq->curr modification in scheduler.
166 	 */
167 	smp_mb();	/* exit from system call is not a mb */
168 	return 0;
169 }
170 
membarrier_private_expedited(int flags,int cpu_id)171 static int membarrier_private_expedited(int flags, int cpu_id)
172 {
173 	cpumask_var_t tmpmask;
174 	struct mm_struct *mm = current->mm;
175 	smp_call_func_t ipi_func = ipi_mb;
176 
177 	if (flags == MEMBARRIER_FLAG_SYNC_CORE) {
178 		if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
179 			return -EINVAL;
180 		if (!(atomic_read(&mm->membarrier_state) &
181 		      MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY))
182 			return -EPERM;
183 		ipi_func = ipi_sync_core;
184 	} else if (flags == MEMBARRIER_FLAG_RSEQ) {
185 		if (!IS_ENABLED(CONFIG_RSEQ))
186 			return -EINVAL;
187 		if (!(atomic_read(&mm->membarrier_state) &
188 		      MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY))
189 			return -EPERM;
190 		ipi_func = ipi_rseq;
191 	} else {
192 		WARN_ON_ONCE(flags);
193 		if (!(atomic_read(&mm->membarrier_state) &
194 		      MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY))
195 			return -EPERM;
196 	}
197 
198 	if (flags != MEMBARRIER_FLAG_SYNC_CORE &&
199 	    (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1))
200 		return 0;
201 
202 	/*
203 	 * Matches memory barriers around rq->curr modification in
204 	 * scheduler.
205 	 */
206 	smp_mb();	/* system call entry is not a mb. */
207 
208 	if (cpu_id < 0 && !zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
209 		return -ENOMEM;
210 
211 	cpus_read_lock();
212 
213 	if (cpu_id >= 0) {
214 		struct task_struct *p;
215 
216 		if (cpu_id >= nr_cpu_ids || !cpu_online(cpu_id))
217 			goto out;
218 		rcu_read_lock();
219 		p = rcu_dereference(cpu_rq(cpu_id)->curr);
220 		if (!p || p->mm != mm) {
221 			rcu_read_unlock();
222 			goto out;
223 		}
224 		rcu_read_unlock();
225 	} else {
226 		int cpu;
227 
228 		rcu_read_lock();
229 		for_each_online_cpu(cpu) {
230 			struct task_struct *p;
231 
232 			p = rcu_dereference(cpu_rq(cpu)->curr);
233 			if (p && p->mm == mm)
234 				__cpumask_set_cpu(cpu, tmpmask);
235 		}
236 		rcu_read_unlock();
237 	}
238 
239 	if (cpu_id >= 0) {
240 		/*
241 		 * smp_call_function_single() will call ipi_func() if cpu_id
242 		 * is the calling CPU.
243 		 */
244 		smp_call_function_single(cpu_id, ipi_func, NULL, 1);
245 	} else {
246 		/*
247 		 * For regular membarrier, we can save a few cycles by
248 		 * skipping the current cpu -- we're about to do smp_mb()
249 		 * below, and if we migrate to a different cpu, this cpu
250 		 * and the new cpu will execute a full barrier in the
251 		 * scheduler.
252 		 *
253 		 * For SYNC_CORE, we do need a barrier on the current cpu --
254 		 * otherwise, if we are migrated and replaced by a different
255 		 * task in the same mm just before, during, or after
256 		 * membarrier, we will end up with some thread in the mm
257 		 * running without a core sync.
258 		 *
259 		 * For RSEQ, don't rseq_preempt() the caller.  User code
260 		 * is not supposed to issue syscalls at all from inside an
261 		 * rseq critical section.
262 		 */
263 		if (flags != MEMBARRIER_FLAG_SYNC_CORE) {
264 			preempt_disable();
265 			smp_call_function_many(tmpmask, ipi_func, NULL, true);
266 			preempt_enable();
267 		} else {
268 			on_each_cpu_mask(tmpmask, ipi_func, NULL, true);
269 		}
270 	}
271 
272 out:
273 	if (cpu_id < 0)
274 		free_cpumask_var(tmpmask);
275 	cpus_read_unlock();
276 
277 	/*
278 	 * Memory barrier on the caller thread _after_ we finished
279 	 * waiting for the last IPI. Matches memory barriers around
280 	 * rq->curr modification in scheduler.
281 	 */
282 	smp_mb();	/* exit from system call is not a mb */
283 
284 	return 0;
285 }
286 
sync_runqueues_membarrier_state(struct mm_struct * mm)287 static int sync_runqueues_membarrier_state(struct mm_struct *mm)
288 {
289 	int membarrier_state = atomic_read(&mm->membarrier_state);
290 	cpumask_var_t tmpmask;
291 	int cpu;
292 
293 	if (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1) {
294 		this_cpu_write(runqueues.membarrier_state, membarrier_state);
295 
296 		/*
297 		 * For single mm user, we can simply issue a memory barrier
298 		 * after setting MEMBARRIER_STATE_GLOBAL_EXPEDITED in the
299 		 * mm and in the current runqueue to guarantee that no memory
300 		 * access following registration is reordered before
301 		 * registration.
302 		 */
303 		smp_mb();
304 		return 0;
305 	}
306 
307 	if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
308 		return -ENOMEM;
309 
310 	/*
311 	 * For mm with multiple users, we need to ensure all future
312 	 * scheduler executions will observe @mm's new membarrier
313 	 * state.
314 	 */
315 	synchronize_rcu();
316 
317 	/*
318 	 * For each cpu runqueue, if the task's mm match @mm, ensure that all
319 	 * @mm's membarrier state set bits are also set in the runqueue's
320 	 * membarrier state. This ensures that a runqueue scheduling
321 	 * between threads which are users of @mm has its membarrier state
322 	 * updated.
323 	 */
324 	cpus_read_lock();
325 	rcu_read_lock();
326 	for_each_online_cpu(cpu) {
327 		struct rq *rq = cpu_rq(cpu);
328 		struct task_struct *p;
329 
330 		p = rcu_dereference(rq->curr);
331 		if (p && p->mm == mm)
332 			__cpumask_set_cpu(cpu, tmpmask);
333 	}
334 	rcu_read_unlock();
335 
336 	on_each_cpu_mask(tmpmask, ipi_sync_rq_state, mm, true);
337 
338 	free_cpumask_var(tmpmask);
339 	cpus_read_unlock();
340 
341 	return 0;
342 }
343 
membarrier_register_global_expedited(void)344 static int membarrier_register_global_expedited(void)
345 {
346 	struct task_struct *p = current;
347 	struct mm_struct *mm = p->mm;
348 	int ret;
349 
350 	if (atomic_read(&mm->membarrier_state) &
351 	    MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY)
352 		return 0;
353 	atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED, &mm->membarrier_state);
354 	ret = sync_runqueues_membarrier_state(mm);
355 	if (ret)
356 		return ret;
357 	atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY,
358 		  &mm->membarrier_state);
359 
360 	return 0;
361 }
362 
membarrier_register_private_expedited(int flags)363 static int membarrier_register_private_expedited(int flags)
364 {
365 	struct task_struct *p = current;
366 	struct mm_struct *mm = p->mm;
367 	int ready_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY,
368 	    set_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED,
369 	    ret;
370 
371 	if (flags == MEMBARRIER_FLAG_SYNC_CORE) {
372 		if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
373 			return -EINVAL;
374 		ready_state =
375 			MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY;
376 	} else if (flags == MEMBARRIER_FLAG_RSEQ) {
377 		if (!IS_ENABLED(CONFIG_RSEQ))
378 			return -EINVAL;
379 		ready_state =
380 			MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY;
381 	} else {
382 		WARN_ON_ONCE(flags);
383 	}
384 
385 	/*
386 	 * We need to consider threads belonging to different thread
387 	 * groups, which use the same mm. (CLONE_VM but not
388 	 * CLONE_THREAD).
389 	 */
390 	if ((atomic_read(&mm->membarrier_state) & ready_state) == ready_state)
391 		return 0;
392 	if (flags & MEMBARRIER_FLAG_SYNC_CORE)
393 		set_state |= MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE;
394 	if (flags & MEMBARRIER_FLAG_RSEQ)
395 		set_state |= MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ;
396 	atomic_or(set_state, &mm->membarrier_state);
397 	ret = sync_runqueues_membarrier_state(mm);
398 	if (ret)
399 		return ret;
400 	atomic_or(ready_state, &mm->membarrier_state);
401 
402 	return 0;
403 }
404 
405 /**
406  * sys_membarrier - issue memory barriers on a set of threads
407  * @cmd:    Takes command values defined in enum membarrier_cmd.
408  * @flags:  Currently needs to be 0 for all commands other than
409  *          MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ: in the latter
410  *          case it can be MEMBARRIER_CMD_FLAG_CPU, indicating that @cpu_id
411  *          contains the CPU on which to interrupt (= restart)
412  *          the RSEQ critical section.
413  * @cpu_id: if @flags == MEMBARRIER_CMD_FLAG_CPU, indicates the cpu on which
414  *          RSEQ CS should be interrupted (@cmd must be
415  *          MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ).
416  *
417  * If this system call is not implemented, -ENOSYS is returned. If the
418  * command specified does not exist, not available on the running
419  * kernel, or if the command argument is invalid, this system call
420  * returns -EINVAL. For a given command, with flags argument set to 0,
421  * if this system call returns -ENOSYS or -EINVAL, it is guaranteed to
422  * always return the same value until reboot. In addition, it can return
423  * -ENOMEM if there is not enough memory available to perform the system
424  * call.
425  *
426  * All memory accesses performed in program order from each targeted thread
427  * is guaranteed to be ordered with respect to sys_membarrier(). If we use
428  * the semantic "barrier()" to represent a compiler barrier forcing memory
429  * accesses to be performed in program order across the barrier, and
430  * smp_mb() to represent explicit memory barriers forcing full memory
431  * ordering across the barrier, we have the following ordering table for
432  * each pair of barrier(), sys_membarrier() and smp_mb():
433  *
434  * The pair ordering is detailed as (O: ordered, X: not ordered):
435  *
436  *                        barrier()   smp_mb() sys_membarrier()
437  *        barrier()          X           X            O
438  *        smp_mb()           X           O            O
439  *        sys_membarrier()   O           O            O
440  */
SYSCALL_DEFINE3(membarrier,int,cmd,unsigned int,flags,int,cpu_id)441 SYSCALL_DEFINE3(membarrier, int, cmd, unsigned int, flags, int, cpu_id)
442 {
443 	switch (cmd) {
444 	case MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ:
445 		if (unlikely(flags && flags != MEMBARRIER_CMD_FLAG_CPU))
446 			return -EINVAL;
447 		break;
448 	default:
449 		if (unlikely(flags))
450 			return -EINVAL;
451 	}
452 
453 	if (!(flags & MEMBARRIER_CMD_FLAG_CPU))
454 		cpu_id = -1;
455 
456 	switch (cmd) {
457 	case MEMBARRIER_CMD_QUERY:
458 	{
459 		int cmd_mask = MEMBARRIER_CMD_BITMASK;
460 
461 		if (tick_nohz_full_enabled())
462 			cmd_mask &= ~MEMBARRIER_CMD_GLOBAL;
463 		return cmd_mask;
464 	}
465 	case MEMBARRIER_CMD_GLOBAL:
466 		/* MEMBARRIER_CMD_GLOBAL is not compatible with nohz_full. */
467 		if (tick_nohz_full_enabled())
468 			return -EINVAL;
469 		if (num_online_cpus() > 1)
470 			synchronize_rcu();
471 		return 0;
472 	case MEMBARRIER_CMD_GLOBAL_EXPEDITED:
473 		return membarrier_global_expedited();
474 	case MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED:
475 		return membarrier_register_global_expedited();
476 	case MEMBARRIER_CMD_PRIVATE_EXPEDITED:
477 		return membarrier_private_expedited(0, cpu_id);
478 	case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED:
479 		return membarrier_register_private_expedited(0);
480 	case MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE:
481 		return membarrier_private_expedited(MEMBARRIER_FLAG_SYNC_CORE, cpu_id);
482 	case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE:
483 		return membarrier_register_private_expedited(MEMBARRIER_FLAG_SYNC_CORE);
484 	case MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ:
485 		return membarrier_private_expedited(MEMBARRIER_FLAG_RSEQ, cpu_id);
486 	case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ:
487 		return membarrier_register_private_expedited(MEMBARRIER_FLAG_RSEQ);
488 	default:
489 		return -EINVAL;
490 	}
491 }
492