1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * Copyright (C) 2010-2017 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
4 *
5 * membarrier system call
6 */
7 #include "sched.h"
8
9 /*
10 * Bitmask made from a "or" of all commands within enum membarrier_cmd,
11 * except MEMBARRIER_CMD_QUERY.
12 */
13 #ifdef CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE
14 #define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK \
15 (MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE \
16 | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE)
17 #else
18 #define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK 0
19 #endif
20
21 #ifdef CONFIG_RSEQ
22 #define MEMBARRIER_PRIVATE_EXPEDITED_RSEQ_BITMASK \
23 (MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ \
24 | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ)
25 #else
26 #define MEMBARRIER_PRIVATE_EXPEDITED_RSEQ_BITMASK 0
27 #endif
28
29 #define MEMBARRIER_CMD_BITMASK \
30 (MEMBARRIER_CMD_GLOBAL | MEMBARRIER_CMD_GLOBAL_EXPEDITED \
31 | MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED \
32 | MEMBARRIER_CMD_PRIVATE_EXPEDITED \
33 | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED \
34 | MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK \
35 | MEMBARRIER_PRIVATE_EXPEDITED_RSEQ_BITMASK)
36
ipi_mb(void * info)37 static void ipi_mb(void *info)
38 {
39 smp_mb(); /* IPIs should be serializing but paranoid. */
40 }
41
ipi_sync_core(void * info)42 static void ipi_sync_core(void *info)
43 {
44 /*
45 * The smp_mb() in membarrier after all the IPIs is supposed to
46 * ensure that memory on remote CPUs that occur before the IPI
47 * become visible to membarrier()'s caller -- see scenario B in
48 * the big comment at the top of this file.
49 *
50 * A sync_core() would provide this guarantee, but
51 * sync_core_before_usermode() might end up being deferred until
52 * after membarrier()'s smp_mb().
53 */
54 smp_mb(); /* IPIs should be serializing but paranoid. */
55
56 sync_core_before_usermode();
57 }
58
ipi_rseq(void * info)59 static void ipi_rseq(void *info)
60 {
61 /*
62 * Ensure that all stores done by the calling thread are visible
63 * to the current task before the current task resumes. We could
64 * probably optimize this away on most architectures, but by the
65 * time we've already sent an IPI, the cost of the extra smp_mb()
66 * is negligible.
67 */
68 smp_mb();
69 rseq_preempt(current);
70 }
71
ipi_sync_rq_state(void * info)72 static void ipi_sync_rq_state(void *info)
73 {
74 struct mm_struct *mm = (struct mm_struct *) info;
75
76 if (current->mm != mm)
77 return;
78 this_cpu_write(runqueues.membarrier_state,
79 atomic_read(&mm->membarrier_state));
80 /*
81 * Issue a memory barrier after setting
82 * MEMBARRIER_STATE_GLOBAL_EXPEDITED in the current runqueue to
83 * guarantee that no memory access following registration is reordered
84 * before registration.
85 */
86 smp_mb();
87 }
88
membarrier_exec_mmap(struct mm_struct * mm)89 void membarrier_exec_mmap(struct mm_struct *mm)
90 {
91 /*
92 * Issue a memory barrier before clearing membarrier_state to
93 * guarantee that no memory access prior to exec is reordered after
94 * clearing this state.
95 */
96 smp_mb();
97 atomic_set(&mm->membarrier_state, 0);
98 /*
99 * Keep the runqueue membarrier_state in sync with this mm
100 * membarrier_state.
101 */
102 this_cpu_write(runqueues.membarrier_state, 0);
103 }
104
membarrier_global_expedited(void)105 static int membarrier_global_expedited(void)
106 {
107 int cpu;
108 cpumask_var_t tmpmask;
109
110 if (num_online_cpus() == 1)
111 return 0;
112
113 /*
114 * Matches memory barriers around rq->curr modification in
115 * scheduler.
116 */
117 smp_mb(); /* system call entry is not a mb. */
118
119 if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
120 return -ENOMEM;
121
122 cpus_read_lock();
123 rcu_read_lock();
124 for_each_online_cpu(cpu) {
125 struct task_struct *p;
126
127 /*
128 * Skipping the current CPU is OK even through we can be
129 * migrated at any point. The current CPU, at the point
130 * where we read raw_smp_processor_id(), is ensured to
131 * be in program order with respect to the caller
132 * thread. Therefore, we can skip this CPU from the
133 * iteration.
134 */
135 if (cpu == raw_smp_processor_id())
136 continue;
137
138 if (!(READ_ONCE(cpu_rq(cpu)->membarrier_state) &
139 MEMBARRIER_STATE_GLOBAL_EXPEDITED))
140 continue;
141
142 /*
143 * Skip the CPU if it runs a kernel thread. The scheduler
144 * leaves the prior task mm in place as an optimization when
145 * scheduling a kthread.
146 */
147 p = rcu_dereference(cpu_rq(cpu)->curr);
148 if (p->flags & PF_KTHREAD)
149 continue;
150
151 __cpumask_set_cpu(cpu, tmpmask);
152 }
153 rcu_read_unlock();
154
155 preempt_disable();
156 smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
157 preempt_enable();
158
159 free_cpumask_var(tmpmask);
160 cpus_read_unlock();
161
162 /*
163 * Memory barrier on the caller thread _after_ we finished
164 * waiting for the last IPI. Matches memory barriers around
165 * rq->curr modification in scheduler.
166 */
167 smp_mb(); /* exit from system call is not a mb */
168 return 0;
169 }
170
membarrier_private_expedited(int flags,int cpu_id)171 static int membarrier_private_expedited(int flags, int cpu_id)
172 {
173 cpumask_var_t tmpmask;
174 struct mm_struct *mm = current->mm;
175 smp_call_func_t ipi_func = ipi_mb;
176
177 if (flags == MEMBARRIER_FLAG_SYNC_CORE) {
178 if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
179 return -EINVAL;
180 if (!(atomic_read(&mm->membarrier_state) &
181 MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY))
182 return -EPERM;
183 ipi_func = ipi_sync_core;
184 } else if (flags == MEMBARRIER_FLAG_RSEQ) {
185 if (!IS_ENABLED(CONFIG_RSEQ))
186 return -EINVAL;
187 if (!(atomic_read(&mm->membarrier_state) &
188 MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY))
189 return -EPERM;
190 ipi_func = ipi_rseq;
191 } else {
192 WARN_ON_ONCE(flags);
193 if (!(atomic_read(&mm->membarrier_state) &
194 MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY))
195 return -EPERM;
196 }
197
198 if (flags != MEMBARRIER_FLAG_SYNC_CORE &&
199 (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1))
200 return 0;
201
202 /*
203 * Matches memory barriers around rq->curr modification in
204 * scheduler.
205 */
206 smp_mb(); /* system call entry is not a mb. */
207
208 if (cpu_id < 0 && !zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
209 return -ENOMEM;
210
211 cpus_read_lock();
212
213 if (cpu_id >= 0) {
214 struct task_struct *p;
215
216 if (cpu_id >= nr_cpu_ids || !cpu_online(cpu_id))
217 goto out;
218 rcu_read_lock();
219 p = rcu_dereference(cpu_rq(cpu_id)->curr);
220 if (!p || p->mm != mm) {
221 rcu_read_unlock();
222 goto out;
223 }
224 rcu_read_unlock();
225 } else {
226 int cpu;
227
228 rcu_read_lock();
229 for_each_online_cpu(cpu) {
230 struct task_struct *p;
231
232 p = rcu_dereference(cpu_rq(cpu)->curr);
233 if (p && p->mm == mm)
234 __cpumask_set_cpu(cpu, tmpmask);
235 }
236 rcu_read_unlock();
237 }
238
239 if (cpu_id >= 0) {
240 /*
241 * smp_call_function_single() will call ipi_func() if cpu_id
242 * is the calling CPU.
243 */
244 smp_call_function_single(cpu_id, ipi_func, NULL, 1);
245 } else {
246 /*
247 * For regular membarrier, we can save a few cycles by
248 * skipping the current cpu -- we're about to do smp_mb()
249 * below, and if we migrate to a different cpu, this cpu
250 * and the new cpu will execute a full barrier in the
251 * scheduler.
252 *
253 * For SYNC_CORE, we do need a barrier on the current cpu --
254 * otherwise, if we are migrated and replaced by a different
255 * task in the same mm just before, during, or after
256 * membarrier, we will end up with some thread in the mm
257 * running without a core sync.
258 *
259 * For RSEQ, don't rseq_preempt() the caller. User code
260 * is not supposed to issue syscalls at all from inside an
261 * rseq critical section.
262 */
263 if (flags != MEMBARRIER_FLAG_SYNC_CORE) {
264 preempt_disable();
265 smp_call_function_many(tmpmask, ipi_func, NULL, true);
266 preempt_enable();
267 } else {
268 on_each_cpu_mask(tmpmask, ipi_func, NULL, true);
269 }
270 }
271
272 out:
273 if (cpu_id < 0)
274 free_cpumask_var(tmpmask);
275 cpus_read_unlock();
276
277 /*
278 * Memory barrier on the caller thread _after_ we finished
279 * waiting for the last IPI. Matches memory barriers around
280 * rq->curr modification in scheduler.
281 */
282 smp_mb(); /* exit from system call is not a mb */
283
284 return 0;
285 }
286
sync_runqueues_membarrier_state(struct mm_struct * mm)287 static int sync_runqueues_membarrier_state(struct mm_struct *mm)
288 {
289 int membarrier_state = atomic_read(&mm->membarrier_state);
290 cpumask_var_t tmpmask;
291 int cpu;
292
293 if (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1) {
294 this_cpu_write(runqueues.membarrier_state, membarrier_state);
295
296 /*
297 * For single mm user, we can simply issue a memory barrier
298 * after setting MEMBARRIER_STATE_GLOBAL_EXPEDITED in the
299 * mm and in the current runqueue to guarantee that no memory
300 * access following registration is reordered before
301 * registration.
302 */
303 smp_mb();
304 return 0;
305 }
306
307 if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
308 return -ENOMEM;
309
310 /*
311 * For mm with multiple users, we need to ensure all future
312 * scheduler executions will observe @mm's new membarrier
313 * state.
314 */
315 synchronize_rcu();
316
317 /*
318 * For each cpu runqueue, if the task's mm match @mm, ensure that all
319 * @mm's membarrier state set bits are also set in the runqueue's
320 * membarrier state. This ensures that a runqueue scheduling
321 * between threads which are users of @mm has its membarrier state
322 * updated.
323 */
324 cpus_read_lock();
325 rcu_read_lock();
326 for_each_online_cpu(cpu) {
327 struct rq *rq = cpu_rq(cpu);
328 struct task_struct *p;
329
330 p = rcu_dereference(rq->curr);
331 if (p && p->mm == mm)
332 __cpumask_set_cpu(cpu, tmpmask);
333 }
334 rcu_read_unlock();
335
336 on_each_cpu_mask(tmpmask, ipi_sync_rq_state, mm, true);
337
338 free_cpumask_var(tmpmask);
339 cpus_read_unlock();
340
341 return 0;
342 }
343
membarrier_register_global_expedited(void)344 static int membarrier_register_global_expedited(void)
345 {
346 struct task_struct *p = current;
347 struct mm_struct *mm = p->mm;
348 int ret;
349
350 if (atomic_read(&mm->membarrier_state) &
351 MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY)
352 return 0;
353 atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED, &mm->membarrier_state);
354 ret = sync_runqueues_membarrier_state(mm);
355 if (ret)
356 return ret;
357 atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY,
358 &mm->membarrier_state);
359
360 return 0;
361 }
362
membarrier_register_private_expedited(int flags)363 static int membarrier_register_private_expedited(int flags)
364 {
365 struct task_struct *p = current;
366 struct mm_struct *mm = p->mm;
367 int ready_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY,
368 set_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED,
369 ret;
370
371 if (flags == MEMBARRIER_FLAG_SYNC_CORE) {
372 if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
373 return -EINVAL;
374 ready_state =
375 MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY;
376 } else if (flags == MEMBARRIER_FLAG_RSEQ) {
377 if (!IS_ENABLED(CONFIG_RSEQ))
378 return -EINVAL;
379 ready_state =
380 MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY;
381 } else {
382 WARN_ON_ONCE(flags);
383 }
384
385 /*
386 * We need to consider threads belonging to different thread
387 * groups, which use the same mm. (CLONE_VM but not
388 * CLONE_THREAD).
389 */
390 if ((atomic_read(&mm->membarrier_state) & ready_state) == ready_state)
391 return 0;
392 if (flags & MEMBARRIER_FLAG_SYNC_CORE)
393 set_state |= MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE;
394 if (flags & MEMBARRIER_FLAG_RSEQ)
395 set_state |= MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ;
396 atomic_or(set_state, &mm->membarrier_state);
397 ret = sync_runqueues_membarrier_state(mm);
398 if (ret)
399 return ret;
400 atomic_or(ready_state, &mm->membarrier_state);
401
402 return 0;
403 }
404
405 /**
406 * sys_membarrier - issue memory barriers on a set of threads
407 * @cmd: Takes command values defined in enum membarrier_cmd.
408 * @flags: Currently needs to be 0 for all commands other than
409 * MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ: in the latter
410 * case it can be MEMBARRIER_CMD_FLAG_CPU, indicating that @cpu_id
411 * contains the CPU on which to interrupt (= restart)
412 * the RSEQ critical section.
413 * @cpu_id: if @flags == MEMBARRIER_CMD_FLAG_CPU, indicates the cpu on which
414 * RSEQ CS should be interrupted (@cmd must be
415 * MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ).
416 *
417 * If this system call is not implemented, -ENOSYS is returned. If the
418 * command specified does not exist, not available on the running
419 * kernel, or if the command argument is invalid, this system call
420 * returns -EINVAL. For a given command, with flags argument set to 0,
421 * if this system call returns -ENOSYS or -EINVAL, it is guaranteed to
422 * always return the same value until reboot. In addition, it can return
423 * -ENOMEM if there is not enough memory available to perform the system
424 * call.
425 *
426 * All memory accesses performed in program order from each targeted thread
427 * is guaranteed to be ordered with respect to sys_membarrier(). If we use
428 * the semantic "barrier()" to represent a compiler barrier forcing memory
429 * accesses to be performed in program order across the barrier, and
430 * smp_mb() to represent explicit memory barriers forcing full memory
431 * ordering across the barrier, we have the following ordering table for
432 * each pair of barrier(), sys_membarrier() and smp_mb():
433 *
434 * The pair ordering is detailed as (O: ordered, X: not ordered):
435 *
436 * barrier() smp_mb() sys_membarrier()
437 * barrier() X X O
438 * smp_mb() X O O
439 * sys_membarrier() O O O
440 */
SYSCALL_DEFINE3(membarrier,int,cmd,unsigned int,flags,int,cpu_id)441 SYSCALL_DEFINE3(membarrier, int, cmd, unsigned int, flags, int, cpu_id)
442 {
443 switch (cmd) {
444 case MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ:
445 if (unlikely(flags && flags != MEMBARRIER_CMD_FLAG_CPU))
446 return -EINVAL;
447 break;
448 default:
449 if (unlikely(flags))
450 return -EINVAL;
451 }
452
453 if (!(flags & MEMBARRIER_CMD_FLAG_CPU))
454 cpu_id = -1;
455
456 switch (cmd) {
457 case MEMBARRIER_CMD_QUERY:
458 {
459 int cmd_mask = MEMBARRIER_CMD_BITMASK;
460
461 if (tick_nohz_full_enabled())
462 cmd_mask &= ~MEMBARRIER_CMD_GLOBAL;
463 return cmd_mask;
464 }
465 case MEMBARRIER_CMD_GLOBAL:
466 /* MEMBARRIER_CMD_GLOBAL is not compatible with nohz_full. */
467 if (tick_nohz_full_enabled())
468 return -EINVAL;
469 if (num_online_cpus() > 1)
470 synchronize_rcu();
471 return 0;
472 case MEMBARRIER_CMD_GLOBAL_EXPEDITED:
473 return membarrier_global_expedited();
474 case MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED:
475 return membarrier_register_global_expedited();
476 case MEMBARRIER_CMD_PRIVATE_EXPEDITED:
477 return membarrier_private_expedited(0, cpu_id);
478 case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED:
479 return membarrier_register_private_expedited(0);
480 case MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE:
481 return membarrier_private_expedited(MEMBARRIER_FLAG_SYNC_CORE, cpu_id);
482 case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE:
483 return membarrier_register_private_expedited(MEMBARRIER_FLAG_SYNC_CORE);
484 case MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ:
485 return membarrier_private_expedited(MEMBARRIER_FLAG_RSEQ, cpu_id);
486 case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ:
487 return membarrier_register_private_expedited(MEMBARRIER_FLAG_RSEQ);
488 default:
489 return -EINVAL;
490 }
491 }
492