1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * Copyright (C) 2010-2017 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
4 *
5 * membarrier system call
6 */
7 #include "sched.h"
8
9 /*
10 * Bitmask made from a "or" of all commands within enum membarrier_cmd,
11 * except MEMBARRIER_CMD_QUERY.
12 */
13 #ifdef CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE
14 #define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK \
15 (MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE \
16 | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE)
17 #else
18 #define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK 0
19 #endif
20
21 #define MEMBARRIER_CMD_BITMASK \
22 (MEMBARRIER_CMD_GLOBAL | MEMBARRIER_CMD_GLOBAL_EXPEDITED \
23 | MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED \
24 | MEMBARRIER_CMD_PRIVATE_EXPEDITED \
25 | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED \
26 | MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK)
27
28 static DEFINE_MUTEX(membarrier_ipi_mutex);
29
ipi_mb(void * info)30 static void ipi_mb(void *info)
31 {
32 smp_mb(); /* IPIs should be serializing but paranoid. */
33 }
34
ipi_sync_core(void * info)35 static void ipi_sync_core(void *info)
36 {
37 /*
38 * The smp_mb() in membarrier after all the IPIs is supposed to
39 * ensure that memory on remote CPUs that occur before the IPI
40 * become visible to membarrier()'s caller -- see scenario B in
41 * the big comment at the top of this file.
42 *
43 * A sync_core() would provide this guarantee, but
44 * sync_core_before_usermode() might end up being deferred until
45 * after membarrier()'s smp_mb().
46 */
47 smp_mb(); /* IPIs should be serializing but paranoid. */
48
49 sync_core_before_usermode();
50 }
51
ipi_sync_rq_state(void * info)52 static void ipi_sync_rq_state(void *info)
53 {
54 struct mm_struct *mm = (struct mm_struct *) info;
55
56 if (current->mm != mm)
57 return;
58 this_cpu_write(runqueues.membarrier_state,
59 atomic_read(&mm->membarrier_state));
60 /*
61 * Issue a memory barrier after setting
62 * MEMBARRIER_STATE_GLOBAL_EXPEDITED in the current runqueue to
63 * guarantee that no memory access following registration is reordered
64 * before registration.
65 */
66 smp_mb();
67 }
68
membarrier_exec_mmap(struct mm_struct * mm)69 void membarrier_exec_mmap(struct mm_struct *mm)
70 {
71 /*
72 * Issue a memory barrier before clearing membarrier_state to
73 * guarantee that no memory access prior to exec is reordered after
74 * clearing this state.
75 */
76 smp_mb();
77 atomic_set(&mm->membarrier_state, 0);
78 /*
79 * Keep the runqueue membarrier_state in sync with this mm
80 * membarrier_state.
81 */
82 this_cpu_write(runqueues.membarrier_state, 0);
83 }
84
membarrier_global_expedited(void)85 static int membarrier_global_expedited(void)
86 {
87 int cpu;
88 cpumask_var_t tmpmask;
89
90 if (num_online_cpus() == 1)
91 return 0;
92
93 /*
94 * Matches memory barriers around rq->curr modification in
95 * scheduler.
96 */
97 smp_mb(); /* system call entry is not a mb. */
98
99 if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
100 return -ENOMEM;
101
102 mutex_lock(&membarrier_ipi_mutex);
103 cpus_read_lock();
104 rcu_read_lock();
105 for_each_online_cpu(cpu) {
106 struct task_struct *p;
107
108 /*
109 * Skipping the current CPU is OK even through we can be
110 * migrated at any point. The current CPU, at the point
111 * where we read raw_smp_processor_id(), is ensured to
112 * be in program order with respect to the caller
113 * thread. Therefore, we can skip this CPU from the
114 * iteration.
115 */
116 if (cpu == raw_smp_processor_id())
117 continue;
118
119 if (!(READ_ONCE(cpu_rq(cpu)->membarrier_state) &
120 MEMBARRIER_STATE_GLOBAL_EXPEDITED))
121 continue;
122
123 /*
124 * Skip the CPU if it runs a kernel thread. The scheduler
125 * leaves the prior task mm in place as an optimization when
126 * scheduling a kthread.
127 */
128 p = rcu_dereference(cpu_rq(cpu)->curr);
129 if (p->flags & PF_KTHREAD)
130 continue;
131
132 __cpumask_set_cpu(cpu, tmpmask);
133 }
134 rcu_read_unlock();
135
136 preempt_disable();
137 smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
138 preempt_enable();
139
140 free_cpumask_var(tmpmask);
141 cpus_read_unlock();
142
143 /*
144 * Memory barrier on the caller thread _after_ we finished
145 * waiting for the last IPI. Matches memory barriers around
146 * rq->curr modification in scheduler.
147 */
148 smp_mb(); /* exit from system call is not a mb */
149 mutex_unlock(&membarrier_ipi_mutex);
150
151 return 0;
152 }
153
membarrier_private_expedited(int flags)154 static int membarrier_private_expedited(int flags)
155 {
156 int cpu;
157 cpumask_var_t tmpmask;
158 struct mm_struct *mm = current->mm;
159 smp_call_func_t ipi_func = ipi_mb;
160
161 if (flags & MEMBARRIER_FLAG_SYNC_CORE) {
162 if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
163 return -EINVAL;
164 if (!(atomic_read(&mm->membarrier_state) &
165 MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY))
166 return -EPERM;
167 ipi_func = ipi_sync_core;
168 } else {
169 if (!(atomic_read(&mm->membarrier_state) &
170 MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY))
171 return -EPERM;
172 }
173
174 if (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1)
175 return 0;
176
177 /*
178 * Matches memory barriers around rq->curr modification in
179 * scheduler.
180 */
181 smp_mb(); /* system call entry is not a mb. */
182
183 if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
184 return -ENOMEM;
185
186 mutex_lock(&membarrier_ipi_mutex);
187 cpus_read_lock();
188 rcu_read_lock();
189 for_each_online_cpu(cpu) {
190 struct task_struct *p;
191
192 /*
193 * Skipping the current CPU is OK even through we can be
194 * migrated at any point. The current CPU, at the point
195 * where we read raw_smp_processor_id(), is ensured to
196 * be in program order with respect to the caller
197 * thread. Therefore, we can skip this CPU from the
198 * iteration.
199 */
200 if (cpu == raw_smp_processor_id())
201 continue;
202 p = rcu_dereference(cpu_rq(cpu)->curr);
203 if (p && p->mm == mm)
204 __cpumask_set_cpu(cpu, tmpmask);
205 }
206 rcu_read_unlock();
207
208 preempt_disable();
209 smp_call_function_many(tmpmask, ipi_func, NULL, 1);
210 preempt_enable();
211
212 free_cpumask_var(tmpmask);
213 cpus_read_unlock();
214
215 /*
216 * Memory barrier on the caller thread _after_ we finished
217 * waiting for the last IPI. Matches memory barriers around
218 * rq->curr modification in scheduler.
219 */
220 smp_mb(); /* exit from system call is not a mb */
221 mutex_unlock(&membarrier_ipi_mutex);
222
223 return 0;
224 }
225
sync_runqueues_membarrier_state(struct mm_struct * mm)226 static int sync_runqueues_membarrier_state(struct mm_struct *mm)
227 {
228 int membarrier_state = atomic_read(&mm->membarrier_state);
229 cpumask_var_t tmpmask;
230 int cpu;
231
232 if (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1) {
233 this_cpu_write(runqueues.membarrier_state, membarrier_state);
234
235 /*
236 * For single mm user, we can simply issue a memory barrier
237 * after setting MEMBARRIER_STATE_GLOBAL_EXPEDITED in the
238 * mm and in the current runqueue to guarantee that no memory
239 * access following registration is reordered before
240 * registration.
241 */
242 smp_mb();
243 return 0;
244 }
245
246 if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
247 return -ENOMEM;
248
249 /*
250 * For mm with multiple users, we need to ensure all future
251 * scheduler executions will observe @mm's new membarrier
252 * state.
253 */
254 synchronize_rcu();
255
256 /*
257 * For each cpu runqueue, if the task's mm match @mm, ensure that all
258 * @mm's membarrier state set bits are also set in in the runqueue's
259 * membarrier state. This ensures that a runqueue scheduling
260 * between threads which are users of @mm has its membarrier state
261 * updated.
262 */
263 mutex_lock(&membarrier_ipi_mutex);
264 cpus_read_lock();
265 rcu_read_lock();
266 for_each_online_cpu(cpu) {
267 struct rq *rq = cpu_rq(cpu);
268 struct task_struct *p;
269
270 p = rcu_dereference(rq->curr);
271 if (p && p->mm == mm)
272 __cpumask_set_cpu(cpu, tmpmask);
273 }
274 rcu_read_unlock();
275
276 on_each_cpu_mask(tmpmask, ipi_sync_rq_state, mm, true);
277
278 free_cpumask_var(tmpmask);
279 cpus_read_unlock();
280 mutex_unlock(&membarrier_ipi_mutex);
281
282 return 0;
283 }
284
membarrier_register_global_expedited(void)285 static int membarrier_register_global_expedited(void)
286 {
287 struct task_struct *p = current;
288 struct mm_struct *mm = p->mm;
289 int ret;
290
291 if (atomic_read(&mm->membarrier_state) &
292 MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY)
293 return 0;
294 atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED, &mm->membarrier_state);
295 ret = sync_runqueues_membarrier_state(mm);
296 if (ret)
297 return ret;
298 atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY,
299 &mm->membarrier_state);
300
301 return 0;
302 }
303
membarrier_register_private_expedited(int flags)304 static int membarrier_register_private_expedited(int flags)
305 {
306 struct task_struct *p = current;
307 struct mm_struct *mm = p->mm;
308 int ready_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY,
309 set_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED,
310 ret;
311
312 if (flags & MEMBARRIER_FLAG_SYNC_CORE) {
313 if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
314 return -EINVAL;
315 ready_state =
316 MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY;
317 }
318
319 /*
320 * We need to consider threads belonging to different thread
321 * groups, which use the same mm. (CLONE_VM but not
322 * CLONE_THREAD).
323 */
324 if ((atomic_read(&mm->membarrier_state) & ready_state) == ready_state)
325 return 0;
326 if (flags & MEMBARRIER_FLAG_SYNC_CORE)
327 set_state |= MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE;
328 atomic_or(set_state, &mm->membarrier_state);
329 ret = sync_runqueues_membarrier_state(mm);
330 if (ret)
331 return ret;
332 atomic_or(ready_state, &mm->membarrier_state);
333
334 return 0;
335 }
336
337 /**
338 * sys_membarrier - issue memory barriers on a set of threads
339 * @cmd: Takes command values defined in enum membarrier_cmd.
340 * @flags: Currently needs to be 0. For future extensions.
341 *
342 * If this system call is not implemented, -ENOSYS is returned. If the
343 * command specified does not exist, not available on the running
344 * kernel, or if the command argument is invalid, this system call
345 * returns -EINVAL. For a given command, with flags argument set to 0,
346 * if this system call returns -ENOSYS or -EINVAL, it is guaranteed to
347 * always return the same value until reboot. In addition, it can return
348 * -ENOMEM if there is not enough memory available to perform the system
349 * call.
350 *
351 * All memory accesses performed in program order from each targeted thread
352 * is guaranteed to be ordered with respect to sys_membarrier(). If we use
353 * the semantic "barrier()" to represent a compiler barrier forcing memory
354 * accesses to be performed in program order across the barrier, and
355 * smp_mb() to represent explicit memory barriers forcing full memory
356 * ordering across the barrier, we have the following ordering table for
357 * each pair of barrier(), sys_membarrier() and smp_mb():
358 *
359 * The pair ordering is detailed as (O: ordered, X: not ordered):
360 *
361 * barrier() smp_mb() sys_membarrier()
362 * barrier() X X O
363 * smp_mb() X O O
364 * sys_membarrier() O O O
365 */
SYSCALL_DEFINE2(membarrier,int,cmd,int,flags)366 SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
367 {
368 if (unlikely(flags))
369 return -EINVAL;
370 switch (cmd) {
371 case MEMBARRIER_CMD_QUERY:
372 {
373 int cmd_mask = MEMBARRIER_CMD_BITMASK;
374
375 if (tick_nohz_full_enabled())
376 cmd_mask &= ~MEMBARRIER_CMD_GLOBAL;
377 return cmd_mask;
378 }
379 case MEMBARRIER_CMD_GLOBAL:
380 /* MEMBARRIER_CMD_GLOBAL is not compatible with nohz_full. */
381 if (tick_nohz_full_enabled())
382 return -EINVAL;
383 if (num_online_cpus() > 1)
384 synchronize_rcu();
385 return 0;
386 case MEMBARRIER_CMD_GLOBAL_EXPEDITED:
387 return membarrier_global_expedited();
388 case MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED:
389 return membarrier_register_global_expedited();
390 case MEMBARRIER_CMD_PRIVATE_EXPEDITED:
391 return membarrier_private_expedited(0);
392 case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED:
393 return membarrier_register_private_expedited(0);
394 case MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE:
395 return membarrier_private_expedited(MEMBARRIER_FLAG_SYNC_CORE);
396 case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE:
397 return membarrier_register_private_expedited(MEMBARRIER_FLAG_SYNC_CORE);
398 default:
399 return -EINVAL;
400 }
401 }
402