1 // SPDX-License-Identifier: GPL-2.0+
2 /*
3 * Read-Copy Update mechanism for mutual exclusion
4 *
5 * Copyright IBM Corporation, 2008
6 *
7 * Authors: Dipankar Sarma <dipankar@in.ibm.com>
8 * Manfred Spraul <manfred@colorfullife.com>
9 * Paul E. McKenney <paulmck@linux.ibm.com> Hierarchical version
10 *
11 * Based on the original work by Paul McKenney <paulmck@linux.ibm.com>
12 * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
13 *
14 * For detailed explanation of Read-Copy Update mechanism see -
15 * Documentation/RCU
16 */
17
18 #define pr_fmt(fmt) "rcu: " fmt
19
20 #include <linux/types.h>
21 #include <linux/kernel.h>
22 #include <linux/init.h>
23 #include <linux/spinlock.h>
24 #include <linux/smp.h>
25 #include <linux/rcupdate_wait.h>
26 #include <linux/interrupt.h>
27 #include <linux/sched.h>
28 #include <linux/sched/debug.h>
29 #include <linux/nmi.h>
30 #include <linux/atomic.h>
31 #include <linux/bitops.h>
32 #include <linux/export.h>
33 #include <linux/completion.h>
34 #include <linux/moduleparam.h>
35 #include <linux/percpu.h>
36 #include <linux/notifier.h>
37 #include <linux/cpu.h>
38 #include <linux/mutex.h>
39 #include <linux/time.h>
40 #include <linux/kernel_stat.h>
41 #include <linux/wait.h>
42 #include <linux/kthread.h>
43 #include <uapi/linux/sched/types.h>
44 #include <linux/prefetch.h>
45 #include <linux/delay.h>
46 #include <linux/stop_machine.h>
47 #include <linux/random.h>
48 #include <linux/trace_events.h>
49 #include <linux/suspend.h>
50 #include <linux/ftrace.h>
51 #include <linux/tick.h>
52 #include <linux/sysrq.h>
53 #include <linux/kprobes.h>
54 #include <linux/gfp.h>
55 #include <linux/oom.h>
56 #include <linux/smpboot.h>
57 #include <linux/jiffies.h>
58 #include <linux/sched/isolation.h>
59 #include <linux/sched/clock.h>
60 #include "../time/tick-internal.h"
61
62 #include "tree.h"
63 #include "rcu.h"
64
65 #ifdef MODULE_PARAM_PREFIX
66 #undef MODULE_PARAM_PREFIX
67 #endif
68 #define MODULE_PARAM_PREFIX "rcutree."
69
70 /* Data structures. */
71
72 /*
73 * Steal a bit from the bottom of ->dynticks for idle entry/exit
74 * control. Initially this is for TLB flushing.
75 */
76 #define RCU_DYNTICK_CTRL_MASK 0x1
77 #define RCU_DYNTICK_CTRL_CTR (RCU_DYNTICK_CTRL_MASK + 1)
78 #ifndef rcu_eqs_special_exit
79 #define rcu_eqs_special_exit() do { } while (0)
80 #endif
81
82 static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, rcu_data) = {
83 .dynticks_nesting = 1,
84 .dynticks_nmi_nesting = DYNTICK_IRQ_NONIDLE,
85 .dynticks = ATOMIC_INIT(RCU_DYNTICK_CTRL_CTR),
86 };
87 struct rcu_state rcu_state = {
88 .level = { &rcu_state.node[0] },
89 .gp_state = RCU_GP_IDLE,
90 .gp_seq = (0UL - 300UL) << RCU_SEQ_CTR_SHIFT,
91 .barrier_mutex = __MUTEX_INITIALIZER(rcu_state.barrier_mutex),
92 .name = RCU_NAME,
93 .abbr = RCU_ABBR,
94 .exp_mutex = __MUTEX_INITIALIZER(rcu_state.exp_mutex),
95 .exp_wake_mutex = __MUTEX_INITIALIZER(rcu_state.exp_wake_mutex),
96 .ofl_lock = __RAW_SPIN_LOCK_UNLOCKED(rcu_state.ofl_lock),
97 };
98
99 /* Dump rcu_node combining tree at boot to verify correct setup. */
100 static bool dump_tree;
101 module_param(dump_tree, bool, 0444);
102 /* By default, use RCU_SOFTIRQ instead of rcuc kthreads. */
103 static bool use_softirq = 1;
104 module_param(use_softirq, bool, 0444);
105 /* Control rcu_node-tree auto-balancing at boot time. */
106 static bool rcu_fanout_exact;
107 module_param(rcu_fanout_exact, bool, 0444);
108 /* Increase (but not decrease) the RCU_FANOUT_LEAF at boot time. */
109 static int rcu_fanout_leaf = RCU_FANOUT_LEAF;
110 module_param(rcu_fanout_leaf, int, 0444);
111 int rcu_num_lvls __read_mostly = RCU_NUM_LVLS;
112 /* Number of rcu_nodes at specified level. */
113 int num_rcu_lvl[] = NUM_RCU_LVL_INIT;
114 int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */
115
116 /*
117 * The rcu_scheduler_active variable is initialized to the value
118 * RCU_SCHEDULER_INACTIVE and transitions RCU_SCHEDULER_INIT just before the
119 * first task is spawned. So when this variable is RCU_SCHEDULER_INACTIVE,
120 * RCU can assume that there is but one task, allowing RCU to (for example)
121 * optimize synchronize_rcu() to a simple barrier(). When this variable
122 * is RCU_SCHEDULER_INIT, RCU must actually do all the hard work required
123 * to detect real grace periods. This variable is also used to suppress
124 * boot-time false positives from lockdep-RCU error checking. Finally, it
125 * transitions from RCU_SCHEDULER_INIT to RCU_SCHEDULER_RUNNING after RCU
126 * is fully initialized, including all of its kthreads having been spawned.
127 */
128 int rcu_scheduler_active __read_mostly;
129 EXPORT_SYMBOL_GPL(rcu_scheduler_active);
130
131 /*
132 * The rcu_scheduler_fully_active variable transitions from zero to one
133 * during the early_initcall() processing, which is after the scheduler
134 * is capable of creating new tasks. So RCU processing (for example,
135 * creating tasks for RCU priority boosting) must be delayed until after
136 * rcu_scheduler_fully_active transitions from zero to one. We also
137 * currently delay invocation of any RCU callbacks until after this point.
138 *
139 * It might later prove better for people registering RCU callbacks during
140 * early boot to take responsibility for these callbacks, but one step at
141 * a time.
142 */
143 static int rcu_scheduler_fully_active __read_mostly;
144
145 static void rcu_report_qs_rnp(unsigned long mask, struct rcu_node *rnp,
146 unsigned long gps, unsigned long flags);
147 static void rcu_init_new_rnp(struct rcu_node *rnp_leaf);
148 static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf);
149 static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu);
150 static void invoke_rcu_core(void);
151 static void rcu_report_exp_rdp(struct rcu_data *rdp);
152 static void sync_sched_exp_online_cleanup(int cpu);
153
154 /* rcuc/rcub kthread realtime priority */
155 static int kthread_prio = IS_ENABLED(CONFIG_RCU_BOOST) ? 1 : 0;
156 module_param(kthread_prio, int, 0444);
157
158 /* Delay in jiffies for grace-period initialization delays, debug only. */
159
160 static int gp_preinit_delay;
161 module_param(gp_preinit_delay, int, 0444);
162 static int gp_init_delay;
163 module_param(gp_init_delay, int, 0444);
164 static int gp_cleanup_delay;
165 module_param(gp_cleanup_delay, int, 0444);
166
167 /* Retrieve RCU kthreads priority for rcutorture */
rcu_get_gp_kthreads_prio(void)168 int rcu_get_gp_kthreads_prio(void)
169 {
170 return kthread_prio;
171 }
172 EXPORT_SYMBOL_GPL(rcu_get_gp_kthreads_prio);
173
174 /*
175 * Number of grace periods between delays, normalized by the duration of
176 * the delay. The longer the delay, the more the grace periods between
177 * each delay. The reason for this normalization is that it means that,
178 * for non-zero delays, the overall slowdown of grace periods is constant
179 * regardless of the duration of the delay. This arrangement balances
180 * the need for long delays to increase some race probabilities with the
181 * need for fast grace periods to increase other race probabilities.
182 */
183 #define PER_RCU_NODE_PERIOD 3 /* Number of grace periods between delays. */
184
185 /*
186 * Compute the mask of online CPUs for the specified rcu_node structure.
187 * This will not be stable unless the rcu_node structure's ->lock is
188 * held, but the bit corresponding to the current CPU will be stable
189 * in most contexts.
190 */
rcu_rnp_online_cpus(struct rcu_node * rnp)191 unsigned long rcu_rnp_online_cpus(struct rcu_node *rnp)
192 {
193 return READ_ONCE(rnp->qsmaskinitnext);
194 }
195
196 /*
197 * Return true if an RCU grace period is in progress. The READ_ONCE()s
198 * permit this function to be invoked without holding the root rcu_node
199 * structure's ->lock, but of course results can be subject to change.
200 */
rcu_gp_in_progress(void)201 static int rcu_gp_in_progress(void)
202 {
203 return rcu_seq_state(rcu_seq_current(&rcu_state.gp_seq));
204 }
205
206 /*
207 * Return the number of callbacks queued on the specified CPU.
208 * Handles both the nocbs and normal cases.
209 */
rcu_get_n_cbs_cpu(int cpu)210 static long rcu_get_n_cbs_cpu(int cpu)
211 {
212 struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
213
214 if (rcu_segcblist_is_enabled(&rdp->cblist))
215 return rcu_segcblist_n_cbs(&rdp->cblist);
216 return 0;
217 }
218
rcu_softirq_qs(void)219 void rcu_softirq_qs(void)
220 {
221 rcu_qs();
222 rcu_preempt_deferred_qs(current);
223 }
224
225 /*
226 * Record entry into an extended quiescent state. This is only to be
227 * called when not already in an extended quiescent state.
228 */
rcu_dynticks_eqs_enter(void)229 static void rcu_dynticks_eqs_enter(void)
230 {
231 struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
232 int seq;
233
234 /*
235 * CPUs seeing atomic_add_return() must see prior RCU read-side
236 * critical sections, and we also must force ordering with the
237 * next idle sojourn.
238 */
239 seq = atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdp->dynticks);
240 /* Better be in an extended quiescent state! */
241 WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
242 (seq & RCU_DYNTICK_CTRL_CTR));
243 /* Better not have special action (TLB flush) pending! */
244 WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
245 (seq & RCU_DYNTICK_CTRL_MASK));
246 }
247
248 /*
249 * Record exit from an extended quiescent state. This is only to be
250 * called from an extended quiescent state.
251 */
rcu_dynticks_eqs_exit(void)252 static void rcu_dynticks_eqs_exit(void)
253 {
254 struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
255 int seq;
256
257 /*
258 * CPUs seeing atomic_add_return() must see prior idle sojourns,
259 * and we also must force ordering with the next RCU read-side
260 * critical section.
261 */
262 seq = atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdp->dynticks);
263 WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
264 !(seq & RCU_DYNTICK_CTRL_CTR));
265 if (seq & RCU_DYNTICK_CTRL_MASK) {
266 atomic_andnot(RCU_DYNTICK_CTRL_MASK, &rdp->dynticks);
267 smp_mb__after_atomic(); /* _exit after clearing mask. */
268 /* Prefer duplicate flushes to losing a flush. */
269 rcu_eqs_special_exit();
270 }
271 }
272
273 /*
274 * Reset the current CPU's ->dynticks counter to indicate that the
275 * newly onlined CPU is no longer in an extended quiescent state.
276 * This will either leave the counter unchanged, or increment it
277 * to the next non-quiescent value.
278 *
279 * The non-atomic test/increment sequence works because the upper bits
280 * of the ->dynticks counter are manipulated only by the corresponding CPU,
281 * or when the corresponding CPU is offline.
282 */
rcu_dynticks_eqs_online(void)283 static void rcu_dynticks_eqs_online(void)
284 {
285 struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
286
287 if (atomic_read(&rdp->dynticks) & RCU_DYNTICK_CTRL_CTR)
288 return;
289 atomic_add(RCU_DYNTICK_CTRL_CTR, &rdp->dynticks);
290 }
291
292 /*
293 * Is the current CPU in an extended quiescent state?
294 *
295 * No ordering, as we are sampling CPU-local information.
296 */
rcu_dynticks_curr_cpu_in_eqs(void)297 bool rcu_dynticks_curr_cpu_in_eqs(void)
298 {
299 struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
300
301 return !(atomic_read(&rdp->dynticks) & RCU_DYNTICK_CTRL_CTR);
302 }
303
304 /*
305 * Snapshot the ->dynticks counter with full ordering so as to allow
306 * stable comparison of this counter with past and future snapshots.
307 */
rcu_dynticks_snap(struct rcu_data * rdp)308 int rcu_dynticks_snap(struct rcu_data *rdp)
309 {
310 int snap = atomic_add_return(0, &rdp->dynticks);
311
312 return snap & ~RCU_DYNTICK_CTRL_MASK;
313 }
314
315 /*
316 * Return true if the snapshot returned from rcu_dynticks_snap()
317 * indicates that RCU is in an extended quiescent state.
318 */
rcu_dynticks_in_eqs(int snap)319 static bool rcu_dynticks_in_eqs(int snap)
320 {
321 return !(snap & RCU_DYNTICK_CTRL_CTR);
322 }
323
324 /*
325 * Return true if the CPU corresponding to the specified rcu_data
326 * structure has spent some time in an extended quiescent state since
327 * rcu_dynticks_snap() returned the specified snapshot.
328 */
rcu_dynticks_in_eqs_since(struct rcu_data * rdp,int snap)329 static bool rcu_dynticks_in_eqs_since(struct rcu_data *rdp, int snap)
330 {
331 return snap != rcu_dynticks_snap(rdp);
332 }
333
334 /*
335 * Set the special (bottom) bit of the specified CPU so that it
336 * will take special action (such as flushing its TLB) on the
337 * next exit from an extended quiescent state. Returns true if
338 * the bit was successfully set, or false if the CPU was not in
339 * an extended quiescent state.
340 */
rcu_eqs_special_set(int cpu)341 bool rcu_eqs_special_set(int cpu)
342 {
343 int old;
344 int new;
345 struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
346
347 do {
348 old = atomic_read(&rdp->dynticks);
349 if (old & RCU_DYNTICK_CTRL_CTR)
350 return false;
351 new = old | RCU_DYNTICK_CTRL_MASK;
352 } while (atomic_cmpxchg(&rdp->dynticks, old, new) != old);
353 return true;
354 }
355
356 /*
357 * Let the RCU core know that this CPU has gone through the scheduler,
358 * which is a quiescent state. This is called when the need for a
359 * quiescent state is urgent, so we burn an atomic operation and full
360 * memory barriers to let the RCU core know about it, regardless of what
361 * this CPU might (or might not) do in the near future.
362 *
363 * We inform the RCU core by emulating a zero-duration dyntick-idle period.
364 *
365 * The caller must have disabled interrupts and must not be idle.
366 */
rcu_momentary_dyntick_idle(void)367 static void __maybe_unused rcu_momentary_dyntick_idle(void)
368 {
369 int special;
370
371 raw_cpu_write(rcu_data.rcu_need_heavy_qs, false);
372 special = atomic_add_return(2 * RCU_DYNTICK_CTRL_CTR,
373 &this_cpu_ptr(&rcu_data)->dynticks);
374 /* It is illegal to call this from idle state. */
375 WARN_ON_ONCE(!(special & RCU_DYNTICK_CTRL_CTR));
376 rcu_preempt_deferred_qs(current);
377 }
378
379 /**
380 * rcu_is_cpu_rrupt_from_idle - see if interrupted from idle
381 *
382 * If the current CPU is idle and running at a first-level (not nested)
383 * interrupt from idle, return true. The caller must have at least
384 * disabled preemption.
385 */
rcu_is_cpu_rrupt_from_idle(void)386 static int rcu_is_cpu_rrupt_from_idle(void)
387 {
388 /* Called only from within the scheduling-clock interrupt */
389 lockdep_assert_in_irq();
390
391 /* Check for counter underflows */
392 RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nesting) < 0,
393 "RCU dynticks_nesting counter underflow!");
394 RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nmi_nesting) <= 0,
395 "RCU dynticks_nmi_nesting counter underflow/zero!");
396
397 /* Are we at first interrupt nesting level? */
398 if (__this_cpu_read(rcu_data.dynticks_nmi_nesting) != 1)
399 return false;
400
401 /* Does CPU appear to be idle from an RCU standpoint? */
402 return __this_cpu_read(rcu_data.dynticks_nesting) == 0;
403 }
404
405 #define DEFAULT_RCU_BLIMIT 10 /* Maximum callbacks per rcu_do_batch ... */
406 #define DEFAULT_MAX_RCU_BLIMIT 10000 /* ... even during callback flood. */
407 static long blimit = DEFAULT_RCU_BLIMIT;
408 #define DEFAULT_RCU_QHIMARK 10000 /* If this many pending, ignore blimit. */
409 static long qhimark = DEFAULT_RCU_QHIMARK;
410 #define DEFAULT_RCU_QLOMARK 100 /* Once only this many pending, use blimit. */
411 static long qlowmark = DEFAULT_RCU_QLOMARK;
412
413 module_param(blimit, long, 0444);
414 module_param(qhimark, long, 0444);
415 module_param(qlowmark, long, 0444);
416
417 static ulong jiffies_till_first_fqs = ULONG_MAX;
418 static ulong jiffies_till_next_fqs = ULONG_MAX;
419 static bool rcu_kick_kthreads;
420 static int rcu_divisor = 7;
421 module_param(rcu_divisor, int, 0644);
422
423 /* Force an exit from rcu_do_batch() after 3 milliseconds. */
424 static long rcu_resched_ns = 3 * NSEC_PER_MSEC;
425 module_param(rcu_resched_ns, long, 0644);
426
427 /*
428 * How long the grace period must be before we start recruiting
429 * quiescent-state help from rcu_note_context_switch().
430 */
431 static ulong jiffies_till_sched_qs = ULONG_MAX;
432 module_param(jiffies_till_sched_qs, ulong, 0444);
433 static ulong jiffies_to_sched_qs; /* See adjust_jiffies_till_sched_qs(). */
434 module_param(jiffies_to_sched_qs, ulong, 0444); /* Display only! */
435
436 /*
437 * Make sure that we give the grace-period kthread time to detect any
438 * idle CPUs before taking active measures to force quiescent states.
439 * However, don't go below 100 milliseconds, adjusted upwards for really
440 * large systems.
441 */
adjust_jiffies_till_sched_qs(void)442 static void adjust_jiffies_till_sched_qs(void)
443 {
444 unsigned long j;
445
446 /* If jiffies_till_sched_qs was specified, respect the request. */
447 if (jiffies_till_sched_qs != ULONG_MAX) {
448 WRITE_ONCE(jiffies_to_sched_qs, jiffies_till_sched_qs);
449 return;
450 }
451 /* Otherwise, set to third fqs scan, but bound below on large system. */
452 j = READ_ONCE(jiffies_till_first_fqs) +
453 2 * READ_ONCE(jiffies_till_next_fqs);
454 if (j < HZ / 10 + nr_cpu_ids / RCU_JIFFIES_FQS_DIV)
455 j = HZ / 10 + nr_cpu_ids / RCU_JIFFIES_FQS_DIV;
456 pr_info("RCU calculated value of scheduler-enlistment delay is %ld jiffies.\n", j);
457 WRITE_ONCE(jiffies_to_sched_qs, j);
458 }
459
param_set_first_fqs_jiffies(const char * val,const struct kernel_param * kp)460 static int param_set_first_fqs_jiffies(const char *val, const struct kernel_param *kp)
461 {
462 ulong j;
463 int ret = kstrtoul(val, 0, &j);
464
465 if (!ret) {
466 WRITE_ONCE(*(ulong *)kp->arg, (j > HZ) ? HZ : j);
467 adjust_jiffies_till_sched_qs();
468 }
469 return ret;
470 }
471
param_set_next_fqs_jiffies(const char * val,const struct kernel_param * kp)472 static int param_set_next_fqs_jiffies(const char *val, const struct kernel_param *kp)
473 {
474 ulong j;
475 int ret = kstrtoul(val, 0, &j);
476
477 if (!ret) {
478 WRITE_ONCE(*(ulong *)kp->arg, (j > HZ) ? HZ : (j ?: 1));
479 adjust_jiffies_till_sched_qs();
480 }
481 return ret;
482 }
483
484 static struct kernel_param_ops first_fqs_jiffies_ops = {
485 .set = param_set_first_fqs_jiffies,
486 .get = param_get_ulong,
487 };
488
489 static struct kernel_param_ops next_fqs_jiffies_ops = {
490 .set = param_set_next_fqs_jiffies,
491 .get = param_get_ulong,
492 };
493
494 module_param_cb(jiffies_till_first_fqs, &first_fqs_jiffies_ops, &jiffies_till_first_fqs, 0644);
495 module_param_cb(jiffies_till_next_fqs, &next_fqs_jiffies_ops, &jiffies_till_next_fqs, 0644);
496 module_param(rcu_kick_kthreads, bool, 0644);
497
498 static void force_qs_rnp(int (*f)(struct rcu_data *rdp));
499 static int rcu_pending(void);
500
501 /*
502 * Return the number of RCU GPs completed thus far for debug & stats.
503 */
rcu_get_gp_seq(void)504 unsigned long rcu_get_gp_seq(void)
505 {
506 return READ_ONCE(rcu_state.gp_seq);
507 }
508 EXPORT_SYMBOL_GPL(rcu_get_gp_seq);
509
510 /*
511 * Return the number of RCU expedited batches completed thus far for
512 * debug & stats. Odd numbers mean that a batch is in progress, even
513 * numbers mean idle. The value returned will thus be roughly double
514 * the cumulative batches since boot.
515 */
rcu_exp_batches_completed(void)516 unsigned long rcu_exp_batches_completed(void)
517 {
518 return rcu_state.expedited_sequence;
519 }
520 EXPORT_SYMBOL_GPL(rcu_exp_batches_completed);
521
522 /*
523 * Return the root node of the rcu_state structure.
524 */
rcu_get_root(void)525 static struct rcu_node *rcu_get_root(void)
526 {
527 return &rcu_state.node[0];
528 }
529
530 /*
531 * Convert a ->gp_state value to a character string.
532 */
gp_state_getname(short gs)533 static const char *gp_state_getname(short gs)
534 {
535 if (gs < 0 || gs >= ARRAY_SIZE(gp_state_names))
536 return "???";
537 return gp_state_names[gs];
538 }
539
540 /*
541 * Send along grace-period-related data for rcutorture diagnostics.
542 */
rcutorture_get_gp_data(enum rcutorture_type test_type,int * flags,unsigned long * gp_seq)543 void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags,
544 unsigned long *gp_seq)
545 {
546 switch (test_type) {
547 case RCU_FLAVOR:
548 *flags = READ_ONCE(rcu_state.gp_flags);
549 *gp_seq = rcu_seq_current(&rcu_state.gp_seq);
550 break;
551 default:
552 break;
553 }
554 }
555 EXPORT_SYMBOL_GPL(rcutorture_get_gp_data);
556
557 /*
558 * Enter an RCU extended quiescent state, which can be either the
559 * idle loop or adaptive-tickless usermode execution.
560 *
561 * We crowbar the ->dynticks_nmi_nesting field to zero to allow for
562 * the possibility of usermode upcalls having messed up our count
563 * of interrupt nesting level during the prior busy period.
564 */
rcu_eqs_enter(bool user)565 static void rcu_eqs_enter(bool user)
566 {
567 struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
568
569 WARN_ON_ONCE(rdp->dynticks_nmi_nesting != DYNTICK_IRQ_NONIDLE);
570 WRITE_ONCE(rdp->dynticks_nmi_nesting, 0);
571 WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
572 rdp->dynticks_nesting == 0);
573 if (rdp->dynticks_nesting != 1) {
574 rdp->dynticks_nesting--;
575 return;
576 }
577
578 lockdep_assert_irqs_disabled();
579 trace_rcu_dyntick(TPS("Start"), rdp->dynticks_nesting, 0, atomic_read(&rdp->dynticks));
580 WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current));
581 rdp = this_cpu_ptr(&rcu_data);
582 rcu_prepare_for_idle();
583 rcu_preempt_deferred_qs(current);
584 WRITE_ONCE(rdp->dynticks_nesting, 0); /* Avoid irq-access tearing. */
585 rcu_dynticks_eqs_enter();
586 rcu_dynticks_task_enter();
587 }
588
589 /**
590 * rcu_idle_enter - inform RCU that current CPU is entering idle
591 *
592 * Enter idle mode, in other words, -leave- the mode in which RCU
593 * read-side critical sections can occur. (Though RCU read-side
594 * critical sections can occur in irq handlers in idle, a possibility
595 * handled by irq_enter() and irq_exit().)
596 *
597 * If you add or remove a call to rcu_idle_enter(), be sure to test with
598 * CONFIG_RCU_EQS_DEBUG=y.
599 */
rcu_idle_enter(void)600 void rcu_idle_enter(void)
601 {
602 lockdep_assert_irqs_disabled();
603 rcu_eqs_enter(false);
604 }
605 EXPORT_SYMBOL_GPL(rcu_idle_enter);
606
607 #ifdef CONFIG_NO_HZ_FULL
608 /**
609 * rcu_user_enter - inform RCU that we are resuming userspace.
610 *
611 * Enter RCU idle mode right before resuming userspace. No use of RCU
612 * is permitted between this call and rcu_user_exit(). This way the
613 * CPU doesn't need to maintain the tick for RCU maintenance purposes
614 * when the CPU runs in userspace.
615 *
616 * If you add or remove a call to rcu_user_enter(), be sure to test with
617 * CONFIG_RCU_EQS_DEBUG=y.
618 */
rcu_user_enter(void)619 void rcu_user_enter(void)
620 {
621 struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
622
623 lockdep_assert_irqs_disabled();
624
625 instrumentation_begin();
626 do_nocb_deferred_wakeup(rdp);
627 instrumentation_end();
628
629 rcu_eqs_enter(true);
630 }
631 #endif /* CONFIG_NO_HZ_FULL */
632
633 /*
634 * If we are returning from the outermost NMI handler that interrupted an
635 * RCU-idle period, update rdp->dynticks and rdp->dynticks_nmi_nesting
636 * to let the RCU grace-period handling know that the CPU is back to
637 * being RCU-idle.
638 *
639 * If you add or remove a call to rcu_nmi_exit_common(), be sure to test
640 * with CONFIG_RCU_EQS_DEBUG=y.
641 */
rcu_nmi_exit_common(bool irq)642 static __always_inline void rcu_nmi_exit_common(bool irq)
643 {
644 struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
645
646 /*
647 * Check for ->dynticks_nmi_nesting underflow and bad ->dynticks.
648 * (We are exiting an NMI handler, so RCU better be paying attention
649 * to us!)
650 */
651 WARN_ON_ONCE(rdp->dynticks_nmi_nesting <= 0);
652 WARN_ON_ONCE(rcu_dynticks_curr_cpu_in_eqs());
653
654 /*
655 * If the nesting level is not 1, the CPU wasn't RCU-idle, so
656 * leave it in non-RCU-idle state.
657 */
658 if (rdp->dynticks_nmi_nesting != 1) {
659 trace_rcu_dyntick(TPS("--="), rdp->dynticks_nmi_nesting, rdp->dynticks_nmi_nesting - 2,
660 atomic_read(&rdp->dynticks));
661 WRITE_ONCE(rdp->dynticks_nmi_nesting, /* No store tearing. */
662 rdp->dynticks_nmi_nesting - 2);
663 return;
664 }
665
666 /* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */
667 trace_rcu_dyntick(TPS("Startirq"), rdp->dynticks_nmi_nesting, 0, atomic_read(&rdp->dynticks));
668 WRITE_ONCE(rdp->dynticks_nmi_nesting, 0); /* Avoid store tearing. */
669
670 if (irq)
671 rcu_prepare_for_idle();
672
673 rcu_dynticks_eqs_enter();
674
675 if (irq)
676 rcu_dynticks_task_enter();
677 }
678
679 /**
680 * rcu_nmi_exit - inform RCU of exit from NMI context
681 *
682 * If you add or remove a call to rcu_nmi_exit(), be sure to test
683 * with CONFIG_RCU_EQS_DEBUG=y.
684 */
rcu_nmi_exit(void)685 void rcu_nmi_exit(void)
686 {
687 rcu_nmi_exit_common(false);
688 }
689
690 /**
691 * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle
692 *
693 * Exit from an interrupt handler, which might possibly result in entering
694 * idle mode, in other words, leaving the mode in which read-side critical
695 * sections can occur. The caller must have disabled interrupts.
696 *
697 * This code assumes that the idle loop never does anything that might
698 * result in unbalanced calls to irq_enter() and irq_exit(). If your
699 * architecture's idle loop violates this assumption, RCU will give you what
700 * you deserve, good and hard. But very infrequently and irreproducibly.
701 *
702 * Use things like work queues to work around this limitation.
703 *
704 * You have been warned.
705 *
706 * If you add or remove a call to rcu_irq_exit(), be sure to test with
707 * CONFIG_RCU_EQS_DEBUG=y.
708 */
rcu_irq_exit(void)709 void rcu_irq_exit(void)
710 {
711 lockdep_assert_irqs_disabled();
712 rcu_nmi_exit_common(true);
713 }
714
715 /*
716 * Wrapper for rcu_irq_exit() where interrupts are enabled.
717 *
718 * If you add or remove a call to rcu_irq_exit_irqson(), be sure to test
719 * with CONFIG_RCU_EQS_DEBUG=y.
720 */
rcu_irq_exit_irqson(void)721 void rcu_irq_exit_irqson(void)
722 {
723 unsigned long flags;
724
725 local_irq_save(flags);
726 rcu_irq_exit();
727 local_irq_restore(flags);
728 }
729
730 /*
731 * Exit an RCU extended quiescent state, which can be either the
732 * idle loop or adaptive-tickless usermode execution.
733 *
734 * We crowbar the ->dynticks_nmi_nesting field to DYNTICK_IRQ_NONIDLE to
735 * allow for the possibility of usermode upcalls messing up our count of
736 * interrupt nesting level during the busy period that is just now starting.
737 */
rcu_eqs_exit(bool user)738 static void rcu_eqs_exit(bool user)
739 {
740 struct rcu_data *rdp;
741 long oldval;
742
743 lockdep_assert_irqs_disabled();
744 rdp = this_cpu_ptr(&rcu_data);
745 oldval = rdp->dynticks_nesting;
746 WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && oldval < 0);
747 if (oldval) {
748 rdp->dynticks_nesting++;
749 return;
750 }
751 rcu_dynticks_task_exit();
752 rcu_dynticks_eqs_exit();
753 rcu_cleanup_after_idle();
754 trace_rcu_dyntick(TPS("End"), rdp->dynticks_nesting, 1, atomic_read(&rdp->dynticks));
755 WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current));
756 WRITE_ONCE(rdp->dynticks_nesting, 1);
757 WARN_ON_ONCE(rdp->dynticks_nmi_nesting);
758 WRITE_ONCE(rdp->dynticks_nmi_nesting, DYNTICK_IRQ_NONIDLE);
759 }
760
761 /**
762 * rcu_idle_exit - inform RCU that current CPU is leaving idle
763 *
764 * Exit idle mode, in other words, -enter- the mode in which RCU
765 * read-side critical sections can occur.
766 *
767 * If you add or remove a call to rcu_idle_exit(), be sure to test with
768 * CONFIG_RCU_EQS_DEBUG=y.
769 */
rcu_idle_exit(void)770 void rcu_idle_exit(void)
771 {
772 unsigned long flags;
773
774 local_irq_save(flags);
775 rcu_eqs_exit(false);
776 local_irq_restore(flags);
777 }
778 EXPORT_SYMBOL_GPL(rcu_idle_exit);
779
780 #ifdef CONFIG_NO_HZ_FULL
781 /**
782 * rcu_user_exit - inform RCU that we are exiting userspace.
783 *
784 * Exit RCU idle mode while entering the kernel because it can
785 * run a RCU read side critical section anytime.
786 *
787 * If you add or remove a call to rcu_user_exit(), be sure to test with
788 * CONFIG_RCU_EQS_DEBUG=y.
789 */
rcu_user_exit(void)790 void rcu_user_exit(void)
791 {
792 rcu_eqs_exit(1);
793 }
794 #endif /* CONFIG_NO_HZ_FULL */
795
796 /**
797 * rcu_nmi_enter_common - inform RCU of entry to NMI context
798 * @irq: Is this call from rcu_irq_enter?
799 *
800 * If the CPU was idle from RCU's viewpoint, update rdp->dynticks and
801 * rdp->dynticks_nmi_nesting to let the RCU grace-period handling know
802 * that the CPU is active. This implementation permits nested NMIs, as
803 * long as the nesting level does not overflow an int. (You will probably
804 * run out of stack space first.)
805 *
806 * If you add or remove a call to rcu_nmi_enter_common(), be sure to test
807 * with CONFIG_RCU_EQS_DEBUG=y.
808 */
rcu_nmi_enter_common(bool irq)809 static __always_inline void rcu_nmi_enter_common(bool irq)
810 {
811 struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
812 long incby = 2;
813
814 /* Complain about underflow. */
815 WARN_ON_ONCE(rdp->dynticks_nmi_nesting < 0);
816
817 /*
818 * If idle from RCU viewpoint, atomically increment ->dynticks
819 * to mark non-idle and increment ->dynticks_nmi_nesting by one.
820 * Otherwise, increment ->dynticks_nmi_nesting by two. This means
821 * if ->dynticks_nmi_nesting is equal to one, we are guaranteed
822 * to be in the outermost NMI handler that interrupted an RCU-idle
823 * period (observation due to Andy Lutomirski).
824 */
825 if (rcu_dynticks_curr_cpu_in_eqs()) {
826
827 if (irq)
828 rcu_dynticks_task_exit();
829
830 rcu_dynticks_eqs_exit();
831
832 if (irq)
833 rcu_cleanup_after_idle();
834
835 incby = 1;
836 }
837 trace_rcu_dyntick(incby == 1 ? TPS("Endirq") : TPS("++="),
838 rdp->dynticks_nmi_nesting,
839 rdp->dynticks_nmi_nesting + incby, atomic_read(&rdp->dynticks));
840 WRITE_ONCE(rdp->dynticks_nmi_nesting, /* Prevent store tearing. */
841 rdp->dynticks_nmi_nesting + incby);
842 barrier();
843 }
844
845 /**
846 * rcu_nmi_enter - inform RCU of entry to NMI context
847 */
rcu_nmi_enter(void)848 void rcu_nmi_enter(void)
849 {
850 rcu_nmi_enter_common(false);
851 }
852 NOKPROBE_SYMBOL(rcu_nmi_enter);
853
854 /**
855 * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle
856 *
857 * Enter an interrupt handler, which might possibly result in exiting
858 * idle mode, in other words, entering the mode in which read-side critical
859 * sections can occur. The caller must have disabled interrupts.
860 *
861 * Note that the Linux kernel is fully capable of entering an interrupt
862 * handler that it never exits, for example when doing upcalls to user mode!
863 * This code assumes that the idle loop never does upcalls to user mode.
864 * If your architecture's idle loop does do upcalls to user mode (or does
865 * anything else that results in unbalanced calls to the irq_enter() and
866 * irq_exit() functions), RCU will give you what you deserve, good and hard.
867 * But very infrequently and irreproducibly.
868 *
869 * Use things like work queues to work around this limitation.
870 *
871 * You have been warned.
872 *
873 * If you add or remove a call to rcu_irq_enter(), be sure to test with
874 * CONFIG_RCU_EQS_DEBUG=y.
875 */
rcu_irq_enter(void)876 void rcu_irq_enter(void)
877 {
878 lockdep_assert_irqs_disabled();
879 rcu_nmi_enter_common(true);
880 }
881
882 /*
883 * Wrapper for rcu_irq_enter() where interrupts are enabled.
884 *
885 * If you add or remove a call to rcu_irq_enter_irqson(), be sure to test
886 * with CONFIG_RCU_EQS_DEBUG=y.
887 */
rcu_irq_enter_irqson(void)888 void rcu_irq_enter_irqson(void)
889 {
890 unsigned long flags;
891
892 local_irq_save(flags);
893 rcu_irq_enter();
894 local_irq_restore(flags);
895 }
896
897 /**
898 * rcu_is_watching - see if RCU thinks that the current CPU is not idle
899 *
900 * Return true if RCU is watching the running CPU, which means that this
901 * CPU can safely enter RCU read-side critical sections. In other words,
902 * if the current CPU is not in its idle loop or is in an interrupt or
903 * NMI handler, return true.
904 */
rcu_is_watching(void)905 bool notrace rcu_is_watching(void)
906 {
907 bool ret;
908
909 preempt_disable_notrace();
910 ret = !rcu_dynticks_curr_cpu_in_eqs();
911 preempt_enable_notrace();
912 return ret;
913 }
914 EXPORT_SYMBOL_GPL(rcu_is_watching);
915
916 /*
917 * If a holdout task is actually running, request an urgent quiescent
918 * state from its CPU. This is unsynchronized, so migrations can cause
919 * the request to go to the wrong CPU. Which is OK, all that will happen
920 * is that the CPU's next context switch will be a bit slower and next
921 * time around this task will generate another request.
922 */
rcu_request_urgent_qs_task(struct task_struct * t)923 void rcu_request_urgent_qs_task(struct task_struct *t)
924 {
925 int cpu;
926
927 barrier();
928 cpu = task_cpu(t);
929 if (!task_curr(t))
930 return; /* This task is not running on that CPU. */
931 smp_store_release(per_cpu_ptr(&rcu_data.rcu_urgent_qs, cpu), true);
932 }
933
934 #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU)
935
936 /*
937 * Is the current CPU online as far as RCU is concerned?
938 *
939 * Disable preemption to avoid false positives that could otherwise
940 * happen due to the current CPU number being sampled, this task being
941 * preempted, its old CPU being taken offline, resuming on some other CPU,
942 * then determining that its old CPU is now offline.
943 *
944 * Disable checking if in an NMI handler because we cannot safely
945 * report errors from NMI handlers anyway. In addition, it is OK to use
946 * RCU on an offline processor during initial boot, hence the check for
947 * rcu_scheduler_fully_active.
948 */
rcu_lockdep_current_cpu_online(void)949 bool rcu_lockdep_current_cpu_online(void)
950 {
951 struct rcu_data *rdp;
952 struct rcu_node *rnp;
953 bool ret = false;
954
955 if (in_nmi() || !rcu_scheduler_fully_active)
956 return true;
957 preempt_disable();
958 rdp = this_cpu_ptr(&rcu_data);
959 rnp = rdp->mynode;
960 if (rdp->grpmask & rcu_rnp_online_cpus(rnp))
961 ret = true;
962 preempt_enable();
963 return ret;
964 }
965 EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online);
966
967 #endif /* #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) */
968
969 /*
970 * We are reporting a quiescent state on behalf of some other CPU, so
971 * it is our responsibility to check for and handle potential overflow
972 * of the rcu_node ->gp_seq counter with respect to the rcu_data counters.
973 * After all, the CPU might be in deep idle state, and thus executing no
974 * code whatsoever.
975 */
rcu_gpnum_ovf(struct rcu_node * rnp,struct rcu_data * rdp)976 static void rcu_gpnum_ovf(struct rcu_node *rnp, struct rcu_data *rdp)
977 {
978 raw_lockdep_assert_held_rcu_node(rnp);
979 if (ULONG_CMP_LT(rcu_seq_current(&rdp->gp_seq) + ULONG_MAX / 4,
980 rnp->gp_seq))
981 WRITE_ONCE(rdp->gpwrap, true);
982 if (ULONG_CMP_LT(rdp->rcu_iw_gp_seq + ULONG_MAX / 4, rnp->gp_seq))
983 rdp->rcu_iw_gp_seq = rnp->gp_seq + ULONG_MAX / 4;
984 }
985
986 /*
987 * Snapshot the specified CPU's dynticks counter so that we can later
988 * credit them with an implicit quiescent state. Return 1 if this CPU
989 * is in dynticks idle mode, which is an extended quiescent state.
990 */
dyntick_save_progress_counter(struct rcu_data * rdp)991 static int dyntick_save_progress_counter(struct rcu_data *rdp)
992 {
993 rdp->dynticks_snap = rcu_dynticks_snap(rdp);
994 if (rcu_dynticks_in_eqs(rdp->dynticks_snap)) {
995 trace_rcu_fqs(rcu_state.name, rdp->gp_seq, rdp->cpu, TPS("dti"));
996 rcu_gpnum_ovf(rdp->mynode, rdp);
997 return 1;
998 }
999 return 0;
1000 }
1001
1002 /*
1003 * Return true if the specified CPU has passed through a quiescent
1004 * state by virtue of being in or having passed through an dynticks
1005 * idle state since the last call to dyntick_save_progress_counter()
1006 * for this same CPU, or by virtue of having been offline.
1007 */
rcu_implicit_dynticks_qs(struct rcu_data * rdp)1008 static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
1009 {
1010 unsigned long jtsq;
1011 bool *rnhqp;
1012 bool *ruqp;
1013 struct rcu_node *rnp = rdp->mynode;
1014
1015 /*
1016 * If the CPU passed through or entered a dynticks idle phase with
1017 * no active irq/NMI handlers, then we can safely pretend that the CPU
1018 * already acknowledged the request to pass through a quiescent
1019 * state. Either way, that CPU cannot possibly be in an RCU
1020 * read-side critical section that started before the beginning
1021 * of the current RCU grace period.
1022 */
1023 if (rcu_dynticks_in_eqs_since(rdp, rdp->dynticks_snap)) {
1024 trace_rcu_fqs(rcu_state.name, rdp->gp_seq, rdp->cpu, TPS("dti"));
1025 rcu_gpnum_ovf(rnp, rdp);
1026 return 1;
1027 }
1028
1029 /* If waiting too long on an offline CPU, complain. */
1030 if (!(rdp->grpmask & rcu_rnp_online_cpus(rnp)) &&
1031 time_after(jiffies, rcu_state.gp_start + HZ)) {
1032 bool onl;
1033 struct rcu_node *rnp1;
1034
1035 WARN_ON(1); /* Offline CPUs are supposed to report QS! */
1036 pr_info("%s: grp: %d-%d level: %d ->gp_seq %ld ->completedqs %ld\n",
1037 __func__, rnp->grplo, rnp->grphi, rnp->level,
1038 (long)rnp->gp_seq, (long)rnp->completedqs);
1039 for (rnp1 = rnp; rnp1; rnp1 = rnp1->parent)
1040 pr_info("%s: %d:%d ->qsmask %#lx ->qsmaskinit %#lx ->qsmaskinitnext %#lx ->rcu_gp_init_mask %#lx\n",
1041 __func__, rnp1->grplo, rnp1->grphi, rnp1->qsmask, rnp1->qsmaskinit, rnp1->qsmaskinitnext, rnp1->rcu_gp_init_mask);
1042 onl = !!(rdp->grpmask & rcu_rnp_online_cpus(rnp));
1043 pr_info("%s %d: %c online: %ld(%d) offline: %ld(%d)\n",
1044 __func__, rdp->cpu, ".o"[onl],
1045 (long)rdp->rcu_onl_gp_seq, rdp->rcu_onl_gp_flags,
1046 (long)rdp->rcu_ofl_gp_seq, rdp->rcu_ofl_gp_flags);
1047 return 1; /* Break things loose after complaining. */
1048 }
1049
1050 /*
1051 * A CPU running for an extended time within the kernel can
1052 * delay RCU grace periods: (1) At age jiffies_to_sched_qs,
1053 * set .rcu_urgent_qs, (2) At age 2*jiffies_to_sched_qs, set
1054 * both .rcu_need_heavy_qs and .rcu_urgent_qs. Note that the
1055 * unsynchronized assignments to the per-CPU rcu_need_heavy_qs
1056 * variable are safe because the assignments are repeated if this
1057 * CPU failed to pass through a quiescent state. This code
1058 * also checks .jiffies_resched in case jiffies_to_sched_qs
1059 * is set way high.
1060 */
1061 jtsq = READ_ONCE(jiffies_to_sched_qs);
1062 ruqp = per_cpu_ptr(&rcu_data.rcu_urgent_qs, rdp->cpu);
1063 rnhqp = &per_cpu(rcu_data.rcu_need_heavy_qs, rdp->cpu);
1064 if (!READ_ONCE(*rnhqp) &&
1065 (time_after(jiffies, rcu_state.gp_start + jtsq * 2) ||
1066 time_after(jiffies, rcu_state.jiffies_resched))) {
1067 WRITE_ONCE(*rnhqp, true);
1068 /* Store rcu_need_heavy_qs before rcu_urgent_qs. */
1069 smp_store_release(ruqp, true);
1070 } else if (time_after(jiffies, rcu_state.gp_start + jtsq)) {
1071 WRITE_ONCE(*ruqp, true);
1072 }
1073
1074 /*
1075 * NO_HZ_FULL CPUs can run in-kernel without rcu_sched_clock_irq!
1076 * The above code handles this, but only for straight cond_resched().
1077 * And some in-kernel loops check need_resched() before calling
1078 * cond_resched(), which defeats the above code for CPUs that are
1079 * running in-kernel with scheduling-clock interrupts disabled.
1080 * So hit them over the head with the resched_cpu() hammer!
1081 */
1082 if (tick_nohz_full_cpu(rdp->cpu) &&
1083 time_after(jiffies,
1084 READ_ONCE(rdp->last_fqs_resched) + jtsq * 3)) {
1085 resched_cpu(rdp->cpu);
1086 WRITE_ONCE(rdp->last_fqs_resched, jiffies);
1087 }
1088
1089 /*
1090 * If more than halfway to RCU CPU stall-warning time, invoke
1091 * resched_cpu() more frequently to try to loosen things up a bit.
1092 * Also check to see if the CPU is getting hammered with interrupts,
1093 * but only once per grace period, just to keep the IPIs down to
1094 * a dull roar.
1095 */
1096 if (time_after(jiffies, rcu_state.jiffies_resched)) {
1097 if (time_after(jiffies,
1098 READ_ONCE(rdp->last_fqs_resched) + jtsq)) {
1099 resched_cpu(rdp->cpu);
1100 WRITE_ONCE(rdp->last_fqs_resched, jiffies);
1101 }
1102 if (IS_ENABLED(CONFIG_IRQ_WORK) &&
1103 !rdp->rcu_iw_pending && rdp->rcu_iw_gp_seq != rnp->gp_seq &&
1104 (rnp->ffmask & rdp->grpmask)) {
1105 init_irq_work(&rdp->rcu_iw, rcu_iw_handler);
1106 rdp->rcu_iw_pending = true;
1107 rdp->rcu_iw_gp_seq = rnp->gp_seq;
1108 irq_work_queue_on(&rdp->rcu_iw, rdp->cpu);
1109 }
1110 }
1111
1112 return 0;
1113 }
1114
1115 /* Trace-event wrapper function for trace_rcu_future_grace_period. */
trace_rcu_this_gp(struct rcu_node * rnp,struct rcu_data * rdp,unsigned long gp_seq_req,const char * s)1116 static void trace_rcu_this_gp(struct rcu_node *rnp, struct rcu_data *rdp,
1117 unsigned long gp_seq_req, const char *s)
1118 {
1119 trace_rcu_future_grace_period(rcu_state.name, rnp->gp_seq, gp_seq_req,
1120 rnp->level, rnp->grplo, rnp->grphi, s);
1121 }
1122
1123 /*
1124 * rcu_start_this_gp - Request the start of a particular grace period
1125 * @rnp_start: The leaf node of the CPU from which to start.
1126 * @rdp: The rcu_data corresponding to the CPU from which to start.
1127 * @gp_seq_req: The gp_seq of the grace period to start.
1128 *
1129 * Start the specified grace period, as needed to handle newly arrived
1130 * callbacks. The required future grace periods are recorded in each
1131 * rcu_node structure's ->gp_seq_needed field. Returns true if there
1132 * is reason to awaken the grace-period kthread.
1133 *
1134 * The caller must hold the specified rcu_node structure's ->lock, which
1135 * is why the caller is responsible for waking the grace-period kthread.
1136 *
1137 * Returns true if the GP thread needs to be awakened else false.
1138 */
rcu_start_this_gp(struct rcu_node * rnp_start,struct rcu_data * rdp,unsigned long gp_seq_req)1139 static bool rcu_start_this_gp(struct rcu_node *rnp_start, struct rcu_data *rdp,
1140 unsigned long gp_seq_req)
1141 {
1142 bool ret = false;
1143 struct rcu_node *rnp;
1144
1145 /*
1146 * Use funnel locking to either acquire the root rcu_node
1147 * structure's lock or bail out if the need for this grace period
1148 * has already been recorded -- or if that grace period has in
1149 * fact already started. If there is already a grace period in
1150 * progress in a non-leaf node, no recording is needed because the
1151 * end of the grace period will scan the leaf rcu_node structures.
1152 * Note that rnp_start->lock must not be released.
1153 */
1154 raw_lockdep_assert_held_rcu_node(rnp_start);
1155 trace_rcu_this_gp(rnp_start, rdp, gp_seq_req, TPS("Startleaf"));
1156 for (rnp = rnp_start; 1; rnp = rnp->parent) {
1157 if (rnp != rnp_start)
1158 raw_spin_lock_rcu_node(rnp);
1159 if (ULONG_CMP_GE(rnp->gp_seq_needed, gp_seq_req) ||
1160 rcu_seq_started(&rnp->gp_seq, gp_seq_req) ||
1161 (rnp != rnp_start &&
1162 rcu_seq_state(rcu_seq_current(&rnp->gp_seq)))) {
1163 trace_rcu_this_gp(rnp, rdp, gp_seq_req,
1164 TPS("Prestarted"));
1165 goto unlock_out;
1166 }
1167 rnp->gp_seq_needed = gp_seq_req;
1168 if (rcu_seq_state(rcu_seq_current(&rnp->gp_seq))) {
1169 /*
1170 * We just marked the leaf or internal node, and a
1171 * grace period is in progress, which means that
1172 * rcu_gp_cleanup() will see the marking. Bail to
1173 * reduce contention.
1174 */
1175 trace_rcu_this_gp(rnp_start, rdp, gp_seq_req,
1176 TPS("Startedleaf"));
1177 goto unlock_out;
1178 }
1179 if (rnp != rnp_start && rnp->parent != NULL)
1180 raw_spin_unlock_rcu_node(rnp);
1181 if (!rnp->parent)
1182 break; /* At root, and perhaps also leaf. */
1183 }
1184
1185 /* If GP already in progress, just leave, otherwise start one. */
1186 if (rcu_gp_in_progress()) {
1187 trace_rcu_this_gp(rnp, rdp, gp_seq_req, TPS("Startedleafroot"));
1188 goto unlock_out;
1189 }
1190 trace_rcu_this_gp(rnp, rdp, gp_seq_req, TPS("Startedroot"));
1191 WRITE_ONCE(rcu_state.gp_flags, rcu_state.gp_flags | RCU_GP_FLAG_INIT);
1192 rcu_state.gp_req_activity = jiffies;
1193 if (!rcu_state.gp_kthread) {
1194 trace_rcu_this_gp(rnp, rdp, gp_seq_req, TPS("NoGPkthread"));
1195 goto unlock_out;
1196 }
1197 trace_rcu_grace_period(rcu_state.name, READ_ONCE(rcu_state.gp_seq), TPS("newreq"));
1198 ret = true; /* Caller must wake GP kthread. */
1199 unlock_out:
1200 /* Push furthest requested GP to leaf node and rcu_data structure. */
1201 if (ULONG_CMP_LT(gp_seq_req, rnp->gp_seq_needed)) {
1202 rnp_start->gp_seq_needed = rnp->gp_seq_needed;
1203 rdp->gp_seq_needed = rnp->gp_seq_needed;
1204 }
1205 if (rnp != rnp_start)
1206 raw_spin_unlock_rcu_node(rnp);
1207 return ret;
1208 }
1209
1210 /*
1211 * Clean up any old requests for the just-ended grace period. Also return
1212 * whether any additional grace periods have been requested.
1213 */
rcu_future_gp_cleanup(struct rcu_node * rnp)1214 static bool rcu_future_gp_cleanup(struct rcu_node *rnp)
1215 {
1216 bool needmore;
1217 struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
1218
1219 needmore = ULONG_CMP_LT(rnp->gp_seq, rnp->gp_seq_needed);
1220 if (!needmore)
1221 rnp->gp_seq_needed = rnp->gp_seq; /* Avoid counter wrap. */
1222 trace_rcu_this_gp(rnp, rdp, rnp->gp_seq,
1223 needmore ? TPS("CleanupMore") : TPS("Cleanup"));
1224 return needmore;
1225 }
1226
1227 /*
1228 * Awaken the grace-period kthread. Don't do a self-awaken (unless in
1229 * an interrupt or softirq handler), and don't bother awakening when there
1230 * is nothing for the grace-period kthread to do (as in several CPUs raced
1231 * to awaken, and we lost), and finally don't try to awaken a kthread that
1232 * has not yet been created. If all those checks are passed, track some
1233 * debug information and awaken.
1234 *
1235 * So why do the self-wakeup when in an interrupt or softirq handler
1236 * in the grace-period kthread's context? Because the kthread might have
1237 * been interrupted just as it was going to sleep, and just after the final
1238 * pre-sleep check of the awaken condition. In this case, a wakeup really
1239 * is required, and is therefore supplied.
1240 */
rcu_gp_kthread_wake(void)1241 static void rcu_gp_kthread_wake(void)
1242 {
1243 if ((current == rcu_state.gp_kthread &&
1244 !in_irq() && !in_serving_softirq()) ||
1245 !READ_ONCE(rcu_state.gp_flags) ||
1246 !rcu_state.gp_kthread)
1247 return;
1248 WRITE_ONCE(rcu_state.gp_wake_time, jiffies);
1249 WRITE_ONCE(rcu_state.gp_wake_seq, READ_ONCE(rcu_state.gp_seq));
1250 swake_up_one(&rcu_state.gp_wq);
1251 }
1252
1253 /*
1254 * If there is room, assign a ->gp_seq number to any callbacks on this
1255 * CPU that have not already been assigned. Also accelerate any callbacks
1256 * that were previously assigned a ->gp_seq number that has since proven
1257 * to be too conservative, which can happen if callbacks get assigned a
1258 * ->gp_seq number while RCU is idle, but with reference to a non-root
1259 * rcu_node structure. This function is idempotent, so it does not hurt
1260 * to call it repeatedly. Returns an flag saying that we should awaken
1261 * the RCU grace-period kthread.
1262 *
1263 * The caller must hold rnp->lock with interrupts disabled.
1264 */
rcu_accelerate_cbs(struct rcu_node * rnp,struct rcu_data * rdp)1265 static bool rcu_accelerate_cbs(struct rcu_node *rnp, struct rcu_data *rdp)
1266 {
1267 unsigned long gp_seq_req;
1268 bool ret = false;
1269
1270 rcu_lockdep_assert_cblist_protected(rdp);
1271 raw_lockdep_assert_held_rcu_node(rnp);
1272
1273 /* If no pending (not yet ready to invoke) callbacks, nothing to do. */
1274 if (!rcu_segcblist_pend_cbs(&rdp->cblist))
1275 return false;
1276
1277 /*
1278 * Callbacks are often registered with incomplete grace-period
1279 * information. Something about the fact that getting exact
1280 * information requires acquiring a global lock... RCU therefore
1281 * makes a conservative estimate of the grace period number at which
1282 * a given callback will become ready to invoke. The following
1283 * code checks this estimate and improves it when possible, thus
1284 * accelerating callback invocation to an earlier grace-period
1285 * number.
1286 */
1287 gp_seq_req = rcu_seq_snap(&rcu_state.gp_seq);
1288 if (rcu_segcblist_accelerate(&rdp->cblist, gp_seq_req))
1289 ret = rcu_start_this_gp(rnp, rdp, gp_seq_req);
1290
1291 /* Trace depending on how much we were able to accelerate. */
1292 if (rcu_segcblist_restempty(&rdp->cblist, RCU_WAIT_TAIL))
1293 trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("AccWaitCB"));
1294 else
1295 trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("AccReadyCB"));
1296 return ret;
1297 }
1298
1299 /*
1300 * Similar to rcu_accelerate_cbs(), but does not require that the leaf
1301 * rcu_node structure's ->lock be held. It consults the cached value
1302 * of ->gp_seq_needed in the rcu_data structure, and if that indicates
1303 * that a new grace-period request be made, invokes rcu_accelerate_cbs()
1304 * while holding the leaf rcu_node structure's ->lock.
1305 */
rcu_accelerate_cbs_unlocked(struct rcu_node * rnp,struct rcu_data * rdp)1306 static void rcu_accelerate_cbs_unlocked(struct rcu_node *rnp,
1307 struct rcu_data *rdp)
1308 {
1309 unsigned long c;
1310 bool needwake;
1311
1312 rcu_lockdep_assert_cblist_protected(rdp);
1313 c = rcu_seq_snap(&rcu_state.gp_seq);
1314 if (!rdp->gpwrap && ULONG_CMP_GE(rdp->gp_seq_needed, c)) {
1315 /* Old request still live, so mark recent callbacks. */
1316 (void)rcu_segcblist_accelerate(&rdp->cblist, c);
1317 return;
1318 }
1319 raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
1320 needwake = rcu_accelerate_cbs(rnp, rdp);
1321 raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
1322 if (needwake)
1323 rcu_gp_kthread_wake();
1324 }
1325
1326 /*
1327 * Move any callbacks whose grace period has completed to the
1328 * RCU_DONE_TAIL sublist, then compact the remaining sublists and
1329 * assign ->gp_seq numbers to any callbacks in the RCU_NEXT_TAIL
1330 * sublist. This function is idempotent, so it does not hurt to
1331 * invoke it repeatedly. As long as it is not invoked -too- often...
1332 * Returns true if the RCU grace-period kthread needs to be awakened.
1333 *
1334 * The caller must hold rnp->lock with interrupts disabled.
1335 */
rcu_advance_cbs(struct rcu_node * rnp,struct rcu_data * rdp)1336 static bool rcu_advance_cbs(struct rcu_node *rnp, struct rcu_data *rdp)
1337 {
1338 rcu_lockdep_assert_cblist_protected(rdp);
1339 raw_lockdep_assert_held_rcu_node(rnp);
1340
1341 /* If no pending (not yet ready to invoke) callbacks, nothing to do. */
1342 if (!rcu_segcblist_pend_cbs(&rdp->cblist))
1343 return false;
1344
1345 /*
1346 * Find all callbacks whose ->gp_seq numbers indicate that they
1347 * are ready to invoke, and put them into the RCU_DONE_TAIL sublist.
1348 */
1349 rcu_segcblist_advance(&rdp->cblist, rnp->gp_seq);
1350
1351 /* Classify any remaining callbacks. */
1352 return rcu_accelerate_cbs(rnp, rdp);
1353 }
1354
1355 /*
1356 * Move and classify callbacks, but only if doing so won't require
1357 * that the RCU grace-period kthread be awakened.
1358 */
rcu_advance_cbs_nowake(struct rcu_node * rnp,struct rcu_data * rdp)1359 static void __maybe_unused rcu_advance_cbs_nowake(struct rcu_node *rnp,
1360 struct rcu_data *rdp)
1361 {
1362 rcu_lockdep_assert_cblist_protected(rdp);
1363 if (!rcu_seq_state(rcu_seq_current(&rnp->gp_seq)) || !raw_spin_trylock_rcu_node(rnp))
1364 return;
1365 // The grace period cannot end while we hold the rcu_node lock.
1366 if (rcu_seq_state(rcu_seq_current(&rnp->gp_seq)))
1367 WARN_ON_ONCE(rcu_advance_cbs(rnp, rdp));
1368 raw_spin_unlock_rcu_node(rnp);
1369 }
1370
1371 /*
1372 * Update CPU-local rcu_data state to record the beginnings and ends of
1373 * grace periods. The caller must hold the ->lock of the leaf rcu_node
1374 * structure corresponding to the current CPU, and must have irqs disabled.
1375 * Returns true if the grace-period kthread needs to be awakened.
1376 */
__note_gp_changes(struct rcu_node * rnp,struct rcu_data * rdp)1377 static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp)
1378 {
1379 bool ret = false;
1380 bool need_gp;
1381 const bool offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
1382 rcu_segcblist_is_offloaded(&rdp->cblist);
1383
1384 raw_lockdep_assert_held_rcu_node(rnp);
1385
1386 if (rdp->gp_seq == rnp->gp_seq)
1387 return false; /* Nothing to do. */
1388
1389 /* Handle the ends of any preceding grace periods first. */
1390 if (rcu_seq_completed_gp(rdp->gp_seq, rnp->gp_seq) ||
1391 unlikely(READ_ONCE(rdp->gpwrap))) {
1392 if (!offloaded)
1393 ret = rcu_advance_cbs(rnp, rdp); /* Advance CBs. */
1394 trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("cpuend"));
1395 } else {
1396 if (!offloaded)
1397 ret = rcu_accelerate_cbs(rnp, rdp); /* Recent CBs. */
1398 }
1399
1400 /* Now handle the beginnings of any new-to-this-CPU grace periods. */
1401 if (rcu_seq_new_gp(rdp->gp_seq, rnp->gp_seq) ||
1402 unlikely(READ_ONCE(rdp->gpwrap))) {
1403 /*
1404 * If the current grace period is waiting for this CPU,
1405 * set up to detect a quiescent state, otherwise don't
1406 * go looking for one.
1407 */
1408 trace_rcu_grace_period(rcu_state.name, rnp->gp_seq, TPS("cpustart"));
1409 need_gp = !!(rnp->qsmask & rdp->grpmask);
1410 rdp->cpu_no_qs.b.norm = need_gp;
1411 rdp->core_needs_qs = need_gp;
1412 zero_cpu_stall_ticks(rdp);
1413 }
1414 rdp->gp_seq = rnp->gp_seq; /* Remember new grace-period state. */
1415 if (ULONG_CMP_LT(rdp->gp_seq_needed, rnp->gp_seq_needed) || rdp->gpwrap)
1416 rdp->gp_seq_needed = rnp->gp_seq_needed;
1417 WRITE_ONCE(rdp->gpwrap, false);
1418 rcu_gpnum_ovf(rnp, rdp);
1419 return ret;
1420 }
1421
note_gp_changes(struct rcu_data * rdp)1422 static void note_gp_changes(struct rcu_data *rdp)
1423 {
1424 unsigned long flags;
1425 bool needwake;
1426 struct rcu_node *rnp;
1427
1428 local_irq_save(flags);
1429 rnp = rdp->mynode;
1430 if ((rdp->gp_seq == rcu_seq_current(&rnp->gp_seq) &&
1431 !unlikely(READ_ONCE(rdp->gpwrap))) || /* w/out lock. */
1432 !raw_spin_trylock_rcu_node(rnp)) { /* irqs already off, so later. */
1433 local_irq_restore(flags);
1434 return;
1435 }
1436 needwake = __note_gp_changes(rnp, rdp);
1437 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
1438 if (needwake)
1439 rcu_gp_kthread_wake();
1440 }
1441
rcu_gp_slow(int delay)1442 static void rcu_gp_slow(int delay)
1443 {
1444 if (delay > 0 &&
1445 !(rcu_seq_ctr(rcu_state.gp_seq) %
1446 (rcu_num_nodes * PER_RCU_NODE_PERIOD * delay)))
1447 schedule_timeout_uninterruptible(delay);
1448 }
1449
1450 /*
1451 * Initialize a new grace period. Return false if no grace period required.
1452 */
rcu_gp_init(void)1453 static bool rcu_gp_init(void)
1454 {
1455 unsigned long flags;
1456 unsigned long oldmask;
1457 unsigned long mask;
1458 struct rcu_data *rdp;
1459 struct rcu_node *rnp = rcu_get_root();
1460
1461 WRITE_ONCE(rcu_state.gp_activity, jiffies);
1462 raw_spin_lock_irq_rcu_node(rnp);
1463 if (!READ_ONCE(rcu_state.gp_flags)) {
1464 /* Spurious wakeup, tell caller to go back to sleep. */
1465 raw_spin_unlock_irq_rcu_node(rnp);
1466 return false;
1467 }
1468 WRITE_ONCE(rcu_state.gp_flags, 0); /* Clear all flags: New GP. */
1469
1470 if (WARN_ON_ONCE(rcu_gp_in_progress())) {
1471 /*
1472 * Grace period already in progress, don't start another.
1473 * Not supposed to be able to happen.
1474 */
1475 raw_spin_unlock_irq_rcu_node(rnp);
1476 return false;
1477 }
1478
1479 /* Advance to a new grace period and initialize state. */
1480 record_gp_stall_check_time();
1481 /* Record GP times before starting GP, hence rcu_seq_start(). */
1482 rcu_seq_start(&rcu_state.gp_seq);
1483 trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("start"));
1484 raw_spin_unlock_irq_rcu_node(rnp);
1485
1486 /*
1487 * Apply per-leaf buffered online and offline operations to the
1488 * rcu_node tree. Note that this new grace period need not wait
1489 * for subsequent online CPUs, and that quiescent-state forcing
1490 * will handle subsequent offline CPUs.
1491 */
1492 rcu_state.gp_state = RCU_GP_ONOFF;
1493 rcu_for_each_leaf_node(rnp) {
1494 raw_spin_lock(&rcu_state.ofl_lock);
1495 raw_spin_lock_irq_rcu_node(rnp);
1496 if (rnp->qsmaskinit == rnp->qsmaskinitnext &&
1497 !rnp->wait_blkd_tasks) {
1498 /* Nothing to do on this leaf rcu_node structure. */
1499 raw_spin_unlock_irq_rcu_node(rnp);
1500 raw_spin_unlock(&rcu_state.ofl_lock);
1501 continue;
1502 }
1503
1504 /* Record old state, apply changes to ->qsmaskinit field. */
1505 oldmask = rnp->qsmaskinit;
1506 rnp->qsmaskinit = rnp->qsmaskinitnext;
1507
1508 /* If zero-ness of ->qsmaskinit changed, propagate up tree. */
1509 if (!oldmask != !rnp->qsmaskinit) {
1510 if (!oldmask) { /* First online CPU for rcu_node. */
1511 if (!rnp->wait_blkd_tasks) /* Ever offline? */
1512 rcu_init_new_rnp(rnp);
1513 } else if (rcu_preempt_has_tasks(rnp)) {
1514 rnp->wait_blkd_tasks = true; /* blocked tasks */
1515 } else { /* Last offline CPU and can propagate. */
1516 rcu_cleanup_dead_rnp(rnp);
1517 }
1518 }
1519
1520 /*
1521 * If all waited-on tasks from prior grace period are
1522 * done, and if all this rcu_node structure's CPUs are
1523 * still offline, propagate up the rcu_node tree and
1524 * clear ->wait_blkd_tasks. Otherwise, if one of this
1525 * rcu_node structure's CPUs has since come back online,
1526 * simply clear ->wait_blkd_tasks.
1527 */
1528 if (rnp->wait_blkd_tasks &&
1529 (!rcu_preempt_has_tasks(rnp) || rnp->qsmaskinit)) {
1530 rnp->wait_blkd_tasks = false;
1531 if (!rnp->qsmaskinit)
1532 rcu_cleanup_dead_rnp(rnp);
1533 }
1534
1535 raw_spin_unlock_irq_rcu_node(rnp);
1536 raw_spin_unlock(&rcu_state.ofl_lock);
1537 }
1538 rcu_gp_slow(gp_preinit_delay); /* Races with CPU hotplug. */
1539
1540 /*
1541 * Set the quiescent-state-needed bits in all the rcu_node
1542 * structures for all currently online CPUs in breadth-first
1543 * order, starting from the root rcu_node structure, relying on the
1544 * layout of the tree within the rcu_state.node[] array. Note that
1545 * other CPUs will access only the leaves of the hierarchy, thus
1546 * seeing that no grace period is in progress, at least until the
1547 * corresponding leaf node has been initialized.
1548 *
1549 * The grace period cannot complete until the initialization
1550 * process finishes, because this kthread handles both.
1551 */
1552 rcu_state.gp_state = RCU_GP_INIT;
1553 rcu_for_each_node_breadth_first(rnp) {
1554 rcu_gp_slow(gp_init_delay);
1555 raw_spin_lock_irqsave_rcu_node(rnp, flags);
1556 rdp = this_cpu_ptr(&rcu_data);
1557 rcu_preempt_check_blocked_tasks(rnp);
1558 rnp->qsmask = rnp->qsmaskinit;
1559 WRITE_ONCE(rnp->gp_seq, rcu_state.gp_seq);
1560 if (rnp == rdp->mynode)
1561 (void)__note_gp_changes(rnp, rdp);
1562 rcu_preempt_boost_start_gp(rnp);
1563 trace_rcu_grace_period_init(rcu_state.name, rnp->gp_seq,
1564 rnp->level, rnp->grplo,
1565 rnp->grphi, rnp->qsmask);
1566 /* Quiescent states for tasks on any now-offline CPUs. */
1567 mask = rnp->qsmask & ~rnp->qsmaskinitnext;
1568 rnp->rcu_gp_init_mask = mask;
1569 if ((mask || rnp->wait_blkd_tasks) && rcu_is_leaf_node(rnp))
1570 rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags);
1571 else
1572 raw_spin_unlock_irq_rcu_node(rnp);
1573 cond_resched_tasks_rcu_qs();
1574 WRITE_ONCE(rcu_state.gp_activity, jiffies);
1575 }
1576
1577 return true;
1578 }
1579
1580 /*
1581 * Helper function for swait_event_idle_exclusive() wakeup at force-quiescent-state
1582 * time.
1583 */
rcu_gp_fqs_check_wake(int * gfp)1584 static bool rcu_gp_fqs_check_wake(int *gfp)
1585 {
1586 struct rcu_node *rnp = rcu_get_root();
1587
1588 /* Someone like call_rcu() requested a force-quiescent-state scan. */
1589 *gfp = READ_ONCE(rcu_state.gp_flags);
1590 if (*gfp & RCU_GP_FLAG_FQS)
1591 return true;
1592
1593 /* The current grace period has completed. */
1594 if (!READ_ONCE(rnp->qsmask) && !rcu_preempt_blocked_readers_cgp(rnp))
1595 return true;
1596
1597 return false;
1598 }
1599
1600 /*
1601 * Do one round of quiescent-state forcing.
1602 */
rcu_gp_fqs(bool first_time)1603 static void rcu_gp_fqs(bool first_time)
1604 {
1605 struct rcu_node *rnp = rcu_get_root();
1606
1607 WRITE_ONCE(rcu_state.gp_activity, jiffies);
1608 WRITE_ONCE(rcu_state.n_force_qs, rcu_state.n_force_qs + 1);
1609 if (first_time) {
1610 /* Collect dyntick-idle snapshots. */
1611 force_qs_rnp(dyntick_save_progress_counter);
1612 } else {
1613 /* Handle dyntick-idle and offline CPUs. */
1614 force_qs_rnp(rcu_implicit_dynticks_qs);
1615 }
1616 /* Clear flag to prevent immediate re-entry. */
1617 if (READ_ONCE(rcu_state.gp_flags) & RCU_GP_FLAG_FQS) {
1618 raw_spin_lock_irq_rcu_node(rnp);
1619 WRITE_ONCE(rcu_state.gp_flags,
1620 READ_ONCE(rcu_state.gp_flags) & ~RCU_GP_FLAG_FQS);
1621 raw_spin_unlock_irq_rcu_node(rnp);
1622 }
1623 }
1624
1625 /*
1626 * Loop doing repeated quiescent-state forcing until the grace period ends.
1627 */
rcu_gp_fqs_loop(void)1628 static void rcu_gp_fqs_loop(void)
1629 {
1630 bool first_gp_fqs;
1631 int gf;
1632 unsigned long j;
1633 int ret;
1634 struct rcu_node *rnp = rcu_get_root();
1635
1636 first_gp_fqs = true;
1637 j = READ_ONCE(jiffies_till_first_fqs);
1638 ret = 0;
1639 for (;;) {
1640 if (!ret) {
1641 rcu_state.jiffies_force_qs = jiffies + j;
1642 WRITE_ONCE(rcu_state.jiffies_kick_kthreads,
1643 jiffies + (j ? 3 * j : 2));
1644 }
1645 trace_rcu_grace_period(rcu_state.name,
1646 READ_ONCE(rcu_state.gp_seq),
1647 TPS("fqswait"));
1648 rcu_state.gp_state = RCU_GP_WAIT_FQS;
1649 ret = swait_event_idle_timeout_exclusive(
1650 rcu_state.gp_wq, rcu_gp_fqs_check_wake(&gf), j);
1651 rcu_state.gp_state = RCU_GP_DOING_FQS;
1652 /* Locking provides needed memory barriers. */
1653 /* If grace period done, leave loop. */
1654 if (!READ_ONCE(rnp->qsmask) &&
1655 !rcu_preempt_blocked_readers_cgp(rnp))
1656 break;
1657 /* If time for quiescent-state forcing, do it. */
1658 if (ULONG_CMP_GE(jiffies, rcu_state.jiffies_force_qs) ||
1659 (gf & RCU_GP_FLAG_FQS)) {
1660 trace_rcu_grace_period(rcu_state.name,
1661 READ_ONCE(rcu_state.gp_seq),
1662 TPS("fqsstart"));
1663 rcu_gp_fqs(first_gp_fqs);
1664 first_gp_fqs = false;
1665 trace_rcu_grace_period(rcu_state.name,
1666 READ_ONCE(rcu_state.gp_seq),
1667 TPS("fqsend"));
1668 cond_resched_tasks_rcu_qs();
1669 WRITE_ONCE(rcu_state.gp_activity, jiffies);
1670 ret = 0; /* Force full wait till next FQS. */
1671 j = READ_ONCE(jiffies_till_next_fqs);
1672 } else {
1673 /* Deal with stray signal. */
1674 cond_resched_tasks_rcu_qs();
1675 WRITE_ONCE(rcu_state.gp_activity, jiffies);
1676 WARN_ON(signal_pending(current));
1677 trace_rcu_grace_period(rcu_state.name,
1678 READ_ONCE(rcu_state.gp_seq),
1679 TPS("fqswaitsig"));
1680 ret = 1; /* Keep old FQS timing. */
1681 j = jiffies;
1682 if (time_after(jiffies, rcu_state.jiffies_force_qs))
1683 j = 1;
1684 else
1685 j = rcu_state.jiffies_force_qs - j;
1686 }
1687 }
1688 }
1689
1690 /*
1691 * Clean up after the old grace period.
1692 */
rcu_gp_cleanup(void)1693 static void rcu_gp_cleanup(void)
1694 {
1695 unsigned long gp_duration;
1696 bool needgp = false;
1697 unsigned long new_gp_seq;
1698 bool offloaded;
1699 struct rcu_data *rdp;
1700 struct rcu_node *rnp = rcu_get_root();
1701 struct swait_queue_head *sq;
1702
1703 WRITE_ONCE(rcu_state.gp_activity, jiffies);
1704 raw_spin_lock_irq_rcu_node(rnp);
1705 rcu_state.gp_end = jiffies;
1706 gp_duration = rcu_state.gp_end - rcu_state.gp_start;
1707 if (gp_duration > rcu_state.gp_max)
1708 rcu_state.gp_max = gp_duration;
1709
1710 /*
1711 * We know the grace period is complete, but to everyone else
1712 * it appears to still be ongoing. But it is also the case
1713 * that to everyone else it looks like there is nothing that
1714 * they can do to advance the grace period. It is therefore
1715 * safe for us to drop the lock in order to mark the grace
1716 * period as completed in all of the rcu_node structures.
1717 */
1718 raw_spin_unlock_irq_rcu_node(rnp);
1719
1720 /*
1721 * Propagate new ->gp_seq value to rcu_node structures so that
1722 * other CPUs don't have to wait until the start of the next grace
1723 * period to process their callbacks. This also avoids some nasty
1724 * RCU grace-period initialization races by forcing the end of
1725 * the current grace period to be completely recorded in all of
1726 * the rcu_node structures before the beginning of the next grace
1727 * period is recorded in any of the rcu_node structures.
1728 */
1729 new_gp_seq = rcu_state.gp_seq;
1730 rcu_seq_end(&new_gp_seq);
1731 rcu_for_each_node_breadth_first(rnp) {
1732 raw_spin_lock_irq_rcu_node(rnp);
1733 if (WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)))
1734 dump_blkd_tasks(rnp, 10);
1735 WARN_ON_ONCE(rnp->qsmask);
1736 WRITE_ONCE(rnp->gp_seq, new_gp_seq);
1737 rdp = this_cpu_ptr(&rcu_data);
1738 if (rnp == rdp->mynode)
1739 needgp = __note_gp_changes(rnp, rdp) || needgp;
1740 /* smp_mb() provided by prior unlock-lock pair. */
1741 needgp = rcu_future_gp_cleanup(rnp) || needgp;
1742 sq = rcu_nocb_gp_get(rnp);
1743 raw_spin_unlock_irq_rcu_node(rnp);
1744 rcu_nocb_gp_cleanup(sq);
1745 cond_resched_tasks_rcu_qs();
1746 WRITE_ONCE(rcu_state.gp_activity, jiffies);
1747 rcu_gp_slow(gp_cleanup_delay);
1748 }
1749 rnp = rcu_get_root();
1750 raw_spin_lock_irq_rcu_node(rnp); /* GP before ->gp_seq update. */
1751
1752 /* Declare grace period done, trace first to use old GP number. */
1753 trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("end"));
1754 rcu_seq_end(&rcu_state.gp_seq);
1755 rcu_state.gp_state = RCU_GP_IDLE;
1756 /* Check for GP requests since above loop. */
1757 rdp = this_cpu_ptr(&rcu_data);
1758 if (!needgp && ULONG_CMP_LT(rnp->gp_seq, rnp->gp_seq_needed)) {
1759 trace_rcu_this_gp(rnp, rdp, rnp->gp_seq_needed,
1760 TPS("CleanupMore"));
1761 needgp = true;
1762 }
1763 /* Advance CBs to reduce false positives below. */
1764 offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
1765 rcu_segcblist_is_offloaded(&rdp->cblist);
1766 if ((offloaded || !rcu_accelerate_cbs(rnp, rdp)) && needgp) {
1767 WRITE_ONCE(rcu_state.gp_flags, RCU_GP_FLAG_INIT);
1768 rcu_state.gp_req_activity = jiffies;
1769 trace_rcu_grace_period(rcu_state.name,
1770 READ_ONCE(rcu_state.gp_seq),
1771 TPS("newreq"));
1772 } else {
1773 WRITE_ONCE(rcu_state.gp_flags,
1774 rcu_state.gp_flags & RCU_GP_FLAG_INIT);
1775 }
1776 raw_spin_unlock_irq_rcu_node(rnp);
1777 }
1778
1779 /*
1780 * Body of kthread that handles grace periods.
1781 */
rcu_gp_kthread(void * unused)1782 static int __noreturn rcu_gp_kthread(void *unused)
1783 {
1784 rcu_bind_gp_kthread();
1785 for (;;) {
1786
1787 /* Handle grace-period start. */
1788 for (;;) {
1789 trace_rcu_grace_period(rcu_state.name,
1790 READ_ONCE(rcu_state.gp_seq),
1791 TPS("reqwait"));
1792 rcu_state.gp_state = RCU_GP_WAIT_GPS;
1793 swait_event_idle_exclusive(rcu_state.gp_wq,
1794 READ_ONCE(rcu_state.gp_flags) &
1795 RCU_GP_FLAG_INIT);
1796 rcu_state.gp_state = RCU_GP_DONE_GPS;
1797 /* Locking provides needed memory barrier. */
1798 if (rcu_gp_init())
1799 break;
1800 cond_resched_tasks_rcu_qs();
1801 WRITE_ONCE(rcu_state.gp_activity, jiffies);
1802 WARN_ON(signal_pending(current));
1803 trace_rcu_grace_period(rcu_state.name,
1804 READ_ONCE(rcu_state.gp_seq),
1805 TPS("reqwaitsig"));
1806 }
1807
1808 /* Handle quiescent-state forcing. */
1809 rcu_gp_fqs_loop();
1810
1811 /* Handle grace-period end. */
1812 rcu_state.gp_state = RCU_GP_CLEANUP;
1813 rcu_gp_cleanup();
1814 rcu_state.gp_state = RCU_GP_CLEANED;
1815 }
1816 }
1817
1818 /*
1819 * Report a full set of quiescent states to the rcu_state data structure.
1820 * Invoke rcu_gp_kthread_wake() to awaken the grace-period kthread if
1821 * another grace period is required. Whether we wake the grace-period
1822 * kthread or it awakens itself for the next round of quiescent-state
1823 * forcing, that kthread will clean up after the just-completed grace
1824 * period. Note that the caller must hold rnp->lock, which is released
1825 * before return.
1826 */
rcu_report_qs_rsp(unsigned long flags)1827 static void rcu_report_qs_rsp(unsigned long flags)
1828 __releases(rcu_get_root()->lock)
1829 {
1830 raw_lockdep_assert_held_rcu_node(rcu_get_root());
1831 WARN_ON_ONCE(!rcu_gp_in_progress());
1832 WRITE_ONCE(rcu_state.gp_flags,
1833 READ_ONCE(rcu_state.gp_flags) | RCU_GP_FLAG_FQS);
1834 raw_spin_unlock_irqrestore_rcu_node(rcu_get_root(), flags);
1835 rcu_gp_kthread_wake();
1836 }
1837
1838 /*
1839 * Similar to rcu_report_qs_rdp(), for which it is a helper function.
1840 * Allows quiescent states for a group of CPUs to be reported at one go
1841 * to the specified rcu_node structure, though all the CPUs in the group
1842 * must be represented by the same rcu_node structure (which need not be a
1843 * leaf rcu_node structure, though it often will be). The gps parameter
1844 * is the grace-period snapshot, which means that the quiescent states
1845 * are valid only if rnp->gp_seq is equal to gps. That structure's lock
1846 * must be held upon entry, and it is released before return.
1847 *
1848 * As a special case, if mask is zero, the bit-already-cleared check is
1849 * disabled. This allows propagating quiescent state due to resumed tasks
1850 * during grace-period initialization.
1851 */
rcu_report_qs_rnp(unsigned long mask,struct rcu_node * rnp,unsigned long gps,unsigned long flags)1852 static void rcu_report_qs_rnp(unsigned long mask, struct rcu_node *rnp,
1853 unsigned long gps, unsigned long flags)
1854 __releases(rnp->lock)
1855 {
1856 unsigned long oldmask = 0;
1857 struct rcu_node *rnp_c;
1858
1859 raw_lockdep_assert_held_rcu_node(rnp);
1860
1861 /* Walk up the rcu_node hierarchy. */
1862 for (;;) {
1863 if ((!(rnp->qsmask & mask) && mask) || rnp->gp_seq != gps) {
1864
1865 /*
1866 * Our bit has already been cleared, or the
1867 * relevant grace period is already over, so done.
1868 */
1869 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
1870 return;
1871 }
1872 WARN_ON_ONCE(oldmask); /* Any child must be all zeroed! */
1873 WARN_ON_ONCE(!rcu_is_leaf_node(rnp) &&
1874 rcu_preempt_blocked_readers_cgp(rnp));
1875 rnp->qsmask &= ~mask;
1876 trace_rcu_quiescent_state_report(rcu_state.name, rnp->gp_seq,
1877 mask, rnp->qsmask, rnp->level,
1878 rnp->grplo, rnp->grphi,
1879 !!rnp->gp_tasks);
1880 if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) {
1881
1882 /* Other bits still set at this level, so done. */
1883 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
1884 return;
1885 }
1886 rnp->completedqs = rnp->gp_seq;
1887 mask = rnp->grpmask;
1888 if (rnp->parent == NULL) {
1889
1890 /* No more levels. Exit loop holding root lock. */
1891
1892 break;
1893 }
1894 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
1895 rnp_c = rnp;
1896 rnp = rnp->parent;
1897 raw_spin_lock_irqsave_rcu_node(rnp, flags);
1898 oldmask = rnp_c->qsmask;
1899 }
1900
1901 /*
1902 * Get here if we are the last CPU to pass through a quiescent
1903 * state for this grace period. Invoke rcu_report_qs_rsp()
1904 * to clean up and start the next grace period if one is needed.
1905 */
1906 rcu_report_qs_rsp(flags); /* releases rnp->lock. */
1907 }
1908
1909 /*
1910 * Record a quiescent state for all tasks that were previously queued
1911 * on the specified rcu_node structure and that were blocking the current
1912 * RCU grace period. The caller must hold the corresponding rnp->lock with
1913 * irqs disabled, and this lock is released upon return, but irqs remain
1914 * disabled.
1915 */
1916 static void __maybe_unused
rcu_report_unblock_qs_rnp(struct rcu_node * rnp,unsigned long flags)1917 rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
1918 __releases(rnp->lock)
1919 {
1920 unsigned long gps;
1921 unsigned long mask;
1922 struct rcu_node *rnp_p;
1923
1924 raw_lockdep_assert_held_rcu_node(rnp);
1925 if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPTION)) ||
1926 WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)) ||
1927 rnp->qsmask != 0) {
1928 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
1929 return; /* Still need more quiescent states! */
1930 }
1931
1932 rnp->completedqs = rnp->gp_seq;
1933 rnp_p = rnp->parent;
1934 if (rnp_p == NULL) {
1935 /*
1936 * Only one rcu_node structure in the tree, so don't
1937 * try to report up to its nonexistent parent!
1938 */
1939 rcu_report_qs_rsp(flags);
1940 return;
1941 }
1942
1943 /* Report up the rest of the hierarchy, tracking current ->gp_seq. */
1944 gps = rnp->gp_seq;
1945 mask = rnp->grpmask;
1946 raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
1947 raw_spin_lock_rcu_node(rnp_p); /* irqs already disabled. */
1948 rcu_report_qs_rnp(mask, rnp_p, gps, flags);
1949 }
1950
1951 /*
1952 * Record a quiescent state for the specified CPU to that CPU's rcu_data
1953 * structure. This must be called from the specified CPU.
1954 */
1955 static void
rcu_report_qs_rdp(int cpu,struct rcu_data * rdp)1956 rcu_report_qs_rdp(int cpu, struct rcu_data *rdp)
1957 {
1958 unsigned long flags;
1959 unsigned long mask;
1960 bool needwake = false;
1961 const bool offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
1962 rcu_segcblist_is_offloaded(&rdp->cblist);
1963 struct rcu_node *rnp;
1964
1965 rnp = rdp->mynode;
1966 raw_spin_lock_irqsave_rcu_node(rnp, flags);
1967 if (rdp->cpu_no_qs.b.norm || rdp->gp_seq != rnp->gp_seq ||
1968 rdp->gpwrap) {
1969
1970 /*
1971 * The grace period in which this quiescent state was
1972 * recorded has ended, so don't report it upwards.
1973 * We will instead need a new quiescent state that lies
1974 * within the current grace period.
1975 */
1976 rdp->cpu_no_qs.b.norm = true; /* need qs for new gp. */
1977 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
1978 return;
1979 }
1980 mask = rdp->grpmask;
1981 rdp->core_needs_qs = false;
1982 if ((rnp->qsmask & mask) == 0) {
1983 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
1984 } else {
1985 /*
1986 * This GP can't end until cpu checks in, so all of our
1987 * callbacks can be processed during the next GP.
1988 */
1989 if (!offloaded)
1990 needwake = rcu_accelerate_cbs(rnp, rdp);
1991
1992 rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags);
1993 /* ^^^ Released rnp->lock */
1994 if (needwake)
1995 rcu_gp_kthread_wake();
1996 }
1997 }
1998
1999 /*
2000 * Check to see if there is a new grace period of which this CPU
2001 * is not yet aware, and if so, set up local rcu_data state for it.
2002 * Otherwise, see if this CPU has just passed through its first
2003 * quiescent state for this grace period, and record that fact if so.
2004 */
2005 static void
rcu_check_quiescent_state(struct rcu_data * rdp)2006 rcu_check_quiescent_state(struct rcu_data *rdp)
2007 {
2008 /* Check for grace-period ends and beginnings. */
2009 note_gp_changes(rdp);
2010
2011 /*
2012 * Does this CPU still need to do its part for current grace period?
2013 * If no, return and let the other CPUs do their part as well.
2014 */
2015 if (!rdp->core_needs_qs)
2016 return;
2017
2018 /*
2019 * Was there a quiescent state since the beginning of the grace
2020 * period? If no, then exit and wait for the next call.
2021 */
2022 if (rdp->cpu_no_qs.b.norm)
2023 return;
2024
2025 /*
2026 * Tell RCU we are done (but rcu_report_qs_rdp() will be the
2027 * judge of that).
2028 */
2029 rcu_report_qs_rdp(rdp->cpu, rdp);
2030 }
2031
2032 /*
2033 * Near the end of the offline process. Trace the fact that this CPU
2034 * is going offline.
2035 */
rcutree_dying_cpu(unsigned int cpu)2036 int rcutree_dying_cpu(unsigned int cpu)
2037 {
2038 bool blkd;
2039 struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
2040 struct rcu_node *rnp = rdp->mynode;
2041
2042 if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
2043 return 0;
2044
2045 blkd = !!(rnp->qsmask & rdp->grpmask);
2046 trace_rcu_grace_period(rcu_state.name, rnp->gp_seq,
2047 blkd ? TPS("cpuofl") : TPS("cpuofl-bgp"));
2048 return 0;
2049 }
2050
2051 /*
2052 * All CPUs for the specified rcu_node structure have gone offline,
2053 * and all tasks that were preempted within an RCU read-side critical
2054 * section while running on one of those CPUs have since exited their RCU
2055 * read-side critical section. Some other CPU is reporting this fact with
2056 * the specified rcu_node structure's ->lock held and interrupts disabled.
2057 * This function therefore goes up the tree of rcu_node structures,
2058 * clearing the corresponding bits in the ->qsmaskinit fields. Note that
2059 * the leaf rcu_node structure's ->qsmaskinit field has already been
2060 * updated.
2061 *
2062 * This function does check that the specified rcu_node structure has
2063 * all CPUs offline and no blocked tasks, so it is OK to invoke it
2064 * prematurely. That said, invoking it after the fact will cost you
2065 * a needless lock acquisition. So once it has done its work, don't
2066 * invoke it again.
2067 */
rcu_cleanup_dead_rnp(struct rcu_node * rnp_leaf)2068 static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
2069 {
2070 long mask;
2071 struct rcu_node *rnp = rnp_leaf;
2072
2073 raw_lockdep_assert_held_rcu_node(rnp_leaf);
2074 if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) ||
2075 WARN_ON_ONCE(rnp_leaf->qsmaskinit) ||
2076 WARN_ON_ONCE(rcu_preempt_has_tasks(rnp_leaf)))
2077 return;
2078 for (;;) {
2079 mask = rnp->grpmask;
2080 rnp = rnp->parent;
2081 if (!rnp)
2082 break;
2083 raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
2084 rnp->qsmaskinit &= ~mask;
2085 /* Between grace periods, so better already be zero! */
2086 WARN_ON_ONCE(rnp->qsmask);
2087 if (rnp->qsmaskinit) {
2088 raw_spin_unlock_rcu_node(rnp);
2089 /* irqs remain disabled. */
2090 return;
2091 }
2092 raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
2093 }
2094 }
2095
2096 /*
2097 * The CPU has been completely removed, and some other CPU is reporting
2098 * this fact from process context. Do the remainder of the cleanup.
2099 * There can only be one CPU hotplug operation at a time, so no need for
2100 * explicit locking.
2101 */
rcutree_dead_cpu(unsigned int cpu)2102 int rcutree_dead_cpu(unsigned int cpu)
2103 {
2104 struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
2105 struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */
2106
2107 if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
2108 return 0;
2109
2110 /* Adjust any no-longer-needed kthreads. */
2111 rcu_boost_kthread_setaffinity(rnp, -1);
2112 /* Do any needed no-CB deferred wakeups from this CPU. */
2113 do_nocb_deferred_wakeup(per_cpu_ptr(&rcu_data, cpu));
2114 return 0;
2115 }
2116
2117 /*
2118 * Invoke any RCU callbacks that have made it to the end of their grace
2119 * period. Thottle as specified by rdp->blimit.
2120 */
rcu_do_batch(struct rcu_data * rdp)2121 static void rcu_do_batch(struct rcu_data *rdp)
2122 {
2123 unsigned long flags;
2124 const bool offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
2125 rcu_segcblist_is_offloaded(&rdp->cblist);
2126 struct rcu_head *rhp;
2127 struct rcu_cblist rcl = RCU_CBLIST_INITIALIZER(rcl);
2128 long bl, count;
2129 long pending, tlimit = 0;
2130
2131 /* If no callbacks are ready, just return. */
2132 if (!rcu_segcblist_ready_cbs(&rdp->cblist)) {
2133 trace_rcu_batch_start(rcu_state.name,
2134 rcu_segcblist_n_lazy_cbs(&rdp->cblist),
2135 rcu_segcblist_n_cbs(&rdp->cblist), 0);
2136 trace_rcu_batch_end(rcu_state.name, 0,
2137 !rcu_segcblist_empty(&rdp->cblist),
2138 need_resched(), is_idle_task(current),
2139 rcu_is_callbacks_kthread());
2140 return;
2141 }
2142
2143 /*
2144 * Extract the list of ready callbacks, disabling to prevent
2145 * races with call_rcu() from interrupt handlers. Leave the
2146 * callback counts, as rcu_barrier() needs to be conservative.
2147 */
2148 local_irq_save(flags);
2149 rcu_nocb_lock(rdp);
2150 WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));
2151 pending = rcu_segcblist_n_cbs(&rdp->cblist);
2152 bl = max(rdp->blimit, pending >> rcu_divisor);
2153 if (unlikely(bl > 100))
2154 tlimit = local_clock() + rcu_resched_ns;
2155 trace_rcu_batch_start(rcu_state.name,
2156 rcu_segcblist_n_lazy_cbs(&rdp->cblist),
2157 rcu_segcblist_n_cbs(&rdp->cblist), bl);
2158 rcu_segcblist_extract_done_cbs(&rdp->cblist, &rcl);
2159 if (offloaded)
2160 rdp->qlen_last_fqs_check = rcu_segcblist_n_cbs(&rdp->cblist);
2161 rcu_nocb_unlock_irqrestore(rdp, flags);
2162
2163 /* Invoke callbacks. */
2164 rhp = rcu_cblist_dequeue(&rcl);
2165 for (; rhp; rhp = rcu_cblist_dequeue(&rcl)) {
2166 debug_rcu_head_unqueue(rhp);
2167 if (__rcu_reclaim(rcu_state.name, rhp))
2168 rcu_cblist_dequeued_lazy(&rcl);
2169 /*
2170 * Stop only if limit reached and CPU has something to do.
2171 * Note: The rcl structure counts down from zero.
2172 */
2173 if (-rcl.len >= bl && !offloaded &&
2174 (need_resched() ||
2175 (!is_idle_task(current) && !rcu_is_callbacks_kthread())))
2176 break;
2177 if (unlikely(tlimit)) {
2178 /* only call local_clock() every 32 callbacks */
2179 if (likely((-rcl.len & 31) || local_clock() < tlimit))
2180 continue;
2181 /* Exceeded the time limit, so leave. */
2182 break;
2183 }
2184 if (offloaded) {
2185 WARN_ON_ONCE(in_serving_softirq());
2186 local_bh_enable();
2187 lockdep_assert_irqs_enabled();
2188 cond_resched_tasks_rcu_qs();
2189 lockdep_assert_irqs_enabled();
2190 local_bh_disable();
2191 }
2192 }
2193
2194 local_irq_save(flags);
2195 rcu_nocb_lock(rdp);
2196 count = -rcl.len;
2197 trace_rcu_batch_end(rcu_state.name, count, !!rcl.head, need_resched(),
2198 is_idle_task(current), rcu_is_callbacks_kthread());
2199
2200 /* Update counts and requeue any remaining callbacks. */
2201 rcu_segcblist_insert_done_cbs(&rdp->cblist, &rcl);
2202 smp_mb(); /* List handling before counting for rcu_barrier(). */
2203 rcu_segcblist_insert_count(&rdp->cblist, &rcl);
2204
2205 /* Reinstate batch limit if we have worked down the excess. */
2206 count = rcu_segcblist_n_cbs(&rdp->cblist);
2207 if (rdp->blimit >= DEFAULT_MAX_RCU_BLIMIT && count <= qlowmark)
2208 rdp->blimit = blimit;
2209
2210 /* Reset ->qlen_last_fqs_check trigger if enough CBs have drained. */
2211 if (count == 0 && rdp->qlen_last_fqs_check != 0) {
2212 rdp->qlen_last_fqs_check = 0;
2213 rdp->n_force_qs_snap = READ_ONCE(rcu_state.n_force_qs);
2214 } else if (count < rdp->qlen_last_fqs_check - qhimark)
2215 rdp->qlen_last_fqs_check = count;
2216
2217 /*
2218 * The following usually indicates a double call_rcu(). To track
2219 * this down, try building with CONFIG_DEBUG_OBJECTS_RCU_HEAD=y.
2220 */
2221 WARN_ON_ONCE(count == 0 && !rcu_segcblist_empty(&rdp->cblist));
2222 WARN_ON_ONCE(!IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
2223 count != 0 && rcu_segcblist_empty(&rdp->cblist));
2224
2225 rcu_nocb_unlock_irqrestore(rdp, flags);
2226
2227 /* Re-invoke RCU core processing if there are callbacks remaining. */
2228 if (!offloaded && rcu_segcblist_ready_cbs(&rdp->cblist))
2229 invoke_rcu_core();
2230 }
2231
2232 /*
2233 * This function is invoked from each scheduling-clock interrupt,
2234 * and checks to see if this CPU is in a non-context-switch quiescent
2235 * state, for example, user mode or idle loop. It also schedules RCU
2236 * core processing. If the current grace period has gone on too long,
2237 * it will ask the scheduler to manufacture a context switch for the sole
2238 * purpose of providing a providing the needed quiescent state.
2239 */
rcu_sched_clock_irq(int user)2240 void rcu_sched_clock_irq(int user)
2241 {
2242 trace_rcu_utilization(TPS("Start scheduler-tick"));
2243 raw_cpu_inc(rcu_data.ticks_this_gp);
2244 /* The load-acquire pairs with the store-release setting to true. */
2245 if (smp_load_acquire(this_cpu_ptr(&rcu_data.rcu_urgent_qs))) {
2246 /* Idle and userspace execution already are quiescent states. */
2247 if (!rcu_is_cpu_rrupt_from_idle() && !user) {
2248 set_tsk_need_resched(current);
2249 set_preempt_need_resched();
2250 }
2251 __this_cpu_write(rcu_data.rcu_urgent_qs, false);
2252 }
2253 rcu_flavor_sched_clock_irq(user);
2254 if (rcu_pending())
2255 invoke_rcu_core();
2256
2257 trace_rcu_utilization(TPS("End scheduler-tick"));
2258 }
2259
2260 /*
2261 * Scan the leaf rcu_node structures. For each structure on which all
2262 * CPUs have reported a quiescent state and on which there are tasks
2263 * blocking the current grace period, initiate RCU priority boosting.
2264 * Otherwise, invoke the specified function to check dyntick state for
2265 * each CPU that has not yet reported a quiescent state.
2266 */
force_qs_rnp(int (* f)(struct rcu_data * rdp))2267 static void force_qs_rnp(int (*f)(struct rcu_data *rdp))
2268 {
2269 int cpu;
2270 unsigned long flags;
2271 unsigned long mask;
2272 struct rcu_node *rnp;
2273
2274 rcu_for_each_leaf_node(rnp) {
2275 cond_resched_tasks_rcu_qs();
2276 mask = 0;
2277 raw_spin_lock_irqsave_rcu_node(rnp, flags);
2278 if (rnp->qsmask == 0) {
2279 if (!IS_ENABLED(CONFIG_PREEMPTION) ||
2280 rcu_preempt_blocked_readers_cgp(rnp)) {
2281 /*
2282 * No point in scanning bits because they
2283 * are all zero. But we might need to
2284 * priority-boost blocked readers.
2285 */
2286 rcu_initiate_boost(rnp, flags);
2287 /* rcu_initiate_boost() releases rnp->lock */
2288 continue;
2289 }
2290 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
2291 continue;
2292 }
2293 for_each_leaf_node_possible_cpu(rnp, cpu) {
2294 unsigned long bit = leaf_node_cpu_bit(rnp, cpu);
2295 if ((rnp->qsmask & bit) != 0) {
2296 if (f(per_cpu_ptr(&rcu_data, cpu)))
2297 mask |= bit;
2298 }
2299 }
2300 if (mask != 0) {
2301 /* Idle/offline CPUs, report (releases rnp->lock). */
2302 rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags);
2303 } else {
2304 /* Nothing to do here, so just drop the lock. */
2305 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
2306 }
2307 }
2308 }
2309
2310 /*
2311 * Force quiescent states on reluctant CPUs, and also detect which
2312 * CPUs are in dyntick-idle mode.
2313 */
rcu_force_quiescent_state(void)2314 void rcu_force_quiescent_state(void)
2315 {
2316 unsigned long flags;
2317 bool ret;
2318 struct rcu_node *rnp;
2319 struct rcu_node *rnp_old = NULL;
2320
2321 /* Funnel through hierarchy to reduce memory contention. */
2322 rnp = raw_cpu_read(rcu_data.mynode);
2323 for (; rnp != NULL; rnp = rnp->parent) {
2324 ret = (READ_ONCE(rcu_state.gp_flags) & RCU_GP_FLAG_FQS) ||
2325 !raw_spin_trylock(&rnp->fqslock);
2326 if (rnp_old != NULL)
2327 raw_spin_unlock(&rnp_old->fqslock);
2328 if (ret)
2329 return;
2330 rnp_old = rnp;
2331 }
2332 /* rnp_old == rcu_get_root(), rnp == NULL. */
2333
2334 /* Reached the root of the rcu_node tree, acquire lock. */
2335 raw_spin_lock_irqsave_rcu_node(rnp_old, flags);
2336 raw_spin_unlock(&rnp_old->fqslock);
2337 if (READ_ONCE(rcu_state.gp_flags) & RCU_GP_FLAG_FQS) {
2338 raw_spin_unlock_irqrestore_rcu_node(rnp_old, flags);
2339 return; /* Someone beat us to it. */
2340 }
2341 WRITE_ONCE(rcu_state.gp_flags,
2342 READ_ONCE(rcu_state.gp_flags) | RCU_GP_FLAG_FQS);
2343 raw_spin_unlock_irqrestore_rcu_node(rnp_old, flags);
2344 rcu_gp_kthread_wake();
2345 }
2346 EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
2347
2348 /* Perform RCU core processing work for the current CPU. */
rcu_core(void)2349 static __latent_entropy void rcu_core(void)
2350 {
2351 unsigned long flags;
2352 struct rcu_data *rdp = raw_cpu_ptr(&rcu_data);
2353 struct rcu_node *rnp = rdp->mynode;
2354 const bool offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
2355 rcu_segcblist_is_offloaded(&rdp->cblist);
2356
2357 if (cpu_is_offline(smp_processor_id()))
2358 return;
2359 trace_rcu_utilization(TPS("Start RCU core"));
2360 WARN_ON_ONCE(!rdp->beenonline);
2361
2362 /* Report any deferred quiescent states if preemption enabled. */
2363 if (!(preempt_count() & PREEMPT_MASK)) {
2364 rcu_preempt_deferred_qs(current);
2365 } else if (rcu_preempt_need_deferred_qs(current)) {
2366 set_tsk_need_resched(current);
2367 set_preempt_need_resched();
2368 }
2369
2370 /* Update RCU state based on any recent quiescent states. */
2371 rcu_check_quiescent_state(rdp);
2372
2373 /* No grace period and unregistered callbacks? */
2374 if (!rcu_gp_in_progress() &&
2375 rcu_segcblist_is_enabled(&rdp->cblist) && !offloaded) {
2376 local_irq_save(flags);
2377 if (!rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL))
2378 rcu_accelerate_cbs_unlocked(rnp, rdp);
2379 local_irq_restore(flags);
2380 }
2381
2382 rcu_check_gp_start_stall(rnp, rdp, rcu_jiffies_till_stall_check());
2383
2384 /* If there are callbacks ready, invoke them. */
2385 if (!offloaded && rcu_segcblist_ready_cbs(&rdp->cblist) &&
2386 likely(READ_ONCE(rcu_scheduler_fully_active)))
2387 rcu_do_batch(rdp);
2388
2389 /* Do any needed deferred wakeups of rcuo kthreads. */
2390 do_nocb_deferred_wakeup(rdp);
2391 trace_rcu_utilization(TPS("End RCU core"));
2392 }
2393
rcu_core_si(struct softirq_action * h)2394 static void rcu_core_si(struct softirq_action *h)
2395 {
2396 rcu_core();
2397 }
2398
rcu_wake_cond(struct task_struct * t,int status)2399 static void rcu_wake_cond(struct task_struct *t, int status)
2400 {
2401 /*
2402 * If the thread is yielding, only wake it when this
2403 * is invoked from idle
2404 */
2405 if (t && (status != RCU_KTHREAD_YIELDING || is_idle_task(current)))
2406 wake_up_process(t);
2407 }
2408
invoke_rcu_core_kthread(void)2409 static void invoke_rcu_core_kthread(void)
2410 {
2411 struct task_struct *t;
2412 unsigned long flags;
2413
2414 local_irq_save(flags);
2415 __this_cpu_write(rcu_data.rcu_cpu_has_work, 1);
2416 t = __this_cpu_read(rcu_data.rcu_cpu_kthread_task);
2417 if (t != NULL && t != current)
2418 rcu_wake_cond(t, __this_cpu_read(rcu_data.rcu_cpu_kthread_status));
2419 local_irq_restore(flags);
2420 }
2421
2422 /*
2423 * Wake up this CPU's rcuc kthread to do RCU core processing.
2424 */
invoke_rcu_core(void)2425 static void invoke_rcu_core(void)
2426 {
2427 if (!cpu_online(smp_processor_id()))
2428 return;
2429 if (use_softirq)
2430 raise_softirq(RCU_SOFTIRQ);
2431 else
2432 invoke_rcu_core_kthread();
2433 }
2434
rcu_cpu_kthread_park(unsigned int cpu)2435 static void rcu_cpu_kthread_park(unsigned int cpu)
2436 {
2437 per_cpu(rcu_data.rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
2438 }
2439
rcu_cpu_kthread_should_run(unsigned int cpu)2440 static int rcu_cpu_kthread_should_run(unsigned int cpu)
2441 {
2442 return __this_cpu_read(rcu_data.rcu_cpu_has_work);
2443 }
2444
2445 /*
2446 * Per-CPU kernel thread that invokes RCU callbacks. This replaces
2447 * the RCU softirq used in configurations of RCU that do not support RCU
2448 * priority boosting.
2449 */
rcu_cpu_kthread(unsigned int cpu)2450 static void rcu_cpu_kthread(unsigned int cpu)
2451 {
2452 unsigned int *statusp = this_cpu_ptr(&rcu_data.rcu_cpu_kthread_status);
2453 char work, *workp = this_cpu_ptr(&rcu_data.rcu_cpu_has_work);
2454 int spincnt;
2455
2456 for (spincnt = 0; spincnt < 10; spincnt++) {
2457 trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait"));
2458 local_bh_disable();
2459 *statusp = RCU_KTHREAD_RUNNING;
2460 local_irq_disable();
2461 work = *workp;
2462 *workp = 0;
2463 local_irq_enable();
2464 if (work)
2465 rcu_core();
2466 local_bh_enable();
2467 if (*workp == 0) {
2468 trace_rcu_utilization(TPS("End CPU kthread@rcu_wait"));
2469 *statusp = RCU_KTHREAD_WAITING;
2470 return;
2471 }
2472 }
2473 *statusp = RCU_KTHREAD_YIELDING;
2474 trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield"));
2475 schedule_timeout_interruptible(2);
2476 trace_rcu_utilization(TPS("End CPU kthread@rcu_yield"));
2477 *statusp = RCU_KTHREAD_WAITING;
2478 }
2479
2480 static struct smp_hotplug_thread rcu_cpu_thread_spec = {
2481 .store = &rcu_data.rcu_cpu_kthread_task,
2482 .thread_should_run = rcu_cpu_kthread_should_run,
2483 .thread_fn = rcu_cpu_kthread,
2484 .thread_comm = "rcuc/%u",
2485 .setup = rcu_cpu_kthread_setup,
2486 .park = rcu_cpu_kthread_park,
2487 };
2488
2489 /*
2490 * Spawn per-CPU RCU core processing kthreads.
2491 */
rcu_spawn_core_kthreads(void)2492 static int __init rcu_spawn_core_kthreads(void)
2493 {
2494 int cpu;
2495
2496 for_each_possible_cpu(cpu)
2497 per_cpu(rcu_data.rcu_cpu_has_work, cpu) = 0;
2498 if (!IS_ENABLED(CONFIG_RCU_BOOST) && use_softirq)
2499 return 0;
2500 WARN_ONCE(smpboot_register_percpu_thread(&rcu_cpu_thread_spec),
2501 "%s: Could not start rcuc kthread, OOM is now expected behavior\n", __func__);
2502 return 0;
2503 }
2504
2505 /*
2506 * Handle any core-RCU processing required by a call_rcu() invocation.
2507 */
__call_rcu_core(struct rcu_data * rdp,struct rcu_head * head,unsigned long flags)2508 static void __call_rcu_core(struct rcu_data *rdp, struct rcu_head *head,
2509 unsigned long flags)
2510 {
2511 /*
2512 * If called from an extended quiescent state, invoke the RCU
2513 * core in order to force a re-evaluation of RCU's idleness.
2514 */
2515 if (!rcu_is_watching())
2516 invoke_rcu_core();
2517
2518 /* If interrupts were disabled or CPU offline, don't invoke RCU core. */
2519 if (irqs_disabled_flags(flags) || cpu_is_offline(smp_processor_id()))
2520 return;
2521
2522 /*
2523 * Force the grace period if too many callbacks or too long waiting.
2524 * Enforce hysteresis, and don't invoke rcu_force_quiescent_state()
2525 * if some other CPU has recently done so. Also, don't bother
2526 * invoking rcu_force_quiescent_state() if the newly enqueued callback
2527 * is the only one waiting for a grace period to complete.
2528 */
2529 if (unlikely(rcu_segcblist_n_cbs(&rdp->cblist) >
2530 rdp->qlen_last_fqs_check + qhimark)) {
2531
2532 /* Are we ignoring a completed grace period? */
2533 note_gp_changes(rdp);
2534
2535 /* Start a new grace period if one not already started. */
2536 if (!rcu_gp_in_progress()) {
2537 rcu_accelerate_cbs_unlocked(rdp->mynode, rdp);
2538 } else {
2539 /* Give the grace period a kick. */
2540 rdp->blimit = DEFAULT_MAX_RCU_BLIMIT;
2541 if (READ_ONCE(rcu_state.n_force_qs) == rdp->n_force_qs_snap &&
2542 rcu_segcblist_first_pend_cb(&rdp->cblist) != head)
2543 rcu_force_quiescent_state();
2544 rdp->n_force_qs_snap = READ_ONCE(rcu_state.n_force_qs);
2545 rdp->qlen_last_fqs_check = rcu_segcblist_n_cbs(&rdp->cblist);
2546 }
2547 }
2548 }
2549
2550 /*
2551 * RCU callback function to leak a callback.
2552 */
rcu_leak_callback(struct rcu_head * rhp)2553 static void rcu_leak_callback(struct rcu_head *rhp)
2554 {
2555 }
2556
2557 /*
2558 * Helper function for call_rcu() and friends. The cpu argument will
2559 * normally be -1, indicating "currently running CPU". It may specify
2560 * a CPU only if that CPU is a no-CBs CPU. Currently, only rcu_barrier()
2561 * is expected to specify a CPU.
2562 */
2563 static void
__call_rcu(struct rcu_head * head,rcu_callback_t func,bool lazy)2564 __call_rcu(struct rcu_head *head, rcu_callback_t func, bool lazy)
2565 {
2566 unsigned long flags;
2567 struct rcu_data *rdp;
2568 bool was_alldone;
2569
2570 /* Misaligned rcu_head! */
2571 WARN_ON_ONCE((unsigned long)head & (sizeof(void *) - 1));
2572
2573 if (debug_rcu_head_queue(head)) {
2574 /*
2575 * Probable double call_rcu(), so leak the callback.
2576 * Use rcu:rcu_callback trace event to find the previous
2577 * time callback was passed to __call_rcu().
2578 */
2579 WARN_ONCE(1, "__call_rcu(): Double-freed CB %p->%pS()!!!\n",
2580 head, head->func);
2581 WRITE_ONCE(head->func, rcu_leak_callback);
2582 return;
2583 }
2584 head->func = func;
2585 head->next = NULL;
2586 local_irq_save(flags);
2587 rdp = this_cpu_ptr(&rcu_data);
2588
2589 /* Add the callback to our list. */
2590 if (unlikely(!rcu_segcblist_is_enabled(&rdp->cblist))) {
2591 // This can trigger due to call_rcu() from offline CPU:
2592 WARN_ON_ONCE(rcu_scheduler_active != RCU_SCHEDULER_INACTIVE);
2593 WARN_ON_ONCE(!rcu_is_watching());
2594 // Very early boot, before rcu_init(). Initialize if needed
2595 // and then drop through to queue the callback.
2596 if (rcu_segcblist_empty(&rdp->cblist))
2597 rcu_segcblist_init(&rdp->cblist);
2598 }
2599 if (rcu_nocb_try_bypass(rdp, head, &was_alldone, flags))
2600 return; // Enqueued onto ->nocb_bypass, so just leave.
2601 /* If we get here, rcu_nocb_try_bypass() acquired ->nocb_lock. */
2602 rcu_segcblist_enqueue(&rdp->cblist, head, lazy);
2603 if (__is_kfree_rcu_offset((unsigned long)func))
2604 trace_rcu_kfree_callback(rcu_state.name, head,
2605 (unsigned long)func,
2606 rcu_segcblist_n_lazy_cbs(&rdp->cblist),
2607 rcu_segcblist_n_cbs(&rdp->cblist));
2608 else
2609 trace_rcu_callback(rcu_state.name, head,
2610 rcu_segcblist_n_lazy_cbs(&rdp->cblist),
2611 rcu_segcblist_n_cbs(&rdp->cblist));
2612
2613 /* Go handle any RCU core processing required. */
2614 if (IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
2615 unlikely(rcu_segcblist_is_offloaded(&rdp->cblist))) {
2616 __call_rcu_nocb_wake(rdp, was_alldone, flags); /* unlocks */
2617 } else {
2618 __call_rcu_core(rdp, head, flags);
2619 local_irq_restore(flags);
2620 }
2621 }
2622
2623 /**
2624 * call_rcu() - Queue an RCU callback for invocation after a grace period.
2625 * @head: structure to be used for queueing the RCU updates.
2626 * @func: actual callback function to be invoked after the grace period
2627 *
2628 * The callback function will be invoked some time after a full grace
2629 * period elapses, in other words after all pre-existing RCU read-side
2630 * critical sections have completed. However, the callback function
2631 * might well execute concurrently with RCU read-side critical sections
2632 * that started after call_rcu() was invoked. RCU read-side critical
2633 * sections are delimited by rcu_read_lock() and rcu_read_unlock(), and
2634 * may be nested. In addition, regions of code across which interrupts,
2635 * preemption, or softirqs have been disabled also serve as RCU read-side
2636 * critical sections. This includes hardware interrupt handlers, softirq
2637 * handlers, and NMI handlers.
2638 *
2639 * Note that all CPUs must agree that the grace period extended beyond
2640 * all pre-existing RCU read-side critical section. On systems with more
2641 * than one CPU, this means that when "func()" is invoked, each CPU is
2642 * guaranteed to have executed a full memory barrier since the end of its
2643 * last RCU read-side critical section whose beginning preceded the call
2644 * to call_rcu(). It also means that each CPU executing an RCU read-side
2645 * critical section that continues beyond the start of "func()" must have
2646 * executed a memory barrier after the call_rcu() but before the beginning
2647 * of that RCU read-side critical section. Note that these guarantees
2648 * include CPUs that are offline, idle, or executing in user mode, as
2649 * well as CPUs that are executing in the kernel.
2650 *
2651 * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the
2652 * resulting RCU callback function "func()", then both CPU A and CPU B are
2653 * guaranteed to execute a full memory barrier during the time interval
2654 * between the call to call_rcu() and the invocation of "func()" -- even
2655 * if CPU A and CPU B are the same CPU (but again only if the system has
2656 * more than one CPU).
2657 */
call_rcu(struct rcu_head * head,rcu_callback_t func)2658 void call_rcu(struct rcu_head *head, rcu_callback_t func)
2659 {
2660 __call_rcu(head, func, 0);
2661 }
2662 EXPORT_SYMBOL_GPL(call_rcu);
2663
2664 /*
2665 * Queue an RCU callback for lazy invocation after a grace period.
2666 * This will likely be later named something like "call_rcu_lazy()",
2667 * but this change will require some way of tagging the lazy RCU
2668 * callbacks in the list of pending callbacks. Until then, this
2669 * function may only be called from __kfree_rcu().
2670 */
kfree_call_rcu(struct rcu_head * head,rcu_callback_t func)2671 void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
2672 {
2673 __call_rcu(head, func, 1);
2674 }
2675 EXPORT_SYMBOL_GPL(kfree_call_rcu);
2676
2677 /*
2678 * During early boot, any blocking grace-period wait automatically
2679 * implies a grace period. Later on, this is never the case for PREEMPT.
2680 *
2681 * Howevr, because a context switch is a grace period for !PREEMPT, any
2682 * blocking grace-period wait automatically implies a grace period if
2683 * there is only one CPU online at any point time during execution of
2684 * either synchronize_rcu() or synchronize_rcu_expedited(). It is OK to
2685 * occasionally incorrectly indicate that there are multiple CPUs online
2686 * when there was in fact only one the whole time, as this just adds some
2687 * overhead: RCU still operates correctly.
2688 */
rcu_blocking_is_gp(void)2689 static int rcu_blocking_is_gp(void)
2690 {
2691 int ret;
2692
2693 if (IS_ENABLED(CONFIG_PREEMPTION))
2694 return rcu_scheduler_active == RCU_SCHEDULER_INACTIVE;
2695 might_sleep(); /* Check for RCU read-side critical section. */
2696 preempt_disable();
2697 ret = num_online_cpus() <= 1;
2698 preempt_enable();
2699 return ret;
2700 }
2701
2702 /**
2703 * synchronize_rcu - wait until a grace period has elapsed.
2704 *
2705 * Control will return to the caller some time after a full grace
2706 * period has elapsed, in other words after all currently executing RCU
2707 * read-side critical sections have completed. Note, however, that
2708 * upon return from synchronize_rcu(), the caller might well be executing
2709 * concurrently with new RCU read-side critical sections that began while
2710 * synchronize_rcu() was waiting. RCU read-side critical sections are
2711 * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested.
2712 * In addition, regions of code across which interrupts, preemption, or
2713 * softirqs have been disabled also serve as RCU read-side critical
2714 * sections. This includes hardware interrupt handlers, softirq handlers,
2715 * and NMI handlers.
2716 *
2717 * Note that this guarantee implies further memory-ordering guarantees.
2718 * On systems with more than one CPU, when synchronize_rcu() returns,
2719 * each CPU is guaranteed to have executed a full memory barrier since
2720 * the end of its last RCU read-side critical section whose beginning
2721 * preceded the call to synchronize_rcu(). In addition, each CPU having
2722 * an RCU read-side critical section that extends beyond the return from
2723 * synchronize_rcu() is guaranteed to have executed a full memory barrier
2724 * after the beginning of synchronize_rcu() and before the beginning of
2725 * that RCU read-side critical section. Note that these guarantees include
2726 * CPUs that are offline, idle, or executing in user mode, as well as CPUs
2727 * that are executing in the kernel.
2728 *
2729 * Furthermore, if CPU A invoked synchronize_rcu(), which returned
2730 * to its caller on CPU B, then both CPU A and CPU B are guaranteed
2731 * to have executed a full memory barrier during the execution of
2732 * synchronize_rcu() -- even if CPU A and CPU B are the same CPU (but
2733 * again only if the system has more than one CPU).
2734 */
synchronize_rcu(void)2735 void synchronize_rcu(void)
2736 {
2737 RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) ||
2738 lock_is_held(&rcu_lock_map) ||
2739 lock_is_held(&rcu_sched_lock_map),
2740 "Illegal synchronize_rcu() in RCU read-side critical section");
2741 if (rcu_blocking_is_gp())
2742 return;
2743 if (rcu_gp_is_expedited())
2744 synchronize_rcu_expedited();
2745 else
2746 wait_rcu_gp(call_rcu);
2747 }
2748 EXPORT_SYMBOL_GPL(synchronize_rcu);
2749
2750 /**
2751 * get_state_synchronize_rcu - Snapshot current RCU state
2752 *
2753 * Returns a cookie that is used by a later call to cond_synchronize_rcu()
2754 * to determine whether or not a full grace period has elapsed in the
2755 * meantime.
2756 */
get_state_synchronize_rcu(void)2757 unsigned long get_state_synchronize_rcu(void)
2758 {
2759 /*
2760 * Any prior manipulation of RCU-protected data must happen
2761 * before the load from ->gp_seq.
2762 */
2763 smp_mb(); /* ^^^ */
2764 return rcu_seq_snap(&rcu_state.gp_seq);
2765 }
2766 EXPORT_SYMBOL_GPL(get_state_synchronize_rcu);
2767
2768 /**
2769 * cond_synchronize_rcu - Conditionally wait for an RCU grace period
2770 *
2771 * @oldstate: return value from earlier call to get_state_synchronize_rcu()
2772 *
2773 * If a full RCU grace period has elapsed since the earlier call to
2774 * get_state_synchronize_rcu(), just return. Otherwise, invoke
2775 * synchronize_rcu() to wait for a full grace period.
2776 *
2777 * Yes, this function does not take counter wrap into account. But
2778 * counter wrap is harmless. If the counter wraps, we have waited for
2779 * more than 2 billion grace periods (and way more on a 64-bit system!),
2780 * so waiting for one additional grace period should be just fine.
2781 */
cond_synchronize_rcu(unsigned long oldstate)2782 void cond_synchronize_rcu(unsigned long oldstate)
2783 {
2784 if (!rcu_seq_done(&rcu_state.gp_seq, oldstate))
2785 synchronize_rcu();
2786 else
2787 smp_mb(); /* Ensure GP ends before subsequent accesses. */
2788 }
2789 EXPORT_SYMBOL_GPL(cond_synchronize_rcu);
2790
2791 /*
2792 * Check to see if there is any immediate RCU-related work to be done by
2793 * the current CPU, returning 1 if so and zero otherwise. The checks are
2794 * in order of increasing expense: checks that can be carried out against
2795 * CPU-local state are performed first. However, we must check for CPU
2796 * stalls first, else we might not get a chance.
2797 */
rcu_pending(void)2798 static int rcu_pending(void)
2799 {
2800 struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
2801 struct rcu_node *rnp = rdp->mynode;
2802
2803 /* Check for CPU stalls, if enabled. */
2804 check_cpu_stall(rdp);
2805
2806 /* Does this CPU need a deferred NOCB wakeup? */
2807 if (rcu_nocb_need_deferred_wakeup(rdp))
2808 return 1;
2809
2810 /* Is this CPU a NO_HZ_FULL CPU that should ignore RCU? */
2811 if (rcu_nohz_full_cpu())
2812 return 0;
2813
2814 /* Is the RCU core waiting for a quiescent state from this CPU? */
2815 if (rdp->core_needs_qs && !rdp->cpu_no_qs.b.norm)
2816 return 1;
2817
2818 /* Does this CPU have callbacks ready to invoke? */
2819 if (rcu_segcblist_ready_cbs(&rdp->cblist))
2820 return 1;
2821
2822 /* Has RCU gone idle with this CPU needing another grace period? */
2823 if (!rcu_gp_in_progress() &&
2824 rcu_segcblist_is_enabled(&rdp->cblist) &&
2825 (!IS_ENABLED(CONFIG_RCU_NOCB_CPU) ||
2826 !rcu_segcblist_is_offloaded(&rdp->cblist)) &&
2827 !rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL))
2828 return 1;
2829
2830 /* Have RCU grace period completed or started? */
2831 if (rcu_seq_current(&rnp->gp_seq) != rdp->gp_seq ||
2832 unlikely(READ_ONCE(rdp->gpwrap))) /* outside lock */
2833 return 1;
2834
2835 /* nothing to do */
2836 return 0;
2837 }
2838
2839 /*
2840 * Helper function for rcu_barrier() tracing. If tracing is disabled,
2841 * the compiler is expected to optimize this away.
2842 */
rcu_barrier_trace(const char * s,int cpu,unsigned long done)2843 static void rcu_barrier_trace(const char *s, int cpu, unsigned long done)
2844 {
2845 trace_rcu_barrier(rcu_state.name, s, cpu,
2846 atomic_read(&rcu_state.barrier_cpu_count), done);
2847 }
2848
2849 /*
2850 * RCU callback function for rcu_barrier(). If we are last, wake
2851 * up the task executing rcu_barrier().
2852 */
rcu_barrier_callback(struct rcu_head * rhp)2853 static void rcu_barrier_callback(struct rcu_head *rhp)
2854 {
2855 if (atomic_dec_and_test(&rcu_state.barrier_cpu_count)) {
2856 rcu_barrier_trace(TPS("LastCB"), -1,
2857 rcu_state.barrier_sequence);
2858 complete(&rcu_state.barrier_completion);
2859 } else {
2860 rcu_barrier_trace(TPS("CB"), -1, rcu_state.barrier_sequence);
2861 }
2862 }
2863
2864 /*
2865 * Called with preemption disabled, and from cross-cpu IRQ context.
2866 */
rcu_barrier_func(void * unused)2867 static void rcu_barrier_func(void *unused)
2868 {
2869 struct rcu_data *rdp = raw_cpu_ptr(&rcu_data);
2870
2871 rcu_barrier_trace(TPS("IRQ"), -1, rcu_state.barrier_sequence);
2872 rdp->barrier_head.func = rcu_barrier_callback;
2873 debug_rcu_head_queue(&rdp->barrier_head);
2874 rcu_nocb_lock(rdp);
2875 WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies));
2876 if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head, 0)) {
2877 atomic_inc(&rcu_state.barrier_cpu_count);
2878 } else {
2879 debug_rcu_head_unqueue(&rdp->barrier_head);
2880 rcu_barrier_trace(TPS("IRQNQ"), -1,
2881 rcu_state.barrier_sequence);
2882 }
2883 rcu_nocb_unlock(rdp);
2884 }
2885
2886 /**
2887 * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete.
2888 *
2889 * Note that this primitive does not necessarily wait for an RCU grace period
2890 * to complete. For example, if there are no RCU callbacks queued anywhere
2891 * in the system, then rcu_barrier() is within its rights to return
2892 * immediately, without waiting for anything, much less an RCU grace period.
2893 */
rcu_barrier(void)2894 void rcu_barrier(void)
2895 {
2896 int cpu;
2897 struct rcu_data *rdp;
2898 unsigned long s = rcu_seq_snap(&rcu_state.barrier_sequence);
2899
2900 rcu_barrier_trace(TPS("Begin"), -1, s);
2901
2902 /* Take mutex to serialize concurrent rcu_barrier() requests. */
2903 mutex_lock(&rcu_state.barrier_mutex);
2904
2905 /* Did someone else do our work for us? */
2906 if (rcu_seq_done(&rcu_state.barrier_sequence, s)) {
2907 rcu_barrier_trace(TPS("EarlyExit"), -1,
2908 rcu_state.barrier_sequence);
2909 smp_mb(); /* caller's subsequent code after above check. */
2910 mutex_unlock(&rcu_state.barrier_mutex);
2911 return;
2912 }
2913
2914 /* Mark the start of the barrier operation. */
2915 rcu_seq_start(&rcu_state.barrier_sequence);
2916 rcu_barrier_trace(TPS("Inc1"), -1, rcu_state.barrier_sequence);
2917
2918 /*
2919 * Initialize the count to one rather than to zero in order to
2920 * avoid a too-soon return to zero in case of a short grace period
2921 * (or preemption of this task). Exclude CPU-hotplug operations
2922 * to ensure that no offline CPU has callbacks queued.
2923 */
2924 init_completion(&rcu_state.barrier_completion);
2925 atomic_set(&rcu_state.barrier_cpu_count, 1);
2926 get_online_cpus();
2927
2928 /*
2929 * Force each CPU with callbacks to register a new callback.
2930 * When that callback is invoked, we will know that all of the
2931 * corresponding CPU's preceding callbacks have been invoked.
2932 */
2933 for_each_possible_cpu(cpu) {
2934 rdp = per_cpu_ptr(&rcu_data, cpu);
2935 if (!cpu_online(cpu) &&
2936 !rcu_segcblist_is_offloaded(&rdp->cblist))
2937 continue;
2938 if (rcu_segcblist_n_cbs(&rdp->cblist)) {
2939 rcu_barrier_trace(TPS("OnlineQ"), cpu,
2940 rcu_state.barrier_sequence);
2941 smp_call_function_single(cpu, rcu_barrier_func, NULL, 1);
2942 } else {
2943 rcu_barrier_trace(TPS("OnlineNQ"), cpu,
2944 rcu_state.barrier_sequence);
2945 }
2946 }
2947 put_online_cpus();
2948
2949 /*
2950 * Now that we have an rcu_barrier_callback() callback on each
2951 * CPU, and thus each counted, remove the initial count.
2952 */
2953 if (atomic_dec_and_test(&rcu_state.barrier_cpu_count))
2954 complete(&rcu_state.barrier_completion);
2955
2956 /* Wait for all rcu_barrier_callback() callbacks to be invoked. */
2957 wait_for_completion(&rcu_state.barrier_completion);
2958
2959 /* Mark the end of the barrier operation. */
2960 rcu_barrier_trace(TPS("Inc2"), -1, rcu_state.barrier_sequence);
2961 rcu_seq_end(&rcu_state.barrier_sequence);
2962
2963 /* Other rcu_barrier() invocations can now safely proceed. */
2964 mutex_unlock(&rcu_state.barrier_mutex);
2965 }
2966 EXPORT_SYMBOL_GPL(rcu_barrier);
2967
2968 /*
2969 * Propagate ->qsinitmask bits up the rcu_node tree to account for the
2970 * first CPU in a given leaf rcu_node structure coming online. The caller
2971 * must hold the corresponding leaf rcu_node ->lock with interrrupts
2972 * disabled.
2973 */
rcu_init_new_rnp(struct rcu_node * rnp_leaf)2974 static void rcu_init_new_rnp(struct rcu_node *rnp_leaf)
2975 {
2976 long mask;
2977 long oldmask;
2978 struct rcu_node *rnp = rnp_leaf;
2979
2980 raw_lockdep_assert_held_rcu_node(rnp_leaf);
2981 WARN_ON_ONCE(rnp->wait_blkd_tasks);
2982 for (;;) {
2983 mask = rnp->grpmask;
2984 rnp = rnp->parent;
2985 if (rnp == NULL)
2986 return;
2987 raw_spin_lock_rcu_node(rnp); /* Interrupts already disabled. */
2988 oldmask = rnp->qsmaskinit;
2989 rnp->qsmaskinit |= mask;
2990 raw_spin_unlock_rcu_node(rnp); /* Interrupts remain disabled. */
2991 if (oldmask)
2992 return;
2993 }
2994 }
2995
2996 /*
2997 * Do boot-time initialization of a CPU's per-CPU RCU data.
2998 */
2999 static void __init
rcu_boot_init_percpu_data(int cpu)3000 rcu_boot_init_percpu_data(int cpu)
3001 {
3002 struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
3003
3004 /* Set up local state, ensuring consistent view of global state. */
3005 rdp->grpmask = leaf_node_cpu_bit(rdp->mynode, cpu);
3006 WARN_ON_ONCE(rdp->dynticks_nesting != 1);
3007 WARN_ON_ONCE(rcu_dynticks_in_eqs(rcu_dynticks_snap(rdp)));
3008 rdp->rcu_ofl_gp_seq = rcu_state.gp_seq;
3009 rdp->rcu_ofl_gp_flags = RCU_GP_CLEANED;
3010 rdp->rcu_onl_gp_seq = rcu_state.gp_seq;
3011 rdp->rcu_onl_gp_flags = RCU_GP_CLEANED;
3012 rdp->cpu = cpu;
3013 rcu_boot_init_nocb_percpu_data(rdp);
3014 }
3015
3016 /*
3017 * Invoked early in the CPU-online process, when pretty much all services
3018 * are available. The incoming CPU is not present.
3019 *
3020 * Initializes a CPU's per-CPU RCU data. Note that only one online or
3021 * offline event can be happening at a given time. Note also that we can
3022 * accept some slop in the rsp->gp_seq access due to the fact that this
3023 * CPU cannot possibly have any non-offloaded RCU callbacks in flight yet.
3024 * And any offloaded callbacks are being numbered elsewhere.
3025 */
rcutree_prepare_cpu(unsigned int cpu)3026 int rcutree_prepare_cpu(unsigned int cpu)
3027 {
3028 unsigned long flags;
3029 struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
3030 struct rcu_node *rnp = rcu_get_root();
3031
3032 /* Set up local state, ensuring consistent view of global state. */
3033 raw_spin_lock_irqsave_rcu_node(rnp, flags);
3034 rdp->qlen_last_fqs_check = 0;
3035 rdp->n_force_qs_snap = READ_ONCE(rcu_state.n_force_qs);
3036 rdp->blimit = blimit;
3037 if (rcu_segcblist_empty(&rdp->cblist) && /* No early-boot CBs? */
3038 !rcu_segcblist_is_offloaded(&rdp->cblist))
3039 rcu_segcblist_init(&rdp->cblist); /* Re-enable callbacks. */
3040 rdp->dynticks_nesting = 1; /* CPU not up, no tearing. */
3041 rcu_dynticks_eqs_online();
3042 raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
3043
3044 /*
3045 * Add CPU to leaf rcu_node pending-online bitmask. Any needed
3046 * propagation up the rcu_node tree will happen at the beginning
3047 * of the next grace period.
3048 */
3049 rnp = rdp->mynode;
3050 raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
3051 rdp->beenonline = true; /* We have now been online. */
3052 rdp->gp_seq = rnp->gp_seq;
3053 rdp->gp_seq_needed = rnp->gp_seq;
3054 rdp->cpu_no_qs.b.norm = true;
3055 rdp->core_needs_qs = false;
3056 rdp->rcu_iw_pending = false;
3057 rdp->rcu_iw_gp_seq = rnp->gp_seq - 1;
3058 trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("cpuonl"));
3059 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
3060 rcu_prepare_kthreads(cpu);
3061 rcu_spawn_cpu_nocb_kthread(cpu);
3062
3063 return 0;
3064 }
3065
3066 /*
3067 * Update RCU priority boot kthread affinity for CPU-hotplug changes.
3068 */
rcutree_affinity_setting(unsigned int cpu,int outgoing)3069 static void rcutree_affinity_setting(unsigned int cpu, int outgoing)
3070 {
3071 struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
3072
3073 rcu_boost_kthread_setaffinity(rdp->mynode, outgoing);
3074 }
3075
3076 /*
3077 * Near the end of the CPU-online process. Pretty much all services
3078 * enabled, and the CPU is now very much alive.
3079 */
rcutree_online_cpu(unsigned int cpu)3080 int rcutree_online_cpu(unsigned int cpu)
3081 {
3082 unsigned long flags;
3083 struct rcu_data *rdp;
3084 struct rcu_node *rnp;
3085
3086 rdp = per_cpu_ptr(&rcu_data, cpu);
3087 rnp = rdp->mynode;
3088 raw_spin_lock_irqsave_rcu_node(rnp, flags);
3089 rnp->ffmask |= rdp->grpmask;
3090 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
3091 if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE)
3092 return 0; /* Too early in boot for scheduler work. */
3093 sync_sched_exp_online_cleanup(cpu);
3094 rcutree_affinity_setting(cpu, -1);
3095 return 0;
3096 }
3097
3098 /*
3099 * Near the beginning of the process. The CPU is still very much alive
3100 * with pretty much all services enabled.
3101 */
rcutree_offline_cpu(unsigned int cpu)3102 int rcutree_offline_cpu(unsigned int cpu)
3103 {
3104 unsigned long flags;
3105 struct rcu_data *rdp;
3106 struct rcu_node *rnp;
3107
3108 rdp = per_cpu_ptr(&rcu_data, cpu);
3109 rnp = rdp->mynode;
3110 raw_spin_lock_irqsave_rcu_node(rnp, flags);
3111 rnp->ffmask &= ~rdp->grpmask;
3112 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
3113
3114 rcutree_affinity_setting(cpu, cpu);
3115 return 0;
3116 }
3117
3118 static DEFINE_PER_CPU(int, rcu_cpu_started);
3119
3120 /*
3121 * Mark the specified CPU as being online so that subsequent grace periods
3122 * (both expedited and normal) will wait on it. Note that this means that
3123 * incoming CPUs are not allowed to use RCU read-side critical sections
3124 * until this function is called. Failing to observe this restriction
3125 * will result in lockdep splats.
3126 *
3127 * Note that this function is special in that it is invoked directly
3128 * from the incoming CPU rather than from the cpuhp_step mechanism.
3129 * This is because this function must be invoked at a precise location.
3130 */
rcu_cpu_starting(unsigned int cpu)3131 void rcu_cpu_starting(unsigned int cpu)
3132 {
3133 unsigned long flags;
3134 unsigned long mask;
3135 int nbits;
3136 unsigned long oldmask;
3137 struct rcu_data *rdp;
3138 struct rcu_node *rnp;
3139
3140 if (per_cpu(rcu_cpu_started, cpu))
3141 return;
3142
3143 per_cpu(rcu_cpu_started, cpu) = 1;
3144
3145 rdp = per_cpu_ptr(&rcu_data, cpu);
3146 rnp = rdp->mynode;
3147 mask = rdp->grpmask;
3148 raw_spin_lock_irqsave_rcu_node(rnp, flags);
3149 rnp->qsmaskinitnext |= mask;
3150 oldmask = rnp->expmaskinitnext;
3151 rnp->expmaskinitnext |= mask;
3152 oldmask ^= rnp->expmaskinitnext;
3153 nbits = bitmap_weight(&oldmask, BITS_PER_LONG);
3154 /* Allow lockless access for expedited grace periods. */
3155 smp_store_release(&rcu_state.ncpus, rcu_state.ncpus + nbits); /* ^^^ */
3156 rcu_gpnum_ovf(rnp, rdp); /* Offline-induced counter wrap? */
3157 rdp->rcu_onl_gp_seq = READ_ONCE(rcu_state.gp_seq);
3158 rdp->rcu_onl_gp_flags = READ_ONCE(rcu_state.gp_flags);
3159 if (rnp->qsmask & mask) { /* RCU waiting on incoming CPU? */
3160 /* Report QS -after- changing ->qsmaskinitnext! */
3161 rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags);
3162 } else {
3163 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
3164 }
3165 smp_mb(); /* Ensure RCU read-side usage follows above initialization. */
3166 }
3167
3168 /*
3169 * The outgoing function has no further need of RCU, so remove it from
3170 * the rcu_node tree's ->qsmaskinitnext bit masks.
3171 *
3172 * Note that this function is special in that it is invoked directly
3173 * from the outgoing CPU rather than from the cpuhp_step mechanism.
3174 * This is because this function must be invoked at a precise location.
3175 */
rcu_report_dead(unsigned int cpu)3176 void rcu_report_dead(unsigned int cpu)
3177 {
3178 unsigned long flags;
3179 unsigned long mask;
3180 struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
3181 struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */
3182
3183 /* QS for any half-done expedited grace period. */
3184 preempt_disable();
3185 rcu_report_exp_rdp(this_cpu_ptr(&rcu_data));
3186 preempt_enable();
3187 rcu_preempt_deferred_qs(current);
3188
3189 /* Remove outgoing CPU from mask in the leaf rcu_node structure. */
3190 mask = rdp->grpmask;
3191 raw_spin_lock(&rcu_state.ofl_lock);
3192 raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */
3193 rdp->rcu_ofl_gp_seq = READ_ONCE(rcu_state.gp_seq);
3194 rdp->rcu_ofl_gp_flags = READ_ONCE(rcu_state.gp_flags);
3195 if (rnp->qsmask & mask) { /* RCU waiting on outgoing CPU? */
3196 /* Report quiescent state -before- changing ->qsmaskinitnext! */
3197 rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags);
3198 raw_spin_lock_irqsave_rcu_node(rnp, flags);
3199 }
3200 rnp->qsmaskinitnext &= ~mask;
3201 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
3202 raw_spin_unlock(&rcu_state.ofl_lock);
3203
3204 per_cpu(rcu_cpu_started, cpu) = 0;
3205 }
3206
3207 #ifdef CONFIG_HOTPLUG_CPU
3208 /*
3209 * The outgoing CPU has just passed through the dying-idle state, and we
3210 * are being invoked from the CPU that was IPIed to continue the offline
3211 * operation. Migrate the outgoing CPU's callbacks to the current CPU.
3212 */
rcutree_migrate_callbacks(int cpu)3213 void rcutree_migrate_callbacks(int cpu)
3214 {
3215 unsigned long flags;
3216 struct rcu_data *my_rdp;
3217 struct rcu_node *my_rnp;
3218 struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
3219 bool needwake;
3220
3221 if (rcu_segcblist_is_offloaded(&rdp->cblist) ||
3222 rcu_segcblist_empty(&rdp->cblist))
3223 return; /* No callbacks to migrate. */
3224
3225 local_irq_save(flags);
3226 my_rdp = this_cpu_ptr(&rcu_data);
3227 my_rnp = my_rdp->mynode;
3228 rcu_nocb_lock(my_rdp); /* irqs already disabled. */
3229 WARN_ON_ONCE(!rcu_nocb_flush_bypass(my_rdp, NULL, jiffies));
3230 raw_spin_lock_rcu_node(my_rnp); /* irqs already disabled. */
3231 /* Leverage recent GPs and set GP for new callbacks. */
3232 needwake = rcu_advance_cbs(my_rnp, rdp) ||
3233 rcu_advance_cbs(my_rnp, my_rdp);
3234 rcu_segcblist_merge(&my_rdp->cblist, &rdp->cblist);
3235 needwake = needwake || rcu_advance_cbs(my_rnp, my_rdp);
3236 rcu_segcblist_disable(&rdp->cblist);
3237 WARN_ON_ONCE(rcu_segcblist_empty(&my_rdp->cblist) !=
3238 !rcu_segcblist_n_cbs(&my_rdp->cblist));
3239 if (rcu_segcblist_is_offloaded(&my_rdp->cblist)) {
3240 raw_spin_unlock_rcu_node(my_rnp); /* irqs remain disabled. */
3241 __call_rcu_nocb_wake(my_rdp, true, flags);
3242 } else {
3243 rcu_nocb_unlock(my_rdp); /* irqs remain disabled. */
3244 raw_spin_unlock_irqrestore_rcu_node(my_rnp, flags);
3245 }
3246 if (needwake)
3247 rcu_gp_kthread_wake();
3248 lockdep_assert_irqs_enabled();
3249 WARN_ONCE(rcu_segcblist_n_cbs(&rdp->cblist) != 0 ||
3250 !rcu_segcblist_empty(&rdp->cblist),
3251 "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, 1stCB=%p\n",
3252 cpu, rcu_segcblist_n_cbs(&rdp->cblist),
3253 rcu_segcblist_first_cb(&rdp->cblist));
3254 }
3255 #endif
3256
3257 /*
3258 * On non-huge systems, use expedited RCU grace periods to make suspend
3259 * and hibernation run faster.
3260 */
rcu_pm_notify(struct notifier_block * self,unsigned long action,void * hcpu)3261 static int rcu_pm_notify(struct notifier_block *self,
3262 unsigned long action, void *hcpu)
3263 {
3264 switch (action) {
3265 case PM_HIBERNATION_PREPARE:
3266 case PM_SUSPEND_PREPARE:
3267 rcu_expedite_gp();
3268 break;
3269 case PM_POST_HIBERNATION:
3270 case PM_POST_SUSPEND:
3271 rcu_unexpedite_gp();
3272 break;
3273 default:
3274 break;
3275 }
3276 return NOTIFY_OK;
3277 }
3278
3279 /*
3280 * Spawn the kthreads that handle RCU's grace periods.
3281 */
rcu_spawn_gp_kthread(void)3282 static int __init rcu_spawn_gp_kthread(void)
3283 {
3284 unsigned long flags;
3285 int kthread_prio_in = kthread_prio;
3286 struct rcu_node *rnp;
3287 struct sched_param sp;
3288 struct task_struct *t;
3289
3290 /* Force priority into range. */
3291 if (IS_ENABLED(CONFIG_RCU_BOOST) && kthread_prio < 2
3292 && IS_BUILTIN(CONFIG_RCU_TORTURE_TEST))
3293 kthread_prio = 2;
3294 else if (IS_ENABLED(CONFIG_RCU_BOOST) && kthread_prio < 1)
3295 kthread_prio = 1;
3296 else if (kthread_prio < 0)
3297 kthread_prio = 0;
3298 else if (kthread_prio > 99)
3299 kthread_prio = 99;
3300
3301 if (kthread_prio != kthread_prio_in)
3302 pr_alert("rcu_spawn_gp_kthread(): Limited prio to %d from %d\n",
3303 kthread_prio, kthread_prio_in);
3304
3305 rcu_scheduler_fully_active = 1;
3306 t = kthread_create(rcu_gp_kthread, NULL, "%s", rcu_state.name);
3307 if (WARN_ONCE(IS_ERR(t), "%s: Could not start grace-period kthread, OOM is now expected behavior\n", __func__))
3308 return 0;
3309 if (kthread_prio) {
3310 sp.sched_priority = kthread_prio;
3311 sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
3312 }
3313 rnp = rcu_get_root();
3314 raw_spin_lock_irqsave_rcu_node(rnp, flags);
3315 rcu_state.gp_kthread = t;
3316 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
3317 wake_up_process(t);
3318 rcu_spawn_nocb_kthreads();
3319 rcu_spawn_boost_kthreads();
3320 rcu_spawn_core_kthreads();
3321 return 0;
3322 }
3323 early_initcall(rcu_spawn_gp_kthread);
3324
3325 /*
3326 * This function is invoked towards the end of the scheduler's
3327 * initialization process. Before this is called, the idle task might
3328 * contain synchronous grace-period primitives (during which time, this idle
3329 * task is booting the system, and such primitives are no-ops). After this
3330 * function is called, any synchronous grace-period primitives are run as
3331 * expedited, with the requesting task driving the grace period forward.
3332 * A later core_initcall() rcu_set_runtime_mode() will switch to full
3333 * runtime RCU functionality.
3334 */
rcu_scheduler_starting(void)3335 void rcu_scheduler_starting(void)
3336 {
3337 WARN_ON(num_online_cpus() != 1);
3338 WARN_ON(nr_context_switches() > 0);
3339 rcu_test_sync_prims();
3340 rcu_scheduler_active = RCU_SCHEDULER_INIT;
3341 rcu_test_sync_prims();
3342 }
3343
3344 /*
3345 * Helper function for rcu_init() that initializes the rcu_state structure.
3346 */
rcu_init_one(void)3347 static void __init rcu_init_one(void)
3348 {
3349 static const char * const buf[] = RCU_NODE_NAME_INIT;
3350 static const char * const fqs[] = RCU_FQS_NAME_INIT;
3351 static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
3352 static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
3353
3354 int levelspread[RCU_NUM_LVLS]; /* kids/node in each level. */
3355 int cpustride = 1;
3356 int i;
3357 int j;
3358 struct rcu_node *rnp;
3359
3360 BUILD_BUG_ON(RCU_NUM_LVLS > ARRAY_SIZE(buf)); /* Fix buf[] init! */
3361
3362 /* Silence gcc 4.8 false positive about array index out of range. */
3363 if (rcu_num_lvls <= 0 || rcu_num_lvls > RCU_NUM_LVLS)
3364 panic("rcu_init_one: rcu_num_lvls out of range");
3365
3366 /* Initialize the level-tracking arrays. */
3367
3368 for (i = 1; i < rcu_num_lvls; i++)
3369 rcu_state.level[i] =
3370 rcu_state.level[i - 1] + num_rcu_lvl[i - 1];
3371 rcu_init_levelspread(levelspread, num_rcu_lvl);
3372
3373 /* Initialize the elements themselves, starting from the leaves. */
3374
3375 for (i = rcu_num_lvls - 1; i >= 0; i--) {
3376 cpustride *= levelspread[i];
3377 rnp = rcu_state.level[i];
3378 for (j = 0; j < num_rcu_lvl[i]; j++, rnp++) {
3379 raw_spin_lock_init(&ACCESS_PRIVATE(rnp, lock));
3380 lockdep_set_class_and_name(&ACCESS_PRIVATE(rnp, lock),
3381 &rcu_node_class[i], buf[i]);
3382 raw_spin_lock_init(&rnp->fqslock);
3383 lockdep_set_class_and_name(&rnp->fqslock,
3384 &rcu_fqs_class[i], fqs[i]);
3385 rnp->gp_seq = rcu_state.gp_seq;
3386 rnp->gp_seq_needed = rcu_state.gp_seq;
3387 rnp->completedqs = rcu_state.gp_seq;
3388 rnp->qsmask = 0;
3389 rnp->qsmaskinit = 0;
3390 rnp->grplo = j * cpustride;
3391 rnp->grphi = (j + 1) * cpustride - 1;
3392 if (rnp->grphi >= nr_cpu_ids)
3393 rnp->grphi = nr_cpu_ids - 1;
3394 if (i == 0) {
3395 rnp->grpnum = 0;
3396 rnp->grpmask = 0;
3397 rnp->parent = NULL;
3398 } else {
3399 rnp->grpnum = j % levelspread[i - 1];
3400 rnp->grpmask = BIT(rnp->grpnum);
3401 rnp->parent = rcu_state.level[i - 1] +
3402 j / levelspread[i - 1];
3403 }
3404 rnp->level = i;
3405 INIT_LIST_HEAD(&rnp->blkd_tasks);
3406 rcu_init_one_nocb(rnp);
3407 init_waitqueue_head(&rnp->exp_wq[0]);
3408 init_waitqueue_head(&rnp->exp_wq[1]);
3409 init_waitqueue_head(&rnp->exp_wq[2]);
3410 init_waitqueue_head(&rnp->exp_wq[3]);
3411 spin_lock_init(&rnp->exp_lock);
3412 }
3413 }
3414
3415 init_swait_queue_head(&rcu_state.gp_wq);
3416 init_swait_queue_head(&rcu_state.expedited_wq);
3417 rnp = rcu_first_leaf_node();
3418 for_each_possible_cpu(i) {
3419 while (i > rnp->grphi)
3420 rnp++;
3421 per_cpu_ptr(&rcu_data, i)->mynode = rnp;
3422 rcu_boot_init_percpu_data(i);
3423 }
3424 }
3425
3426 /*
3427 * Compute the rcu_node tree geometry from kernel parameters. This cannot
3428 * replace the definitions in tree.h because those are needed to size
3429 * the ->node array in the rcu_state structure.
3430 */
rcu_init_geometry(void)3431 void rcu_init_geometry(void)
3432 {
3433 ulong d;
3434 int i;
3435 static unsigned long old_nr_cpu_ids;
3436 int rcu_capacity[RCU_NUM_LVLS];
3437 static bool initialized;
3438
3439 if (initialized) {
3440 /*
3441 * Warn if setup_nr_cpu_ids() had not yet been invoked,
3442 * unless nr_cpus_ids == NR_CPUS, in which case who cares?
3443 */
3444 WARN_ON_ONCE(old_nr_cpu_ids != nr_cpu_ids);
3445 return;
3446 }
3447
3448 old_nr_cpu_ids = nr_cpu_ids;
3449 initialized = true;
3450
3451 /*
3452 * Initialize any unspecified boot parameters.
3453 * The default values of jiffies_till_first_fqs and
3454 * jiffies_till_next_fqs are set to the RCU_JIFFIES_TILL_FORCE_QS
3455 * value, which is a function of HZ, then adding one for each
3456 * RCU_JIFFIES_FQS_DIV CPUs that might be on the system.
3457 */
3458 d = RCU_JIFFIES_TILL_FORCE_QS + nr_cpu_ids / RCU_JIFFIES_FQS_DIV;
3459 if (jiffies_till_first_fqs == ULONG_MAX)
3460 jiffies_till_first_fqs = d;
3461 if (jiffies_till_next_fqs == ULONG_MAX)
3462 jiffies_till_next_fqs = d;
3463 adjust_jiffies_till_sched_qs();
3464
3465 /* If the compile-time values are accurate, just leave. */
3466 if (rcu_fanout_leaf == RCU_FANOUT_LEAF &&
3467 nr_cpu_ids == NR_CPUS)
3468 return;
3469 pr_info("Adjusting geometry for rcu_fanout_leaf=%d, nr_cpu_ids=%u\n",
3470 rcu_fanout_leaf, nr_cpu_ids);
3471
3472 /*
3473 * The boot-time rcu_fanout_leaf parameter must be at least two
3474 * and cannot exceed the number of bits in the rcu_node masks.
3475 * Complain and fall back to the compile-time values if this
3476 * limit is exceeded.
3477 */
3478 if (rcu_fanout_leaf < 2 ||
3479 rcu_fanout_leaf > sizeof(unsigned long) * 8) {
3480 rcu_fanout_leaf = RCU_FANOUT_LEAF;
3481 WARN_ON(1);
3482 return;
3483 }
3484
3485 /*
3486 * Compute number of nodes that can be handled an rcu_node tree
3487 * with the given number of levels.
3488 */
3489 rcu_capacity[0] = rcu_fanout_leaf;
3490 for (i = 1; i < RCU_NUM_LVLS; i++)
3491 rcu_capacity[i] = rcu_capacity[i - 1] * RCU_FANOUT;
3492
3493 /*
3494 * The tree must be able to accommodate the configured number of CPUs.
3495 * If this limit is exceeded, fall back to the compile-time values.
3496 */
3497 if (nr_cpu_ids > rcu_capacity[RCU_NUM_LVLS - 1]) {
3498 rcu_fanout_leaf = RCU_FANOUT_LEAF;
3499 WARN_ON(1);
3500 return;
3501 }
3502
3503 /* Calculate the number of levels in the tree. */
3504 for (i = 0; nr_cpu_ids > rcu_capacity[i]; i++) {
3505 }
3506 rcu_num_lvls = i + 1;
3507
3508 /* Calculate the number of rcu_nodes at each level of the tree. */
3509 for (i = 0; i < rcu_num_lvls; i++) {
3510 int cap = rcu_capacity[(rcu_num_lvls - 1) - i];
3511 num_rcu_lvl[i] = DIV_ROUND_UP(nr_cpu_ids, cap);
3512 }
3513
3514 /* Calculate the total number of rcu_node structures. */
3515 rcu_num_nodes = 0;
3516 for (i = 0; i < rcu_num_lvls; i++)
3517 rcu_num_nodes += num_rcu_lvl[i];
3518 }
3519
3520 /*
3521 * Dump out the structure of the rcu_node combining tree associated
3522 * with the rcu_state structure.
3523 */
rcu_dump_rcu_node_tree(void)3524 static void __init rcu_dump_rcu_node_tree(void)
3525 {
3526 int level = 0;
3527 struct rcu_node *rnp;
3528
3529 pr_info("rcu_node tree layout dump\n");
3530 pr_info(" ");
3531 rcu_for_each_node_breadth_first(rnp) {
3532 if (rnp->level != level) {
3533 pr_cont("\n");
3534 pr_info(" ");
3535 level = rnp->level;
3536 }
3537 pr_cont("%d:%d ^%d ", rnp->grplo, rnp->grphi, rnp->grpnum);
3538 }
3539 pr_cont("\n");
3540 }
3541
3542 struct workqueue_struct *rcu_gp_wq;
3543 struct workqueue_struct *rcu_par_gp_wq;
3544
rcu_init(void)3545 void __init rcu_init(void)
3546 {
3547 int cpu;
3548
3549 rcu_early_boot_tests();
3550
3551 rcu_bootup_announce();
3552 rcu_init_geometry();
3553 rcu_init_one();
3554 if (dump_tree)
3555 rcu_dump_rcu_node_tree();
3556 if (use_softirq)
3557 open_softirq(RCU_SOFTIRQ, rcu_core_si);
3558
3559 /*
3560 * We don't need protection against CPU-hotplug here because
3561 * this is called early in boot, before either interrupts
3562 * or the scheduler are operational.
3563 */
3564 pm_notifier(rcu_pm_notify, 0);
3565 for_each_online_cpu(cpu) {
3566 rcutree_prepare_cpu(cpu);
3567 rcu_cpu_starting(cpu);
3568 rcutree_online_cpu(cpu);
3569 }
3570
3571 /* Create workqueue for expedited GPs and for Tree SRCU. */
3572 rcu_gp_wq = alloc_workqueue("rcu_gp", WQ_MEM_RECLAIM, 0);
3573 WARN_ON(!rcu_gp_wq);
3574 rcu_par_gp_wq = alloc_workqueue("rcu_par_gp", WQ_MEM_RECLAIM, 0);
3575 WARN_ON(!rcu_par_gp_wq);
3576 srcu_init();
3577 }
3578
3579 #include "tree_stall.h"
3580 #include "tree_exp.h"
3581 #include "tree_plugin.h"
3582