• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Read-Copy Update mechanism for mutual exclusion, realtime implementation
3  *
4  * This program is free software; you can redistribute it and/or modify
5  * it under the terms of the GNU General Public License as published by
6  * the Free Software Foundation; either version 2 of the License, or
7  * (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program; if not, write to the Free Software
16  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17  *
18  * Copyright IBM Corporation, 2006
19  *
20  * Authors: Paul E. McKenney <paulmck@us.ibm.com>
21  *		With thanks to Esben Nielsen, Bill Huey, and Ingo Molnar
22  *		for pushing me away from locks and towards counters, and
23  *		to Suparna Bhattacharya for pushing me completely away
24  *		from atomic instructions on the read side.
25  *
26  *  - Added handling of Dynamic Ticks
27  *      Copyright 2007 - Paul E. Mckenney <paulmck@us.ibm.com>
28  *                     - Steven Rostedt <srostedt@redhat.com>
29  *
30  * Papers:  http://www.rdrop.com/users/paulmck/RCU
31  *
32  * Design Document: http://lwn.net/Articles/253651/
33  *
34  * For detailed explanation of Read-Copy Update mechanism see -
35  * 		Documentation/RCU/ *.txt
36  *
37  */
38 #include <linux/types.h>
39 #include <linux/kernel.h>
40 #include <linux/init.h>
41 #include <linux/spinlock.h>
42 #include <linux/smp.h>
43 #include <linux/rcupdate.h>
44 #include <linux/interrupt.h>
45 #include <linux/sched.h>
46 #include <asm/atomic.h>
47 #include <linux/bitops.h>
48 #include <linux/module.h>
49 #include <linux/kthread.h>
50 #include <linux/completion.h>
51 #include <linux/moduleparam.h>
52 #include <linux/percpu.h>
53 #include <linux/notifier.h>
54 #include <linux/cpu.h>
55 #include <linux/random.h>
56 #include <linux/delay.h>
57 #include <linux/cpumask.h>
58 #include <linux/rcupreempt_trace.h>
59 #include <asm/byteorder.h>
60 
61 /*
62  * PREEMPT_RCU data structures.
63  */
64 
65 /*
66  * GP_STAGES specifies the number of times the state machine has
67  * to go through the all the rcu_try_flip_states (see below)
68  * in a single Grace Period.
69  *
70  * GP in GP_STAGES stands for Grace Period ;)
71  */
72 #define GP_STAGES    2
73 struct rcu_data {
74 	spinlock_t	lock;		/* Protect rcu_data fields. */
75 	long		completed;	/* Number of last completed batch. */
76 	int		waitlistcount;
77 	struct rcu_head *nextlist;
78 	struct rcu_head **nexttail;
79 	struct rcu_head *waitlist[GP_STAGES];
80 	struct rcu_head **waittail[GP_STAGES];
81 	struct rcu_head *donelist;	/* from waitlist & waitschedlist */
82 	struct rcu_head **donetail;
83 	long rcu_flipctr[2];
84 	struct rcu_head *nextschedlist;
85 	struct rcu_head **nextschedtail;
86 	struct rcu_head *waitschedlist;
87 	struct rcu_head **waitschedtail;
88 	int rcu_sched_sleeping;
89 #ifdef CONFIG_RCU_TRACE
90 	struct rcupreempt_trace trace;
91 #endif /* #ifdef CONFIG_RCU_TRACE */
92 };
93 
94 /*
95  * States for rcu_try_flip() and friends.
96  */
97 
98 enum rcu_try_flip_states {
99 
100 	/*
101 	 * Stay here if nothing is happening. Flip the counter if somthing
102 	 * starts happening. Denoted by "I"
103 	 */
104 	rcu_try_flip_idle_state,
105 
106 	/*
107 	 * Wait here for all CPUs to notice that the counter has flipped. This
108 	 * prevents the old set of counters from ever being incremented once
109 	 * we leave this state, which in turn is necessary because we cannot
110 	 * test any individual counter for zero -- we can only check the sum.
111 	 * Denoted by "A".
112 	 */
113 	rcu_try_flip_waitack_state,
114 
115 	/*
116 	 * Wait here for the sum of the old per-CPU counters to reach zero.
117 	 * Denoted by "Z".
118 	 */
119 	rcu_try_flip_waitzero_state,
120 
121 	/*
122 	 * Wait here for each of the other CPUs to execute a memory barrier.
123 	 * This is necessary to ensure that these other CPUs really have
124 	 * completed executing their RCU read-side critical sections, despite
125 	 * their CPUs wildly reordering memory. Denoted by "M".
126 	 */
127 	rcu_try_flip_waitmb_state,
128 };
129 
130 /*
131  * States for rcu_ctrlblk.rcu_sched_sleep.
132  */
133 
134 enum rcu_sched_sleep_states {
135 	rcu_sched_not_sleeping,	/* Not sleeping, callbacks need GP.  */
136 	rcu_sched_sleep_prep,	/* Thinking of sleeping, rechecking. */
137 	rcu_sched_sleeping,	/* Sleeping, awaken if GP needed. */
138 };
139 
140 struct rcu_ctrlblk {
141 	spinlock_t	fliplock;	/* Protect state-machine transitions. */
142 	long		completed;	/* Number of last completed batch. */
143 	enum rcu_try_flip_states rcu_try_flip_state; /* The current state of
144 							the rcu state machine */
145 	spinlock_t	schedlock;	/* Protect rcu_sched sleep state. */
146 	enum rcu_sched_sleep_states sched_sleep; /* rcu_sched state. */
147 	wait_queue_head_t sched_wq;	/* Place for rcu_sched to sleep. */
148 };
149 
150 static DEFINE_PER_CPU(struct rcu_data, rcu_data);
151 static struct rcu_ctrlblk rcu_ctrlblk = {
152 	.fliplock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.fliplock),
153 	.completed = 0,
154 	.rcu_try_flip_state = rcu_try_flip_idle_state,
155 	.schedlock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.schedlock),
156 	.sched_sleep = rcu_sched_not_sleeping,
157 	.sched_wq = __WAIT_QUEUE_HEAD_INITIALIZER(rcu_ctrlblk.sched_wq),
158 };
159 
160 static struct task_struct *rcu_sched_grace_period_task;
161 
162 #ifdef CONFIG_RCU_TRACE
163 static char *rcu_try_flip_state_names[] =
164 	{ "idle", "waitack", "waitzero", "waitmb" };
165 #endif /* #ifdef CONFIG_RCU_TRACE */
166 
167 static DECLARE_BITMAP(rcu_cpu_online_map, NR_CPUS) __read_mostly
168 	= CPU_BITS_NONE;
169 
170 /*
171  * Enum and per-CPU flag to determine when each CPU has seen
172  * the most recent counter flip.
173  */
174 
175 enum rcu_flip_flag_values {
176 	rcu_flip_seen,		/* Steady/initial state, last flip seen. */
177 				/* Only GP detector can update. */
178 	rcu_flipped		/* Flip just completed, need confirmation. */
179 				/* Only corresponding CPU can update. */
180 };
181 static DEFINE_PER_CPU_SHARED_ALIGNED(enum rcu_flip_flag_values, rcu_flip_flag)
182 								= rcu_flip_seen;
183 
184 /*
185  * Enum and per-CPU flag to determine when each CPU has executed the
186  * needed memory barrier to fence in memory references from its last RCU
187  * read-side critical section in the just-completed grace period.
188  */
189 
190 enum rcu_mb_flag_values {
191 	rcu_mb_done,		/* Steady/initial state, no mb()s required. */
192 				/* Only GP detector can update. */
193 	rcu_mb_needed		/* Flip just completed, need an mb(). */
194 				/* Only corresponding CPU can update. */
195 };
196 static DEFINE_PER_CPU_SHARED_ALIGNED(enum rcu_mb_flag_values, rcu_mb_flag)
197 								= rcu_mb_done;
198 
199 /*
200  * RCU_DATA_ME: find the current CPU's rcu_data structure.
201  * RCU_DATA_CPU: find the specified CPU's rcu_data structure.
202  */
203 #define RCU_DATA_ME()		(&__get_cpu_var(rcu_data))
204 #define RCU_DATA_CPU(cpu)	(&per_cpu(rcu_data, cpu))
205 
206 /*
207  * Helper macro for tracing when the appropriate rcu_data is not
208  * cached in a local variable, but where the CPU number is so cached.
209  */
210 #define RCU_TRACE_CPU(f, cpu) RCU_TRACE(f, &(RCU_DATA_CPU(cpu)->trace));
211 
212 /*
213  * Helper macro for tracing when the appropriate rcu_data is not
214  * cached in a local variable.
215  */
216 #define RCU_TRACE_ME(f) RCU_TRACE(f, &(RCU_DATA_ME()->trace));
217 
218 /*
219  * Helper macro for tracing when the appropriate rcu_data is pointed
220  * to by a local variable.
221  */
222 #define RCU_TRACE_RDP(f, rdp) RCU_TRACE(f, &((rdp)->trace));
223 
224 #define RCU_SCHED_BATCH_TIME (HZ / 50)
225 
226 /*
227  * Return the number of RCU batches processed thus far.  Useful
228  * for debug and statistics.
229  */
rcu_batches_completed(void)230 long rcu_batches_completed(void)
231 {
232 	return rcu_ctrlblk.completed;
233 }
234 EXPORT_SYMBOL_GPL(rcu_batches_completed);
235 
__rcu_read_lock(void)236 void __rcu_read_lock(void)
237 {
238 	int idx;
239 	struct task_struct *t = current;
240 	int nesting;
241 
242 	nesting = ACCESS_ONCE(t->rcu_read_lock_nesting);
243 	if (nesting != 0) {
244 
245 		/* An earlier rcu_read_lock() covers us, just count it. */
246 
247 		t->rcu_read_lock_nesting = nesting + 1;
248 
249 	} else {
250 		unsigned long flags;
251 
252 		/*
253 		 * We disable interrupts for the following reasons:
254 		 * - If we get scheduling clock interrupt here, and we
255 		 *   end up acking the counter flip, it's like a promise
256 		 *   that we will never increment the old counter again.
257 		 *   Thus we will break that promise if that
258 		 *   scheduling clock interrupt happens between the time
259 		 *   we pick the .completed field and the time that we
260 		 *   increment our counter.
261 		 *
262 		 * - We don't want to be preempted out here.
263 		 *
264 		 * NMIs can still occur, of course, and might themselves
265 		 * contain rcu_read_lock().
266 		 */
267 
268 		local_irq_save(flags);
269 
270 		/*
271 		 * Outermost nesting of rcu_read_lock(), so increment
272 		 * the current counter for the current CPU.  Use volatile
273 		 * casts to prevent the compiler from reordering.
274 		 */
275 
276 		idx = ACCESS_ONCE(rcu_ctrlblk.completed) & 0x1;
277 		ACCESS_ONCE(RCU_DATA_ME()->rcu_flipctr[idx])++;
278 
279 		/*
280 		 * Now that the per-CPU counter has been incremented, we
281 		 * are protected from races with rcu_read_lock() invoked
282 		 * from NMI handlers on this CPU.  We can therefore safely
283 		 * increment the nesting counter, relieving further NMIs
284 		 * of the need to increment the per-CPU counter.
285 		 */
286 
287 		ACCESS_ONCE(t->rcu_read_lock_nesting) = nesting + 1;
288 
289 		/*
290 		 * Now that we have preventing any NMIs from storing
291 		 * to the ->rcu_flipctr_idx, we can safely use it to
292 		 * remember which counter to decrement in the matching
293 		 * rcu_read_unlock().
294 		 */
295 
296 		ACCESS_ONCE(t->rcu_flipctr_idx) = idx;
297 		local_irq_restore(flags);
298 	}
299 }
300 EXPORT_SYMBOL_GPL(__rcu_read_lock);
301 
__rcu_read_unlock(void)302 void __rcu_read_unlock(void)
303 {
304 	int idx;
305 	struct task_struct *t = current;
306 	int nesting;
307 
308 	nesting = ACCESS_ONCE(t->rcu_read_lock_nesting);
309 	if (nesting > 1) {
310 
311 		/*
312 		 * We are still protected by the enclosing rcu_read_lock(),
313 		 * so simply decrement the counter.
314 		 */
315 
316 		t->rcu_read_lock_nesting = nesting - 1;
317 
318 	} else {
319 		unsigned long flags;
320 
321 		/*
322 		 * Disable local interrupts to prevent the grace-period
323 		 * detection state machine from seeing us half-done.
324 		 * NMIs can still occur, of course, and might themselves
325 		 * contain rcu_read_lock() and rcu_read_unlock().
326 		 */
327 
328 		local_irq_save(flags);
329 
330 		/*
331 		 * Outermost nesting of rcu_read_unlock(), so we must
332 		 * decrement the current counter for the current CPU.
333 		 * This must be done carefully, because NMIs can
334 		 * occur at any point in this code, and any rcu_read_lock()
335 		 * and rcu_read_unlock() pairs in the NMI handlers
336 		 * must interact non-destructively with this code.
337 		 * Lots of volatile casts, and -very- careful ordering.
338 		 *
339 		 * Changes to this code, including this one, must be
340 		 * inspected, validated, and tested extremely carefully!!!
341 		 */
342 
343 		/*
344 		 * First, pick up the index.
345 		 */
346 
347 		idx = ACCESS_ONCE(t->rcu_flipctr_idx);
348 
349 		/*
350 		 * Now that we have fetched the counter index, it is
351 		 * safe to decrement the per-task RCU nesting counter.
352 		 * After this, any interrupts or NMIs will increment and
353 		 * decrement the per-CPU counters.
354 		 */
355 		ACCESS_ONCE(t->rcu_read_lock_nesting) = nesting - 1;
356 
357 		/*
358 		 * It is now safe to decrement this task's nesting count.
359 		 * NMIs that occur after this statement will route their
360 		 * rcu_read_lock() calls through this "else" clause, and
361 		 * will thus start incrementing the per-CPU counter on
362 		 * their own.  They will also clobber ->rcu_flipctr_idx,
363 		 * but that is OK, since we have already fetched it.
364 		 */
365 
366 		ACCESS_ONCE(RCU_DATA_ME()->rcu_flipctr[idx])--;
367 		local_irq_restore(flags);
368 	}
369 }
370 EXPORT_SYMBOL_GPL(__rcu_read_unlock);
371 
372 /*
373  * If a global counter flip has occurred since the last time that we
374  * advanced callbacks, advance them.  Hardware interrupts must be
375  * disabled when calling this function.
376  */
__rcu_advance_callbacks(struct rcu_data * rdp)377 static void __rcu_advance_callbacks(struct rcu_data *rdp)
378 {
379 	int cpu;
380 	int i;
381 	int wlc = 0;
382 
383 	if (rdp->completed != rcu_ctrlblk.completed) {
384 		if (rdp->waitlist[GP_STAGES - 1] != NULL) {
385 			*rdp->donetail = rdp->waitlist[GP_STAGES - 1];
386 			rdp->donetail = rdp->waittail[GP_STAGES - 1];
387 			RCU_TRACE_RDP(rcupreempt_trace_move2done, rdp);
388 		}
389 		for (i = GP_STAGES - 2; i >= 0; i--) {
390 			if (rdp->waitlist[i] != NULL) {
391 				rdp->waitlist[i + 1] = rdp->waitlist[i];
392 				rdp->waittail[i + 1] = rdp->waittail[i];
393 				wlc++;
394 			} else {
395 				rdp->waitlist[i + 1] = NULL;
396 				rdp->waittail[i + 1] =
397 					&rdp->waitlist[i + 1];
398 			}
399 		}
400 		if (rdp->nextlist != NULL) {
401 			rdp->waitlist[0] = rdp->nextlist;
402 			rdp->waittail[0] = rdp->nexttail;
403 			wlc++;
404 			rdp->nextlist = NULL;
405 			rdp->nexttail = &rdp->nextlist;
406 			RCU_TRACE_RDP(rcupreempt_trace_move2wait, rdp);
407 		} else {
408 			rdp->waitlist[0] = NULL;
409 			rdp->waittail[0] = &rdp->waitlist[0];
410 		}
411 		rdp->waitlistcount = wlc;
412 		rdp->completed = rcu_ctrlblk.completed;
413 	}
414 
415 	/*
416 	 * Check to see if this CPU needs to report that it has seen
417 	 * the most recent counter flip, thereby declaring that all
418 	 * subsequent rcu_read_lock() invocations will respect this flip.
419 	 */
420 
421 	cpu = raw_smp_processor_id();
422 	if (per_cpu(rcu_flip_flag, cpu) == rcu_flipped) {
423 		smp_mb();  /* Subsequent counter accesses must see new value */
424 		per_cpu(rcu_flip_flag, cpu) = rcu_flip_seen;
425 		smp_mb();  /* Subsequent RCU read-side critical sections */
426 			   /*  seen -after- acknowledgement. */
427 	}
428 }
429 
430 DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_dyntick_sched, rcu_dyntick_sched) = {
431 	.dynticks = 1,
432 };
433 
434 #ifdef CONFIG_NO_HZ
435 static DEFINE_PER_CPU(int, rcu_update_flag);
436 
437 /**
438  * rcu_irq_enter - Called from Hard irq handlers and NMI/SMI.
439  *
440  * If the CPU was idle with dynamic ticks active, this updates the
441  * rcu_dyntick_sched.dynticks to let the RCU handling know that the
442  * CPU is active.
443  */
rcu_irq_enter(void)444 void rcu_irq_enter(void)
445 {
446 	int cpu = smp_processor_id();
447 	struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
448 
449 	if (per_cpu(rcu_update_flag, cpu))
450 		per_cpu(rcu_update_flag, cpu)++;
451 
452 	/*
453 	 * Only update if we are coming from a stopped ticks mode
454 	 * (rcu_dyntick_sched.dynticks is even).
455 	 */
456 	if (!in_interrupt() &&
457 	    (rdssp->dynticks & 0x1) == 0) {
458 		/*
459 		 * The following might seem like we could have a race
460 		 * with NMI/SMIs. But this really isn't a problem.
461 		 * Here we do a read/modify/write, and the race happens
462 		 * when an NMI/SMI comes in after the read and before
463 		 * the write. But NMI/SMIs will increment this counter
464 		 * twice before returning, so the zero bit will not
465 		 * be corrupted by the NMI/SMI which is the most important
466 		 * part.
467 		 *
468 		 * The only thing is that we would bring back the counter
469 		 * to a postion that it was in during the NMI/SMI.
470 		 * But the zero bit would be set, so the rest of the
471 		 * counter would again be ignored.
472 		 *
473 		 * On return from the IRQ, the counter may have the zero
474 		 * bit be 0 and the counter the same as the return from
475 		 * the NMI/SMI. If the state machine was so unlucky to
476 		 * see that, it still doesn't matter, since all
477 		 * RCU read-side critical sections on this CPU would
478 		 * have already completed.
479 		 */
480 		rdssp->dynticks++;
481 		/*
482 		 * The following memory barrier ensures that any
483 		 * rcu_read_lock() primitives in the irq handler
484 		 * are seen by other CPUs to follow the above
485 		 * increment to rcu_dyntick_sched.dynticks. This is
486 		 * required in order for other CPUs to correctly
487 		 * determine when it is safe to advance the RCU
488 		 * grace-period state machine.
489 		 */
490 		smp_mb(); /* see above block comment. */
491 		/*
492 		 * Since we can't determine the dynamic tick mode from
493 		 * the rcu_dyntick_sched.dynticks after this routine,
494 		 * we use a second flag to acknowledge that we came
495 		 * from an idle state with ticks stopped.
496 		 */
497 		per_cpu(rcu_update_flag, cpu)++;
498 		/*
499 		 * If we take an NMI/SMI now, they will also increment
500 		 * the rcu_update_flag, and will not update the
501 		 * rcu_dyntick_sched.dynticks on exit. That is for
502 		 * this IRQ to do.
503 		 */
504 	}
505 }
506 
507 /**
508  * rcu_irq_exit - Called from exiting Hard irq context.
509  *
510  * If the CPU was idle with dynamic ticks active, update the
511  * rcu_dyntick_sched.dynticks to put let the RCU handling be
512  * aware that the CPU is going back to idle with no ticks.
513  */
rcu_irq_exit(void)514 void rcu_irq_exit(void)
515 {
516 	int cpu = smp_processor_id();
517 	struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
518 
519 	/*
520 	 * rcu_update_flag is set if we interrupted the CPU
521 	 * when it was idle with ticks stopped.
522 	 * Once this occurs, we keep track of interrupt nesting
523 	 * because a NMI/SMI could also come in, and we still
524 	 * only want the IRQ that started the increment of the
525 	 * rcu_dyntick_sched.dynticks to be the one that modifies
526 	 * it on exit.
527 	 */
528 	if (per_cpu(rcu_update_flag, cpu)) {
529 		if (--per_cpu(rcu_update_flag, cpu))
530 			return;
531 
532 		/* This must match the interrupt nesting */
533 		WARN_ON(in_interrupt());
534 
535 		/*
536 		 * If an NMI/SMI happens now we are still
537 		 * protected by the rcu_dyntick_sched.dynticks being odd.
538 		 */
539 
540 		/*
541 		 * The following memory barrier ensures that any
542 		 * rcu_read_unlock() primitives in the irq handler
543 		 * are seen by other CPUs to preceed the following
544 		 * increment to rcu_dyntick_sched.dynticks. This
545 		 * is required in order for other CPUs to determine
546 		 * when it is safe to advance the RCU grace-period
547 		 * state machine.
548 		 */
549 		smp_mb(); /* see above block comment. */
550 		rdssp->dynticks++;
551 		WARN_ON(rdssp->dynticks & 0x1);
552 	}
553 }
554 
rcu_nmi_enter(void)555 void rcu_nmi_enter(void)
556 {
557 	rcu_irq_enter();
558 }
559 
rcu_nmi_exit(void)560 void rcu_nmi_exit(void)
561 {
562 	rcu_irq_exit();
563 }
564 
dyntick_save_progress_counter(int cpu)565 static void dyntick_save_progress_counter(int cpu)
566 {
567 	struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
568 
569 	rdssp->dynticks_snap = rdssp->dynticks;
570 }
571 
572 static inline int
rcu_try_flip_waitack_needed(int cpu)573 rcu_try_flip_waitack_needed(int cpu)
574 {
575 	long curr;
576 	long snap;
577 	struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
578 
579 	curr = rdssp->dynticks;
580 	snap = rdssp->dynticks_snap;
581 	smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
582 
583 	/*
584 	 * If the CPU remained in dynticks mode for the entire time
585 	 * and didn't take any interrupts, NMIs, SMIs, or whatever,
586 	 * then it cannot be in the middle of an rcu_read_lock(), so
587 	 * the next rcu_read_lock() it executes must use the new value
588 	 * of the counter.  So we can safely pretend that this CPU
589 	 * already acknowledged the counter.
590 	 */
591 
592 	if ((curr == snap) && ((curr & 0x1) == 0))
593 		return 0;
594 
595 	/*
596 	 * If the CPU passed through or entered a dynticks idle phase with
597 	 * no active irq handlers, then, as above, we can safely pretend
598 	 * that this CPU already acknowledged the counter.
599 	 */
600 
601 	if ((curr - snap) > 2 || (curr & 0x1) == 0)
602 		return 0;
603 
604 	/* We need this CPU to explicitly acknowledge the counter flip. */
605 
606 	return 1;
607 }
608 
609 static inline int
rcu_try_flip_waitmb_needed(int cpu)610 rcu_try_flip_waitmb_needed(int cpu)
611 {
612 	long curr;
613 	long snap;
614 	struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
615 
616 	curr = rdssp->dynticks;
617 	snap = rdssp->dynticks_snap;
618 	smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
619 
620 	/*
621 	 * If the CPU remained in dynticks mode for the entire time
622 	 * and didn't take any interrupts, NMIs, SMIs, or whatever,
623 	 * then it cannot have executed an RCU read-side critical section
624 	 * during that time, so there is no need for it to execute a
625 	 * memory barrier.
626 	 */
627 
628 	if ((curr == snap) && ((curr & 0x1) == 0))
629 		return 0;
630 
631 	/*
632 	 * If the CPU either entered or exited an outermost interrupt,
633 	 * SMI, NMI, or whatever handler, then we know that it executed
634 	 * a memory barrier when doing so.  So we don't need another one.
635 	 */
636 	if (curr != snap)
637 		return 0;
638 
639 	/* We need the CPU to execute a memory barrier. */
640 
641 	return 1;
642 }
643 
dyntick_save_progress_counter_sched(int cpu)644 static void dyntick_save_progress_counter_sched(int cpu)
645 {
646 	struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
647 
648 	rdssp->sched_dynticks_snap = rdssp->dynticks;
649 }
650 
rcu_qsctr_inc_needed_dyntick(int cpu)651 static int rcu_qsctr_inc_needed_dyntick(int cpu)
652 {
653 	long curr;
654 	long snap;
655 	struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
656 
657 	curr = rdssp->dynticks;
658 	snap = rdssp->sched_dynticks_snap;
659 	smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
660 
661 	/*
662 	 * If the CPU remained in dynticks mode for the entire time
663 	 * and didn't take any interrupts, NMIs, SMIs, or whatever,
664 	 * then it cannot be in the middle of an rcu_read_lock(), so
665 	 * the next rcu_read_lock() it executes must use the new value
666 	 * of the counter.  Therefore, this CPU has been in a quiescent
667 	 * state the entire time, and we don't need to wait for it.
668 	 */
669 
670 	if ((curr == snap) && ((curr & 0x1) == 0))
671 		return 0;
672 
673 	/*
674 	 * If the CPU passed through or entered a dynticks idle phase with
675 	 * no active irq handlers, then, as above, this CPU has already
676 	 * passed through a quiescent state.
677 	 */
678 
679 	if ((curr - snap) > 2 || (snap & 0x1) == 0)
680 		return 0;
681 
682 	/* We need this CPU to go through a quiescent state. */
683 
684 	return 1;
685 }
686 
687 #else /* !CONFIG_NO_HZ */
688 
689 # define dyntick_save_progress_counter(cpu)		do { } while (0)
690 # define rcu_try_flip_waitack_needed(cpu)		(1)
691 # define rcu_try_flip_waitmb_needed(cpu)		(1)
692 
693 # define dyntick_save_progress_counter_sched(cpu)	do { } while (0)
694 # define rcu_qsctr_inc_needed_dyntick(cpu)		(1)
695 
696 #endif /* CONFIG_NO_HZ */
697 
save_qsctr_sched(int cpu)698 static void save_qsctr_sched(int cpu)
699 {
700 	struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
701 
702 	rdssp->sched_qs_snap = rdssp->sched_qs;
703 }
704 
rcu_qsctr_inc_needed(int cpu)705 static inline int rcu_qsctr_inc_needed(int cpu)
706 {
707 	struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
708 
709 	/*
710 	 * If there has been a quiescent state, no more need to wait
711 	 * on this CPU.
712 	 */
713 
714 	if (rdssp->sched_qs != rdssp->sched_qs_snap) {
715 		smp_mb(); /* force ordering with cpu entering schedule(). */
716 		return 0;
717 	}
718 
719 	/* We need this CPU to go through a quiescent state. */
720 
721 	return 1;
722 }
723 
724 /*
725  * Get here when RCU is idle.  Decide whether we need to
726  * move out of idle state, and return non-zero if so.
727  * "Straightforward" approach for the moment, might later
728  * use callback-list lengths, grace-period duration, or
729  * some such to determine when to exit idle state.
730  * Might also need a pre-idle test that does not acquire
731  * the lock, but let's get the simple case working first...
732  */
733 
734 static int
rcu_try_flip_idle(void)735 rcu_try_flip_idle(void)
736 {
737 	int cpu;
738 
739 	RCU_TRACE_ME(rcupreempt_trace_try_flip_i1);
740 	if (!rcu_pending(smp_processor_id())) {
741 		RCU_TRACE_ME(rcupreempt_trace_try_flip_ie1);
742 		return 0;
743 	}
744 
745 	/*
746 	 * Do the flip.
747 	 */
748 
749 	RCU_TRACE_ME(rcupreempt_trace_try_flip_g1);
750 	rcu_ctrlblk.completed++;  /* stands in for rcu_try_flip_g2 */
751 
752 	/*
753 	 * Need a memory barrier so that other CPUs see the new
754 	 * counter value before they see the subsequent change of all
755 	 * the rcu_flip_flag instances to rcu_flipped.
756 	 */
757 
758 	smp_mb();	/* see above block comment. */
759 
760 	/* Now ask each CPU for acknowledgement of the flip. */
761 
762 	for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map)) {
763 		per_cpu(rcu_flip_flag, cpu) = rcu_flipped;
764 		dyntick_save_progress_counter(cpu);
765 	}
766 
767 	return 1;
768 }
769 
770 /*
771  * Wait for CPUs to acknowledge the flip.
772  */
773 
774 static int
rcu_try_flip_waitack(void)775 rcu_try_flip_waitack(void)
776 {
777 	int cpu;
778 
779 	RCU_TRACE_ME(rcupreempt_trace_try_flip_a1);
780 	for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map))
781 		if (rcu_try_flip_waitack_needed(cpu) &&
782 		    per_cpu(rcu_flip_flag, cpu) != rcu_flip_seen) {
783 			RCU_TRACE_ME(rcupreempt_trace_try_flip_ae1);
784 			return 0;
785 		}
786 
787 	/*
788 	 * Make sure our checks above don't bleed into subsequent
789 	 * waiting for the sum of the counters to reach zero.
790 	 */
791 
792 	smp_mb();	/* see above block comment. */
793 	RCU_TRACE_ME(rcupreempt_trace_try_flip_a2);
794 	return 1;
795 }
796 
797 /*
798  * Wait for collective ``last'' counter to reach zero,
799  * then tell all CPUs to do an end-of-grace-period memory barrier.
800  */
801 
802 static int
rcu_try_flip_waitzero(void)803 rcu_try_flip_waitzero(void)
804 {
805 	int cpu;
806 	int lastidx = !(rcu_ctrlblk.completed & 0x1);
807 	int sum = 0;
808 
809 	/* Check to see if the sum of the "last" counters is zero. */
810 
811 	RCU_TRACE_ME(rcupreempt_trace_try_flip_z1);
812 	for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map))
813 		sum += RCU_DATA_CPU(cpu)->rcu_flipctr[lastidx];
814 	if (sum != 0) {
815 		RCU_TRACE_ME(rcupreempt_trace_try_flip_ze1);
816 		return 0;
817 	}
818 
819 	/*
820 	 * This ensures that the other CPUs see the call for
821 	 * memory barriers -after- the sum to zero has been
822 	 * detected here
823 	 */
824 	smp_mb();  /*  ^^^^^^^^^^^^ */
825 
826 	/* Call for a memory barrier from each CPU. */
827 	for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map)) {
828 		per_cpu(rcu_mb_flag, cpu) = rcu_mb_needed;
829 		dyntick_save_progress_counter(cpu);
830 	}
831 
832 	RCU_TRACE_ME(rcupreempt_trace_try_flip_z2);
833 	return 1;
834 }
835 
836 /*
837  * Wait for all CPUs to do their end-of-grace-period memory barrier.
838  * Return 0 once all CPUs have done so.
839  */
840 
841 static int
rcu_try_flip_waitmb(void)842 rcu_try_flip_waitmb(void)
843 {
844 	int cpu;
845 
846 	RCU_TRACE_ME(rcupreempt_trace_try_flip_m1);
847 	for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map))
848 		if (rcu_try_flip_waitmb_needed(cpu) &&
849 		    per_cpu(rcu_mb_flag, cpu) != rcu_mb_done) {
850 			RCU_TRACE_ME(rcupreempt_trace_try_flip_me1);
851 			return 0;
852 		}
853 
854 	smp_mb(); /* Ensure that the above checks precede any following flip. */
855 	RCU_TRACE_ME(rcupreempt_trace_try_flip_m2);
856 	return 1;
857 }
858 
859 /*
860  * Attempt a single flip of the counters.  Remember, a single flip does
861  * -not- constitute a grace period.  Instead, the interval between
862  * at least GP_STAGES consecutive flips is a grace period.
863  *
864  * If anyone is nuts enough to run this CONFIG_PREEMPT_RCU implementation
865  * on a large SMP, they might want to use a hierarchical organization of
866  * the per-CPU-counter pairs.
867  */
rcu_try_flip(void)868 static void rcu_try_flip(void)
869 {
870 	unsigned long flags;
871 
872 	RCU_TRACE_ME(rcupreempt_trace_try_flip_1);
873 	if (unlikely(!spin_trylock_irqsave(&rcu_ctrlblk.fliplock, flags))) {
874 		RCU_TRACE_ME(rcupreempt_trace_try_flip_e1);
875 		return;
876 	}
877 
878 	/*
879 	 * Take the next transition(s) through the RCU grace-period
880 	 * flip-counter state machine.
881 	 */
882 
883 	switch (rcu_ctrlblk.rcu_try_flip_state) {
884 	case rcu_try_flip_idle_state:
885 		if (rcu_try_flip_idle())
886 			rcu_ctrlblk.rcu_try_flip_state =
887 				rcu_try_flip_waitack_state;
888 		break;
889 	case rcu_try_flip_waitack_state:
890 		if (rcu_try_flip_waitack())
891 			rcu_ctrlblk.rcu_try_flip_state =
892 				rcu_try_flip_waitzero_state;
893 		break;
894 	case rcu_try_flip_waitzero_state:
895 		if (rcu_try_flip_waitzero())
896 			rcu_ctrlblk.rcu_try_flip_state =
897 				rcu_try_flip_waitmb_state;
898 		break;
899 	case rcu_try_flip_waitmb_state:
900 		if (rcu_try_flip_waitmb())
901 			rcu_ctrlblk.rcu_try_flip_state =
902 				rcu_try_flip_idle_state;
903 	}
904 	spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
905 }
906 
907 /*
908  * Check to see if this CPU needs to do a memory barrier in order to
909  * ensure that any prior RCU read-side critical sections have committed
910  * their counter manipulations and critical-section memory references
911  * before declaring the grace period to be completed.
912  */
rcu_check_mb(int cpu)913 static void rcu_check_mb(int cpu)
914 {
915 	if (per_cpu(rcu_mb_flag, cpu) == rcu_mb_needed) {
916 		smp_mb();  /* Ensure RCU read-side accesses are visible. */
917 		per_cpu(rcu_mb_flag, cpu) = rcu_mb_done;
918 	}
919 }
920 
rcu_check_callbacks(int cpu,int user)921 void rcu_check_callbacks(int cpu, int user)
922 {
923 	unsigned long flags;
924 	struct rcu_data *rdp = RCU_DATA_CPU(cpu);
925 
926 	/*
927 	 * If this CPU took its interrupt from user mode or from the
928 	 * idle loop, and this is not a nested interrupt, then
929 	 * this CPU has to have exited all prior preept-disable
930 	 * sections of code.  So increment the counter to note this.
931 	 *
932 	 * The memory barrier is needed to handle the case where
933 	 * writes from a preempt-disable section of code get reordered
934 	 * into schedule() by this CPU's write buffer.  So the memory
935 	 * barrier makes sure that the rcu_qsctr_inc() is seen by other
936 	 * CPUs to happen after any such write.
937 	 */
938 
939 	if (user ||
940 	    (idle_cpu(cpu) && !in_softirq() &&
941 	     hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
942 		smp_mb();	/* Guard against aggressive schedule(). */
943 	     	rcu_qsctr_inc(cpu);
944 	}
945 
946 	rcu_check_mb(cpu);
947 	if (rcu_ctrlblk.completed == rdp->completed)
948 		rcu_try_flip();
949 	spin_lock_irqsave(&rdp->lock, flags);
950 	RCU_TRACE_RDP(rcupreempt_trace_check_callbacks, rdp);
951 	__rcu_advance_callbacks(rdp);
952 	if (rdp->donelist == NULL) {
953 		spin_unlock_irqrestore(&rdp->lock, flags);
954 	} else {
955 		spin_unlock_irqrestore(&rdp->lock, flags);
956 		raise_softirq(RCU_SOFTIRQ);
957 	}
958 }
959 
960 /*
961  * Needed by dynticks, to make sure all RCU processing has finished
962  * when we go idle:
963  */
rcu_advance_callbacks(int cpu,int user)964 void rcu_advance_callbacks(int cpu, int user)
965 {
966 	unsigned long flags;
967 	struct rcu_data *rdp = RCU_DATA_CPU(cpu);
968 
969 	if (rcu_ctrlblk.completed == rdp->completed) {
970 		rcu_try_flip();
971 		if (rcu_ctrlblk.completed == rdp->completed)
972 			return;
973 	}
974 	spin_lock_irqsave(&rdp->lock, flags);
975 	RCU_TRACE_RDP(rcupreempt_trace_check_callbacks, rdp);
976 	__rcu_advance_callbacks(rdp);
977 	spin_unlock_irqrestore(&rdp->lock, flags);
978 }
979 
980 #ifdef CONFIG_HOTPLUG_CPU
981 #define rcu_offline_cpu_enqueue(srclist, srctail, dstlist, dsttail) do { \
982 		*dsttail = srclist; \
983 		if (srclist != NULL) { \
984 			dsttail = srctail; \
985 			srclist = NULL; \
986 			srctail = &srclist;\
987 		} \
988 	} while (0)
989 
rcu_offline_cpu(int cpu)990 void rcu_offline_cpu(int cpu)
991 {
992 	int i;
993 	struct rcu_head *list = NULL;
994 	unsigned long flags;
995 	struct rcu_data *rdp = RCU_DATA_CPU(cpu);
996 	struct rcu_head *schedlist = NULL;
997 	struct rcu_head **schedtail = &schedlist;
998 	struct rcu_head **tail = &list;
999 
1000 	/*
1001 	 * Remove all callbacks from the newly dead CPU, retaining order.
1002 	 * Otherwise rcu_barrier() will fail
1003 	 */
1004 
1005 	spin_lock_irqsave(&rdp->lock, flags);
1006 	rcu_offline_cpu_enqueue(rdp->donelist, rdp->donetail, list, tail);
1007 	for (i = GP_STAGES - 1; i >= 0; i--)
1008 		rcu_offline_cpu_enqueue(rdp->waitlist[i], rdp->waittail[i],
1009 						list, tail);
1010 	rcu_offline_cpu_enqueue(rdp->nextlist, rdp->nexttail, list, tail);
1011 	rcu_offline_cpu_enqueue(rdp->waitschedlist, rdp->waitschedtail,
1012 				schedlist, schedtail);
1013 	rcu_offline_cpu_enqueue(rdp->nextschedlist, rdp->nextschedtail,
1014 				schedlist, schedtail);
1015 	rdp->rcu_sched_sleeping = 0;
1016 	spin_unlock_irqrestore(&rdp->lock, flags);
1017 	rdp->waitlistcount = 0;
1018 
1019 	/* Disengage the newly dead CPU from the grace-period computation. */
1020 
1021 	spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags);
1022 	rcu_check_mb(cpu);
1023 	if (per_cpu(rcu_flip_flag, cpu) == rcu_flipped) {
1024 		smp_mb();  /* Subsequent counter accesses must see new value */
1025 		per_cpu(rcu_flip_flag, cpu) = rcu_flip_seen;
1026 		smp_mb();  /* Subsequent RCU read-side critical sections */
1027 			   /*  seen -after- acknowledgement. */
1028 	}
1029 
1030 	RCU_DATA_ME()->rcu_flipctr[0] += RCU_DATA_CPU(cpu)->rcu_flipctr[0];
1031 	RCU_DATA_ME()->rcu_flipctr[1] += RCU_DATA_CPU(cpu)->rcu_flipctr[1];
1032 
1033 	RCU_DATA_CPU(cpu)->rcu_flipctr[0] = 0;
1034 	RCU_DATA_CPU(cpu)->rcu_flipctr[1] = 0;
1035 
1036 	cpumask_clear_cpu(cpu, to_cpumask(rcu_cpu_online_map));
1037 
1038 	spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
1039 
1040 	/*
1041 	 * Place the removed callbacks on the current CPU's queue.
1042 	 * Make them all start a new grace period: simple approach,
1043 	 * in theory could starve a given set of callbacks, but
1044 	 * you would need to be doing some serious CPU hotplugging
1045 	 * to make this happen.  If this becomes a problem, adding
1046 	 * a synchronize_rcu() to the hotplug path would be a simple
1047 	 * fix.
1048 	 */
1049 
1050 	local_irq_save(flags);  /* disable preempt till we know what lock. */
1051 	rdp = RCU_DATA_ME();
1052 	spin_lock(&rdp->lock);
1053 	*rdp->nexttail = list;
1054 	if (list)
1055 		rdp->nexttail = tail;
1056 	*rdp->nextschedtail = schedlist;
1057 	if (schedlist)
1058 		rdp->nextschedtail = schedtail;
1059 	spin_unlock_irqrestore(&rdp->lock, flags);
1060 }
1061 
1062 #else /* #ifdef CONFIG_HOTPLUG_CPU */
1063 
rcu_offline_cpu(int cpu)1064 void rcu_offline_cpu(int cpu)
1065 {
1066 }
1067 
1068 #endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
1069 
rcu_online_cpu(int cpu)1070 void __cpuinit rcu_online_cpu(int cpu)
1071 {
1072 	unsigned long flags;
1073 	struct rcu_data *rdp;
1074 
1075 	spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags);
1076 	cpumask_set_cpu(cpu, to_cpumask(rcu_cpu_online_map));
1077 	spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
1078 
1079 	/*
1080 	 * The rcu_sched grace-period processing might have bypassed
1081 	 * this CPU, given that it was not in the rcu_cpu_online_map
1082 	 * when the grace-period scan started.  This means that the
1083 	 * grace-period task might sleep.  So make sure that if this
1084 	 * should happen, the first callback posted to this CPU will
1085 	 * wake up the grace-period task if need be.
1086 	 */
1087 
1088 	rdp = RCU_DATA_CPU(cpu);
1089 	spin_lock_irqsave(&rdp->lock, flags);
1090 	rdp->rcu_sched_sleeping = 1;
1091 	spin_unlock_irqrestore(&rdp->lock, flags);
1092 }
1093 
rcu_process_callbacks(struct softirq_action * unused)1094 static void rcu_process_callbacks(struct softirq_action *unused)
1095 {
1096 	unsigned long flags;
1097 	struct rcu_head *next, *list;
1098 	struct rcu_data *rdp;
1099 
1100 	local_irq_save(flags);
1101 	rdp = RCU_DATA_ME();
1102 	spin_lock(&rdp->lock);
1103 	list = rdp->donelist;
1104 	if (list == NULL) {
1105 		spin_unlock_irqrestore(&rdp->lock, flags);
1106 		return;
1107 	}
1108 	rdp->donelist = NULL;
1109 	rdp->donetail = &rdp->donelist;
1110 	RCU_TRACE_RDP(rcupreempt_trace_done_remove, rdp);
1111 	spin_unlock_irqrestore(&rdp->lock, flags);
1112 	while (list) {
1113 		next = list->next;
1114 		list->func(list);
1115 		list = next;
1116 		RCU_TRACE_ME(rcupreempt_trace_invoke);
1117 	}
1118 }
1119 
call_rcu(struct rcu_head * head,void (* func)(struct rcu_head * rcu))1120 void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
1121 {
1122 	unsigned long flags;
1123 	struct rcu_data *rdp;
1124 
1125 	head->func = func;
1126 	head->next = NULL;
1127 	local_irq_save(flags);
1128 	rdp = RCU_DATA_ME();
1129 	spin_lock(&rdp->lock);
1130 	__rcu_advance_callbacks(rdp);
1131 	*rdp->nexttail = head;
1132 	rdp->nexttail = &head->next;
1133 	RCU_TRACE_RDP(rcupreempt_trace_next_add, rdp);
1134 	spin_unlock_irqrestore(&rdp->lock, flags);
1135 }
1136 EXPORT_SYMBOL_GPL(call_rcu);
1137 
call_rcu_sched(struct rcu_head * head,void (* func)(struct rcu_head * rcu))1138 void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
1139 {
1140 	unsigned long flags;
1141 	struct rcu_data *rdp;
1142 	int wake_gp = 0;
1143 
1144 	head->func = func;
1145 	head->next = NULL;
1146 	local_irq_save(flags);
1147 	rdp = RCU_DATA_ME();
1148 	spin_lock(&rdp->lock);
1149 	*rdp->nextschedtail = head;
1150 	rdp->nextschedtail = &head->next;
1151 	if (rdp->rcu_sched_sleeping) {
1152 
1153 		/* Grace-period processing might be sleeping... */
1154 
1155 		rdp->rcu_sched_sleeping = 0;
1156 		wake_gp = 1;
1157 	}
1158 	spin_unlock_irqrestore(&rdp->lock, flags);
1159 	if (wake_gp) {
1160 
1161 		/* Wake up grace-period processing, unless someone beat us. */
1162 
1163 		spin_lock_irqsave(&rcu_ctrlblk.schedlock, flags);
1164 		if (rcu_ctrlblk.sched_sleep != rcu_sched_sleeping)
1165 			wake_gp = 0;
1166 		rcu_ctrlblk.sched_sleep = rcu_sched_not_sleeping;
1167 		spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
1168 		if (wake_gp)
1169 			wake_up_interruptible(&rcu_ctrlblk.sched_wq);
1170 	}
1171 }
1172 EXPORT_SYMBOL_GPL(call_rcu_sched);
1173 
1174 /*
1175  * Wait until all currently running preempt_disable() code segments
1176  * (including hardware-irq-disable segments) complete.  Note that
1177  * in -rt this does -not- necessarily result in all currently executing
1178  * interrupt -handlers- having completed.
1179  */
__synchronize_sched(void)1180 void __synchronize_sched(void)
1181 {
1182 	struct rcu_synchronize rcu;
1183 
1184 	if (num_online_cpus() == 1)
1185 		return;  /* blocking is gp if only one CPU! */
1186 
1187 	init_completion(&rcu.completion);
1188 	/* Will wake me after RCU finished. */
1189 	call_rcu_sched(&rcu.head, wakeme_after_rcu);
1190 	/* Wait for it. */
1191 	wait_for_completion(&rcu.completion);
1192 }
1193 EXPORT_SYMBOL_GPL(__synchronize_sched);
1194 
1195 /*
1196  * kthread function that manages call_rcu_sched grace periods.
1197  */
rcu_sched_grace_period(void * arg)1198 static int rcu_sched_grace_period(void *arg)
1199 {
1200 	int couldsleep;		/* might sleep after current pass. */
1201 	int couldsleepnext = 0; /* might sleep after next pass. */
1202 	int cpu;
1203 	unsigned long flags;
1204 	struct rcu_data *rdp;
1205 	int ret;
1206 
1207 	/*
1208 	 * Each pass through the following loop handles one
1209 	 * rcu_sched grace period cycle.
1210 	 */
1211 	do {
1212 		/* Save each CPU's current state. */
1213 
1214 		for_each_online_cpu(cpu) {
1215 			dyntick_save_progress_counter_sched(cpu);
1216 			save_qsctr_sched(cpu);
1217 		}
1218 
1219 		/*
1220 		 * Sleep for about an RCU grace-period's worth to
1221 		 * allow better batching and to consume less CPU.
1222 		 */
1223 		schedule_timeout_interruptible(RCU_SCHED_BATCH_TIME);
1224 
1225 		/*
1226 		 * If there was nothing to do last time, prepare to
1227 		 * sleep at the end of the current grace period cycle.
1228 		 */
1229 		couldsleep = couldsleepnext;
1230 		couldsleepnext = 1;
1231 		if (couldsleep) {
1232 			spin_lock_irqsave(&rcu_ctrlblk.schedlock, flags);
1233 			rcu_ctrlblk.sched_sleep = rcu_sched_sleep_prep;
1234 			spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
1235 		}
1236 
1237 		/*
1238 		 * Wait on each CPU in turn to have either visited
1239 		 * a quiescent state or been in dynticks-idle mode.
1240 		 */
1241 		for_each_online_cpu(cpu) {
1242 			while (rcu_qsctr_inc_needed(cpu) &&
1243 			       rcu_qsctr_inc_needed_dyntick(cpu)) {
1244 				/* resched_cpu(cpu); @@@ */
1245 				schedule_timeout_interruptible(1);
1246 			}
1247 		}
1248 
1249 		/* Advance callbacks for each CPU.  */
1250 
1251 		for_each_online_cpu(cpu) {
1252 
1253 			rdp = RCU_DATA_CPU(cpu);
1254 			spin_lock_irqsave(&rdp->lock, flags);
1255 
1256 			/*
1257 			 * We are running on this CPU irq-disabled, so no
1258 			 * CPU can go offline until we re-enable irqs.
1259 			 * The current CPU might have already gone
1260 			 * offline (between the for_each_offline_cpu and
1261 			 * the spin_lock_irqsave), but in that case all its
1262 			 * callback lists will be empty, so no harm done.
1263 			 *
1264 			 * Advance the callbacks!  We share normal RCU's
1265 			 * donelist, since callbacks are invoked the
1266 			 * same way in either case.
1267 			 */
1268 			if (rdp->waitschedlist != NULL) {
1269 				*rdp->donetail = rdp->waitschedlist;
1270 				rdp->donetail = rdp->waitschedtail;
1271 
1272 				/*
1273 				 * Next rcu_check_callbacks() will
1274 				 * do the required raise_softirq().
1275 				 */
1276 			}
1277 			if (rdp->nextschedlist != NULL) {
1278 				rdp->waitschedlist = rdp->nextschedlist;
1279 				rdp->waitschedtail = rdp->nextschedtail;
1280 				couldsleep = 0;
1281 				couldsleepnext = 0;
1282 			} else {
1283 				rdp->waitschedlist = NULL;
1284 				rdp->waitschedtail = &rdp->waitschedlist;
1285 			}
1286 			rdp->nextschedlist = NULL;
1287 			rdp->nextschedtail = &rdp->nextschedlist;
1288 
1289 			/* Mark sleep intention. */
1290 
1291 			rdp->rcu_sched_sleeping = couldsleep;
1292 
1293 			spin_unlock_irqrestore(&rdp->lock, flags);
1294 		}
1295 
1296 		/* If we saw callbacks on the last scan, go deal with them. */
1297 
1298 		if (!couldsleep)
1299 			continue;
1300 
1301 		/* Attempt to block... */
1302 
1303 		spin_lock_irqsave(&rcu_ctrlblk.schedlock, flags);
1304 		if (rcu_ctrlblk.sched_sleep != rcu_sched_sleep_prep) {
1305 
1306 			/*
1307 			 * Someone posted a callback after we scanned.
1308 			 * Go take care of it.
1309 			 */
1310 			spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
1311 			couldsleepnext = 0;
1312 			continue;
1313 		}
1314 
1315 		/* Block until the next person posts a callback. */
1316 
1317 		rcu_ctrlblk.sched_sleep = rcu_sched_sleeping;
1318 		spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
1319 		ret = 0;
1320 		__wait_event_interruptible(rcu_ctrlblk.sched_wq,
1321 			rcu_ctrlblk.sched_sleep != rcu_sched_sleeping,
1322 			ret);
1323 
1324 		/*
1325 		 * Signals would prevent us from sleeping, and we cannot
1326 		 * do much with them in any case.  So flush them.
1327 		 */
1328 		if (ret)
1329 			flush_signals(current);
1330 		couldsleepnext = 0;
1331 
1332 	} while (!kthread_should_stop());
1333 
1334 	return (0);
1335 }
1336 
1337 /*
1338  * Check to see if any future RCU-related work will need to be done
1339  * by the current CPU, even if none need be done immediately, returning
1340  * 1 if so.  Assumes that notifiers would take care of handling any
1341  * outstanding requests from the RCU core.
1342  *
1343  * This function is part of the RCU implementation; it is -not-
1344  * an exported member of the RCU API.
1345  */
rcu_needs_cpu(int cpu)1346 int rcu_needs_cpu(int cpu)
1347 {
1348 	struct rcu_data *rdp = RCU_DATA_CPU(cpu);
1349 
1350 	return (rdp->donelist != NULL ||
1351 		!!rdp->waitlistcount ||
1352 		rdp->nextlist != NULL ||
1353 		rdp->nextschedlist != NULL ||
1354 		rdp->waitschedlist != NULL);
1355 }
1356 
rcu_pending(int cpu)1357 int rcu_pending(int cpu)
1358 {
1359 	struct rcu_data *rdp = RCU_DATA_CPU(cpu);
1360 
1361 	/* The CPU has at least one callback queued somewhere. */
1362 
1363 	if (rdp->donelist != NULL ||
1364 	    !!rdp->waitlistcount ||
1365 	    rdp->nextlist != NULL ||
1366 	    rdp->nextschedlist != NULL ||
1367 	    rdp->waitschedlist != NULL)
1368 		return 1;
1369 
1370 	/* The RCU core needs an acknowledgement from this CPU. */
1371 
1372 	if ((per_cpu(rcu_flip_flag, cpu) == rcu_flipped) ||
1373 	    (per_cpu(rcu_mb_flag, cpu) == rcu_mb_needed))
1374 		return 1;
1375 
1376 	/* This CPU has fallen behind the global grace-period number. */
1377 
1378 	if (rdp->completed != rcu_ctrlblk.completed)
1379 		return 1;
1380 
1381 	/* Nothing needed from this CPU. */
1382 
1383 	return 0;
1384 }
1385 
rcu_cpu_notify(struct notifier_block * self,unsigned long action,void * hcpu)1386 static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
1387 				unsigned long action, void *hcpu)
1388 {
1389 	long cpu = (long)hcpu;
1390 
1391 	switch (action) {
1392 	case CPU_UP_PREPARE:
1393 	case CPU_UP_PREPARE_FROZEN:
1394 		rcu_online_cpu(cpu);
1395 		break;
1396 	case CPU_UP_CANCELED:
1397 	case CPU_UP_CANCELED_FROZEN:
1398 	case CPU_DEAD:
1399 	case CPU_DEAD_FROZEN:
1400 		rcu_offline_cpu(cpu);
1401 		break;
1402 	default:
1403 		break;
1404 	}
1405 	return NOTIFY_OK;
1406 }
1407 
1408 static struct notifier_block __cpuinitdata rcu_nb = {
1409 	.notifier_call = rcu_cpu_notify,
1410 };
1411 
__rcu_init(void)1412 void __init __rcu_init(void)
1413 {
1414 	int cpu;
1415 	int i;
1416 	struct rcu_data *rdp;
1417 
1418 	printk(KERN_NOTICE "Preemptible RCU implementation.\n");
1419 	for_each_possible_cpu(cpu) {
1420 		rdp = RCU_DATA_CPU(cpu);
1421 		spin_lock_init(&rdp->lock);
1422 		rdp->completed = 0;
1423 		rdp->waitlistcount = 0;
1424 		rdp->nextlist = NULL;
1425 		rdp->nexttail = &rdp->nextlist;
1426 		for (i = 0; i < GP_STAGES; i++) {
1427 			rdp->waitlist[i] = NULL;
1428 			rdp->waittail[i] = &rdp->waitlist[i];
1429 		}
1430 		rdp->donelist = NULL;
1431 		rdp->donetail = &rdp->donelist;
1432 		rdp->rcu_flipctr[0] = 0;
1433 		rdp->rcu_flipctr[1] = 0;
1434 		rdp->nextschedlist = NULL;
1435 		rdp->nextschedtail = &rdp->nextschedlist;
1436 		rdp->waitschedlist = NULL;
1437 		rdp->waitschedtail = &rdp->waitschedlist;
1438 		rdp->rcu_sched_sleeping = 0;
1439 	}
1440 	register_cpu_notifier(&rcu_nb);
1441 
1442 	/*
1443 	 * We don't need protection against CPU-Hotplug here
1444 	 * since
1445 	 * a) If a CPU comes online while we are iterating over the
1446 	 *    cpu_online_mask below, we would only end up making a
1447 	 *    duplicate call to rcu_online_cpu() which sets the corresponding
1448 	 *    CPU's mask in the rcu_cpu_online_map.
1449 	 *
1450 	 * b) A CPU cannot go offline at this point in time since the user
1451 	 *    does not have access to the sysfs interface, nor do we
1452 	 *    suspend the system.
1453 	 */
1454 	for_each_online_cpu(cpu)
1455 		rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE,	(void *)(long) cpu);
1456 
1457 	open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
1458 }
1459 
1460 /*
1461  * Late-boot-time RCU initialization that must wait until after scheduler
1462  * has been initialized.
1463  */
rcu_init_sched(void)1464 void __init rcu_init_sched(void)
1465 {
1466 	rcu_sched_grace_period_task = kthread_run(rcu_sched_grace_period,
1467 						  NULL,
1468 						  "rcu_sched_grace_period");
1469 	WARN_ON(IS_ERR(rcu_sched_grace_period_task));
1470 }
1471 
1472 #ifdef CONFIG_RCU_TRACE
rcupreempt_flipctr(int cpu)1473 long *rcupreempt_flipctr(int cpu)
1474 {
1475 	return &RCU_DATA_CPU(cpu)->rcu_flipctr[0];
1476 }
1477 EXPORT_SYMBOL_GPL(rcupreempt_flipctr);
1478 
rcupreempt_flip_flag(int cpu)1479 int rcupreempt_flip_flag(int cpu)
1480 {
1481 	return per_cpu(rcu_flip_flag, cpu);
1482 }
1483 EXPORT_SYMBOL_GPL(rcupreempt_flip_flag);
1484 
rcupreempt_mb_flag(int cpu)1485 int rcupreempt_mb_flag(int cpu)
1486 {
1487 	return per_cpu(rcu_mb_flag, cpu);
1488 }
1489 EXPORT_SYMBOL_GPL(rcupreempt_mb_flag);
1490 
rcupreempt_try_flip_state_name(void)1491 char *rcupreempt_try_flip_state_name(void)
1492 {
1493 	return rcu_try_flip_state_names[rcu_ctrlblk.rcu_try_flip_state];
1494 }
1495 EXPORT_SYMBOL_GPL(rcupreempt_try_flip_state_name);
1496 
rcupreempt_trace_cpu(int cpu)1497 struct rcupreempt_trace *rcupreempt_trace_cpu(int cpu)
1498 {
1499 	struct rcu_data *rdp = RCU_DATA_CPU(cpu);
1500 
1501 	return &rdp->trace;
1502 }
1503 EXPORT_SYMBOL_GPL(rcupreempt_trace_cpu);
1504 
1505 #endif /* #ifdef RCU_TRACE */
1506