• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  *  Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
4  *  Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
5  *  Copyright(C) 2006-2007  Timesys Corp., Thomas Gleixner
6  *
7  *  NOHZ implementation for low and high resolution timers
8  *
9  *  Started by: Thomas Gleixner and Ingo Molnar
10  */
11 #include <linux/compiler.h>
12 #include <linux/cpu.h>
13 #include <linux/err.h>
14 #include <linux/hrtimer.h>
15 #include <linux/interrupt.h>
16 #include <linux/kernel_stat.h>
17 #include <linux/percpu.h>
18 #include <linux/nmi.h>
19 #include <linux/profile.h>
20 #include <linux/sched/signal.h>
21 #include <linux/sched/clock.h>
22 #include <linux/sched/stat.h>
23 #include <linux/sched/nohz.h>
24 #include <linux/sched/loadavg.h>
25 #include <linux/module.h>
26 #include <linux/irq_work.h>
27 #include <linux/posix-timers.h>
28 #include <linux/context_tracking.h>
29 #include <linux/mm.h>
30 #include <trace/hooks/sched.h>
31 
32 #include <asm/irq_regs.h>
33 
34 #include "tick-internal.h"
35 
36 #include <trace/events/timer.h>
37 
38 /*
39  * Per-CPU nohz control structure
40  */
41 static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched);
42 
tick_get_tick_sched(int cpu)43 struct tick_sched *tick_get_tick_sched(int cpu)
44 {
45 	return &per_cpu(tick_cpu_sched, cpu);
46 }
47 
48 /*
49  * The time when the last jiffy update happened. Write access must hold
50  * jiffies_lock and jiffies_seq. tick_nohz_next_event() needs to get a
51  * consistent view of jiffies and last_jiffies_update.
52  */
53 static ktime_t last_jiffies_update;
54 
55 /*
56  * Must be called with interrupts disabled !
57  */
tick_do_update_jiffies64(ktime_t now)58 static void tick_do_update_jiffies64(ktime_t now)
59 {
60 	unsigned long ticks = 1;
61 	ktime_t delta, nextp;
62 
63 	/*
64 	 * 64-bit can do a quick check without holding the jiffies lock and
65 	 * without looking at the sequence count. The smp_load_acquire()
66 	 * pairs with the update done later in this function.
67 	 *
68 	 * 32-bit cannot do that because the store of 'tick_next_period'
69 	 * consists of two 32-bit stores, and the first store could be
70 	 * moved by the CPU to a random point in the future.
71 	 */
72 	if (IS_ENABLED(CONFIG_64BIT)) {
73 		if (ktime_before(now, smp_load_acquire(&tick_next_period)))
74 			return;
75 	} else {
76 		unsigned int seq;
77 
78 		/*
79 		 * Avoid contention on 'jiffies_lock' and protect the quick
80 		 * check with the sequence count.
81 		 */
82 		do {
83 			seq = read_seqcount_begin(&jiffies_seq);
84 			nextp = tick_next_period;
85 		} while (read_seqcount_retry(&jiffies_seq, seq));
86 
87 		if (ktime_before(now, nextp))
88 			return;
89 	}
90 
91 	/* Quick check failed, i.e. update is required. */
92 	raw_spin_lock(&jiffies_lock);
93 	/*
94 	 * Re-evaluate with the lock held. Another CPU might have done the
95 	 * update already.
96 	 */
97 	if (ktime_before(now, tick_next_period)) {
98 		raw_spin_unlock(&jiffies_lock);
99 		return;
100 	}
101 
102 	write_seqcount_begin(&jiffies_seq);
103 
104 	delta = ktime_sub(now, tick_next_period);
105 	if (unlikely(delta >= TICK_NSEC)) {
106 		/* Slow path for long idle sleep times */
107 		s64 incr = TICK_NSEC;
108 
109 		ticks += ktime_divns(delta, incr);
110 
111 		last_jiffies_update = ktime_add_ns(last_jiffies_update,
112 						   incr * ticks);
113 	} else {
114 		last_jiffies_update = ktime_add_ns(last_jiffies_update,
115 						   TICK_NSEC);
116 	}
117 
118 	/* Advance jiffies to complete the 'jiffies_seq' protected job */
119 	jiffies_64 += ticks;
120 
121 	/* Keep the tick_next_period variable up to date */
122 	nextp = ktime_add_ns(last_jiffies_update, TICK_NSEC);
123 
124 	if (IS_ENABLED(CONFIG_64BIT)) {
125 		/*
126 		 * Pairs with smp_load_acquire() in the lockless quick
127 		 * check above, and ensures that the update to 'jiffies_64' is
128 		 * not reordered vs. the store to 'tick_next_period', neither
129 		 * by the compiler nor by the CPU.
130 		 */
131 		smp_store_release(&tick_next_period, nextp);
132 	} else {
133 		/*
134 		 * A plain store is good enough on 32-bit, as the quick check
135 		 * above is protected by the sequence count.
136 		 */
137 		tick_next_period = nextp;
138 	}
139 
140 	/*
141 	 * Release the sequence count. calc_global_load() below is not
142 	 * protected by it, but 'jiffies_lock' needs to be held to prevent
143 	 * concurrent invocations.
144 	 */
145 	write_seqcount_end(&jiffies_seq);
146 
147 	calc_global_load();
148 
149 	raw_spin_unlock(&jiffies_lock);
150 	update_wall_time();
151 }
152 
153 /*
154  * Initialize and return retrieve the jiffies update.
155  */
tick_init_jiffy_update(void)156 static ktime_t tick_init_jiffy_update(void)
157 {
158 	ktime_t period;
159 
160 	raw_spin_lock(&jiffies_lock);
161 	write_seqcount_begin(&jiffies_seq);
162 
163 	/* Have we started the jiffies update yet ? */
164 	if (last_jiffies_update == 0) {
165 		u32 rem;
166 
167 		/*
168 		 * Ensure that the tick is aligned to a multiple of
169 		 * TICK_NSEC.
170 		 */
171 		div_u64_rem(tick_next_period, TICK_NSEC, &rem);
172 		if (rem)
173 			tick_next_period += TICK_NSEC - rem;
174 
175 		last_jiffies_update = tick_next_period;
176 	}
177 	period = last_jiffies_update;
178 
179 	write_seqcount_end(&jiffies_seq);
180 	raw_spin_unlock(&jiffies_lock);
181 
182 	return period;
183 }
184 
tick_sched_flag_test(struct tick_sched * ts,unsigned long flag)185 static inline int tick_sched_flag_test(struct tick_sched *ts,
186 				       unsigned long flag)
187 {
188 	return !!(ts->flags & flag);
189 }
190 
tick_sched_flag_set(struct tick_sched * ts,unsigned long flag)191 static inline void tick_sched_flag_set(struct tick_sched *ts,
192 				       unsigned long flag)
193 {
194 	lockdep_assert_irqs_disabled();
195 	ts->flags |= flag;
196 }
197 
tick_sched_flag_clear(struct tick_sched * ts,unsigned long flag)198 static inline void tick_sched_flag_clear(struct tick_sched *ts,
199 					 unsigned long flag)
200 {
201 	lockdep_assert_irqs_disabled();
202 	ts->flags &= ~flag;
203 }
204 
205 #define MAX_STALLED_JIFFIES 5
206 
tick_sched_do_timer(struct tick_sched * ts,ktime_t now)207 static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now)
208 {
209 	int tick_cpu, cpu = smp_processor_id();
210 
211 	/*
212 	 * Check if the do_timer duty was dropped. We don't care about
213 	 * concurrency: This happens only when the CPU in charge went
214 	 * into a long sleep. If two CPUs happen to assign themselves to
215 	 * this duty, then the jiffies update is still serialized by
216 	 * 'jiffies_lock'.
217 	 *
218 	 * If nohz_full is enabled, this should not happen because the
219 	 * 'tick_do_timer_cpu' CPU never relinquishes.
220 	 */
221 	tick_cpu = READ_ONCE(tick_do_timer_cpu);
222 
223 	if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && unlikely(tick_cpu == TICK_DO_TIMER_NONE)) {
224 #ifdef CONFIG_NO_HZ_FULL
225 		WARN_ON_ONCE(tick_nohz_full_running);
226 #endif
227 		WRITE_ONCE(tick_do_timer_cpu, cpu);
228 		tick_cpu = cpu;
229 	}
230 
231 	/* Check if jiffies need an update */
232 	if (tick_cpu == cpu) {
233 		tick_do_update_jiffies64(now);
234 		trace_android_vh_jiffies_update(NULL);
235 	}
236 
237 	/*
238 	 * If the jiffies update stalled for too long (timekeeper in stop_machine()
239 	 * or VMEXIT'ed for several msecs), force an update.
240 	 */
241 	if (ts->last_tick_jiffies != jiffies) {
242 		ts->stalled_jiffies = 0;
243 		ts->last_tick_jiffies = READ_ONCE(jiffies);
244 	} else {
245 		if (++ts->stalled_jiffies == MAX_STALLED_JIFFIES) {
246 			tick_do_update_jiffies64(now);
247 			ts->stalled_jiffies = 0;
248 			ts->last_tick_jiffies = READ_ONCE(jiffies);
249 		}
250 	}
251 
252 	if (tick_sched_flag_test(ts, TS_FLAG_INIDLE))
253 		ts->got_idle_tick = 1;
254 }
255 
tick_sched_handle(struct tick_sched * ts,struct pt_regs * regs)256 static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
257 {
258 	/*
259 	 * When we are idle and the tick is stopped, we have to touch
260 	 * the watchdog as we might not schedule for a really long
261 	 * time. This happens on completely idle SMP systems while
262 	 * waiting on the login prompt. We also increment the "start of
263 	 * idle" jiffy stamp so the idle accounting adjustment we do
264 	 * when we go busy again does not account too many ticks.
265 	 */
266 	if (IS_ENABLED(CONFIG_NO_HZ_COMMON) &&
267 	    tick_sched_flag_test(ts, TS_FLAG_STOPPED)) {
268 		touch_softlockup_watchdog_sched();
269 		if (is_idle_task(current))
270 			ts->idle_jiffies++;
271 		/*
272 		 * In case the current tick fired too early past its expected
273 		 * expiration, make sure we don't bypass the next clock reprogramming
274 		 * to the same deadline.
275 		 */
276 		ts->next_tick = 0;
277 	}
278 
279 	update_process_times(user_mode(regs));
280 	profile_tick(CPU_PROFILING);
281 }
282 
283 /*
284  * We rearm the timer until we get disabled by the idle code.
285  * Called with interrupts disabled.
286  */
tick_nohz_handler(struct hrtimer * timer)287 static enum hrtimer_restart tick_nohz_handler(struct hrtimer *timer)
288 {
289 	struct tick_sched *ts =	container_of(timer, struct tick_sched, sched_timer);
290 	struct pt_regs *regs = get_irq_regs();
291 	ktime_t now = ktime_get();
292 
293 	tick_sched_do_timer(ts, now);
294 
295 	/*
296 	 * Do not call when we are not in IRQ context and have
297 	 * no valid 'regs' pointer
298 	 */
299 	if (regs)
300 		tick_sched_handle(ts, regs);
301 	else
302 		ts->next_tick = 0;
303 
304 	/*
305 	 * In dynticks mode, tick reprogram is deferred:
306 	 * - to the idle task if in dynticks-idle
307 	 * - to IRQ exit if in full-dynticks.
308 	 */
309 	if (unlikely(tick_sched_flag_test(ts, TS_FLAG_STOPPED)))
310 		return HRTIMER_NORESTART;
311 
312 	hrtimer_forward(timer, now, TICK_NSEC);
313 
314 	return HRTIMER_RESTART;
315 }
316 
tick_sched_timer_cancel(struct tick_sched * ts)317 static void tick_sched_timer_cancel(struct tick_sched *ts)
318 {
319 	if (tick_sched_flag_test(ts, TS_FLAG_HIGHRES))
320 		hrtimer_cancel(&ts->sched_timer);
321 	else if (tick_sched_flag_test(ts, TS_FLAG_NOHZ))
322 		tick_program_event(KTIME_MAX, 1);
323 }
324 
325 #ifdef CONFIG_NO_HZ_FULL
326 cpumask_var_t tick_nohz_full_mask;
327 EXPORT_SYMBOL_GPL(tick_nohz_full_mask);
328 bool tick_nohz_full_running;
329 EXPORT_SYMBOL_GPL(tick_nohz_full_running);
330 static atomic_t tick_dep_mask;
331 
check_tick_dependency(atomic_t * dep)332 static bool check_tick_dependency(atomic_t *dep)
333 {
334 	int val = atomic_read(dep);
335 
336 	if (val & TICK_DEP_MASK_POSIX_TIMER) {
337 		trace_tick_stop(0, TICK_DEP_MASK_POSIX_TIMER);
338 		return true;
339 	}
340 
341 	if (val & TICK_DEP_MASK_PERF_EVENTS) {
342 		trace_tick_stop(0, TICK_DEP_MASK_PERF_EVENTS);
343 		return true;
344 	}
345 
346 	if (val & TICK_DEP_MASK_SCHED) {
347 		trace_tick_stop(0, TICK_DEP_MASK_SCHED);
348 		return true;
349 	}
350 
351 	if (val & TICK_DEP_MASK_CLOCK_UNSTABLE) {
352 		trace_tick_stop(0, TICK_DEP_MASK_CLOCK_UNSTABLE);
353 		return true;
354 	}
355 
356 	if (val & TICK_DEP_MASK_RCU) {
357 		trace_tick_stop(0, TICK_DEP_MASK_RCU);
358 		return true;
359 	}
360 
361 	if (val & TICK_DEP_MASK_RCU_EXP) {
362 		trace_tick_stop(0, TICK_DEP_MASK_RCU_EXP);
363 		return true;
364 	}
365 
366 	return false;
367 }
368 
can_stop_full_tick(int cpu,struct tick_sched * ts)369 static bool can_stop_full_tick(int cpu, struct tick_sched *ts)
370 {
371 	lockdep_assert_irqs_disabled();
372 
373 	if (unlikely(!cpu_online(cpu)))
374 		return false;
375 
376 	if (check_tick_dependency(&tick_dep_mask))
377 		return false;
378 
379 	if (check_tick_dependency(&ts->tick_dep_mask))
380 		return false;
381 
382 	if (check_tick_dependency(&current->tick_dep_mask))
383 		return false;
384 
385 	if (check_tick_dependency(&current->signal->tick_dep_mask))
386 		return false;
387 
388 	return true;
389 }
390 
nohz_full_kick_func(struct irq_work * work)391 static void nohz_full_kick_func(struct irq_work *work)
392 {
393 	/* Empty, the tick restart happens on tick_nohz_irq_exit() */
394 }
395 
396 static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) =
397 	IRQ_WORK_INIT_HARD(nohz_full_kick_func);
398 
399 /*
400  * Kick this CPU if it's full dynticks in order to force it to
401  * re-evaluate its dependency on the tick and restart it if necessary.
402  * This kick, unlike tick_nohz_full_kick_cpu() and tick_nohz_full_kick_all(),
403  * is NMI safe.
404  */
tick_nohz_full_kick(void)405 static void tick_nohz_full_kick(void)
406 {
407 	if (!tick_nohz_full_cpu(smp_processor_id()))
408 		return;
409 
410 	irq_work_queue(this_cpu_ptr(&nohz_full_kick_work));
411 }
412 
413 /*
414  * Kick the CPU if it's full dynticks in order to force it to
415  * re-evaluate its dependency on the tick and restart it if necessary.
416  */
tick_nohz_full_kick_cpu(int cpu)417 void tick_nohz_full_kick_cpu(int cpu)
418 {
419 	if (!tick_nohz_full_cpu(cpu))
420 		return;
421 
422 	irq_work_queue_on(&per_cpu(nohz_full_kick_work, cpu), cpu);
423 }
424 
tick_nohz_kick_task(struct task_struct * tsk)425 static void tick_nohz_kick_task(struct task_struct *tsk)
426 {
427 	int cpu;
428 
429 	/*
430 	 * If the task is not running, run_posix_cpu_timers()
431 	 * has nothing to elapse, and an IPI can then be optimized out.
432 	 *
433 	 * activate_task()                      STORE p->tick_dep_mask
434 	 *   STORE p->on_rq
435 	 * __schedule() (switch to task 'p')    smp_mb() (atomic_fetch_or())
436 	 *   LOCK rq->lock                      LOAD p->on_rq
437 	 *   smp_mb__after_spin_lock()
438 	 *   tick_nohz_task_switch()
439 	 *     LOAD p->tick_dep_mask
440 	 *
441 	 * XXX given a task picks up the dependency on schedule(), should we
442 	 * only care about tasks that are currently on the CPU instead of all
443 	 * that are on the runqueue?
444 	 *
445 	 * That is, does this want to be: task_on_cpu() / task_curr()?
446 	 */
447 	if (!sched_task_on_rq(tsk))
448 		return;
449 
450 	/*
451 	 * If the task concurrently migrates to another CPU,
452 	 * we guarantee it sees the new tick dependency upon
453 	 * schedule.
454 	 *
455 	 * set_task_cpu(p, cpu);
456 	 *   STORE p->cpu = @cpu
457 	 * __schedule() (switch to task 'p')
458 	 *   LOCK rq->lock
459 	 *   smp_mb__after_spin_lock()          STORE p->tick_dep_mask
460 	 *   tick_nohz_task_switch()            smp_mb() (atomic_fetch_or())
461 	 *      LOAD p->tick_dep_mask           LOAD p->cpu
462 	 */
463 	cpu = task_cpu(tsk);
464 
465 	preempt_disable();
466 	if (cpu_online(cpu))
467 		tick_nohz_full_kick_cpu(cpu);
468 	preempt_enable();
469 }
470 
471 /*
472  * Kick all full dynticks CPUs in order to force these to re-evaluate
473  * their dependency on the tick and restart it if necessary.
474  */
tick_nohz_full_kick_all(void)475 static void tick_nohz_full_kick_all(void)
476 {
477 	int cpu;
478 
479 	if (!tick_nohz_full_running)
480 		return;
481 
482 	preempt_disable();
483 	for_each_cpu_and(cpu, tick_nohz_full_mask, cpu_online_mask)
484 		tick_nohz_full_kick_cpu(cpu);
485 	preempt_enable();
486 }
487 
tick_nohz_dep_set_all(atomic_t * dep,enum tick_dep_bits bit)488 static void tick_nohz_dep_set_all(atomic_t *dep,
489 				  enum tick_dep_bits bit)
490 {
491 	int prev;
492 
493 	prev = atomic_fetch_or(BIT(bit), dep);
494 	if (!prev)
495 		tick_nohz_full_kick_all();
496 }
497 
498 /*
499  * Set a global tick dependency. Used by perf events that rely on freq and
500  * unstable clocks.
501  */
tick_nohz_dep_set(enum tick_dep_bits bit)502 void tick_nohz_dep_set(enum tick_dep_bits bit)
503 {
504 	tick_nohz_dep_set_all(&tick_dep_mask, bit);
505 }
506 
tick_nohz_dep_clear(enum tick_dep_bits bit)507 void tick_nohz_dep_clear(enum tick_dep_bits bit)
508 {
509 	atomic_andnot(BIT(bit), &tick_dep_mask);
510 }
511 
512 /*
513  * Set per-CPU tick dependency. Used by scheduler and perf events in order to
514  * manage event-throttling.
515  */
tick_nohz_dep_set_cpu(int cpu,enum tick_dep_bits bit)516 void tick_nohz_dep_set_cpu(int cpu, enum tick_dep_bits bit)
517 {
518 	int prev;
519 	struct tick_sched *ts;
520 
521 	ts = per_cpu_ptr(&tick_cpu_sched, cpu);
522 
523 	prev = atomic_fetch_or(BIT(bit), &ts->tick_dep_mask);
524 	if (!prev) {
525 		preempt_disable();
526 		/* Perf needs local kick that is NMI safe */
527 		if (cpu == smp_processor_id()) {
528 			tick_nohz_full_kick();
529 		} else {
530 			/* Remote IRQ work not NMI-safe */
531 			if (!WARN_ON_ONCE(in_nmi()))
532 				tick_nohz_full_kick_cpu(cpu);
533 		}
534 		preempt_enable();
535 	}
536 }
537 EXPORT_SYMBOL_GPL(tick_nohz_dep_set_cpu);
538 
tick_nohz_dep_clear_cpu(int cpu,enum tick_dep_bits bit)539 void tick_nohz_dep_clear_cpu(int cpu, enum tick_dep_bits bit)
540 {
541 	struct tick_sched *ts = per_cpu_ptr(&tick_cpu_sched, cpu);
542 
543 	atomic_andnot(BIT(bit), &ts->tick_dep_mask);
544 }
545 EXPORT_SYMBOL_GPL(tick_nohz_dep_clear_cpu);
546 
547 /*
548  * Set a per-task tick dependency. RCU needs this. Also posix CPU timers
549  * in order to elapse per task timers.
550  */
tick_nohz_dep_set_task(struct task_struct * tsk,enum tick_dep_bits bit)551 void tick_nohz_dep_set_task(struct task_struct *tsk, enum tick_dep_bits bit)
552 {
553 	if (!atomic_fetch_or(BIT(bit), &tsk->tick_dep_mask))
554 		tick_nohz_kick_task(tsk);
555 }
556 EXPORT_SYMBOL_GPL(tick_nohz_dep_set_task);
557 
tick_nohz_dep_clear_task(struct task_struct * tsk,enum tick_dep_bits bit)558 void tick_nohz_dep_clear_task(struct task_struct *tsk, enum tick_dep_bits bit)
559 {
560 	atomic_andnot(BIT(bit), &tsk->tick_dep_mask);
561 }
562 EXPORT_SYMBOL_GPL(tick_nohz_dep_clear_task);
563 
564 /*
565  * Set a per-taskgroup tick dependency. Posix CPU timers need this in order to elapse
566  * per process timers.
567  */
tick_nohz_dep_set_signal(struct task_struct * tsk,enum tick_dep_bits bit)568 void tick_nohz_dep_set_signal(struct task_struct *tsk,
569 			      enum tick_dep_bits bit)
570 {
571 	int prev;
572 	struct signal_struct *sig = tsk->signal;
573 
574 	prev = atomic_fetch_or(BIT(bit), &sig->tick_dep_mask);
575 	if (!prev) {
576 		struct task_struct *t;
577 
578 		lockdep_assert_held(&tsk->sighand->siglock);
579 		__for_each_thread(sig, t)
580 			tick_nohz_kick_task(t);
581 	}
582 }
583 
tick_nohz_dep_clear_signal(struct signal_struct * sig,enum tick_dep_bits bit)584 void tick_nohz_dep_clear_signal(struct signal_struct *sig, enum tick_dep_bits bit)
585 {
586 	atomic_andnot(BIT(bit), &sig->tick_dep_mask);
587 }
588 
589 /*
590  * Re-evaluate the need for the tick as we switch the current task.
591  * It might need the tick due to per task/process properties:
592  * perf events, posix CPU timers, ...
593  */
__tick_nohz_task_switch(void)594 void __tick_nohz_task_switch(void)
595 {
596 	struct tick_sched *ts;
597 
598 	if (!tick_nohz_full_cpu(smp_processor_id()))
599 		return;
600 
601 	ts = this_cpu_ptr(&tick_cpu_sched);
602 
603 	if (tick_sched_flag_test(ts, TS_FLAG_STOPPED)) {
604 		if (atomic_read(&current->tick_dep_mask) ||
605 		    atomic_read(&current->signal->tick_dep_mask))
606 			tick_nohz_full_kick();
607 	}
608 }
609 
610 /* Get the boot-time nohz CPU list from the kernel parameters. */
tick_nohz_full_setup(cpumask_var_t cpumask)611 void __init tick_nohz_full_setup(cpumask_var_t cpumask)
612 {
613 	alloc_bootmem_cpumask_var(&tick_nohz_full_mask);
614 	cpumask_copy(tick_nohz_full_mask, cpumask);
615 	tick_nohz_full_running = true;
616 }
617 
tick_nohz_cpu_hotpluggable(unsigned int cpu)618 bool tick_nohz_cpu_hotpluggable(unsigned int cpu)
619 {
620 	/*
621 	 * The 'tick_do_timer_cpu' CPU handles housekeeping duty (unbound
622 	 * timers, workqueues, timekeeping, ...) on behalf of full dynticks
623 	 * CPUs. It must remain online when nohz full is enabled.
624 	 */
625 	if (tick_nohz_full_running && READ_ONCE(tick_do_timer_cpu) == cpu)
626 		return false;
627 	return true;
628 }
629 
tick_nohz_cpu_down(unsigned int cpu)630 static int tick_nohz_cpu_down(unsigned int cpu)
631 {
632 	return tick_nohz_cpu_hotpluggable(cpu) ? 0 : -EBUSY;
633 }
634 
tick_nohz_init(void)635 void __init tick_nohz_init(void)
636 {
637 	int cpu, ret;
638 
639 	if (!tick_nohz_full_running)
640 		return;
641 
642 	/*
643 	 * Full dynticks uses IRQ work to drive the tick rescheduling on safe
644 	 * locking contexts. But then we need IRQ work to raise its own
645 	 * interrupts to avoid circular dependency on the tick.
646 	 */
647 	if (!arch_irq_work_has_interrupt()) {
648 		pr_warn("NO_HZ: Can't run full dynticks because arch doesn't support IRQ work self-IPIs\n");
649 		cpumask_clear(tick_nohz_full_mask);
650 		tick_nohz_full_running = false;
651 		return;
652 	}
653 
654 	if (IS_ENABLED(CONFIG_PM_SLEEP_SMP) &&
655 			!IS_ENABLED(CONFIG_PM_SLEEP_SMP_NONZERO_CPU)) {
656 		cpu = smp_processor_id();
657 
658 		if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) {
659 			pr_warn("NO_HZ: Clearing %d from nohz_full range "
660 				"for timekeeping\n", cpu);
661 			cpumask_clear_cpu(cpu, tick_nohz_full_mask);
662 		}
663 	}
664 
665 	for_each_cpu(cpu, tick_nohz_full_mask)
666 		ct_cpu_track_user(cpu);
667 
668 	ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
669 					"kernel/nohz:predown", NULL,
670 					tick_nohz_cpu_down);
671 	WARN_ON(ret < 0);
672 	pr_info("NO_HZ: Full dynticks CPUs: %*pbl.\n",
673 		cpumask_pr_args(tick_nohz_full_mask));
674 }
675 #endif /* #ifdef CONFIG_NO_HZ_FULL */
676 
677 /*
678  * NOHZ - aka dynamic tick functionality
679  */
680 #ifdef CONFIG_NO_HZ_COMMON
681 /*
682  * NO HZ enabled ?
683  */
684 bool tick_nohz_enabled __read_mostly  = true;
685 unsigned long tick_nohz_active  __read_mostly;
686 /*
687  * Enable / Disable tickless mode
688  */
setup_tick_nohz(char * str)689 static int __init setup_tick_nohz(char *str)
690 {
691 	return (kstrtobool(str, &tick_nohz_enabled) == 0);
692 }
693 
694 __setup("nohz=", setup_tick_nohz);
695 
tick_nohz_tick_stopped(void)696 bool tick_nohz_tick_stopped(void)
697 {
698 	struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
699 
700 	return tick_sched_flag_test(ts, TS_FLAG_STOPPED);
701 }
702 
tick_nohz_tick_stopped_cpu(int cpu)703 bool tick_nohz_tick_stopped_cpu(int cpu)
704 {
705 	struct tick_sched *ts = per_cpu_ptr(&tick_cpu_sched, cpu);
706 
707 	return tick_sched_flag_test(ts, TS_FLAG_STOPPED);
708 }
709 
710 /**
711  * tick_nohz_update_jiffies - update jiffies when idle was interrupted
712  * @now: current ktime_t
713  *
714  * Called from interrupt entry when the CPU was idle
715  *
716  * In case the sched_tick was stopped on this CPU, we have to check if jiffies
717  * must be updated. Otherwise an interrupt handler could use a stale jiffy
718  * value. We do this unconditionally on any CPU, as we don't know whether the
719  * CPU, which has the update task assigned, is in a long sleep.
720  */
tick_nohz_update_jiffies(ktime_t now)721 static void tick_nohz_update_jiffies(ktime_t now)
722 {
723 	unsigned long flags;
724 
725 	__this_cpu_write(tick_cpu_sched.idle_waketime, now);
726 
727 	local_irq_save(flags);
728 	tick_do_update_jiffies64(now);
729 	local_irq_restore(flags);
730 
731 	touch_softlockup_watchdog_sched();
732 }
733 
tick_nohz_stop_idle(struct tick_sched * ts,ktime_t now)734 static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now)
735 {
736 	ktime_t delta;
737 
738 	if (WARN_ON_ONCE(!tick_sched_flag_test(ts, TS_FLAG_IDLE_ACTIVE)))
739 		return;
740 
741 	delta = ktime_sub(now, ts->idle_entrytime);
742 
743 	write_seqcount_begin(&ts->idle_sleeptime_seq);
744 	if (nr_iowait_cpu(smp_processor_id()) > 0)
745 		ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta);
746 	else
747 		ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
748 
749 	ts->idle_entrytime = now;
750 	tick_sched_flag_clear(ts, TS_FLAG_IDLE_ACTIVE);
751 	write_seqcount_end(&ts->idle_sleeptime_seq);
752 
753 	sched_clock_idle_wakeup_event();
754 }
755 
tick_nohz_start_idle(struct tick_sched * ts)756 static void tick_nohz_start_idle(struct tick_sched *ts)
757 {
758 	write_seqcount_begin(&ts->idle_sleeptime_seq);
759 	ts->idle_entrytime = ktime_get();
760 	tick_sched_flag_set(ts, TS_FLAG_IDLE_ACTIVE);
761 	write_seqcount_end(&ts->idle_sleeptime_seq);
762 
763 	sched_clock_idle_sleep_event();
764 }
765 
get_cpu_sleep_time_us(struct tick_sched * ts,ktime_t * sleeptime,bool compute_delta,u64 * last_update_time)766 static u64 get_cpu_sleep_time_us(struct tick_sched *ts, ktime_t *sleeptime,
767 				 bool compute_delta, u64 *last_update_time)
768 {
769 	ktime_t now, idle;
770 	unsigned int seq;
771 
772 	if (!tick_nohz_active)
773 		return -1;
774 
775 	now = ktime_get();
776 	if (last_update_time)
777 		*last_update_time = ktime_to_us(now);
778 
779 	do {
780 		seq = read_seqcount_begin(&ts->idle_sleeptime_seq);
781 
782 		if (tick_sched_flag_test(ts, TS_FLAG_IDLE_ACTIVE) && compute_delta) {
783 			ktime_t delta = ktime_sub(now, ts->idle_entrytime);
784 
785 			idle = ktime_add(*sleeptime, delta);
786 		} else {
787 			idle = *sleeptime;
788 		}
789 	} while (read_seqcount_retry(&ts->idle_sleeptime_seq, seq));
790 
791 	return ktime_to_us(idle);
792 
793 }
794 
795 /**
796  * get_cpu_idle_time_us - get the total idle time of a CPU
797  * @cpu: CPU number to query
798  * @last_update_time: variable to store update time in. Do not update
799  * counters if NULL.
800  *
801  * Return the cumulative idle time (since boot) for a given
802  * CPU, in microseconds. Note that this is partially broken due to
803  * the counter of iowait tasks that can be remotely updated without
804  * any synchronization. Therefore it is possible to observe backward
805  * values within two consecutive reads.
806  *
807  * This time is measured via accounting rather than sampling,
808  * and is as accurate as ktime_get() is.
809  *
810  * Return: -1 if NOHZ is not enabled, else total idle time of the @cpu
811  */
get_cpu_idle_time_us(int cpu,u64 * last_update_time)812 u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
813 {
814 	struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
815 
816 	return get_cpu_sleep_time_us(ts, &ts->idle_sleeptime,
817 				     !nr_iowait_cpu(cpu), last_update_time);
818 }
819 EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);
820 
821 /**
822  * get_cpu_iowait_time_us - get the total iowait time of a CPU
823  * @cpu: CPU number to query
824  * @last_update_time: variable to store update time in. Do not update
825  * counters if NULL.
826  *
827  * Return the cumulative iowait time (since boot) for a given
828  * CPU, in microseconds. Note this is partially broken due to
829  * the counter of iowait tasks that can be remotely updated without
830  * any synchronization. Therefore it is possible to observe backward
831  * values within two consecutive reads.
832  *
833  * This time is measured via accounting rather than sampling,
834  * and is as accurate as ktime_get() is.
835  *
836  * Return: -1 if NOHZ is not enabled, else total iowait time of @cpu
837  */
get_cpu_iowait_time_us(int cpu,u64 * last_update_time)838 u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
839 {
840 	struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
841 
842 	return get_cpu_sleep_time_us(ts, &ts->iowait_sleeptime,
843 				     nr_iowait_cpu(cpu), last_update_time);
844 }
845 EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us);
846 
tick_nohz_restart(struct tick_sched * ts,ktime_t now)847 static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
848 {
849 	hrtimer_cancel(&ts->sched_timer);
850 	hrtimer_set_expires(&ts->sched_timer, ts->last_tick);
851 
852 	/* Forward the time to expire in the future */
853 	hrtimer_forward(&ts->sched_timer, now, TICK_NSEC);
854 
855 	if (tick_sched_flag_test(ts, TS_FLAG_HIGHRES)) {
856 		hrtimer_start_expires(&ts->sched_timer,
857 				      HRTIMER_MODE_ABS_PINNED_HARD);
858 	} else {
859 		tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1);
860 	}
861 
862 	/*
863 	 * Reset to make sure the next tick stop doesn't get fooled by past
864 	 * cached clock deadline.
865 	 */
866 	ts->next_tick = 0;
867 }
868 
local_timer_softirq_pending(void)869 static inline bool local_timer_softirq_pending(void)
870 {
871 	return local_softirq_pending() & BIT(TIMER_SOFTIRQ);
872 }
873 
874 /*
875  * Read jiffies and the time when jiffies were updated last
876  */
get_jiffies_update(unsigned long * basej)877 u64 get_jiffies_update(unsigned long *basej)
878 {
879 	unsigned long basejiff;
880 	unsigned int seq;
881 	u64 basemono;
882 
883 	do {
884 		seq = read_seqcount_begin(&jiffies_seq);
885 		basemono = last_jiffies_update;
886 		basejiff = jiffies;
887 	} while (read_seqcount_retry(&jiffies_seq, seq));
888 	*basej = basejiff;
889 	return basemono;
890 }
891 
892 /**
893  * tick_nohz_next_event() - return the clock monotonic based next event
894  * @ts:		pointer to tick_sched struct
895  * @cpu:	CPU number
896  *
897  * Return:
898  * *%0		- When the next event is a maximum of TICK_NSEC in the future
899  *		  and the tick is not stopped yet
900  * *%next_event	- Next event based on clock monotonic
901  */
tick_nohz_next_event(struct tick_sched * ts,int cpu)902 static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu)
903 {
904 	u64 basemono, next_tick, delta, expires;
905 	unsigned long basejiff;
906 	int tick_cpu;
907 
908 	basemono = get_jiffies_update(&basejiff);
909 	ts->last_jiffies = basejiff;
910 	ts->timer_expires_base = basemono;
911 
912 	/*
913 	 * Keep the periodic tick, when RCU, architecture or irq_work
914 	 * requests it.
915 	 * Aside of that, check whether the local timer softirq is
916 	 * pending. If so, its a bad idea to call get_next_timer_interrupt(),
917 	 * because there is an already expired timer, so it will request
918 	 * immediate expiry, which rearms the hardware timer with a
919 	 * minimal delta, which brings us back to this place
920 	 * immediately. Lather, rinse and repeat...
921 	 */
922 	if (rcu_needs_cpu() || arch_needs_cpu() ||
923 	    irq_work_needs_cpu() || local_timer_softirq_pending()) {
924 		next_tick = basemono + TICK_NSEC;
925 	} else {
926 		/*
927 		 * Get the next pending timer. If high resolution
928 		 * timers are enabled this only takes the timer wheel
929 		 * timers into account. If high resolution timers are
930 		 * disabled this also looks at the next expiring
931 		 * hrtimer.
932 		 */
933 		next_tick = get_next_timer_interrupt(basejiff, basemono);
934 		ts->next_timer = next_tick;
935 	}
936 
937 	/* Make sure next_tick is never before basemono! */
938 	if (WARN_ON_ONCE(basemono > next_tick))
939 		next_tick = basemono;
940 
941 	/*
942 	 * If the tick is due in the next period, keep it ticking or
943 	 * force prod the timer.
944 	 */
945 	delta = next_tick - basemono;
946 	if (delta <= (u64)TICK_NSEC) {
947 		/*
948 		 * We've not stopped the tick yet, and there's a timer in the
949 		 * next period, so no point in stopping it either, bail.
950 		 */
951 		if (!tick_sched_flag_test(ts, TS_FLAG_STOPPED)) {
952 			ts->timer_expires = 0;
953 			goto out;
954 		}
955 	}
956 
957 	/*
958 	 * If this CPU is the one which had the do_timer() duty last, we limit
959 	 * the sleep time to the timekeeping 'max_deferment' value.
960 	 * Otherwise we can sleep as long as we want.
961 	 */
962 	delta = timekeeping_max_deferment();
963 	tick_cpu = READ_ONCE(tick_do_timer_cpu);
964 	if (tick_cpu != cpu &&
965 	    (tick_cpu != TICK_DO_TIMER_NONE || !tick_sched_flag_test(ts, TS_FLAG_DO_TIMER_LAST)))
966 		delta = KTIME_MAX;
967 
968 	/* Calculate the next expiry time */
969 	if (delta < (KTIME_MAX - basemono))
970 		expires = basemono + delta;
971 	else
972 		expires = KTIME_MAX;
973 
974 	ts->timer_expires = min_t(u64, expires, next_tick);
975 
976 out:
977 	return ts->timer_expires;
978 }
979 
tick_nohz_stop_tick(struct tick_sched * ts,int cpu)980 static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu)
981 {
982 	struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
983 	unsigned long basejiff = ts->last_jiffies;
984 	u64 basemono = ts->timer_expires_base;
985 	bool timer_idle = tick_sched_flag_test(ts, TS_FLAG_STOPPED);
986 	int tick_cpu;
987 	u64 expires;
988 
989 	/* Make sure we won't be trying to stop it twice in a row. */
990 	ts->timer_expires_base = 0;
991 
992 	/*
993 	 * Now the tick should be stopped definitely - so the timer base needs
994 	 * to be marked idle as well to not miss a newly queued timer.
995 	 */
996 	expires = timer_base_try_to_set_idle(basejiff, basemono, &timer_idle);
997 	if (expires > ts->timer_expires) {
998 		/*
999 		 * This path could only happen when the first timer was removed
1000 		 * between calculating the possible sleep length and now (when
1001 		 * high resolution mode is not active, timer could also be a
1002 		 * hrtimer).
1003 		 *
1004 		 * We have to stick to the original calculated expiry value to
1005 		 * not stop the tick for too long with a shallow C-state (which
1006 		 * was programmed by cpuidle because of an early next expiration
1007 		 * value).
1008 		 */
1009 		expires = ts->timer_expires;
1010 	}
1011 
1012 	/* If the timer base is not idle, retain the not yet stopped tick. */
1013 	if (!timer_idle)
1014 		return;
1015 
1016 	/*
1017 	 * If this CPU is the one which updates jiffies, then give up
1018 	 * the assignment and let it be taken by the CPU which runs
1019 	 * the tick timer next, which might be this CPU as well. If we
1020 	 * don't drop this here, the jiffies might be stale and
1021 	 * do_timer() never gets invoked. Keep track of the fact that it
1022 	 * was the one which had the do_timer() duty last.
1023 	 */
1024 	tick_cpu = READ_ONCE(tick_do_timer_cpu);
1025 	if (tick_cpu == cpu) {
1026 		WRITE_ONCE(tick_do_timer_cpu, TICK_DO_TIMER_NONE);
1027 		tick_sched_flag_set(ts, TS_FLAG_DO_TIMER_LAST);
1028 	} else if (tick_cpu != TICK_DO_TIMER_NONE) {
1029 		tick_sched_flag_clear(ts, TS_FLAG_DO_TIMER_LAST);
1030 	}
1031 
1032 	/* Skip reprogram of event if it's not changed */
1033 	if (tick_sched_flag_test(ts, TS_FLAG_STOPPED) && (expires == ts->next_tick)) {
1034 		/* Sanity check: make sure clockevent is actually programmed */
1035 		if (expires == KTIME_MAX || ts->next_tick == hrtimer_get_expires(&ts->sched_timer))
1036 			return;
1037 
1038 		WARN_ONCE(1, "basemono: %llu ts->next_tick: %llu dev->next_event: %llu "
1039 			  "timer->active: %d timer->expires: %llu\n", basemono, ts->next_tick,
1040 			  dev->next_event, hrtimer_active(&ts->sched_timer),
1041 			  hrtimer_get_expires(&ts->sched_timer));
1042 	}
1043 
1044 	/*
1045 	 * tick_nohz_stop_tick() can be called several times before
1046 	 * tick_nohz_restart_sched_tick() is called. This happens when
1047 	 * interrupts arrive which do not cause a reschedule. In the first
1048 	 * call we save the current tick time, so we can restart the
1049 	 * scheduler tick in tick_nohz_restart_sched_tick().
1050 	 */
1051 	if (!tick_sched_flag_test(ts, TS_FLAG_STOPPED)) {
1052 		calc_load_nohz_start();
1053 		quiet_vmstat();
1054 
1055 		ts->last_tick = hrtimer_get_expires(&ts->sched_timer);
1056 		tick_sched_flag_set(ts, TS_FLAG_STOPPED);
1057 		trace_tick_stop(1, TICK_DEP_MASK_NONE);
1058 	}
1059 
1060 	ts->next_tick = expires;
1061 
1062 	/*
1063 	 * If the expiration time == KTIME_MAX, then we simply stop
1064 	 * the tick timer.
1065 	 */
1066 	if (unlikely(expires == KTIME_MAX)) {
1067 		tick_sched_timer_cancel(ts);
1068 		return;
1069 	}
1070 
1071 	if (tick_sched_flag_test(ts, TS_FLAG_HIGHRES)) {
1072 		hrtimer_start(&ts->sched_timer, expires,
1073 			      HRTIMER_MODE_ABS_PINNED_HARD);
1074 	} else {
1075 		hrtimer_set_expires(&ts->sched_timer, expires);
1076 		tick_program_event(expires, 1);
1077 	}
1078 }
1079 
tick_nohz_retain_tick(struct tick_sched * ts)1080 static void tick_nohz_retain_tick(struct tick_sched *ts)
1081 {
1082 	ts->timer_expires_base = 0;
1083 }
1084 
1085 #ifdef CONFIG_NO_HZ_FULL
tick_nohz_full_stop_tick(struct tick_sched * ts,int cpu)1086 static void tick_nohz_full_stop_tick(struct tick_sched *ts, int cpu)
1087 {
1088 	if (tick_nohz_next_event(ts, cpu))
1089 		tick_nohz_stop_tick(ts, cpu);
1090 	else
1091 		tick_nohz_retain_tick(ts);
1092 }
1093 #endif /* CONFIG_NO_HZ_FULL */
1094 
tick_nohz_restart_sched_tick(struct tick_sched * ts,ktime_t now)1095 static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)
1096 {
1097 	/* Update jiffies first */
1098 	tick_do_update_jiffies64(now);
1099 
1100 	/*
1101 	 * Clear the timer idle flag, so we avoid IPIs on remote queueing and
1102 	 * the clock forward checks in the enqueue path:
1103 	 */
1104 	timer_clear_idle();
1105 
1106 	calc_load_nohz_stop();
1107 	touch_softlockup_watchdog_sched();
1108 
1109 	/* Cancel the scheduled timer and restore the tick: */
1110 	tick_sched_flag_clear(ts, TS_FLAG_STOPPED);
1111 	tick_nohz_restart(ts, now);
1112 }
1113 
__tick_nohz_full_update_tick(struct tick_sched * ts,ktime_t now)1114 static void __tick_nohz_full_update_tick(struct tick_sched *ts,
1115 					 ktime_t now)
1116 {
1117 #ifdef CONFIG_NO_HZ_FULL
1118 	int cpu = smp_processor_id();
1119 
1120 	if (can_stop_full_tick(cpu, ts))
1121 		tick_nohz_full_stop_tick(ts, cpu);
1122 	else if (tick_sched_flag_test(ts, TS_FLAG_STOPPED))
1123 		tick_nohz_restart_sched_tick(ts, now);
1124 #endif
1125 }
1126 
tick_nohz_full_update_tick(struct tick_sched * ts)1127 static void tick_nohz_full_update_tick(struct tick_sched *ts)
1128 {
1129 	if (!tick_nohz_full_cpu(smp_processor_id()))
1130 		return;
1131 
1132 	if (!tick_sched_flag_test(ts, TS_FLAG_NOHZ))
1133 		return;
1134 
1135 	__tick_nohz_full_update_tick(ts, ktime_get());
1136 }
1137 
1138 /*
1139  * A pending softirq outside an IRQ (or softirq disabled section) context
1140  * should be waiting for ksoftirqd to handle it. Therefore we shouldn't
1141  * reach this code due to the need_resched() early check in can_stop_idle_tick().
1142  *
1143  * However if we are between CPUHP_AP_SMPBOOT_THREADS and CPU_TEARDOWN_CPU on the
1144  * cpu_down() process, softirqs can still be raised while ksoftirqd is parked,
1145  * triggering the code below, since wakep_softirqd() is ignored.
1146  *
1147  */
report_idle_softirq(void)1148 static bool report_idle_softirq(void)
1149 {
1150 	static int ratelimit;
1151 	unsigned int pending = local_softirq_pending();
1152 
1153 	if (likely(!pending))
1154 		return false;
1155 
1156 	/* Some softirqs claim to be safe against hotplug and ksoftirqd parking */
1157 	if (!cpu_active(smp_processor_id())) {
1158 		pending &= ~SOFTIRQ_HOTPLUG_SAFE_MASK;
1159 		if (!pending)
1160 			return false;
1161 	}
1162 
1163 	if (ratelimit >= 10)
1164 		return false;
1165 
1166 	/* On RT, softirq handling may be waiting on some lock */
1167 	if (local_bh_blocked())
1168 		return false;
1169 
1170 	pr_warn("NOHZ tick-stop error: local softirq work is pending, handler #%02x!!!\n",
1171 		pending);
1172 	ratelimit++;
1173 
1174 	return true;
1175 }
1176 
can_stop_idle_tick(int cpu,struct tick_sched * ts)1177 static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
1178 {
1179 	WARN_ON_ONCE(cpu_is_offline(cpu));
1180 
1181 	if (unlikely(!tick_sched_flag_test(ts, TS_FLAG_NOHZ)))
1182 		return false;
1183 
1184 	if (need_resched())
1185 		return false;
1186 
1187 	if (unlikely(report_idle_softirq()))
1188 		return false;
1189 
1190 	if (tick_nohz_full_enabled()) {
1191 		int tick_cpu = READ_ONCE(tick_do_timer_cpu);
1192 
1193 		/*
1194 		 * Keep the tick alive to guarantee timekeeping progression
1195 		 * if there are full dynticks CPUs around
1196 		 */
1197 		if (tick_cpu == cpu)
1198 			return false;
1199 
1200 		/* Should not happen for nohz-full */
1201 		if (WARN_ON_ONCE(tick_cpu == TICK_DO_TIMER_NONE))
1202 			return false;
1203 	}
1204 
1205 	return true;
1206 }
1207 
1208 /**
1209  * tick_nohz_idle_stop_tick - stop the idle tick from the idle task
1210  *
1211  * When the next event is more than a tick into the future, stop the idle tick
1212  */
tick_nohz_idle_stop_tick(void)1213 void tick_nohz_idle_stop_tick(void)
1214 {
1215 	struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
1216 	int cpu = smp_processor_id();
1217 	ktime_t expires;
1218 
1219 	trace_android_vh_tick_nohz_idle_stop_tick(NULL);
1220 
1221 	/*
1222 	 * If tick_nohz_get_sleep_length() ran tick_nohz_next_event(), the
1223 	 * tick timer expiration time is known already.
1224 	 */
1225 	if (ts->timer_expires_base)
1226 		expires = ts->timer_expires;
1227 	else if (can_stop_idle_tick(cpu, ts))
1228 		expires = tick_nohz_next_event(ts, cpu);
1229 	else
1230 		return;
1231 
1232 	ts->idle_calls++;
1233 
1234 	if (expires > 0LL) {
1235 		int was_stopped = tick_sched_flag_test(ts, TS_FLAG_STOPPED);
1236 
1237 		tick_nohz_stop_tick(ts, cpu);
1238 
1239 		ts->idle_sleeps++;
1240 		ts->idle_expires = expires;
1241 
1242 		if (!was_stopped && tick_sched_flag_test(ts, TS_FLAG_STOPPED)) {
1243 			ts->idle_jiffies = ts->last_jiffies;
1244 			nohz_balance_enter_idle(cpu);
1245 		}
1246 	} else {
1247 		tick_nohz_retain_tick(ts);
1248 	}
1249 }
1250 
tick_nohz_idle_retain_tick(void)1251 void tick_nohz_idle_retain_tick(void)
1252 {
1253 	tick_nohz_retain_tick(this_cpu_ptr(&tick_cpu_sched));
1254 }
1255 
1256 /**
1257  * tick_nohz_idle_enter - prepare for entering idle on the current CPU
1258  *
1259  * Called when we start the idle loop.
1260  */
tick_nohz_idle_enter(void)1261 void tick_nohz_idle_enter(void)
1262 {
1263 	struct tick_sched *ts;
1264 
1265 	lockdep_assert_irqs_enabled();
1266 
1267 	local_irq_disable();
1268 
1269 	ts = this_cpu_ptr(&tick_cpu_sched);
1270 
1271 	WARN_ON_ONCE(ts->timer_expires_base);
1272 
1273 	tick_sched_flag_set(ts, TS_FLAG_INIDLE);
1274 	tick_nohz_start_idle(ts);
1275 
1276 	local_irq_enable();
1277 }
1278 
1279 /**
1280  * tick_nohz_irq_exit - Notify the tick about IRQ exit
1281  *
1282  * A timer may have been added/modified/deleted either by the current IRQ,
1283  * or by another place using this IRQ as a notification. This IRQ may have
1284  * also updated the RCU callback list. These events may require a
1285  * re-evaluation of the next tick. Depending on the context:
1286  *
1287  * 1) If the CPU is idle and no resched is pending, just proceed with idle
1288  *    time accounting. The next tick will be re-evaluated on the next idle
1289  *    loop iteration.
1290  *
1291  * 2) If the CPU is nohz_full:
1292  *
1293  *    2.1) If there is any tick dependency, restart the tick if stopped.
1294  *
1295  *    2.2) If there is no tick dependency, (re-)evaluate the next tick and
1296  *         stop/update it accordingly.
1297  */
tick_nohz_irq_exit(void)1298 void tick_nohz_irq_exit(void)
1299 {
1300 	struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
1301 
1302 	if (tick_sched_flag_test(ts, TS_FLAG_INIDLE))
1303 		tick_nohz_start_idle(ts);
1304 	else
1305 		tick_nohz_full_update_tick(ts);
1306 }
1307 
1308 /**
1309  * tick_nohz_idle_got_tick - Check whether or not the tick handler has run
1310  *
1311  * Return: %true if the tick handler has run, otherwise %false
1312  */
tick_nohz_idle_got_tick(void)1313 bool tick_nohz_idle_got_tick(void)
1314 {
1315 	struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
1316 
1317 	if (ts->got_idle_tick) {
1318 		ts->got_idle_tick = 0;
1319 		return true;
1320 	}
1321 	return false;
1322 }
1323 
1324 /**
1325  * tick_nohz_get_next_hrtimer - return the next expiration time for the hrtimer
1326  * or the tick, whichever expires first. Note that, if the tick has been
1327  * stopped, it returns the next hrtimer.
1328  *
1329  * Called from power state control code with interrupts disabled
1330  *
1331  * Return: the next expiration time
1332  */
tick_nohz_get_next_hrtimer(void)1333 ktime_t tick_nohz_get_next_hrtimer(void)
1334 {
1335 	return __this_cpu_read(tick_cpu_device.evtdev)->next_event;
1336 }
1337 
1338 /**
1339  * tick_nohz_get_sleep_length - return the expected length of the current sleep
1340  * @delta_next: duration until the next event if the tick cannot be stopped
1341  *
1342  * Called from power state control code with interrupts disabled.
1343  *
1344  * The return value of this function and/or the value returned by it through the
1345  * @delta_next pointer can be negative which must be taken into account by its
1346  * callers.
1347  *
1348  * Return: the expected length of the current sleep
1349  */
tick_nohz_get_sleep_length(ktime_t * delta_next)1350 ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next)
1351 {
1352 	struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
1353 	struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
1354 	int cpu = smp_processor_id();
1355 	/*
1356 	 * The idle entry time is expected to be a sufficient approximation of
1357 	 * the current time at this point.
1358 	 */
1359 	ktime_t now = ts->idle_entrytime;
1360 	ktime_t next_event;
1361 
1362 	WARN_ON_ONCE(!tick_sched_flag_test(ts, TS_FLAG_INIDLE));
1363 
1364 	*delta_next = ktime_sub(dev->next_event, now);
1365 
1366 	if (!can_stop_idle_tick(cpu, ts))
1367 		return *delta_next;
1368 
1369 	next_event = tick_nohz_next_event(ts, cpu);
1370 	if (!next_event)
1371 		return *delta_next;
1372 
1373 	/*
1374 	 * If the next highres timer to expire is earlier than 'next_event', the
1375 	 * idle governor needs to know that.
1376 	 */
1377 	next_event = min_t(u64, next_event,
1378 			   hrtimer_next_event_without(&ts->sched_timer));
1379 
1380 	return ktime_sub(next_event, now);
1381 }
1382 EXPORT_SYMBOL_GPL(tick_nohz_get_sleep_length);
1383 
1384 /**
1385  * tick_nohz_get_idle_calls_cpu - return the current idle calls counter value
1386  * for a particular CPU.
1387  * @cpu: target CPU number
1388  *
1389  * Called from the schedutil frequency scaling governor in scheduler context.
1390  *
1391  * Return: the current idle calls counter value for @cpu
1392  */
tick_nohz_get_idle_calls_cpu(int cpu)1393 unsigned long tick_nohz_get_idle_calls_cpu(int cpu)
1394 {
1395 	struct tick_sched *ts = tick_get_tick_sched(cpu);
1396 
1397 	return ts->idle_calls;
1398 }
1399 EXPORT_SYMBOL_GPL(tick_nohz_get_idle_calls_cpu);
1400 
tick_nohz_account_idle_time(struct tick_sched * ts,ktime_t now)1401 static void tick_nohz_account_idle_time(struct tick_sched *ts,
1402 					ktime_t now)
1403 {
1404 	unsigned long ticks;
1405 
1406 	ts->idle_exittime = now;
1407 
1408 	if (vtime_accounting_enabled_this_cpu())
1409 		return;
1410 	/*
1411 	 * We stopped the tick in idle. update_process_times() would miss the
1412 	 * time we slept, as it does only a 1 tick accounting.
1413 	 * Enforce that this is accounted to idle !
1414 	 */
1415 	ticks = jiffies - ts->idle_jiffies;
1416 	/*
1417 	 * We might be one off. Do not randomly account a huge number of ticks!
1418 	 */
1419 	if (ticks && ticks < LONG_MAX)
1420 		account_idle_ticks(ticks);
1421 }
1422 
tick_nohz_idle_restart_tick(void)1423 void tick_nohz_idle_restart_tick(void)
1424 {
1425 	struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
1426 
1427 	if (tick_sched_flag_test(ts, TS_FLAG_STOPPED)) {
1428 		ktime_t now = ktime_get();
1429 		tick_nohz_restart_sched_tick(ts, now);
1430 		tick_nohz_account_idle_time(ts, now);
1431 	}
1432 }
1433 
tick_nohz_idle_update_tick(struct tick_sched * ts,ktime_t now)1434 static void tick_nohz_idle_update_tick(struct tick_sched *ts, ktime_t now)
1435 {
1436 	if (tick_nohz_full_cpu(smp_processor_id()))
1437 		__tick_nohz_full_update_tick(ts, now);
1438 	else
1439 		tick_nohz_restart_sched_tick(ts, now);
1440 
1441 	tick_nohz_account_idle_time(ts, now);
1442 }
1443 
1444 /**
1445  * tick_nohz_idle_exit - Update the tick upon idle task exit
1446  *
1447  * When the idle task exits, update the tick depending on the
1448  * following situations:
1449  *
1450  * 1) If the CPU is not in nohz_full mode (most cases), then
1451  *    restart the tick.
1452  *
1453  * 2) If the CPU is in nohz_full mode (corner case):
1454  *   2.1) If the tick can be kept stopped (no tick dependencies)
1455  *        then re-evaluate the next tick and try to keep it stopped
1456  *        as long as possible.
1457  *   2.2) If the tick has dependencies, restart the tick.
1458  *
1459  */
tick_nohz_idle_exit(void)1460 void tick_nohz_idle_exit(void)
1461 {
1462 	struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
1463 	bool idle_active, tick_stopped;
1464 	ktime_t now;
1465 
1466 	local_irq_disable();
1467 
1468 	WARN_ON_ONCE(!tick_sched_flag_test(ts, TS_FLAG_INIDLE));
1469 	WARN_ON_ONCE(ts->timer_expires_base);
1470 
1471 	tick_sched_flag_clear(ts, TS_FLAG_INIDLE);
1472 	idle_active = tick_sched_flag_test(ts, TS_FLAG_IDLE_ACTIVE);
1473 	tick_stopped = tick_sched_flag_test(ts, TS_FLAG_STOPPED);
1474 
1475 	if (idle_active || tick_stopped)
1476 		now = ktime_get();
1477 
1478 	if (idle_active)
1479 		tick_nohz_stop_idle(ts, now);
1480 
1481 	if (tick_stopped)
1482 		tick_nohz_idle_update_tick(ts, now);
1483 
1484 	local_irq_enable();
1485 }
1486 
1487 /*
1488  * In low-resolution mode, the tick handler must be implemented directly
1489  * at the clockevent level. hrtimer can't be used instead, because its
1490  * infrastructure actually relies on the tick itself as a backend in
1491  * low-resolution mode (see hrtimer_run_queues()).
1492  */
tick_nohz_lowres_handler(struct clock_event_device * dev)1493 static void tick_nohz_lowres_handler(struct clock_event_device *dev)
1494 {
1495 	struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
1496 
1497 	dev->next_event = KTIME_MAX;
1498 
1499 	if (likely(tick_nohz_handler(&ts->sched_timer) == HRTIMER_RESTART))
1500 		tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1);
1501 }
1502 
tick_nohz_activate(struct tick_sched * ts)1503 static inline void tick_nohz_activate(struct tick_sched *ts)
1504 {
1505 	if (!tick_nohz_enabled)
1506 		return;
1507 	tick_sched_flag_set(ts, TS_FLAG_NOHZ);
1508 	/* One update is enough */
1509 	if (!test_and_set_bit(0, &tick_nohz_active))
1510 		timers_update_nohz();
1511 }
1512 
1513 /**
1514  * tick_nohz_switch_to_nohz - switch to NOHZ mode
1515  */
tick_nohz_switch_to_nohz(void)1516 static void tick_nohz_switch_to_nohz(void)
1517 {
1518 	if (!tick_nohz_enabled)
1519 		return;
1520 
1521 	if (tick_switch_to_oneshot(tick_nohz_lowres_handler))
1522 		return;
1523 
1524 	/*
1525 	 * Recycle the hrtimer in 'ts', so we can share the
1526 	 * highres code.
1527 	 */
1528 	tick_setup_sched_timer(false);
1529 }
1530 
tick_nohz_irq_enter(void)1531 static inline void tick_nohz_irq_enter(void)
1532 {
1533 	struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
1534 	ktime_t now;
1535 
1536 	if (!tick_sched_flag_test(ts, TS_FLAG_STOPPED | TS_FLAG_IDLE_ACTIVE))
1537 		return;
1538 	now = ktime_get();
1539 	if (tick_sched_flag_test(ts, TS_FLAG_IDLE_ACTIVE))
1540 		tick_nohz_stop_idle(ts, now);
1541 	/*
1542 	 * If all CPUs are idle we may need to update a stale jiffies value.
1543 	 * Note nohz_full is a special case: a timekeeper is guaranteed to stay
1544 	 * alive but it might be busy looping with interrupts disabled in some
1545 	 * rare case (typically stop machine). So we must make sure we have a
1546 	 * last resort.
1547 	 */
1548 	if (tick_sched_flag_test(ts, TS_FLAG_STOPPED))
1549 		tick_nohz_update_jiffies(now);
1550 }
1551 
1552 #else
1553 
tick_nohz_switch_to_nohz(void)1554 static inline void tick_nohz_switch_to_nohz(void) { }
tick_nohz_irq_enter(void)1555 static inline void tick_nohz_irq_enter(void) { }
tick_nohz_activate(struct tick_sched * ts)1556 static inline void tick_nohz_activate(struct tick_sched *ts) { }
1557 
1558 #endif /* CONFIG_NO_HZ_COMMON */
1559 
1560 /*
1561  * Called from irq_enter() to notify about the possible interruption of idle()
1562  */
tick_irq_enter(void)1563 void tick_irq_enter(void)
1564 {
1565 	tick_check_oneshot_broadcast_this_cpu();
1566 	tick_nohz_irq_enter();
1567 }
1568 
1569 static int sched_skew_tick;
1570 
skew_tick(char * str)1571 static int __init skew_tick(char *str)
1572 {
1573 	get_option(&str, &sched_skew_tick);
1574 
1575 	return 0;
1576 }
1577 early_param("skew_tick", skew_tick);
1578 
1579 /**
1580  * tick_setup_sched_timer - setup the tick emulation timer
1581  * @hrtimer: whether to use the hrtimer or not
1582  */
tick_setup_sched_timer(bool hrtimer)1583 void tick_setup_sched_timer(bool hrtimer)
1584 {
1585 	struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
1586 
1587 	/* Emulate tick processing via per-CPU hrtimers: */
1588 	hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
1589 
1590 	if (IS_ENABLED(CONFIG_HIGH_RES_TIMERS) && hrtimer) {
1591 		tick_sched_flag_set(ts, TS_FLAG_HIGHRES);
1592 		ts->sched_timer.function = tick_nohz_handler;
1593 	}
1594 
1595 	/* Get the next period (per-CPU) */
1596 	hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update());
1597 
1598 	/* Offset the tick to avert 'jiffies_lock' contention. */
1599 	if (sched_skew_tick) {
1600 		u64 offset = TICK_NSEC >> 1;
1601 		do_div(offset, num_possible_cpus());
1602 		offset *= smp_processor_id();
1603 		hrtimer_add_expires_ns(&ts->sched_timer, offset);
1604 	}
1605 
1606 	hrtimer_forward_now(&ts->sched_timer, TICK_NSEC);
1607 	if (IS_ENABLED(CONFIG_HIGH_RES_TIMERS) && hrtimer)
1608 		hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED_HARD);
1609 	else
1610 		tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1);
1611 	tick_nohz_activate(ts);
1612 }
1613 
1614 /*
1615  * Shut down the tick and make sure the CPU won't try to retake the timekeeping
1616  * duty before disabling IRQs in idle for the last time.
1617  */
tick_sched_timer_dying(int cpu)1618 void tick_sched_timer_dying(int cpu)
1619 {
1620 	struct tick_device *td = &per_cpu(tick_cpu_device, cpu);
1621 	struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
1622 	struct clock_event_device *dev = td->evtdev;
1623 	ktime_t idle_sleeptime, iowait_sleeptime;
1624 	unsigned long idle_calls, idle_sleeps;
1625 
1626 	/* This must happen before hrtimers are migrated! */
1627 	tick_sched_timer_cancel(ts);
1628 
1629 	/*
1630 	 * If the clockevents doesn't support CLOCK_EVT_STATE_ONESHOT_STOPPED,
1631 	 * make sure not to call low-res tick handler.
1632 	 */
1633 	if (tick_sched_flag_test(ts, TS_FLAG_NOHZ))
1634 		dev->event_handler = clockevents_handle_noop;
1635 
1636 	idle_sleeptime = ts->idle_sleeptime;
1637 	iowait_sleeptime = ts->iowait_sleeptime;
1638 	idle_calls = ts->idle_calls;
1639 	idle_sleeps = ts->idle_sleeps;
1640 	memset(ts, 0, sizeof(*ts));
1641 	ts->idle_sleeptime = idle_sleeptime;
1642 	ts->iowait_sleeptime = iowait_sleeptime;
1643 	ts->idle_calls = idle_calls;
1644 	ts->idle_sleeps = idle_sleeps;
1645 }
1646 
1647 /*
1648  * Async notification about clocksource changes
1649  */
tick_clock_notify(void)1650 void tick_clock_notify(void)
1651 {
1652 	int cpu;
1653 
1654 	for_each_possible_cpu(cpu)
1655 		set_bit(0, &per_cpu(tick_cpu_sched, cpu).check_clocks);
1656 }
1657 
1658 /*
1659  * Async notification about clock event changes
1660  */
tick_oneshot_notify(void)1661 void tick_oneshot_notify(void)
1662 {
1663 	struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
1664 
1665 	set_bit(0, &ts->check_clocks);
1666 }
1667 
1668 /*
1669  * Check if a change happened, which makes oneshot possible.
1670  *
1671  * Called cyclically from the hrtimer softirq (driven by the timer
1672  * softirq). 'allow_nohz' signals that we can switch into low-res NOHZ
1673  * mode, because high resolution timers are disabled (either compile
1674  * or runtime). Called with interrupts disabled.
1675  */
tick_check_oneshot_change(int allow_nohz)1676 int tick_check_oneshot_change(int allow_nohz)
1677 {
1678 	struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
1679 
1680 	if (!test_and_clear_bit(0, &ts->check_clocks))
1681 		return 0;
1682 
1683 	if (tick_sched_flag_test(ts, TS_FLAG_NOHZ))
1684 		return 0;
1685 
1686 	if (!timekeeping_valid_for_hres() || !tick_is_oneshot_available())
1687 		return 0;
1688 
1689 	if (!allow_nohz)
1690 		return 1;
1691 
1692 	tick_nohz_switch_to_nohz();
1693 	return 0;
1694 }
1695