1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Kernel internal timers
4 *
5 * Copyright (C) 1991, 1992 Linus Torvalds
6 *
7 * 1997-01-28 Modified by Finn Arne Gangstad to make timers scale better.
8 *
9 * 1997-09-10 Updated NTP code according to technical memorandum Jan '96
10 * "A Kernel Model for Precision Timekeeping" by Dave Mills
11 * 1998-12-24 Fixed a xtime SMP race (we need the xtime_lock rw spinlock to
12 * serialize accesses to xtime/lost_ticks).
13 * Copyright (C) 1998 Andrea Arcangeli
14 * 1999-03-10 Improved NTP compatibility by Ulrich Windl
15 * 2002-05-31 Move sys_sysinfo here and make its locking sane, Robert Love
16 * 2000-10-05 Implemented scalable SMP per-CPU timer handling.
17 * Copyright (C) 2000, 2001, 2002 Ingo Molnar
18 * Designed by David S. Miller, Alexey Kuznetsov and Ingo Molnar
19 */
20
21 #include <linux/kernel_stat.h>
22 #include <linux/export.h>
23 #include <linux/interrupt.h>
24 #include <linux/percpu.h>
25 #include <linux/init.h>
26 #include <linux/mm.h>
27 #include <linux/swap.h>
28 #include <linux/pid_namespace.h>
29 #include <linux/notifier.h>
30 #include <linux/thread_info.h>
31 #include <linux/time.h>
32 #include <linux/jiffies.h>
33 #include <linux/posix-timers.h>
34 #include <linux/cpu.h>
35 #include <linux/syscalls.h>
36 #include <linux/delay.h>
37 #include <linux/tick.h>
38 #include <linux/kallsyms.h>
39 #include <linux/irq_work.h>
40 #include <linux/sched/signal.h>
41 #include <linux/sched/sysctl.h>
42 #include <linux/sched/nohz.h>
43 #include <linux/sched/debug.h>
44 #include <linux/slab.h>
45 #include <linux/compat.h>
46 #include <linux/random.h>
47 #include <linux/sysctl.h>
48
49 #include <linux/uaccess.h>
50 #include <asm/unistd.h>
51 #include <asm/div64.h>
52 #include <asm/timex.h>
53 #include <asm/io.h>
54
55 #include "tick-internal.h"
56 #include "timer_migration.h"
57
58 #define CREATE_TRACE_POINTS
59 #include <trace/events/timer.h>
60 #undef CREATE_TRACE_POINTS
61 #include <trace/hooks/timer.h>
62
63 __visible u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES;
64
65 EXPORT_SYMBOL(jiffies_64);
66
67 /*
68 * The timer wheel has LVL_DEPTH array levels. Each level provides an array of
69 * LVL_SIZE buckets. Each level is driven by its own clock and therefore each
70 * level has a different granularity.
71 *
72 * The level granularity is: LVL_CLK_DIV ^ level
73 * The level clock frequency is: HZ / (LVL_CLK_DIV ^ level)
74 *
75 * The array level of a newly armed timer depends on the relative expiry
76 * time. The farther the expiry time is away the higher the array level and
77 * therefore the granularity becomes.
78 *
79 * Contrary to the original timer wheel implementation, which aims for 'exact'
80 * expiry of the timers, this implementation removes the need for recascading
81 * the timers into the lower array levels. The previous 'classic' timer wheel
82 * implementation of the kernel already violated the 'exact' expiry by adding
83 * slack to the expiry time to provide batched expiration. The granularity
84 * levels provide implicit batching.
85 *
86 * This is an optimization of the original timer wheel implementation for the
87 * majority of the timer wheel use cases: timeouts. The vast majority of
88 * timeout timers (networking, disk I/O ...) are canceled before expiry. If
89 * the timeout expires it indicates that normal operation is disturbed, so it
90 * does not matter much whether the timeout comes with a slight delay.
91 *
92 * The only exception to this are networking timers with a small expiry
93 * time. They rely on the granularity. Those fit into the first wheel level,
94 * which has HZ granularity.
95 *
96 * We don't have cascading anymore. timers with a expiry time above the
97 * capacity of the last wheel level are force expired at the maximum timeout
98 * value of the last wheel level. From data sampling we know that the maximum
99 * value observed is 5 days (network connection tracking), so this should not
100 * be an issue.
101 *
102 * The currently chosen array constants values are a good compromise between
103 * array size and granularity.
104 *
105 * This results in the following granularity and range levels:
106 *
107 * HZ 1000 steps
108 * Level Offset Granularity Range
109 * 0 0 1 ms 0 ms - 63 ms
110 * 1 64 8 ms 64 ms - 511 ms
111 * 2 128 64 ms 512 ms - 4095 ms (512ms - ~4s)
112 * 3 192 512 ms 4096 ms - 32767 ms (~4s - ~32s)
113 * 4 256 4096 ms (~4s) 32768 ms - 262143 ms (~32s - ~4m)
114 * 5 320 32768 ms (~32s) 262144 ms - 2097151 ms (~4m - ~34m)
115 * 6 384 262144 ms (~4m) 2097152 ms - 16777215 ms (~34m - ~4h)
116 * 7 448 2097152 ms (~34m) 16777216 ms - 134217727 ms (~4h - ~1d)
117 * 8 512 16777216 ms (~4h) 134217728 ms - 1073741822 ms (~1d - ~12d)
118 *
119 * HZ 300
120 * Level Offset Granularity Range
121 * 0 0 3 ms 0 ms - 210 ms
122 * 1 64 26 ms 213 ms - 1703 ms (213ms - ~1s)
123 * 2 128 213 ms 1706 ms - 13650 ms (~1s - ~13s)
124 * 3 192 1706 ms (~1s) 13653 ms - 109223 ms (~13s - ~1m)
125 * 4 256 13653 ms (~13s) 109226 ms - 873810 ms (~1m - ~14m)
126 * 5 320 109226 ms (~1m) 873813 ms - 6990503 ms (~14m - ~1h)
127 * 6 384 873813 ms (~14m) 6990506 ms - 55924050 ms (~1h - ~15h)
128 * 7 448 6990506 ms (~1h) 55924053 ms - 447392423 ms (~15h - ~5d)
129 * 8 512 55924053 ms (~15h) 447392426 ms - 3579139406 ms (~5d - ~41d)
130 *
131 * HZ 250
132 * Level Offset Granularity Range
133 * 0 0 4 ms 0 ms - 255 ms
134 * 1 64 32 ms 256 ms - 2047 ms (256ms - ~2s)
135 * 2 128 256 ms 2048 ms - 16383 ms (~2s - ~16s)
136 * 3 192 2048 ms (~2s) 16384 ms - 131071 ms (~16s - ~2m)
137 * 4 256 16384 ms (~16s) 131072 ms - 1048575 ms (~2m - ~17m)
138 * 5 320 131072 ms (~2m) 1048576 ms - 8388607 ms (~17m - ~2h)
139 * 6 384 1048576 ms (~17m) 8388608 ms - 67108863 ms (~2h - ~18h)
140 * 7 448 8388608 ms (~2h) 67108864 ms - 536870911 ms (~18h - ~6d)
141 * 8 512 67108864 ms (~18h) 536870912 ms - 4294967288 ms (~6d - ~49d)
142 *
143 * HZ 100
144 * Level Offset Granularity Range
145 * 0 0 10 ms 0 ms - 630 ms
146 * 1 64 80 ms 640 ms - 5110 ms (640ms - ~5s)
147 * 2 128 640 ms 5120 ms - 40950 ms (~5s - ~40s)
148 * 3 192 5120 ms (~5s) 40960 ms - 327670 ms (~40s - ~5m)
149 * 4 256 40960 ms (~40s) 327680 ms - 2621430 ms (~5m - ~43m)
150 * 5 320 327680 ms (~5m) 2621440 ms - 20971510 ms (~43m - ~5h)
151 * 6 384 2621440 ms (~43m) 20971520 ms - 167772150 ms (~5h - ~1d)
152 * 7 448 20971520 ms (~5h) 167772160 ms - 1342177270 ms (~1d - ~15d)
153 */
154
155 /* Clock divisor for the next level */
156 #define LVL_CLK_SHIFT 3
157 #define LVL_CLK_DIV (1UL << LVL_CLK_SHIFT)
158 #define LVL_CLK_MASK (LVL_CLK_DIV - 1)
159 #define LVL_SHIFT(n) ((n) * LVL_CLK_SHIFT)
160 #define LVL_GRAN(n) (1UL << LVL_SHIFT(n))
161
162 /*
163 * The time start value for each level to select the bucket at enqueue
164 * time. We start from the last possible delta of the previous level
165 * so that we can later add an extra LVL_GRAN(n) to n (see calc_index()).
166 */
167 #define LVL_START(n) ((LVL_SIZE - 1) << (((n) - 1) * LVL_CLK_SHIFT))
168
169 /* Size of each clock level */
170 #define LVL_BITS 6
171 #define LVL_SIZE (1UL << LVL_BITS)
172 #define LVL_MASK (LVL_SIZE - 1)
173 #define LVL_OFFS(n) ((n) * LVL_SIZE)
174
175 /* Level depth */
176 #if HZ > 100
177 # define LVL_DEPTH 9
178 # else
179 # define LVL_DEPTH 8
180 #endif
181
182 /* The cutoff (max. capacity of the wheel) */
183 #define WHEEL_TIMEOUT_CUTOFF (LVL_START(LVL_DEPTH))
184 #define WHEEL_TIMEOUT_MAX (WHEEL_TIMEOUT_CUTOFF - LVL_GRAN(LVL_DEPTH - 1))
185
186 /*
187 * The resulting wheel size. If NOHZ is configured we allocate two
188 * wheels so we have a separate storage for the deferrable timers.
189 */
190 #define WHEEL_SIZE (LVL_SIZE * LVL_DEPTH)
191
192 #ifdef CONFIG_NO_HZ_COMMON
193 /*
194 * If multiple bases need to be locked, use the base ordering for lock
195 * nesting, i.e. lowest number first.
196 */
197 # define NR_BASES 3
198 # define BASE_LOCAL 0
199 # define BASE_GLOBAL 1
200 # define BASE_DEF 2
201 #else
202 # define NR_BASES 1
203 # define BASE_LOCAL 0
204 # define BASE_GLOBAL 0
205 # define BASE_DEF 0
206 #endif
207
208 /**
209 * struct timer_base - Per CPU timer base (number of base depends on config)
210 * @lock: Lock protecting the timer_base
211 * @running_timer: When expiring timers, the lock is dropped. To make
212 * sure not to race against deleting/modifying a
213 * currently running timer, the pointer is set to the
214 * timer, which expires at the moment. If no timer is
215 * running, the pointer is NULL.
216 * @expiry_lock: PREEMPT_RT only: Lock is taken in softirq around
217 * timer expiry callback execution and when trying to
218 * delete a running timer and it wasn't successful in
219 * the first glance. It prevents priority inversion
220 * when callback was preempted on a remote CPU and a
221 * caller tries to delete the running timer. It also
222 * prevents a life lock, when the task which tries to
223 * delete a timer preempted the softirq thread which
224 * is running the timer callback function.
225 * @timer_waiters: PREEMPT_RT only: Tells, if there is a waiter
226 * waiting for the end of the timer callback function
227 * execution.
228 * @clk: clock of the timer base; is updated before enqueue
229 * of a timer; during expiry, it is 1 offset ahead of
230 * jiffies to avoid endless requeuing to current
231 * jiffies
232 * @next_expiry: expiry value of the first timer; it is updated when
233 * finding the next timer and during enqueue; the
234 * value is not valid, when next_expiry_recalc is set
235 * @cpu: Number of CPU the timer base belongs to
236 * @next_expiry_recalc: States, whether a recalculation of next_expiry is
237 * required. Value is set true, when a timer was
238 * deleted.
239 * @is_idle: Is set, when timer_base is idle. It is triggered by NOHZ
240 * code. This state is only used in standard
241 * base. Deferrable timers, which are enqueued remotely
242 * never wake up an idle CPU. So no matter of supporting it
243 * for this base.
244 * @timers_pending: Is set, when a timer is pending in the base. It is only
245 * reliable when next_expiry_recalc is not set.
246 * @pending_map: bitmap of the timer wheel; each bit reflects a
247 * bucket of the wheel. When a bit is set, at least a
248 * single timer is enqueued in the related bucket.
249 * @vectors: Array of lists; Each array member reflects a bucket
250 * of the timer wheel. The list contains all timers
251 * which are enqueued into a specific bucket.
252 */
253 struct timer_base {
254 raw_spinlock_t lock;
255 struct timer_list *running_timer;
256 #ifdef CONFIG_PREEMPT_RT
257 spinlock_t expiry_lock;
258 atomic_t timer_waiters;
259 #endif
260 unsigned long clk;
261 unsigned long next_expiry;
262 unsigned int cpu;
263 bool next_expiry_recalc;
264 bool is_idle;
265 bool timers_pending;
266 DECLARE_BITMAP(pending_map, WHEEL_SIZE);
267 struct hlist_head vectors[WHEEL_SIZE];
268 } ____cacheline_aligned;
269
270 static DEFINE_PER_CPU(struct timer_base, timer_bases[NR_BASES]);
271
272 #ifdef CONFIG_NO_HZ_COMMON
273
274 static DEFINE_STATIC_KEY_FALSE(timers_nohz_active);
275 static DEFINE_MUTEX(timer_keys_mutex);
276
277 static void timer_update_keys(struct work_struct *work);
278 static DECLARE_WORK(timer_update_work, timer_update_keys);
279
280 #ifdef CONFIG_SMP
281 static unsigned int sysctl_timer_migration = 1;
282
283 DEFINE_STATIC_KEY_FALSE(timers_migration_enabled);
284
timers_update_migration(void)285 static void timers_update_migration(void)
286 {
287 if (sysctl_timer_migration && tick_nohz_active)
288 static_branch_enable(&timers_migration_enabled);
289 else
290 static_branch_disable(&timers_migration_enabled);
291 }
292
293 #ifdef CONFIG_SYSCTL
timer_migration_handler(const struct ctl_table * table,int write,void * buffer,size_t * lenp,loff_t * ppos)294 static int timer_migration_handler(const struct ctl_table *table, int write,
295 void *buffer, size_t *lenp, loff_t *ppos)
296 {
297 int ret;
298
299 mutex_lock(&timer_keys_mutex);
300 ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
301 if (!ret && write)
302 timers_update_migration();
303 mutex_unlock(&timer_keys_mutex);
304 return ret;
305 }
306
307 static struct ctl_table timer_sysctl[] = {
308 {
309 .procname = "timer_migration",
310 .data = &sysctl_timer_migration,
311 .maxlen = sizeof(unsigned int),
312 .mode = 0644,
313 .proc_handler = timer_migration_handler,
314 .extra1 = SYSCTL_ZERO,
315 .extra2 = SYSCTL_ONE,
316 },
317 };
318
timer_sysctl_init(void)319 static int __init timer_sysctl_init(void)
320 {
321 register_sysctl("kernel", timer_sysctl);
322 return 0;
323 }
324 device_initcall(timer_sysctl_init);
325 #endif /* CONFIG_SYSCTL */
326 #else /* CONFIG_SMP */
timers_update_migration(void)327 static inline void timers_update_migration(void) { }
328 #endif /* !CONFIG_SMP */
329
timer_update_keys(struct work_struct * work)330 static void timer_update_keys(struct work_struct *work)
331 {
332 mutex_lock(&timer_keys_mutex);
333 timers_update_migration();
334 static_branch_enable(&timers_nohz_active);
335 mutex_unlock(&timer_keys_mutex);
336 }
337
timers_update_nohz(void)338 void timers_update_nohz(void)
339 {
340 schedule_work(&timer_update_work);
341 }
342
is_timers_nohz_active(void)343 static inline bool is_timers_nohz_active(void)
344 {
345 return static_branch_unlikely(&timers_nohz_active);
346 }
347 #else
is_timers_nohz_active(void)348 static inline bool is_timers_nohz_active(void) { return false; }
349 #endif /* NO_HZ_COMMON */
350
round_jiffies_common(unsigned long j,int cpu,bool force_up)351 static unsigned long round_jiffies_common(unsigned long j, int cpu,
352 bool force_up)
353 {
354 int rem;
355 unsigned long original = j;
356
357 /*
358 * We don't want all cpus firing their timers at once hitting the
359 * same lock or cachelines, so we skew each extra cpu with an extra
360 * 3 jiffies. This 3 jiffies came originally from the mm/ code which
361 * already did this.
362 * The skew is done by adding 3*cpunr, then round, then subtract this
363 * extra offset again.
364 */
365 j += cpu * 3;
366
367 rem = j % HZ;
368
369 /*
370 * If the target jiffy is just after a whole second (which can happen
371 * due to delays of the timer irq, long irq off times etc etc) then
372 * we should round down to the whole second, not up. Use 1/4th second
373 * as cutoff for this rounding as an extreme upper bound for this.
374 * But never round down if @force_up is set.
375 */
376 if (rem < HZ/4 && !force_up) /* round down */
377 j = j - rem;
378 else /* round up */
379 j = j - rem + HZ;
380
381 /* now that we have rounded, subtract the extra skew again */
382 j -= cpu * 3;
383
384 /*
385 * Make sure j is still in the future. Otherwise return the
386 * unmodified value.
387 */
388 return time_is_after_jiffies(j) ? j : original;
389 }
390
391 /**
392 * __round_jiffies - function to round jiffies to a full second
393 * @j: the time in (absolute) jiffies that should be rounded
394 * @cpu: the processor number on which the timeout will happen
395 *
396 * __round_jiffies() rounds an absolute time in the future (in jiffies)
397 * up or down to (approximately) full seconds. This is useful for timers
398 * for which the exact time they fire does not matter too much, as long as
399 * they fire approximately every X seconds.
400 *
401 * By rounding these timers to whole seconds, all such timers will fire
402 * at the same time, rather than at various times spread out. The goal
403 * of this is to have the CPU wake up less, which saves power.
404 *
405 * The exact rounding is skewed for each processor to avoid all
406 * processors firing at the exact same time, which could lead
407 * to lock contention or spurious cache line bouncing.
408 *
409 * The return value is the rounded version of the @j parameter.
410 */
__round_jiffies(unsigned long j,int cpu)411 unsigned long __round_jiffies(unsigned long j, int cpu)
412 {
413 return round_jiffies_common(j, cpu, false);
414 }
415 EXPORT_SYMBOL_GPL(__round_jiffies);
416
417 /**
418 * __round_jiffies_relative - function to round jiffies to a full second
419 * @j: the time in (relative) jiffies that should be rounded
420 * @cpu: the processor number on which the timeout will happen
421 *
422 * __round_jiffies_relative() rounds a time delta in the future (in jiffies)
423 * up or down to (approximately) full seconds. This is useful for timers
424 * for which the exact time they fire does not matter too much, as long as
425 * they fire approximately every X seconds.
426 *
427 * By rounding these timers to whole seconds, all such timers will fire
428 * at the same time, rather than at various times spread out. The goal
429 * of this is to have the CPU wake up less, which saves power.
430 *
431 * The exact rounding is skewed for each processor to avoid all
432 * processors firing at the exact same time, which could lead
433 * to lock contention or spurious cache line bouncing.
434 *
435 * The return value is the rounded version of the @j parameter.
436 */
__round_jiffies_relative(unsigned long j,int cpu)437 unsigned long __round_jiffies_relative(unsigned long j, int cpu)
438 {
439 unsigned long j0 = jiffies;
440
441 /* Use j0 because jiffies might change while we run */
442 return round_jiffies_common(j + j0, cpu, false) - j0;
443 }
444 EXPORT_SYMBOL_GPL(__round_jiffies_relative);
445
446 /**
447 * round_jiffies - function to round jiffies to a full second
448 * @j: the time in (absolute) jiffies that should be rounded
449 *
450 * round_jiffies() rounds an absolute time in the future (in jiffies)
451 * up or down to (approximately) full seconds. This is useful for timers
452 * for which the exact time they fire does not matter too much, as long as
453 * they fire approximately every X seconds.
454 *
455 * By rounding these timers to whole seconds, all such timers will fire
456 * at the same time, rather than at various times spread out. The goal
457 * of this is to have the CPU wake up less, which saves power.
458 *
459 * The return value is the rounded version of the @j parameter.
460 */
round_jiffies(unsigned long j)461 unsigned long round_jiffies(unsigned long j)
462 {
463 return round_jiffies_common(j, raw_smp_processor_id(), false);
464 }
465 EXPORT_SYMBOL_GPL(round_jiffies);
466
467 /**
468 * round_jiffies_relative - function to round jiffies to a full second
469 * @j: the time in (relative) jiffies that should be rounded
470 *
471 * round_jiffies_relative() rounds a time delta in the future (in jiffies)
472 * up or down to (approximately) full seconds. This is useful for timers
473 * for which the exact time they fire does not matter too much, as long as
474 * they fire approximately every X seconds.
475 *
476 * By rounding these timers to whole seconds, all such timers will fire
477 * at the same time, rather than at various times spread out. The goal
478 * of this is to have the CPU wake up less, which saves power.
479 *
480 * The return value is the rounded version of the @j parameter.
481 */
round_jiffies_relative(unsigned long j)482 unsigned long round_jiffies_relative(unsigned long j)
483 {
484 return __round_jiffies_relative(j, raw_smp_processor_id());
485 }
486 EXPORT_SYMBOL_GPL(round_jiffies_relative);
487
488 /**
489 * __round_jiffies_up - function to round jiffies up to a full second
490 * @j: the time in (absolute) jiffies that should be rounded
491 * @cpu: the processor number on which the timeout will happen
492 *
493 * This is the same as __round_jiffies() except that it will never
494 * round down. This is useful for timeouts for which the exact time
495 * of firing does not matter too much, as long as they don't fire too
496 * early.
497 */
__round_jiffies_up(unsigned long j,int cpu)498 unsigned long __round_jiffies_up(unsigned long j, int cpu)
499 {
500 return round_jiffies_common(j, cpu, true);
501 }
502 EXPORT_SYMBOL_GPL(__round_jiffies_up);
503
504 /**
505 * __round_jiffies_up_relative - function to round jiffies up to a full second
506 * @j: the time in (relative) jiffies that should be rounded
507 * @cpu: the processor number on which the timeout will happen
508 *
509 * This is the same as __round_jiffies_relative() except that it will never
510 * round down. This is useful for timeouts for which the exact time
511 * of firing does not matter too much, as long as they don't fire too
512 * early.
513 */
__round_jiffies_up_relative(unsigned long j,int cpu)514 unsigned long __round_jiffies_up_relative(unsigned long j, int cpu)
515 {
516 unsigned long j0 = jiffies;
517
518 /* Use j0 because jiffies might change while we run */
519 return round_jiffies_common(j + j0, cpu, true) - j0;
520 }
521 EXPORT_SYMBOL_GPL(__round_jiffies_up_relative);
522
523 /**
524 * round_jiffies_up - function to round jiffies up to a full second
525 * @j: the time in (absolute) jiffies that should be rounded
526 *
527 * This is the same as round_jiffies() except that it will never
528 * round down. This is useful for timeouts for which the exact time
529 * of firing does not matter too much, as long as they don't fire too
530 * early.
531 */
round_jiffies_up(unsigned long j)532 unsigned long round_jiffies_up(unsigned long j)
533 {
534 return round_jiffies_common(j, raw_smp_processor_id(), true);
535 }
536 EXPORT_SYMBOL_GPL(round_jiffies_up);
537
538 /**
539 * round_jiffies_up_relative - function to round jiffies up to a full second
540 * @j: the time in (relative) jiffies that should be rounded
541 *
542 * This is the same as round_jiffies_relative() except that it will never
543 * round down. This is useful for timeouts for which the exact time
544 * of firing does not matter too much, as long as they don't fire too
545 * early.
546 */
round_jiffies_up_relative(unsigned long j)547 unsigned long round_jiffies_up_relative(unsigned long j)
548 {
549 return __round_jiffies_up_relative(j, raw_smp_processor_id());
550 }
551 EXPORT_SYMBOL_GPL(round_jiffies_up_relative);
552
553
timer_get_idx(struct timer_list * timer)554 static inline unsigned int timer_get_idx(struct timer_list *timer)
555 {
556 return (timer->flags & TIMER_ARRAYMASK) >> TIMER_ARRAYSHIFT;
557 }
558
timer_set_idx(struct timer_list * timer,unsigned int idx)559 static inline void timer_set_idx(struct timer_list *timer, unsigned int idx)
560 {
561 timer->flags = (timer->flags & ~TIMER_ARRAYMASK) |
562 idx << TIMER_ARRAYSHIFT;
563 }
564
565 /*
566 * Helper function to calculate the array index for a given expiry
567 * time.
568 */
calc_index(unsigned long expires,unsigned lvl,unsigned long * bucket_expiry)569 static inline unsigned calc_index(unsigned long expires, unsigned lvl,
570 unsigned long *bucket_expiry)
571 {
572
573 /*
574 * The timer wheel has to guarantee that a timer does not fire
575 * early. Early expiry can happen due to:
576 * - Timer is armed at the edge of a tick
577 * - Truncation of the expiry time in the outer wheel levels
578 *
579 * Round up with level granularity to prevent this.
580 */
581 trace_android_vh_timer_calc_index(lvl, &expires);
582 expires = (expires >> LVL_SHIFT(lvl)) + 1;
583 *bucket_expiry = expires << LVL_SHIFT(lvl);
584 return LVL_OFFS(lvl) + (expires & LVL_MASK);
585 }
586
calc_wheel_index(unsigned long expires,unsigned long clk,unsigned long * bucket_expiry)587 static int calc_wheel_index(unsigned long expires, unsigned long clk,
588 unsigned long *bucket_expiry)
589 {
590 unsigned long delta = expires - clk;
591 unsigned int idx;
592
593 if (delta < LVL_START(1)) {
594 idx = calc_index(expires, 0, bucket_expiry);
595 } else if (delta < LVL_START(2)) {
596 idx = calc_index(expires, 1, bucket_expiry);
597 } else if (delta < LVL_START(3)) {
598 idx = calc_index(expires, 2, bucket_expiry);
599 } else if (delta < LVL_START(4)) {
600 idx = calc_index(expires, 3, bucket_expiry);
601 } else if (delta < LVL_START(5)) {
602 idx = calc_index(expires, 4, bucket_expiry);
603 } else if (delta < LVL_START(6)) {
604 idx = calc_index(expires, 5, bucket_expiry);
605 } else if (delta < LVL_START(7)) {
606 idx = calc_index(expires, 6, bucket_expiry);
607 } else if (LVL_DEPTH > 8 && delta < LVL_START(8)) {
608 idx = calc_index(expires, 7, bucket_expiry);
609 } else if ((long) delta < 0) {
610 idx = clk & LVL_MASK;
611 *bucket_expiry = clk;
612 } else {
613 /*
614 * Force expire obscene large timeouts to expire at the
615 * capacity limit of the wheel.
616 */
617 if (delta >= WHEEL_TIMEOUT_CUTOFF)
618 expires = clk + WHEEL_TIMEOUT_MAX;
619
620 idx = calc_index(expires, LVL_DEPTH - 1, bucket_expiry);
621 }
622 return idx;
623 }
624
625 static void
trigger_dyntick_cpu(struct timer_base * base,struct timer_list * timer)626 trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer)
627 {
628 /*
629 * Deferrable timers do not prevent the CPU from entering dynticks and
630 * are not taken into account on the idle/nohz_full path. An IPI when a
631 * new deferrable timer is enqueued will wake up the remote CPU but
632 * nothing will be done with the deferrable timer base. Therefore skip
633 * the remote IPI for deferrable timers completely.
634 */
635 if (!is_timers_nohz_active() || timer->flags & TIMER_DEFERRABLE)
636 return;
637
638 /*
639 * We might have to IPI the remote CPU if the base is idle and the
640 * timer is pinned. If it is a non pinned timer, it is only queued
641 * on the remote CPU, when timer was running during queueing. Then
642 * everything is handled by remote CPU anyway. If the other CPU is
643 * on the way to idle then it can't set base->is_idle as we hold
644 * the base lock:
645 */
646 if (base->is_idle) {
647 WARN_ON_ONCE(!(timer->flags & TIMER_PINNED ||
648 tick_nohz_full_cpu(base->cpu)));
649 wake_up_nohz_cpu(base->cpu);
650 }
651 }
652
653 /*
654 * Enqueue the timer into the hash bucket, mark it pending in
655 * the bitmap, store the index in the timer flags then wake up
656 * the target CPU if needed.
657 */
enqueue_timer(struct timer_base * base,struct timer_list * timer,unsigned int idx,unsigned long bucket_expiry)658 static void enqueue_timer(struct timer_base *base, struct timer_list *timer,
659 unsigned int idx, unsigned long bucket_expiry)
660 {
661
662 hlist_add_head(&timer->entry, base->vectors + idx);
663 __set_bit(idx, base->pending_map);
664 timer_set_idx(timer, idx);
665
666 trace_timer_start(timer, bucket_expiry);
667
668 /*
669 * Check whether this is the new first expiring timer. The
670 * effective expiry time of the timer is required here
671 * (bucket_expiry) instead of timer->expires.
672 */
673 if (time_before(bucket_expiry, base->next_expiry)) {
674 /*
675 * Set the next expiry time and kick the CPU so it
676 * can reevaluate the wheel:
677 */
678 WRITE_ONCE(base->next_expiry, bucket_expiry);
679 base->timers_pending = true;
680 base->next_expiry_recalc = false;
681 trigger_dyntick_cpu(base, timer);
682 }
683 }
684
internal_add_timer(struct timer_base * base,struct timer_list * timer)685 static void internal_add_timer(struct timer_base *base, struct timer_list *timer)
686 {
687 unsigned long bucket_expiry;
688 unsigned int idx;
689
690 idx = calc_wheel_index(timer->expires, base->clk, &bucket_expiry);
691 enqueue_timer(base, timer, idx, bucket_expiry);
692 }
693
694 #ifdef CONFIG_DEBUG_OBJECTS_TIMERS
695
696 static const struct debug_obj_descr timer_debug_descr;
697
698 struct timer_hint {
699 void (*function)(struct timer_list *t);
700 long offset;
701 };
702
703 #define TIMER_HINT(fn, container, timr, hintfn) \
704 { \
705 .function = fn, \
706 .offset = offsetof(container, hintfn) - \
707 offsetof(container, timr) \
708 }
709
710 static const struct timer_hint timer_hints[] = {
711 TIMER_HINT(delayed_work_timer_fn,
712 struct delayed_work, timer, work.func),
713 TIMER_HINT(kthread_delayed_work_timer_fn,
714 struct kthread_delayed_work, timer, work.func),
715 };
716
timer_debug_hint(void * addr)717 static void *timer_debug_hint(void *addr)
718 {
719 struct timer_list *timer = addr;
720 int i;
721
722 for (i = 0; i < ARRAY_SIZE(timer_hints); i++) {
723 if (timer_hints[i].function == timer->function) {
724 void (**fn)(void) = addr + timer_hints[i].offset;
725
726 return *fn;
727 }
728 }
729
730 return timer->function;
731 }
732
timer_is_static_object(void * addr)733 static bool timer_is_static_object(void *addr)
734 {
735 struct timer_list *timer = addr;
736
737 return (timer->entry.pprev == NULL &&
738 timer->entry.next == TIMER_ENTRY_STATIC);
739 }
740
741 /*
742 * timer_fixup_init is called when:
743 * - an active object is initialized
744 */
timer_fixup_init(void * addr,enum debug_obj_state state)745 static bool timer_fixup_init(void *addr, enum debug_obj_state state)
746 {
747 struct timer_list *timer = addr;
748
749 switch (state) {
750 case ODEBUG_STATE_ACTIVE:
751 del_timer_sync(timer);
752 debug_object_init(timer, &timer_debug_descr);
753 return true;
754 default:
755 return false;
756 }
757 }
758
759 /* Stub timer callback for improperly used timers. */
stub_timer(struct timer_list * unused)760 static void stub_timer(struct timer_list *unused)
761 {
762 WARN_ON(1);
763 }
764
765 /*
766 * timer_fixup_activate is called when:
767 * - an active object is activated
768 * - an unknown non-static object is activated
769 */
timer_fixup_activate(void * addr,enum debug_obj_state state)770 static bool timer_fixup_activate(void *addr, enum debug_obj_state state)
771 {
772 struct timer_list *timer = addr;
773
774 switch (state) {
775 case ODEBUG_STATE_NOTAVAILABLE:
776 timer_setup(timer, stub_timer, 0);
777 return true;
778
779 case ODEBUG_STATE_ACTIVE:
780 WARN_ON(1);
781 fallthrough;
782 default:
783 return false;
784 }
785 }
786
787 /*
788 * timer_fixup_free is called when:
789 * - an active object is freed
790 */
timer_fixup_free(void * addr,enum debug_obj_state state)791 static bool timer_fixup_free(void *addr, enum debug_obj_state state)
792 {
793 struct timer_list *timer = addr;
794
795 switch (state) {
796 case ODEBUG_STATE_ACTIVE:
797 del_timer_sync(timer);
798 debug_object_free(timer, &timer_debug_descr);
799 return true;
800 default:
801 return false;
802 }
803 }
804
805 /*
806 * timer_fixup_assert_init is called when:
807 * - an untracked/uninit-ed object is found
808 */
timer_fixup_assert_init(void * addr,enum debug_obj_state state)809 static bool timer_fixup_assert_init(void *addr, enum debug_obj_state state)
810 {
811 struct timer_list *timer = addr;
812
813 switch (state) {
814 case ODEBUG_STATE_NOTAVAILABLE:
815 timer_setup(timer, stub_timer, 0);
816 return true;
817 default:
818 return false;
819 }
820 }
821
822 static const struct debug_obj_descr timer_debug_descr = {
823 .name = "timer_list",
824 .debug_hint = timer_debug_hint,
825 .is_static_object = timer_is_static_object,
826 .fixup_init = timer_fixup_init,
827 .fixup_activate = timer_fixup_activate,
828 .fixup_free = timer_fixup_free,
829 .fixup_assert_init = timer_fixup_assert_init,
830 };
831
debug_timer_init(struct timer_list * timer)832 static inline void debug_timer_init(struct timer_list *timer)
833 {
834 debug_object_init(timer, &timer_debug_descr);
835 }
836
debug_timer_activate(struct timer_list * timer)837 static inline void debug_timer_activate(struct timer_list *timer)
838 {
839 debug_object_activate(timer, &timer_debug_descr);
840 }
841
debug_timer_deactivate(struct timer_list * timer)842 static inline void debug_timer_deactivate(struct timer_list *timer)
843 {
844 debug_object_deactivate(timer, &timer_debug_descr);
845 }
846
debug_timer_assert_init(struct timer_list * timer)847 static inline void debug_timer_assert_init(struct timer_list *timer)
848 {
849 debug_object_assert_init(timer, &timer_debug_descr);
850 }
851
852 static void do_init_timer(struct timer_list *timer,
853 void (*func)(struct timer_list *),
854 unsigned int flags,
855 const char *name, struct lock_class_key *key);
856
init_timer_on_stack_key(struct timer_list * timer,void (* func)(struct timer_list *),unsigned int flags,const char * name,struct lock_class_key * key)857 void init_timer_on_stack_key(struct timer_list *timer,
858 void (*func)(struct timer_list *),
859 unsigned int flags,
860 const char *name, struct lock_class_key *key)
861 {
862 debug_object_init_on_stack(timer, &timer_debug_descr);
863 do_init_timer(timer, func, flags, name, key);
864 }
865 EXPORT_SYMBOL_GPL(init_timer_on_stack_key);
866
destroy_timer_on_stack(struct timer_list * timer)867 void destroy_timer_on_stack(struct timer_list *timer)
868 {
869 debug_object_free(timer, &timer_debug_descr);
870 }
871 EXPORT_SYMBOL_GPL(destroy_timer_on_stack);
872
873 #else
debug_timer_init(struct timer_list * timer)874 static inline void debug_timer_init(struct timer_list *timer) { }
debug_timer_activate(struct timer_list * timer)875 static inline void debug_timer_activate(struct timer_list *timer) { }
debug_timer_deactivate(struct timer_list * timer)876 static inline void debug_timer_deactivate(struct timer_list *timer) { }
debug_timer_assert_init(struct timer_list * timer)877 static inline void debug_timer_assert_init(struct timer_list *timer) { }
878 #endif
879
debug_init(struct timer_list * timer)880 static inline void debug_init(struct timer_list *timer)
881 {
882 debug_timer_init(timer);
883 trace_timer_init(timer);
884 }
885
debug_deactivate(struct timer_list * timer)886 static inline void debug_deactivate(struct timer_list *timer)
887 {
888 debug_timer_deactivate(timer);
889 trace_timer_cancel(timer);
890 }
891
debug_assert_init(struct timer_list * timer)892 static inline void debug_assert_init(struct timer_list *timer)
893 {
894 debug_timer_assert_init(timer);
895 }
896
do_init_timer(struct timer_list * timer,void (* func)(struct timer_list *),unsigned int flags,const char * name,struct lock_class_key * key)897 static void do_init_timer(struct timer_list *timer,
898 void (*func)(struct timer_list *),
899 unsigned int flags,
900 const char *name, struct lock_class_key *key)
901 {
902 timer->entry.pprev = NULL;
903 timer->function = func;
904 if (WARN_ON_ONCE(flags & ~TIMER_INIT_FLAGS))
905 flags &= TIMER_INIT_FLAGS;
906 timer->flags = flags | raw_smp_processor_id();
907 lockdep_init_map(&timer->lockdep_map, name, key, 0);
908 }
909
910 /**
911 * init_timer_key - initialize a timer
912 * @timer: the timer to be initialized
913 * @func: timer callback function
914 * @flags: timer flags
915 * @name: name of the timer
916 * @key: lockdep class key of the fake lock used for tracking timer
917 * sync lock dependencies
918 *
919 * init_timer_key() must be done to a timer prior to calling *any* of the
920 * other timer functions.
921 */
init_timer_key(struct timer_list * timer,void (* func)(struct timer_list *),unsigned int flags,const char * name,struct lock_class_key * key)922 void init_timer_key(struct timer_list *timer,
923 void (*func)(struct timer_list *), unsigned int flags,
924 const char *name, struct lock_class_key *key)
925 {
926 debug_init(timer);
927 do_init_timer(timer, func, flags, name, key);
928 }
929 EXPORT_SYMBOL(init_timer_key);
930
detach_timer(struct timer_list * timer,bool clear_pending)931 static inline void detach_timer(struct timer_list *timer, bool clear_pending)
932 {
933 struct hlist_node *entry = &timer->entry;
934
935 debug_deactivate(timer);
936
937 __hlist_del(entry);
938 if (clear_pending)
939 entry->pprev = NULL;
940 entry->next = LIST_POISON2;
941 }
942
detach_if_pending(struct timer_list * timer,struct timer_base * base,bool clear_pending)943 static int detach_if_pending(struct timer_list *timer, struct timer_base *base,
944 bool clear_pending)
945 {
946 unsigned idx = timer_get_idx(timer);
947
948 if (!timer_pending(timer))
949 return 0;
950
951 if (hlist_is_singular_node(&timer->entry, base->vectors + idx)) {
952 __clear_bit(idx, base->pending_map);
953 base->next_expiry_recalc = true;
954 }
955
956 detach_timer(timer, clear_pending);
957 return 1;
958 }
959
get_timer_cpu_base(u32 tflags,u32 cpu)960 static inline struct timer_base *get_timer_cpu_base(u32 tflags, u32 cpu)
961 {
962 int index = tflags & TIMER_PINNED ? BASE_LOCAL : BASE_GLOBAL;
963 struct timer_base *base;
964
965 base = per_cpu_ptr(&timer_bases[index], cpu);
966
967 /*
968 * If the timer is deferrable and NO_HZ_COMMON is set then we need
969 * to use the deferrable base.
970 */
971 if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && (tflags & TIMER_DEFERRABLE))
972 base = per_cpu_ptr(&timer_bases[BASE_DEF], cpu);
973 return base;
974 }
975
get_timer_this_cpu_base(u32 tflags)976 static inline struct timer_base *get_timer_this_cpu_base(u32 tflags)
977 {
978 int index = tflags & TIMER_PINNED ? BASE_LOCAL : BASE_GLOBAL;
979 struct timer_base *base;
980
981 base = this_cpu_ptr(&timer_bases[index]);
982
983 /*
984 * If the timer is deferrable and NO_HZ_COMMON is set then we need
985 * to use the deferrable base.
986 */
987 if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && (tflags & TIMER_DEFERRABLE))
988 base = this_cpu_ptr(&timer_bases[BASE_DEF]);
989 return base;
990 }
991
get_timer_base(u32 tflags)992 static inline struct timer_base *get_timer_base(u32 tflags)
993 {
994 return get_timer_cpu_base(tflags, tflags & TIMER_CPUMASK);
995 }
996
__forward_timer_base(struct timer_base * base,unsigned long basej)997 static inline void __forward_timer_base(struct timer_base *base,
998 unsigned long basej)
999 {
1000 /*
1001 * Check whether we can forward the base. We can only do that when
1002 * @basej is past base->clk otherwise we might rewind base->clk.
1003 */
1004 if (time_before_eq(basej, base->clk))
1005 return;
1006
1007 /*
1008 * If the next expiry value is > jiffies, then we fast forward to
1009 * jiffies otherwise we forward to the next expiry value.
1010 */
1011 if (time_after(base->next_expiry, basej)) {
1012 base->clk = basej;
1013 } else {
1014 if (WARN_ON_ONCE(time_before(base->next_expiry, base->clk)))
1015 return;
1016 base->clk = base->next_expiry;
1017 }
1018
1019 }
1020
forward_timer_base(struct timer_base * base)1021 static inline void forward_timer_base(struct timer_base *base)
1022 {
1023 __forward_timer_base(base, READ_ONCE(jiffies));
1024 }
1025
1026 /*
1027 * We are using hashed locking: Holding per_cpu(timer_bases[x]).lock means
1028 * that all timers which are tied to this base are locked, and the base itself
1029 * is locked too.
1030 *
1031 * So __run_timers/migrate_timers can safely modify all timers which could
1032 * be found in the base->vectors array.
1033 *
1034 * When a timer is migrating then the TIMER_MIGRATING flag is set and we need
1035 * to wait until the migration is done.
1036 */
lock_timer_base(struct timer_list * timer,unsigned long * flags)1037 static struct timer_base *lock_timer_base(struct timer_list *timer,
1038 unsigned long *flags)
1039 __acquires(timer->base->lock)
1040 {
1041 for (;;) {
1042 struct timer_base *base;
1043 u32 tf;
1044
1045 /*
1046 * We need to use READ_ONCE() here, otherwise the compiler
1047 * might re-read @tf between the check for TIMER_MIGRATING
1048 * and spin_lock().
1049 */
1050 tf = READ_ONCE(timer->flags);
1051
1052 if (!(tf & TIMER_MIGRATING)) {
1053 base = get_timer_base(tf);
1054 raw_spin_lock_irqsave(&base->lock, *flags);
1055 if (timer->flags == tf)
1056 return base;
1057 raw_spin_unlock_irqrestore(&base->lock, *flags);
1058 }
1059 cpu_relax();
1060 }
1061 }
1062
1063 #define MOD_TIMER_PENDING_ONLY 0x01
1064 #define MOD_TIMER_REDUCE 0x02
1065 #define MOD_TIMER_NOTPENDING 0x04
1066
1067 static inline int
__mod_timer(struct timer_list * timer,unsigned long expires,unsigned int options)1068 __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int options)
1069 {
1070 unsigned long clk = 0, flags, bucket_expiry;
1071 struct timer_base *base, *new_base;
1072 unsigned int idx = UINT_MAX;
1073 int ret = 0;
1074
1075 debug_assert_init(timer);
1076
1077 /*
1078 * This is a common optimization triggered by the networking code - if
1079 * the timer is re-modified to have the same timeout or ends up in the
1080 * same array bucket then just return:
1081 */
1082 if (!(options & MOD_TIMER_NOTPENDING) && timer_pending(timer)) {
1083 /*
1084 * The downside of this optimization is that it can result in
1085 * larger granularity than you would get from adding a new
1086 * timer with this expiry.
1087 */
1088 long diff = timer->expires - expires;
1089
1090 if (!diff)
1091 return 1;
1092 if (options & MOD_TIMER_REDUCE && diff <= 0)
1093 return 1;
1094
1095 /*
1096 * We lock timer base and calculate the bucket index right
1097 * here. If the timer ends up in the same bucket, then we
1098 * just update the expiry time and avoid the whole
1099 * dequeue/enqueue dance.
1100 */
1101 base = lock_timer_base(timer, &flags);
1102 /*
1103 * Has @timer been shutdown? This needs to be evaluated
1104 * while holding base lock to prevent a race against the
1105 * shutdown code.
1106 */
1107 if (!timer->function)
1108 goto out_unlock;
1109
1110 forward_timer_base(base);
1111
1112 if (timer_pending(timer) && (options & MOD_TIMER_REDUCE) &&
1113 time_before_eq(timer->expires, expires)) {
1114 ret = 1;
1115 goto out_unlock;
1116 }
1117
1118 clk = base->clk;
1119 idx = calc_wheel_index(expires, clk, &bucket_expiry);
1120
1121 /*
1122 * Retrieve and compare the array index of the pending
1123 * timer. If it matches set the expiry to the new value so a
1124 * subsequent call will exit in the expires check above.
1125 */
1126 if (idx == timer_get_idx(timer)) {
1127 if (!(options & MOD_TIMER_REDUCE))
1128 timer->expires = expires;
1129 else if (time_after(timer->expires, expires))
1130 timer->expires = expires;
1131 ret = 1;
1132 goto out_unlock;
1133 }
1134 } else {
1135 base = lock_timer_base(timer, &flags);
1136 /*
1137 * Has @timer been shutdown? This needs to be evaluated
1138 * while holding base lock to prevent a race against the
1139 * shutdown code.
1140 */
1141 if (!timer->function)
1142 goto out_unlock;
1143
1144 forward_timer_base(base);
1145 }
1146
1147 ret = detach_if_pending(timer, base, false);
1148 if (!ret && (options & MOD_TIMER_PENDING_ONLY))
1149 goto out_unlock;
1150
1151 new_base = get_timer_this_cpu_base(timer->flags);
1152
1153 if (base != new_base) {
1154 /*
1155 * We are trying to schedule the timer on the new base.
1156 * However we can't change timer's base while it is running,
1157 * otherwise timer_delete_sync() can't detect that the timer's
1158 * handler yet has not finished. This also guarantees that the
1159 * timer is serialized wrt itself.
1160 */
1161 if (likely(base->running_timer != timer)) {
1162 /* See the comment in lock_timer_base() */
1163 timer->flags |= TIMER_MIGRATING;
1164
1165 raw_spin_unlock(&base->lock);
1166 base = new_base;
1167 raw_spin_lock(&base->lock);
1168 WRITE_ONCE(timer->flags,
1169 (timer->flags & ~TIMER_BASEMASK) | base->cpu);
1170 forward_timer_base(base);
1171 }
1172 }
1173
1174 debug_timer_activate(timer);
1175
1176 timer->expires = expires;
1177 /*
1178 * If 'idx' was calculated above and the base time did not advance
1179 * between calculating 'idx' and possibly switching the base, only
1180 * enqueue_timer() is required. Otherwise we need to (re)calculate
1181 * the wheel index via internal_add_timer().
1182 */
1183 if (idx != UINT_MAX && clk == base->clk)
1184 enqueue_timer(base, timer, idx, bucket_expiry);
1185 else
1186 internal_add_timer(base, timer);
1187
1188 out_unlock:
1189 raw_spin_unlock_irqrestore(&base->lock, flags);
1190
1191 return ret;
1192 }
1193
1194 /**
1195 * mod_timer_pending - Modify a pending timer's timeout
1196 * @timer: The pending timer to be modified
1197 * @expires: New absolute timeout in jiffies
1198 *
1199 * mod_timer_pending() is the same for pending timers as mod_timer(), but
1200 * will not activate inactive timers.
1201 *
1202 * If @timer->function == NULL then the start operation is silently
1203 * discarded.
1204 *
1205 * Return:
1206 * * %0 - The timer was inactive and not modified or was in
1207 * shutdown state and the operation was discarded
1208 * * %1 - The timer was active and requeued to expire at @expires
1209 */
mod_timer_pending(struct timer_list * timer,unsigned long expires)1210 int mod_timer_pending(struct timer_list *timer, unsigned long expires)
1211 {
1212 return __mod_timer(timer, expires, MOD_TIMER_PENDING_ONLY);
1213 }
1214 EXPORT_SYMBOL(mod_timer_pending);
1215
1216 /**
1217 * mod_timer - Modify a timer's timeout
1218 * @timer: The timer to be modified
1219 * @expires: New absolute timeout in jiffies
1220 *
1221 * mod_timer(timer, expires) is equivalent to:
1222 *
1223 * del_timer(timer); timer->expires = expires; add_timer(timer);
1224 *
1225 * mod_timer() is more efficient than the above open coded sequence. In
1226 * case that the timer is inactive, the del_timer() part is a NOP. The
1227 * timer is in any case activated with the new expiry time @expires.
1228 *
1229 * Note that if there are multiple unserialized concurrent users of the
1230 * same timer, then mod_timer() is the only safe way to modify the timeout,
1231 * since add_timer() cannot modify an already running timer.
1232 *
1233 * If @timer->function == NULL then the start operation is silently
1234 * discarded. In this case the return value is 0 and meaningless.
1235 *
1236 * Return:
1237 * * %0 - The timer was inactive and started or was in shutdown
1238 * state and the operation was discarded
1239 * * %1 - The timer was active and requeued to expire at @expires or
1240 * the timer was active and not modified because @expires did
1241 * not change the effective expiry time
1242 */
mod_timer(struct timer_list * timer,unsigned long expires)1243 int mod_timer(struct timer_list *timer, unsigned long expires)
1244 {
1245 return __mod_timer(timer, expires, 0);
1246 }
1247 EXPORT_SYMBOL(mod_timer);
1248
1249 /**
1250 * timer_reduce - Modify a timer's timeout if it would reduce the timeout
1251 * @timer: The timer to be modified
1252 * @expires: New absolute timeout in jiffies
1253 *
1254 * timer_reduce() is very similar to mod_timer(), except that it will only
1255 * modify an enqueued timer if that would reduce the expiration time. If
1256 * @timer is not enqueued it starts the timer.
1257 *
1258 * If @timer->function == NULL then the start operation is silently
1259 * discarded.
1260 *
1261 * Return:
1262 * * %0 - The timer was inactive and started or was in shutdown
1263 * state and the operation was discarded
1264 * * %1 - The timer was active and requeued to expire at @expires or
1265 * the timer was active and not modified because @expires
1266 * did not change the effective expiry time such that the
1267 * timer would expire earlier than already scheduled
1268 */
timer_reduce(struct timer_list * timer,unsigned long expires)1269 int timer_reduce(struct timer_list *timer, unsigned long expires)
1270 {
1271 return __mod_timer(timer, expires, MOD_TIMER_REDUCE);
1272 }
1273 EXPORT_SYMBOL(timer_reduce);
1274
1275 /**
1276 * add_timer - Start a timer
1277 * @timer: The timer to be started
1278 *
1279 * Start @timer to expire at @timer->expires in the future. @timer->expires
1280 * is the absolute expiry time measured in 'jiffies'. When the timer expires
1281 * timer->function(timer) will be invoked from soft interrupt context.
1282 *
1283 * The @timer->expires and @timer->function fields must be set prior
1284 * to calling this function.
1285 *
1286 * If @timer->function == NULL then the start operation is silently
1287 * discarded.
1288 *
1289 * If @timer->expires is already in the past @timer will be queued to
1290 * expire at the next timer tick.
1291 *
1292 * This can only operate on an inactive timer. Attempts to invoke this on
1293 * an active timer are rejected with a warning.
1294 */
add_timer(struct timer_list * timer)1295 void add_timer(struct timer_list *timer)
1296 {
1297 if (WARN_ON_ONCE(timer_pending(timer)))
1298 return;
1299 __mod_timer(timer, timer->expires, MOD_TIMER_NOTPENDING);
1300 }
1301 EXPORT_SYMBOL(add_timer);
1302
1303 /**
1304 * add_timer_local() - Start a timer on the local CPU
1305 * @timer: The timer to be started
1306 *
1307 * Same as add_timer() except that the timer flag TIMER_PINNED is set.
1308 *
1309 * See add_timer() for further details.
1310 */
add_timer_local(struct timer_list * timer)1311 void add_timer_local(struct timer_list *timer)
1312 {
1313 if (WARN_ON_ONCE(timer_pending(timer)))
1314 return;
1315 timer->flags |= TIMER_PINNED;
1316 __mod_timer(timer, timer->expires, MOD_TIMER_NOTPENDING);
1317 }
1318 EXPORT_SYMBOL(add_timer_local);
1319
1320 /**
1321 * add_timer_global() - Start a timer without TIMER_PINNED flag set
1322 * @timer: The timer to be started
1323 *
1324 * Same as add_timer() except that the timer flag TIMER_PINNED is unset.
1325 *
1326 * See add_timer() for further details.
1327 */
add_timer_global(struct timer_list * timer)1328 void add_timer_global(struct timer_list *timer)
1329 {
1330 if (WARN_ON_ONCE(timer_pending(timer)))
1331 return;
1332 timer->flags &= ~TIMER_PINNED;
1333 __mod_timer(timer, timer->expires, MOD_TIMER_NOTPENDING);
1334 }
1335 EXPORT_SYMBOL(add_timer_global);
1336
1337 /**
1338 * add_timer_on - Start a timer on a particular CPU
1339 * @timer: The timer to be started
1340 * @cpu: The CPU to start it on
1341 *
1342 * Same as add_timer() except that it starts the timer on the given CPU and
1343 * the TIMER_PINNED flag is set. When timer shouldn't be a pinned timer in
1344 * the next round, add_timer_global() should be used instead as it unsets
1345 * the TIMER_PINNED flag.
1346 *
1347 * See add_timer() for further details.
1348 */
add_timer_on(struct timer_list * timer,int cpu)1349 void add_timer_on(struct timer_list *timer, int cpu)
1350 {
1351 struct timer_base *new_base, *base;
1352 unsigned long flags;
1353
1354 debug_assert_init(timer);
1355
1356 if (WARN_ON_ONCE(timer_pending(timer)))
1357 return;
1358
1359 /* Make sure timer flags have TIMER_PINNED flag set */
1360 timer->flags |= TIMER_PINNED;
1361
1362 new_base = get_timer_cpu_base(timer->flags, cpu);
1363
1364 /*
1365 * If @timer was on a different CPU, it should be migrated with the
1366 * old base locked to prevent other operations proceeding with the
1367 * wrong base locked. See lock_timer_base().
1368 */
1369 base = lock_timer_base(timer, &flags);
1370 /*
1371 * Has @timer been shutdown? This needs to be evaluated while
1372 * holding base lock to prevent a race against the shutdown code.
1373 */
1374 if (!timer->function)
1375 goto out_unlock;
1376
1377 if (base != new_base) {
1378 timer->flags |= TIMER_MIGRATING;
1379
1380 raw_spin_unlock(&base->lock);
1381 base = new_base;
1382 raw_spin_lock(&base->lock);
1383 WRITE_ONCE(timer->flags,
1384 (timer->flags & ~TIMER_BASEMASK) | cpu);
1385 }
1386 forward_timer_base(base);
1387
1388 debug_timer_activate(timer);
1389 internal_add_timer(base, timer);
1390 out_unlock:
1391 raw_spin_unlock_irqrestore(&base->lock, flags);
1392 }
1393 EXPORT_SYMBOL_GPL(add_timer_on);
1394
1395 /**
1396 * __timer_delete - Internal function: Deactivate a timer
1397 * @timer: The timer to be deactivated
1398 * @shutdown: If true, this indicates that the timer is about to be
1399 * shutdown permanently.
1400 *
1401 * If @shutdown is true then @timer->function is set to NULL under the
1402 * timer base lock which prevents further rearming of the time. In that
1403 * case any attempt to rearm @timer after this function returns will be
1404 * silently ignored.
1405 *
1406 * Return:
1407 * * %0 - The timer was not pending
1408 * * %1 - The timer was pending and deactivated
1409 */
__timer_delete(struct timer_list * timer,bool shutdown)1410 static int __timer_delete(struct timer_list *timer, bool shutdown)
1411 {
1412 struct timer_base *base;
1413 unsigned long flags;
1414 int ret = 0;
1415
1416 debug_assert_init(timer);
1417
1418 /*
1419 * If @shutdown is set then the lock has to be taken whether the
1420 * timer is pending or not to protect against a concurrent rearm
1421 * which might hit between the lockless pending check and the lock
1422 * acquisition. By taking the lock it is ensured that such a newly
1423 * enqueued timer is dequeued and cannot end up with
1424 * timer->function == NULL in the expiry code.
1425 *
1426 * If timer->function is currently executed, then this makes sure
1427 * that the callback cannot requeue the timer.
1428 */
1429 if (timer_pending(timer) || shutdown) {
1430 base = lock_timer_base(timer, &flags);
1431 ret = detach_if_pending(timer, base, true);
1432 if (shutdown)
1433 timer->function = NULL;
1434 raw_spin_unlock_irqrestore(&base->lock, flags);
1435 }
1436
1437 return ret;
1438 }
1439
1440 /**
1441 * timer_delete - Deactivate a timer
1442 * @timer: The timer to be deactivated
1443 *
1444 * The function only deactivates a pending timer, but contrary to
1445 * timer_delete_sync() it does not take into account whether the timer's
1446 * callback function is concurrently executed on a different CPU or not.
1447 * It neither prevents rearming of the timer. If @timer can be rearmed
1448 * concurrently then the return value of this function is meaningless.
1449 *
1450 * Return:
1451 * * %0 - The timer was not pending
1452 * * %1 - The timer was pending and deactivated
1453 */
timer_delete(struct timer_list * timer)1454 int timer_delete(struct timer_list *timer)
1455 {
1456 return __timer_delete(timer, false);
1457 }
1458 EXPORT_SYMBOL(timer_delete);
1459
1460 /**
1461 * timer_shutdown - Deactivate a timer and prevent rearming
1462 * @timer: The timer to be deactivated
1463 *
1464 * The function does not wait for an eventually running timer callback on a
1465 * different CPU but it prevents rearming of the timer. Any attempt to arm
1466 * @timer after this function returns will be silently ignored.
1467 *
1468 * This function is useful for teardown code and should only be used when
1469 * timer_shutdown_sync() cannot be invoked due to locking or context constraints.
1470 *
1471 * Return:
1472 * * %0 - The timer was not pending
1473 * * %1 - The timer was pending
1474 */
timer_shutdown(struct timer_list * timer)1475 int timer_shutdown(struct timer_list *timer)
1476 {
1477 return __timer_delete(timer, true);
1478 }
1479 EXPORT_SYMBOL_GPL(timer_shutdown);
1480
1481 /**
1482 * __try_to_del_timer_sync - Internal function: Try to deactivate a timer
1483 * @timer: Timer to deactivate
1484 * @shutdown: If true, this indicates that the timer is about to be
1485 * shutdown permanently.
1486 *
1487 * If @shutdown is true then @timer->function is set to NULL under the
1488 * timer base lock which prevents further rearming of the timer. Any
1489 * attempt to rearm @timer after this function returns will be silently
1490 * ignored.
1491 *
1492 * This function cannot guarantee that the timer cannot be rearmed
1493 * right after dropping the base lock if @shutdown is false. That
1494 * needs to be prevented by the calling code if necessary.
1495 *
1496 * Return:
1497 * * %0 - The timer was not pending
1498 * * %1 - The timer was pending and deactivated
1499 * * %-1 - The timer callback function is running on a different CPU
1500 */
__try_to_del_timer_sync(struct timer_list * timer,bool shutdown)1501 static int __try_to_del_timer_sync(struct timer_list *timer, bool shutdown)
1502 {
1503 struct timer_base *base;
1504 unsigned long flags;
1505 int ret = -1;
1506
1507 debug_assert_init(timer);
1508
1509 base = lock_timer_base(timer, &flags);
1510
1511 if (base->running_timer != timer)
1512 ret = detach_if_pending(timer, base, true);
1513 if (shutdown)
1514 timer->function = NULL;
1515
1516 raw_spin_unlock_irqrestore(&base->lock, flags);
1517
1518 return ret;
1519 }
1520
1521 /**
1522 * try_to_del_timer_sync - Try to deactivate a timer
1523 * @timer: Timer to deactivate
1524 *
1525 * This function tries to deactivate a timer. On success the timer is not
1526 * queued and the timer callback function is not running on any CPU.
1527 *
1528 * This function does not guarantee that the timer cannot be rearmed right
1529 * after dropping the base lock. That needs to be prevented by the calling
1530 * code if necessary.
1531 *
1532 * Return:
1533 * * %0 - The timer was not pending
1534 * * %1 - The timer was pending and deactivated
1535 * * %-1 - The timer callback function is running on a different CPU
1536 */
try_to_del_timer_sync(struct timer_list * timer)1537 int try_to_del_timer_sync(struct timer_list *timer)
1538 {
1539 return __try_to_del_timer_sync(timer, false);
1540 }
1541 EXPORT_SYMBOL(try_to_del_timer_sync);
1542
1543 #ifdef CONFIG_PREEMPT_RT
timer_base_init_expiry_lock(struct timer_base * base)1544 static __init void timer_base_init_expiry_lock(struct timer_base *base)
1545 {
1546 spin_lock_init(&base->expiry_lock);
1547 }
1548
timer_base_lock_expiry(struct timer_base * base)1549 static inline void timer_base_lock_expiry(struct timer_base *base)
1550 {
1551 spin_lock(&base->expiry_lock);
1552 }
1553
timer_base_unlock_expiry(struct timer_base * base)1554 static inline void timer_base_unlock_expiry(struct timer_base *base)
1555 {
1556 spin_unlock(&base->expiry_lock);
1557 }
1558
1559 /*
1560 * The counterpart to del_timer_wait_running().
1561 *
1562 * If there is a waiter for base->expiry_lock, then it was waiting for the
1563 * timer callback to finish. Drop expiry_lock and reacquire it. That allows
1564 * the waiter to acquire the lock and make progress.
1565 */
timer_sync_wait_running(struct timer_base * base)1566 static void timer_sync_wait_running(struct timer_base *base)
1567 __releases(&base->lock) __releases(&base->expiry_lock)
1568 __acquires(&base->expiry_lock) __acquires(&base->lock)
1569 {
1570 if (atomic_read(&base->timer_waiters)) {
1571 raw_spin_unlock_irq(&base->lock);
1572 spin_unlock(&base->expiry_lock);
1573 spin_lock(&base->expiry_lock);
1574 raw_spin_lock_irq(&base->lock);
1575 }
1576 }
1577
1578 /*
1579 * This function is called on PREEMPT_RT kernels when the fast path
1580 * deletion of a timer failed because the timer callback function was
1581 * running.
1582 *
1583 * This prevents priority inversion, if the softirq thread on a remote CPU
1584 * got preempted, and it prevents a life lock when the task which tries to
1585 * delete a timer preempted the softirq thread running the timer callback
1586 * function.
1587 */
del_timer_wait_running(struct timer_list * timer)1588 static void del_timer_wait_running(struct timer_list *timer)
1589 {
1590 u32 tf;
1591
1592 tf = READ_ONCE(timer->flags);
1593 if (!(tf & (TIMER_MIGRATING | TIMER_IRQSAFE))) {
1594 struct timer_base *base = get_timer_base(tf);
1595
1596 /*
1597 * Mark the base as contended and grab the expiry lock,
1598 * which is held by the softirq across the timer
1599 * callback. Drop the lock immediately so the softirq can
1600 * expire the next timer. In theory the timer could already
1601 * be running again, but that's more than unlikely and just
1602 * causes another wait loop.
1603 */
1604 atomic_inc(&base->timer_waiters);
1605 spin_lock_bh(&base->expiry_lock);
1606 atomic_dec(&base->timer_waiters);
1607 spin_unlock_bh(&base->expiry_lock);
1608 }
1609 }
1610 #else
timer_base_init_expiry_lock(struct timer_base * base)1611 static inline void timer_base_init_expiry_lock(struct timer_base *base) { }
timer_base_lock_expiry(struct timer_base * base)1612 static inline void timer_base_lock_expiry(struct timer_base *base) { }
timer_base_unlock_expiry(struct timer_base * base)1613 static inline void timer_base_unlock_expiry(struct timer_base *base) { }
timer_sync_wait_running(struct timer_base * base)1614 static inline void timer_sync_wait_running(struct timer_base *base) { }
del_timer_wait_running(struct timer_list * timer)1615 static inline void del_timer_wait_running(struct timer_list *timer) { }
1616 #endif
1617
1618 /**
1619 * __timer_delete_sync - Internal function: Deactivate a timer and wait
1620 * for the handler to finish.
1621 * @timer: The timer to be deactivated
1622 * @shutdown: If true, @timer->function will be set to NULL under the
1623 * timer base lock which prevents rearming of @timer
1624 *
1625 * If @shutdown is not set the timer can be rearmed later. If the timer can
1626 * be rearmed concurrently, i.e. after dropping the base lock then the
1627 * return value is meaningless.
1628 *
1629 * If @shutdown is set then @timer->function is set to NULL under timer
1630 * base lock which prevents rearming of the timer. Any attempt to rearm
1631 * a shutdown timer is silently ignored.
1632 *
1633 * If the timer should be reused after shutdown it has to be initialized
1634 * again.
1635 *
1636 * Return:
1637 * * %0 - The timer was not pending
1638 * * %1 - The timer was pending and deactivated
1639 */
__timer_delete_sync(struct timer_list * timer,bool shutdown)1640 static int __timer_delete_sync(struct timer_list *timer, bool shutdown)
1641 {
1642 int ret;
1643
1644 #ifdef CONFIG_LOCKDEP
1645 unsigned long flags;
1646
1647 /*
1648 * If lockdep gives a backtrace here, please reference
1649 * the synchronization rules above.
1650 */
1651 local_irq_save(flags);
1652 lock_map_acquire(&timer->lockdep_map);
1653 lock_map_release(&timer->lockdep_map);
1654 local_irq_restore(flags);
1655 #endif
1656 /*
1657 * don't use it in hardirq context, because it
1658 * could lead to deadlock.
1659 */
1660 WARN_ON(in_hardirq() && !(timer->flags & TIMER_IRQSAFE));
1661
1662 /*
1663 * Must be able to sleep on PREEMPT_RT because of the slowpath in
1664 * del_timer_wait_running().
1665 */
1666 if (IS_ENABLED(CONFIG_PREEMPT_RT) && !(timer->flags & TIMER_IRQSAFE))
1667 lockdep_assert_preemption_enabled();
1668
1669 do {
1670 ret = __try_to_del_timer_sync(timer, shutdown);
1671
1672 if (unlikely(ret < 0)) {
1673 del_timer_wait_running(timer);
1674 cpu_relax();
1675 }
1676 } while (ret < 0);
1677
1678 return ret;
1679 }
1680
1681 /**
1682 * timer_delete_sync - Deactivate a timer and wait for the handler to finish.
1683 * @timer: The timer to be deactivated
1684 *
1685 * Synchronization rules: Callers must prevent restarting of the timer,
1686 * otherwise this function is meaningless. It must not be called from
1687 * interrupt contexts unless the timer is an irqsafe one. The caller must
1688 * not hold locks which would prevent completion of the timer's callback
1689 * function. The timer's handler must not call add_timer_on(). Upon exit
1690 * the timer is not queued and the handler is not running on any CPU.
1691 *
1692 * For !irqsafe timers, the caller must not hold locks that are held in
1693 * interrupt context. Even if the lock has nothing to do with the timer in
1694 * question. Here's why::
1695 *
1696 * CPU0 CPU1
1697 * ---- ----
1698 * <SOFTIRQ>
1699 * call_timer_fn();
1700 * base->running_timer = mytimer;
1701 * spin_lock_irq(somelock);
1702 * <IRQ>
1703 * spin_lock(somelock);
1704 * timer_delete_sync(mytimer);
1705 * while (base->running_timer == mytimer);
1706 *
1707 * Now timer_delete_sync() will never return and never release somelock.
1708 * The interrupt on the other CPU is waiting to grab somelock but it has
1709 * interrupted the softirq that CPU0 is waiting to finish.
1710 *
1711 * This function cannot guarantee that the timer is not rearmed again by
1712 * some concurrent or preempting code, right after it dropped the base
1713 * lock. If there is the possibility of a concurrent rearm then the return
1714 * value of the function is meaningless.
1715 *
1716 * If such a guarantee is needed, e.g. for teardown situations then use
1717 * timer_shutdown_sync() instead.
1718 *
1719 * Return:
1720 * * %0 - The timer was not pending
1721 * * %1 - The timer was pending and deactivated
1722 */
timer_delete_sync(struct timer_list * timer)1723 int timer_delete_sync(struct timer_list *timer)
1724 {
1725 return __timer_delete_sync(timer, false);
1726 }
1727 EXPORT_SYMBOL(timer_delete_sync);
1728
1729 /**
1730 * timer_shutdown_sync - Shutdown a timer and prevent rearming
1731 * @timer: The timer to be shutdown
1732 *
1733 * When the function returns it is guaranteed that:
1734 * - @timer is not queued
1735 * - The callback function of @timer is not running
1736 * - @timer cannot be enqueued again. Any attempt to rearm
1737 * @timer is silently ignored.
1738 *
1739 * See timer_delete_sync() for synchronization rules.
1740 *
1741 * This function is useful for final teardown of an infrastructure where
1742 * the timer is subject to a circular dependency problem.
1743 *
1744 * A common pattern for this is a timer and a workqueue where the timer can
1745 * schedule work and work can arm the timer. On shutdown the workqueue must
1746 * be destroyed and the timer must be prevented from rearming. Unless the
1747 * code has conditionals like 'if (mything->in_shutdown)' to prevent that
1748 * there is no way to get this correct with timer_delete_sync().
1749 *
1750 * timer_shutdown_sync() is solving the problem. The correct ordering of
1751 * calls in this case is:
1752 *
1753 * timer_shutdown_sync(&mything->timer);
1754 * workqueue_destroy(&mything->workqueue);
1755 *
1756 * After this 'mything' can be safely freed.
1757 *
1758 * This obviously implies that the timer is not required to be functional
1759 * for the rest of the shutdown operation.
1760 *
1761 * Return:
1762 * * %0 - The timer was not pending
1763 * * %1 - The timer was pending
1764 */
timer_shutdown_sync(struct timer_list * timer)1765 int timer_shutdown_sync(struct timer_list *timer)
1766 {
1767 return __timer_delete_sync(timer, true);
1768 }
1769 EXPORT_SYMBOL_GPL(timer_shutdown_sync);
1770
call_timer_fn(struct timer_list * timer,void (* fn)(struct timer_list *),unsigned long baseclk)1771 static void call_timer_fn(struct timer_list *timer,
1772 void (*fn)(struct timer_list *),
1773 unsigned long baseclk)
1774 {
1775 int count = preempt_count();
1776
1777 #ifdef CONFIG_LOCKDEP
1778 /*
1779 * It is permissible to free the timer from inside the
1780 * function that is called from it, this we need to take into
1781 * account for lockdep too. To avoid bogus "held lock freed"
1782 * warnings as well as problems when looking into
1783 * timer->lockdep_map, make a copy and use that here.
1784 */
1785 struct lockdep_map lockdep_map;
1786
1787 lockdep_copy_map(&lockdep_map, &timer->lockdep_map);
1788 #endif
1789 /*
1790 * Couple the lock chain with the lock chain at
1791 * timer_delete_sync() by acquiring the lock_map around the fn()
1792 * call here and in timer_delete_sync().
1793 */
1794 lock_map_acquire(&lockdep_map);
1795
1796 trace_timer_expire_entry(timer, baseclk);
1797 fn(timer);
1798 trace_timer_expire_exit(timer);
1799
1800 lock_map_release(&lockdep_map);
1801
1802 if (count != preempt_count()) {
1803 WARN_ONCE(1, "timer: %pS preempt leak: %08x -> %08x\n",
1804 fn, count, preempt_count());
1805 /*
1806 * Restore the preempt count. That gives us a decent
1807 * chance to survive and extract information. If the
1808 * callback kept a lock held, bad luck, but not worse
1809 * than the BUG() we had.
1810 */
1811 preempt_count_set(count);
1812 }
1813 }
1814
expire_timers(struct timer_base * base,struct hlist_head * head)1815 static void expire_timers(struct timer_base *base, struct hlist_head *head)
1816 {
1817 /*
1818 * This value is required only for tracing. base->clk was
1819 * incremented directly before expire_timers was called. But expiry
1820 * is related to the old base->clk value.
1821 */
1822 unsigned long baseclk = base->clk - 1;
1823
1824 while (!hlist_empty(head)) {
1825 struct timer_list *timer;
1826 void (*fn)(struct timer_list *);
1827
1828 timer = hlist_entry(head->first, struct timer_list, entry);
1829
1830 base->running_timer = timer;
1831 detach_timer(timer, true);
1832
1833 fn = timer->function;
1834
1835 if (WARN_ON_ONCE(!fn)) {
1836 /* Should never happen. Emphasis on should! */
1837 base->running_timer = NULL;
1838 continue;
1839 }
1840
1841 if (timer->flags & TIMER_IRQSAFE) {
1842 raw_spin_unlock(&base->lock);
1843 call_timer_fn(timer, fn, baseclk);
1844 raw_spin_lock(&base->lock);
1845 base->running_timer = NULL;
1846 } else {
1847 raw_spin_unlock_irq(&base->lock);
1848 call_timer_fn(timer, fn, baseclk);
1849 raw_spin_lock_irq(&base->lock);
1850 base->running_timer = NULL;
1851 timer_sync_wait_running(base);
1852 }
1853 }
1854 }
1855
collect_expired_timers(struct timer_base * base,struct hlist_head * heads)1856 static int collect_expired_timers(struct timer_base *base,
1857 struct hlist_head *heads)
1858 {
1859 unsigned long clk = base->clk = base->next_expiry;
1860 struct hlist_head *vec;
1861 int i, levels = 0;
1862 unsigned int idx;
1863
1864 for (i = 0; i < LVL_DEPTH; i++) {
1865 idx = (clk & LVL_MASK) + i * LVL_SIZE;
1866
1867 if (__test_and_clear_bit(idx, base->pending_map)) {
1868 vec = base->vectors + idx;
1869 hlist_move_list(vec, heads++);
1870 levels++;
1871 }
1872 /* Is it time to look at the next level? */
1873 if (clk & LVL_CLK_MASK)
1874 break;
1875 /* Shift clock for the next level granularity */
1876 clk >>= LVL_CLK_SHIFT;
1877 }
1878 return levels;
1879 }
1880
1881 /*
1882 * Find the next pending bucket of a level. Search from level start (@offset)
1883 * + @clk upwards and if nothing there, search from start of the level
1884 * (@offset) up to @offset + clk.
1885 */
next_pending_bucket(struct timer_base * base,unsigned offset,unsigned clk)1886 static int next_pending_bucket(struct timer_base *base, unsigned offset,
1887 unsigned clk)
1888 {
1889 unsigned pos, start = offset + clk;
1890 unsigned end = offset + LVL_SIZE;
1891
1892 pos = find_next_bit(base->pending_map, end, start);
1893 if (pos < end)
1894 return pos - start;
1895
1896 pos = find_next_bit(base->pending_map, start, offset);
1897 return pos < start ? pos + LVL_SIZE - start : -1;
1898 }
1899
1900 /*
1901 * Search the first expiring timer in the various clock levels. Caller must
1902 * hold base->lock.
1903 *
1904 * Store next expiry time in base->next_expiry.
1905 */
timer_recalc_next_expiry(struct timer_base * base)1906 static void timer_recalc_next_expiry(struct timer_base *base)
1907 {
1908 unsigned long clk, next, adj;
1909 unsigned lvl, offset = 0;
1910
1911 next = base->clk + NEXT_TIMER_MAX_DELTA;
1912 clk = base->clk;
1913 for (lvl = 0; lvl < LVL_DEPTH; lvl++, offset += LVL_SIZE) {
1914 int pos = next_pending_bucket(base, offset, clk & LVL_MASK);
1915 unsigned long lvl_clk = clk & LVL_CLK_MASK;
1916
1917 if (pos >= 0) {
1918 unsigned long tmp = clk + (unsigned long) pos;
1919
1920 tmp <<= LVL_SHIFT(lvl);
1921 if (time_before(tmp, next))
1922 next = tmp;
1923
1924 /*
1925 * If the next expiration happens before we reach
1926 * the next level, no need to check further.
1927 */
1928 if (pos <= ((LVL_CLK_DIV - lvl_clk) & LVL_CLK_MASK))
1929 break;
1930 }
1931 /*
1932 * Clock for the next level. If the current level clock lower
1933 * bits are zero, we look at the next level as is. If not we
1934 * need to advance it by one because that's going to be the
1935 * next expiring bucket in that level. base->clk is the next
1936 * expiring jiffy. So in case of:
1937 *
1938 * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0
1939 * 0 0 0 0 0 0
1940 *
1941 * we have to look at all levels @index 0. With
1942 *
1943 * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0
1944 * 0 0 0 0 0 2
1945 *
1946 * LVL0 has the next expiring bucket @index 2. The upper
1947 * levels have the next expiring bucket @index 1.
1948 *
1949 * In case that the propagation wraps the next level the same
1950 * rules apply:
1951 *
1952 * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0
1953 * 0 0 0 0 F 2
1954 *
1955 * So after looking at LVL0 we get:
1956 *
1957 * LVL5 LVL4 LVL3 LVL2 LVL1
1958 * 0 0 0 1 0
1959 *
1960 * So no propagation from LVL1 to LVL2 because that happened
1961 * with the add already, but then we need to propagate further
1962 * from LVL2 to LVL3.
1963 *
1964 * So the simple check whether the lower bits of the current
1965 * level are 0 or not is sufficient for all cases.
1966 */
1967 adj = lvl_clk ? 1 : 0;
1968 clk >>= LVL_CLK_SHIFT;
1969 clk += adj;
1970 }
1971
1972 WRITE_ONCE(base->next_expiry, next);
1973 base->next_expiry_recalc = false;
1974 base->timers_pending = !(next == base->clk + NEXT_TIMER_MAX_DELTA);
1975 }
1976
1977 #ifdef CONFIG_NO_HZ_COMMON
1978 /*
1979 * Check, if the next hrtimer event is before the next timer wheel
1980 * event:
1981 */
cmp_next_hrtimer_event(u64 basem,u64 expires)1982 static u64 cmp_next_hrtimer_event(u64 basem, u64 expires)
1983 {
1984 u64 nextevt = hrtimer_get_next_event();
1985
1986 /*
1987 * If high resolution timers are enabled
1988 * hrtimer_get_next_event() returns KTIME_MAX.
1989 */
1990 if (expires <= nextevt)
1991 return expires;
1992
1993 /*
1994 * If the next timer is already expired, return the tick base
1995 * time so the tick is fired immediately.
1996 */
1997 if (nextevt <= basem)
1998 return basem;
1999
2000 /*
2001 * Round up to the next jiffy. High resolution timers are
2002 * off, so the hrtimers are expired in the tick and we need to
2003 * make sure that this tick really expires the timer to avoid
2004 * a ping pong of the nohz stop code.
2005 *
2006 * Use DIV_ROUND_UP_ULL to prevent gcc calling __divdi3
2007 */
2008 return DIV_ROUND_UP_ULL(nextevt, TICK_NSEC) * TICK_NSEC;
2009 }
2010
next_timer_interrupt(struct timer_base * base,unsigned long basej)2011 static unsigned long next_timer_interrupt(struct timer_base *base,
2012 unsigned long basej)
2013 {
2014 if (base->next_expiry_recalc)
2015 timer_recalc_next_expiry(base);
2016
2017 /*
2018 * Move next_expiry for the empty base into the future to prevent an
2019 * unnecessary raise of the timer softirq when the next_expiry value
2020 * will be reached even if there is no timer pending.
2021 *
2022 * This update is also required to make timer_base::next_expiry values
2023 * easy comparable to find out which base holds the first pending timer.
2024 */
2025 if (!base->timers_pending)
2026 WRITE_ONCE(base->next_expiry, basej + NEXT_TIMER_MAX_DELTA);
2027
2028 return base->next_expiry;
2029 }
2030
fetch_next_timer_interrupt(unsigned long basej,u64 basem,struct timer_base * base_local,struct timer_base * base_global,struct timer_events * tevt)2031 static unsigned long fetch_next_timer_interrupt(unsigned long basej, u64 basem,
2032 struct timer_base *base_local,
2033 struct timer_base *base_global,
2034 struct timer_events *tevt)
2035 {
2036 unsigned long nextevt, nextevt_local, nextevt_global;
2037 bool local_first;
2038
2039 nextevt_local = next_timer_interrupt(base_local, basej);
2040 nextevt_global = next_timer_interrupt(base_global, basej);
2041
2042 local_first = time_before_eq(nextevt_local, nextevt_global);
2043
2044 nextevt = local_first ? nextevt_local : nextevt_global;
2045
2046 /*
2047 * If the @nextevt is at max. one tick away, use @nextevt and store
2048 * it in the local expiry value. The next global event is irrelevant in
2049 * this case and can be left as KTIME_MAX.
2050 */
2051 if (time_before_eq(nextevt, basej + 1)) {
2052 /* If we missed a tick already, force 0 delta */
2053 if (time_before(nextevt, basej))
2054 nextevt = basej;
2055 tevt->local = basem + (u64)(nextevt - basej) * TICK_NSEC;
2056
2057 /*
2058 * This is required for the remote check only but it doesn't
2059 * hurt, when it is done for both call sites:
2060 *
2061 * * The remote callers will only take care of the global timers
2062 * as local timers will be handled by CPU itself. When not
2063 * updating tevt->global with the already missed first global
2064 * timer, it is possible that it will be missed completely.
2065 *
2066 * * The local callers will ignore the tevt->global anyway, when
2067 * nextevt is max. one tick away.
2068 */
2069 if (!local_first)
2070 tevt->global = tevt->local;
2071 return nextevt;
2072 }
2073
2074 /*
2075 * Update tevt.* values:
2076 *
2077 * If the local queue expires first, then the global event can be
2078 * ignored. If the global queue is empty, nothing to do either.
2079 */
2080 if (!local_first && base_global->timers_pending)
2081 tevt->global = basem + (u64)(nextevt_global - basej) * TICK_NSEC;
2082
2083 if (base_local->timers_pending)
2084 tevt->local = basem + (u64)(nextevt_local - basej) * TICK_NSEC;
2085
2086 return nextevt;
2087 }
2088
2089 # ifdef CONFIG_SMP
2090 /**
2091 * fetch_next_timer_interrupt_remote() - Store next timers into @tevt
2092 * @basej: base time jiffies
2093 * @basem: base time clock monotonic
2094 * @tevt: Pointer to the storage for the expiry values
2095 * @cpu: Remote CPU
2096 *
2097 * Stores the next pending local and global timer expiry values in the
2098 * struct pointed to by @tevt. If a queue is empty the corresponding
2099 * field is set to KTIME_MAX. If local event expires before global
2100 * event, global event is set to KTIME_MAX as well.
2101 *
2102 * Caller needs to make sure timer base locks are held (use
2103 * timer_lock_remote_bases() for this purpose).
2104 */
fetch_next_timer_interrupt_remote(unsigned long basej,u64 basem,struct timer_events * tevt,unsigned int cpu)2105 void fetch_next_timer_interrupt_remote(unsigned long basej, u64 basem,
2106 struct timer_events *tevt,
2107 unsigned int cpu)
2108 {
2109 struct timer_base *base_local, *base_global;
2110
2111 /* Preset local / global events */
2112 tevt->local = tevt->global = KTIME_MAX;
2113
2114 base_local = per_cpu_ptr(&timer_bases[BASE_LOCAL], cpu);
2115 base_global = per_cpu_ptr(&timer_bases[BASE_GLOBAL], cpu);
2116
2117 lockdep_assert_held(&base_local->lock);
2118 lockdep_assert_held(&base_global->lock);
2119
2120 fetch_next_timer_interrupt(basej, basem, base_local, base_global, tevt);
2121 }
2122
2123 /**
2124 * timer_unlock_remote_bases - unlock timer bases of cpu
2125 * @cpu: Remote CPU
2126 *
2127 * Unlocks the remote timer bases.
2128 */
timer_unlock_remote_bases(unsigned int cpu)2129 void timer_unlock_remote_bases(unsigned int cpu)
2130 __releases(timer_bases[BASE_LOCAL]->lock)
2131 __releases(timer_bases[BASE_GLOBAL]->lock)
2132 {
2133 struct timer_base *base_local, *base_global;
2134
2135 base_local = per_cpu_ptr(&timer_bases[BASE_LOCAL], cpu);
2136 base_global = per_cpu_ptr(&timer_bases[BASE_GLOBAL], cpu);
2137
2138 raw_spin_unlock(&base_global->lock);
2139 raw_spin_unlock(&base_local->lock);
2140 }
2141
2142 /**
2143 * timer_lock_remote_bases - lock timer bases of cpu
2144 * @cpu: Remote CPU
2145 *
2146 * Locks the remote timer bases.
2147 */
timer_lock_remote_bases(unsigned int cpu)2148 void timer_lock_remote_bases(unsigned int cpu)
2149 __acquires(timer_bases[BASE_LOCAL]->lock)
2150 __acquires(timer_bases[BASE_GLOBAL]->lock)
2151 {
2152 struct timer_base *base_local, *base_global;
2153
2154 base_local = per_cpu_ptr(&timer_bases[BASE_LOCAL], cpu);
2155 base_global = per_cpu_ptr(&timer_bases[BASE_GLOBAL], cpu);
2156
2157 lockdep_assert_irqs_disabled();
2158
2159 raw_spin_lock(&base_local->lock);
2160 raw_spin_lock_nested(&base_global->lock, SINGLE_DEPTH_NESTING);
2161 }
2162
2163 /**
2164 * timer_base_is_idle() - Return whether timer base is set idle
2165 *
2166 * Returns value of local timer base is_idle value.
2167 */
timer_base_is_idle(void)2168 bool timer_base_is_idle(void)
2169 {
2170 return __this_cpu_read(timer_bases[BASE_LOCAL].is_idle);
2171 }
2172
2173 static void __run_timer_base(struct timer_base *base);
2174
2175 /**
2176 * timer_expire_remote() - expire global timers of cpu
2177 * @cpu: Remote CPU
2178 *
2179 * Expire timers of global base of remote CPU.
2180 */
timer_expire_remote(unsigned int cpu)2181 void timer_expire_remote(unsigned int cpu)
2182 {
2183 struct timer_base *base = per_cpu_ptr(&timer_bases[BASE_GLOBAL], cpu);
2184
2185 __run_timer_base(base);
2186 }
2187
timer_use_tmigr(unsigned long basej,u64 basem,unsigned long * nextevt,bool * tick_stop_path,bool timer_base_idle,struct timer_events * tevt)2188 static void timer_use_tmigr(unsigned long basej, u64 basem,
2189 unsigned long *nextevt, bool *tick_stop_path,
2190 bool timer_base_idle, struct timer_events *tevt)
2191 {
2192 u64 next_tmigr;
2193
2194 if (timer_base_idle)
2195 next_tmigr = tmigr_cpu_new_timer(tevt->global);
2196 else if (tick_stop_path)
2197 next_tmigr = tmigr_cpu_deactivate(tevt->global);
2198 else
2199 next_tmigr = tmigr_quick_check(tevt->global);
2200
2201 /*
2202 * If the CPU is the last going idle in timer migration hierarchy, make
2203 * sure the CPU will wake up in time to handle remote timers.
2204 * next_tmigr == KTIME_MAX if other CPUs are still active.
2205 */
2206 if (next_tmigr < tevt->local) {
2207 u64 tmp;
2208
2209 /* If we missed a tick already, force 0 delta */
2210 if (next_tmigr < basem)
2211 next_tmigr = basem;
2212
2213 tmp = div_u64(next_tmigr - basem, TICK_NSEC);
2214
2215 *nextevt = basej + (unsigned long)tmp;
2216 tevt->local = next_tmigr;
2217 }
2218 }
2219 # else
timer_use_tmigr(unsigned long basej,u64 basem,unsigned long * nextevt,bool * tick_stop_path,bool timer_base_idle,struct timer_events * tevt)2220 static void timer_use_tmigr(unsigned long basej, u64 basem,
2221 unsigned long *nextevt, bool *tick_stop_path,
2222 bool timer_base_idle, struct timer_events *tevt)
2223 {
2224 /*
2225 * Make sure first event is written into tevt->local to not miss a
2226 * timer on !SMP systems.
2227 */
2228 tevt->local = min_t(u64, tevt->local, tevt->global);
2229 }
2230 # endif /* CONFIG_SMP */
2231
__get_next_timer_interrupt(unsigned long basej,u64 basem,bool * idle)2232 static inline u64 __get_next_timer_interrupt(unsigned long basej, u64 basem,
2233 bool *idle)
2234 {
2235 struct timer_events tevt = { .local = KTIME_MAX, .global = KTIME_MAX };
2236 struct timer_base *base_local, *base_global;
2237 unsigned long nextevt;
2238 bool idle_is_possible;
2239
2240 /*
2241 * When the CPU is offline, the tick is cancelled and nothing is supposed
2242 * to try to stop it.
2243 */
2244 if (WARN_ON_ONCE(cpu_is_offline(smp_processor_id()))) {
2245 if (idle)
2246 *idle = true;
2247 return tevt.local;
2248 }
2249
2250 base_local = this_cpu_ptr(&timer_bases[BASE_LOCAL]);
2251 base_global = this_cpu_ptr(&timer_bases[BASE_GLOBAL]);
2252
2253 raw_spin_lock(&base_local->lock);
2254 raw_spin_lock_nested(&base_global->lock, SINGLE_DEPTH_NESTING);
2255
2256 nextevt = fetch_next_timer_interrupt(basej, basem, base_local,
2257 base_global, &tevt);
2258
2259 /*
2260 * If the next event is only one jiffy ahead there is no need to call
2261 * timer migration hierarchy related functions. The value for the next
2262 * global timer in @tevt struct equals then KTIME_MAX. This is also
2263 * true, when the timer base is idle.
2264 *
2265 * The proper timer migration hierarchy function depends on the callsite
2266 * and whether timer base is idle or not. @nextevt will be updated when
2267 * this CPU needs to handle the first timer migration hierarchy
2268 * event. See timer_use_tmigr() for detailed information.
2269 */
2270 idle_is_possible = time_after(nextevt, basej + 1);
2271 if (idle_is_possible)
2272 timer_use_tmigr(basej, basem, &nextevt, idle,
2273 base_local->is_idle, &tevt);
2274
2275 /*
2276 * We have a fresh next event. Check whether we can forward the
2277 * base.
2278 */
2279 __forward_timer_base(base_local, basej);
2280 __forward_timer_base(base_global, basej);
2281
2282 /*
2283 * Set base->is_idle only when caller is timer_base_try_to_set_idle()
2284 */
2285 if (idle) {
2286 /*
2287 * Bases are idle if the next event is more than a tick
2288 * away. Caution: @nextevt could have changed by enqueueing a
2289 * global timer into timer migration hierarchy. Therefore a new
2290 * check is required here.
2291 *
2292 * If the base is marked idle then any timer add operation must
2293 * forward the base clk itself to keep granularity small. This
2294 * idle logic is only maintained for the BASE_LOCAL and
2295 * BASE_GLOBAL base, deferrable timers may still see large
2296 * granularity skew (by design).
2297 */
2298 if (!base_local->is_idle && time_after(nextevt, basej + 1)) {
2299 base_local->is_idle = true;
2300 /*
2301 * Global timers queued locally while running in a task
2302 * in nohz_full mode need a self-IPI to kick reprogramming
2303 * in IRQ tail.
2304 */
2305 if (tick_nohz_full_cpu(base_local->cpu))
2306 base_global->is_idle = true;
2307 trace_timer_base_idle(true, base_local->cpu);
2308 }
2309 *idle = base_local->is_idle;
2310
2311 /*
2312 * When timer base is not set idle, undo the effect of
2313 * tmigr_cpu_deactivate() to prevent inconsistent states - active
2314 * timer base but inactive timer migration hierarchy.
2315 *
2316 * When timer base was already marked idle, nothing will be
2317 * changed here.
2318 */
2319 if (!base_local->is_idle && idle_is_possible)
2320 tmigr_cpu_activate();
2321 }
2322
2323 raw_spin_unlock(&base_global->lock);
2324 raw_spin_unlock(&base_local->lock);
2325
2326 return cmp_next_hrtimer_event(basem, tevt.local);
2327 }
2328
2329 /**
2330 * get_next_timer_interrupt() - return the time (clock mono) of the next timer
2331 * @basej: base time jiffies
2332 * @basem: base time clock monotonic
2333 *
2334 * Returns the tick aligned clock monotonic time of the next pending timer or
2335 * KTIME_MAX if no timer is pending. If timer of global base was queued into
2336 * timer migration hierarchy, first global timer is not taken into account. If
2337 * it was the last CPU of timer migration hierarchy going idle, first global
2338 * event is taken into account.
2339 */
get_next_timer_interrupt(unsigned long basej,u64 basem)2340 u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
2341 {
2342 return __get_next_timer_interrupt(basej, basem, NULL);
2343 }
2344
2345 /**
2346 * timer_base_try_to_set_idle() - Try to set the idle state of the timer bases
2347 * @basej: base time jiffies
2348 * @basem: base time clock monotonic
2349 * @idle: pointer to store the value of timer_base->is_idle on return;
2350 * *idle contains the information whether tick was already stopped
2351 *
2352 * Returns the tick aligned clock monotonic time of the next pending timer or
2353 * KTIME_MAX if no timer is pending. When tick was already stopped KTIME_MAX is
2354 * returned as well.
2355 */
timer_base_try_to_set_idle(unsigned long basej,u64 basem,bool * idle)2356 u64 timer_base_try_to_set_idle(unsigned long basej, u64 basem, bool *idle)
2357 {
2358 if (*idle)
2359 return KTIME_MAX;
2360
2361 return __get_next_timer_interrupt(basej, basem, idle);
2362 }
2363
2364 /**
2365 * timer_clear_idle - Clear the idle state of the timer base
2366 *
2367 * Called with interrupts disabled
2368 */
timer_clear_idle(void)2369 void timer_clear_idle(void)
2370 {
2371 /*
2372 * We do this unlocked. The worst outcome is a remote pinned timer
2373 * enqueue sending a pointless IPI, but taking the lock would just
2374 * make the window for sending the IPI a few instructions smaller
2375 * for the cost of taking the lock in the exit from idle
2376 * path. Required for BASE_LOCAL only.
2377 */
2378 __this_cpu_write(timer_bases[BASE_LOCAL].is_idle, false);
2379 if (tick_nohz_full_cpu(smp_processor_id()))
2380 __this_cpu_write(timer_bases[BASE_GLOBAL].is_idle, false);
2381 trace_timer_base_idle(false, smp_processor_id());
2382
2383 /* Activate without holding the timer_base->lock */
2384 tmigr_cpu_activate();
2385 }
2386 #endif
2387
2388 /**
2389 * __run_timers - run all expired timers (if any) on this CPU.
2390 * @base: the timer vector to be processed.
2391 */
__run_timers(struct timer_base * base)2392 static inline void __run_timers(struct timer_base *base)
2393 {
2394 struct hlist_head heads[LVL_DEPTH];
2395 int levels;
2396
2397 lockdep_assert_held(&base->lock);
2398
2399 if (base->running_timer)
2400 return;
2401
2402 while (time_after_eq(jiffies, base->clk) &&
2403 time_after_eq(jiffies, base->next_expiry)) {
2404 levels = collect_expired_timers(base, heads);
2405 /*
2406 * The two possible reasons for not finding any expired
2407 * timer at this clk are that all matching timers have been
2408 * dequeued or no timer has been queued since
2409 * base::next_expiry was set to base::clk +
2410 * NEXT_TIMER_MAX_DELTA.
2411 */
2412 WARN_ON_ONCE(!levels && !base->next_expiry_recalc
2413 && base->timers_pending);
2414 /*
2415 * While executing timers, base->clk is set 1 offset ahead of
2416 * jiffies to avoid endless requeuing to current jiffies.
2417 */
2418 base->clk++;
2419 timer_recalc_next_expiry(base);
2420
2421 while (levels--)
2422 expire_timers(base, heads + levels);
2423 }
2424 }
2425
__run_timer_base(struct timer_base * base)2426 static void __run_timer_base(struct timer_base *base)
2427 {
2428 /* Can race against a remote CPU updating next_expiry under the lock */
2429 if (time_before(jiffies, READ_ONCE(base->next_expiry)))
2430 return;
2431
2432 timer_base_lock_expiry(base);
2433 raw_spin_lock_irq(&base->lock);
2434 __run_timers(base);
2435 raw_spin_unlock_irq(&base->lock);
2436 timer_base_unlock_expiry(base);
2437 }
2438
run_timer_base(int index)2439 static void run_timer_base(int index)
2440 {
2441 struct timer_base *base = this_cpu_ptr(&timer_bases[index]);
2442
2443 __run_timer_base(base);
2444 }
2445
2446 /*
2447 * This function runs timers and the timer-tq in bottom half context.
2448 */
run_timer_softirq(void)2449 static __latent_entropy void run_timer_softirq(void)
2450 {
2451 run_timer_base(BASE_LOCAL);
2452 if (IS_ENABLED(CONFIG_NO_HZ_COMMON)) {
2453 run_timer_base(BASE_GLOBAL);
2454 run_timer_base(BASE_DEF);
2455
2456 if (is_timers_nohz_active())
2457 tmigr_handle_remote();
2458 }
2459 }
2460
2461 /*
2462 * Called by the local, per-CPU timer interrupt on SMP.
2463 */
run_local_timers(void)2464 static void run_local_timers(void)
2465 {
2466 struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_LOCAL]);
2467
2468 hrtimer_run_queues();
2469
2470 for (int i = 0; i < NR_BASES; i++, base++) {
2471 /*
2472 * Raise the softirq only if required.
2473 *
2474 * timer_base::next_expiry can be written by a remote CPU while
2475 * holding the lock. If this write happens at the same time than
2476 * the lockless local read, sanity checker could complain about
2477 * data corruption.
2478 *
2479 * There are two possible situations where
2480 * timer_base::next_expiry is written by a remote CPU:
2481 *
2482 * 1. Remote CPU expires global timers of this CPU and updates
2483 * timer_base::next_expiry of BASE_GLOBAL afterwards in
2484 * next_timer_interrupt() or timer_recalc_next_expiry(). The
2485 * worst outcome is a superfluous raise of the timer softirq
2486 * when the not yet updated value is read.
2487 *
2488 * 2. A new first pinned timer is enqueued by a remote CPU
2489 * and therefore timer_base::next_expiry of BASE_LOCAL is
2490 * updated. When this update is missed, this isn't a
2491 * problem, as an IPI is executed nevertheless when the CPU
2492 * was idle before. When the CPU wasn't idle but the update
2493 * is missed, then the timer would expire one jiffy late -
2494 * bad luck.
2495 *
2496 * Those unlikely corner cases where the worst outcome is only a
2497 * one jiffy delay or a superfluous raise of the softirq are
2498 * not that expensive as doing the check always while holding
2499 * the lock.
2500 *
2501 * Possible remote writers are using WRITE_ONCE(). Local reader
2502 * uses therefore READ_ONCE().
2503 */
2504 if (time_after_eq(jiffies, READ_ONCE(base->next_expiry)) ||
2505 (i == BASE_DEF && tmigr_requires_handle_remote())) {
2506 raise_softirq(TIMER_SOFTIRQ);
2507 return;
2508 }
2509 }
2510 }
2511
2512 /*
2513 * Called from the timer interrupt handler to charge one tick to the current
2514 * process. user_tick is 1 if the tick is user time, 0 for system.
2515 */
update_process_times(int user_tick)2516 void update_process_times(int user_tick)
2517 {
2518 struct task_struct *p = current;
2519
2520 /* Note: this timer irq context must be accounted for as well. */
2521 account_process_tick(p, user_tick);
2522 run_local_timers();
2523 rcu_sched_clock_irq(user_tick);
2524 #ifdef CONFIG_IRQ_WORK
2525 if (in_irq())
2526 irq_work_tick();
2527 #endif
2528 sched_tick();
2529 if (IS_ENABLED(CONFIG_POSIX_TIMERS))
2530 run_posix_cpu_timers();
2531 }
2532
2533 /*
2534 * Since schedule_timeout()'s timer is defined on the stack, it must store
2535 * the target task on the stack as well.
2536 */
2537 struct process_timer {
2538 struct timer_list timer;
2539 struct task_struct *task;
2540 };
2541
process_timeout(struct timer_list * t)2542 static void process_timeout(struct timer_list *t)
2543 {
2544 struct process_timer *timeout = from_timer(timeout, t, timer);
2545
2546 wake_up_process(timeout->task);
2547 }
2548
2549 /**
2550 * schedule_timeout - sleep until timeout
2551 * @timeout: timeout value in jiffies
2552 *
2553 * Make the current task sleep until @timeout jiffies have elapsed.
2554 * The function behavior depends on the current task state
2555 * (see also set_current_state() description):
2556 *
2557 * %TASK_RUNNING - the scheduler is called, but the task does not sleep
2558 * at all. That happens because sched_submit_work() does nothing for
2559 * tasks in %TASK_RUNNING state.
2560 *
2561 * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to
2562 * pass before the routine returns unless the current task is explicitly
2563 * woken up, (e.g. by wake_up_process()).
2564 *
2565 * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
2566 * delivered to the current task or the current task is explicitly woken
2567 * up.
2568 *
2569 * The current task state is guaranteed to be %TASK_RUNNING when this
2570 * routine returns.
2571 *
2572 * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule
2573 * the CPU away without a bound on the timeout. In this case the return
2574 * value will be %MAX_SCHEDULE_TIMEOUT.
2575 *
2576 * Returns 0 when the timer has expired otherwise the remaining time in
2577 * jiffies will be returned. In all cases the return value is guaranteed
2578 * to be non-negative.
2579 */
schedule_timeout(signed long timeout)2580 signed long __sched schedule_timeout(signed long timeout)
2581 {
2582 struct process_timer timer;
2583 unsigned long expire;
2584
2585 switch (timeout)
2586 {
2587 case MAX_SCHEDULE_TIMEOUT:
2588 /*
2589 * These two special cases are useful to be comfortable
2590 * in the caller. Nothing more. We could take
2591 * MAX_SCHEDULE_TIMEOUT from one of the negative value
2592 * but I' d like to return a valid offset (>=0) to allow
2593 * the caller to do everything it want with the retval.
2594 */
2595 schedule();
2596 goto out;
2597 default:
2598 /*
2599 * Another bit of PARANOID. Note that the retval will be
2600 * 0 since no piece of kernel is supposed to do a check
2601 * for a negative retval of schedule_timeout() (since it
2602 * should never happens anyway). You just have the printk()
2603 * that will tell you if something is gone wrong and where.
2604 */
2605 if (timeout < 0) {
2606 printk(KERN_ERR "schedule_timeout: wrong timeout "
2607 "value %lx\n", timeout);
2608 dump_stack();
2609 __set_current_state(TASK_RUNNING);
2610 goto out;
2611 }
2612 }
2613
2614 expire = timeout + jiffies;
2615
2616 timer.task = current;
2617 timer_setup_on_stack(&timer.timer, process_timeout, 0);
2618 __mod_timer(&timer.timer, expire, MOD_TIMER_NOTPENDING);
2619 schedule();
2620 del_timer_sync(&timer.timer);
2621
2622 /* Remove the timer from the object tracker */
2623 destroy_timer_on_stack(&timer.timer);
2624
2625 timeout = expire - jiffies;
2626
2627 out:
2628 return timeout < 0 ? 0 : timeout;
2629 }
2630 EXPORT_SYMBOL(schedule_timeout);
2631
2632 /*
2633 * We can use __set_current_state() here because schedule_timeout() calls
2634 * schedule() unconditionally.
2635 */
schedule_timeout_interruptible(signed long timeout)2636 signed long __sched schedule_timeout_interruptible(signed long timeout)
2637 {
2638 __set_current_state(TASK_INTERRUPTIBLE);
2639 return schedule_timeout(timeout);
2640 }
2641 EXPORT_SYMBOL(schedule_timeout_interruptible);
2642
schedule_timeout_killable(signed long timeout)2643 signed long __sched schedule_timeout_killable(signed long timeout)
2644 {
2645 __set_current_state(TASK_KILLABLE);
2646 return schedule_timeout(timeout);
2647 }
2648 EXPORT_SYMBOL(schedule_timeout_killable);
2649
schedule_timeout_uninterruptible(signed long timeout)2650 signed long __sched schedule_timeout_uninterruptible(signed long timeout)
2651 {
2652 __set_current_state(TASK_UNINTERRUPTIBLE);
2653 return schedule_timeout(timeout);
2654 }
2655 EXPORT_SYMBOL(schedule_timeout_uninterruptible);
2656
2657 /*
2658 * Like schedule_timeout_uninterruptible(), except this task will not contribute
2659 * to load average.
2660 */
schedule_timeout_idle(signed long timeout)2661 signed long __sched schedule_timeout_idle(signed long timeout)
2662 {
2663 __set_current_state(TASK_IDLE);
2664 return schedule_timeout(timeout);
2665 }
2666 EXPORT_SYMBOL(schedule_timeout_idle);
2667
2668 #ifdef CONFIG_HOTPLUG_CPU
migrate_timer_list(struct timer_base * new_base,struct hlist_head * head)2669 static void migrate_timer_list(struct timer_base *new_base, struct hlist_head *head)
2670 {
2671 struct timer_list *timer;
2672 int cpu = new_base->cpu;
2673
2674 while (!hlist_empty(head)) {
2675 timer = hlist_entry(head->first, struct timer_list, entry);
2676 detach_timer(timer, false);
2677 timer->flags = (timer->flags & ~TIMER_BASEMASK) | cpu;
2678 internal_add_timer(new_base, timer);
2679 }
2680 }
2681
timers_prepare_cpu(unsigned int cpu)2682 int timers_prepare_cpu(unsigned int cpu)
2683 {
2684 struct timer_base *base;
2685 int b;
2686
2687 for (b = 0; b < NR_BASES; b++) {
2688 base = per_cpu_ptr(&timer_bases[b], cpu);
2689 base->clk = jiffies;
2690 base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA;
2691 base->next_expiry_recalc = false;
2692 base->timers_pending = false;
2693 base->is_idle = false;
2694 }
2695 return 0;
2696 }
2697
timers_dead_cpu(unsigned int cpu)2698 int timers_dead_cpu(unsigned int cpu)
2699 {
2700 struct timer_base *old_base;
2701 struct timer_base *new_base;
2702 int b, i;
2703
2704 for (b = 0; b < NR_BASES; b++) {
2705 old_base = per_cpu_ptr(&timer_bases[b], cpu);
2706 new_base = get_cpu_ptr(&timer_bases[b]);
2707 /*
2708 * The caller is globally serialized and nobody else
2709 * takes two locks at once, deadlock is not possible.
2710 */
2711 raw_spin_lock_irq(&new_base->lock);
2712 raw_spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
2713
2714 /*
2715 * The current CPUs base clock might be stale. Update it
2716 * before moving the timers over.
2717 */
2718 forward_timer_base(new_base);
2719
2720 WARN_ON_ONCE(old_base->running_timer);
2721 old_base->running_timer = NULL;
2722
2723 for (i = 0; i < WHEEL_SIZE; i++)
2724 migrate_timer_list(new_base, old_base->vectors + i);
2725
2726 raw_spin_unlock(&old_base->lock);
2727 raw_spin_unlock_irq(&new_base->lock);
2728 put_cpu_ptr(&timer_bases);
2729 }
2730 return 0;
2731 }
2732
2733 #endif /* CONFIG_HOTPLUG_CPU */
2734
init_timer_cpu(int cpu)2735 static void __init init_timer_cpu(int cpu)
2736 {
2737 struct timer_base *base;
2738 int i;
2739
2740 for (i = 0; i < NR_BASES; i++) {
2741 base = per_cpu_ptr(&timer_bases[i], cpu);
2742 base->cpu = cpu;
2743 raw_spin_lock_init(&base->lock);
2744 base->clk = jiffies;
2745 base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA;
2746 timer_base_init_expiry_lock(base);
2747 }
2748 }
2749
init_timer_cpus(void)2750 static void __init init_timer_cpus(void)
2751 {
2752 int cpu;
2753
2754 for_each_possible_cpu(cpu)
2755 init_timer_cpu(cpu);
2756 }
2757
init_timers(void)2758 void __init init_timers(void)
2759 {
2760 init_timer_cpus();
2761 posix_cputimers_init_work();
2762 open_softirq(TIMER_SOFTIRQ, run_timer_softirq);
2763 }
2764
2765 /**
2766 * msleep - sleep safely even with waitqueue interruptions
2767 * @msecs: Time in milliseconds to sleep for
2768 */
msleep(unsigned int msecs)2769 void msleep(unsigned int msecs)
2770 {
2771 unsigned long timeout = msecs_to_jiffies(msecs);
2772
2773 while (timeout)
2774 timeout = schedule_timeout_uninterruptible(timeout);
2775 }
2776
2777 EXPORT_SYMBOL(msleep);
2778
2779 /**
2780 * msleep_interruptible - sleep waiting for signals
2781 * @msecs: Time in milliseconds to sleep for
2782 */
msleep_interruptible(unsigned int msecs)2783 unsigned long msleep_interruptible(unsigned int msecs)
2784 {
2785 unsigned long timeout = msecs_to_jiffies(msecs);
2786
2787 while (timeout && !signal_pending(current))
2788 timeout = schedule_timeout_interruptible(timeout);
2789 return jiffies_to_msecs(timeout);
2790 }
2791
2792 EXPORT_SYMBOL(msleep_interruptible);
2793
2794 /**
2795 * usleep_range_state - Sleep for an approximate time in a given state
2796 * @min: Minimum time in usecs to sleep
2797 * @max: Maximum time in usecs to sleep
2798 * @state: State of the current task that will be while sleeping
2799 *
2800 * In non-atomic context where the exact wakeup time is flexible, use
2801 * usleep_range_state() instead of udelay(). The sleep improves responsiveness
2802 * by avoiding the CPU-hogging busy-wait of udelay(), and the range reduces
2803 * power usage by allowing hrtimers to take advantage of an already-
2804 * scheduled interrupt instead of scheduling a new one just for this sleep.
2805 */
usleep_range_state(unsigned long min,unsigned long max,unsigned int state)2806 void __sched usleep_range_state(unsigned long min, unsigned long max,
2807 unsigned int state)
2808 {
2809 ktime_t exp = ktime_add_us(ktime_get(), min);
2810 u64 delta = (u64)(max - min) * NSEC_PER_USEC;
2811
2812 for (;;) {
2813 __set_current_state(state);
2814 /* Do not return before the requested sleep time has elapsed */
2815 if (!schedule_hrtimeout_range(&exp, delta, HRTIMER_MODE_ABS))
2816 break;
2817 }
2818 }
2819 EXPORT_SYMBOL(usleep_range_state);
2820