• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: GPL-2.0+
2 /*
3  * This file contains the functions which manage clocksource drivers.
4  *
5  * Copyright (C) 2004, 2005 IBM, John Stultz (johnstul@us.ibm.com)
6  */
7 
8 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
9 
10 #include <linux/device.h>
11 #include <linux/clocksource.h>
12 #include <linux/init.h>
13 #include <linux/module.h>
14 #include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */
15 #include <linux/tick.h>
16 #include <linux/kthread.h>
17 #include <linux/prandom.h>
18 #include <linux/cpu.h>
19 
20 #include "tick-internal.h"
21 #include "timekeeping_internal.h"
22 
23 /**
24  * clocks_calc_mult_shift - calculate mult/shift factors for scaled math of clocks
25  * @mult:	pointer to mult variable
26  * @shift:	pointer to shift variable
27  * @from:	frequency to convert from
28  * @to:		frequency to convert to
29  * @maxsec:	guaranteed runtime conversion range in seconds
30  *
31  * The function evaluates the shift/mult pair for the scaled math
32  * operations of clocksources and clockevents.
33  *
34  * @to and @from are frequency values in HZ. For clock sources @to is
35  * NSEC_PER_SEC == 1GHz and @from is the counter frequency. For clock
36  * event @to is the counter frequency and @from is NSEC_PER_SEC.
37  *
38  * The @maxsec conversion range argument controls the time frame in
39  * seconds which must be covered by the runtime conversion with the
40  * calculated mult and shift factors. This guarantees that no 64bit
41  * overflow happens when the input value of the conversion is
42  * multiplied with the calculated mult factor. Larger ranges may
43  * reduce the conversion accuracy by choosing smaller mult and shift
44  * factors.
45  */
46 void
clocks_calc_mult_shift(u32 * mult,u32 * shift,u32 from,u32 to,u32 maxsec)47 clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 maxsec)
48 {
49 	u64 tmp;
50 	u32 sft, sftacc= 32;
51 
52 	/*
53 	 * Calculate the shift factor which is limiting the conversion
54 	 * range:
55 	 */
56 	tmp = ((u64)maxsec * from) >> 32;
57 	while (tmp) {
58 		tmp >>=1;
59 		sftacc--;
60 	}
61 
62 	/*
63 	 * Find the conversion shift/mult pair which has the best
64 	 * accuracy and fits the maxsec conversion range:
65 	 */
66 	for (sft = 32; sft > 0; sft--) {
67 		tmp = (u64) to << sft;
68 		tmp += from / 2;
69 		do_div(tmp, from);
70 		if ((tmp >> sftacc) == 0)
71 			break;
72 	}
73 	*mult = tmp;
74 	*shift = sft;
75 }
76 EXPORT_SYMBOL_GPL(clocks_calc_mult_shift);
77 
78 /*[Clocksource internal variables]---------
79  * curr_clocksource:
80  *	currently selected clocksource.
81  * suspend_clocksource:
82  *	used to calculate the suspend time.
83  * clocksource_list:
84  *	linked list with the registered clocksources
85  * clocksource_mutex:
86  *	protects manipulations to curr_clocksource and the clocksource_list
87  * override_name:
88  *	Name of the user-specified clocksource.
89  */
90 static struct clocksource *curr_clocksource;
91 static struct clocksource *suspend_clocksource;
92 static LIST_HEAD(clocksource_list);
93 static DEFINE_MUTEX(clocksource_mutex);
94 static char override_name[CS_NAME_LEN];
95 static int finished_booting;
96 static u64 suspend_start;
97 
98 /*
99  * Threshold: 0.0312s, when doubled: 0.0625s.
100  * Also a default for cs->uncertainty_margin when registering clocks.
101  */
102 #define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 5)
103 
104 /*
105  * Maximum permissible delay between two readouts of the watchdog
106  * clocksource surrounding a read of the clocksource being validated.
107  * This delay could be due to SMIs, NMIs, or to VCPU preemptions.  Used as
108  * a lower bound for cs->uncertainty_margin values when registering clocks.
109  */
110 #define WATCHDOG_MAX_SKEW (100 * NSEC_PER_USEC)
111 
112 #ifdef CONFIG_CLOCKSOURCE_WATCHDOG
113 static void clocksource_watchdog_work(struct work_struct *work);
114 static void clocksource_select(void);
115 
116 static LIST_HEAD(watchdog_list);
117 static struct clocksource *watchdog;
118 static struct timer_list watchdog_timer;
119 static DECLARE_WORK(watchdog_work, clocksource_watchdog_work);
120 static DEFINE_SPINLOCK(watchdog_lock);
121 static int watchdog_running;
122 static atomic_t watchdog_reset_pending;
123 static int64_t watchdog_max_interval;
124 
clocksource_watchdog_lock(unsigned long * flags)125 static inline void clocksource_watchdog_lock(unsigned long *flags)
126 {
127 	spin_lock_irqsave(&watchdog_lock, *flags);
128 }
129 
clocksource_watchdog_unlock(unsigned long * flags)130 static inline void clocksource_watchdog_unlock(unsigned long *flags)
131 {
132 	spin_unlock_irqrestore(&watchdog_lock, *flags);
133 }
134 
135 static int clocksource_watchdog_kthread(void *data);
136 static void __clocksource_change_rating(struct clocksource *cs, int rating);
137 
138 /*
139  * Interval: 0.5sec.
140  */
141 #define WATCHDOG_INTERVAL (HZ >> 1)
142 #define WATCHDOG_INTERVAL_MAX_NS ((2 * WATCHDOG_INTERVAL) * (NSEC_PER_SEC / HZ))
143 
clocksource_watchdog_work(struct work_struct * work)144 static void clocksource_watchdog_work(struct work_struct *work)
145 {
146 	/*
147 	 * We cannot directly run clocksource_watchdog_kthread() here, because
148 	 * clocksource_select() calls timekeeping_notify() which uses
149 	 * stop_machine(). One cannot use stop_machine() from a workqueue() due
150 	 * lock inversions wrt CPU hotplug.
151 	 *
152 	 * Also, we only ever run this work once or twice during the lifetime
153 	 * of the kernel, so there is no point in creating a more permanent
154 	 * kthread for this.
155 	 *
156 	 * If kthread_run fails the next watchdog scan over the
157 	 * watchdog_list will find the unstable clock again.
158 	 */
159 	kthread_run(clocksource_watchdog_kthread, NULL, "kwatchdog");
160 }
161 
__clocksource_unstable(struct clocksource * cs)162 static void __clocksource_unstable(struct clocksource *cs)
163 {
164 	cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG);
165 	cs->flags |= CLOCK_SOURCE_UNSTABLE;
166 
167 	/*
168 	 * If the clocksource is registered clocksource_watchdog_kthread() will
169 	 * re-rate and re-select.
170 	 */
171 	if (list_empty(&cs->list)) {
172 		cs->rating = 0;
173 		return;
174 	}
175 
176 	if (cs->mark_unstable)
177 		cs->mark_unstable(cs);
178 
179 	/* kick clocksource_watchdog_kthread() */
180 	if (finished_booting)
181 		schedule_work(&watchdog_work);
182 }
183 
184 /**
185  * clocksource_mark_unstable - mark clocksource unstable via watchdog
186  * @cs:		clocksource to be marked unstable
187  *
188  * This function is called by the x86 TSC code to mark clocksources as unstable;
189  * it defers demotion and re-selection to a kthread.
190  */
clocksource_mark_unstable(struct clocksource * cs)191 void clocksource_mark_unstable(struct clocksource *cs)
192 {
193 	unsigned long flags;
194 
195 	spin_lock_irqsave(&watchdog_lock, flags);
196 	if (!(cs->flags & CLOCK_SOURCE_UNSTABLE)) {
197 		if (!list_empty(&cs->list) && list_empty(&cs->wd_list))
198 			list_add(&cs->wd_list, &watchdog_list);
199 		__clocksource_unstable(cs);
200 	}
201 	spin_unlock_irqrestore(&watchdog_lock, flags);
202 }
203 
204 ulong max_cswd_read_retries = 3;
205 module_param(max_cswd_read_retries, ulong, 0644);
206 EXPORT_SYMBOL_GPL(max_cswd_read_retries);
207 static int verify_n_cpus = 8;
208 module_param(verify_n_cpus, int, 0644);
209 
210 enum wd_read_status {
211 	WD_READ_SUCCESS,
212 	WD_READ_UNSTABLE,
213 	WD_READ_SKIP
214 };
215 
cs_watchdog_read(struct clocksource * cs,u64 * csnow,u64 * wdnow)216 static enum wd_read_status cs_watchdog_read(struct clocksource *cs, u64 *csnow, u64 *wdnow)
217 {
218 	unsigned int nretries;
219 	u64 wd_end, wd_end2, wd_delta;
220 	int64_t wd_delay, wd_seq_delay;
221 
222 	for (nretries = 0; nretries <= max_cswd_read_retries; nretries++) {
223 		local_irq_disable();
224 		*wdnow = watchdog->read(watchdog);
225 		*csnow = cs->read(cs);
226 		wd_end = watchdog->read(watchdog);
227 		wd_end2 = watchdog->read(watchdog);
228 		local_irq_enable();
229 
230 		wd_delta = clocksource_delta(wd_end, *wdnow, watchdog->mask);
231 		wd_delay = clocksource_cyc2ns(wd_delta, watchdog->mult,
232 					      watchdog->shift);
233 		if (wd_delay <= WATCHDOG_MAX_SKEW) {
234 			if (nretries > 1 || nretries >= max_cswd_read_retries) {
235 				pr_warn("timekeeping watchdog on CPU%d: %s retried %d times before success\n",
236 					smp_processor_id(), watchdog->name, nretries);
237 			}
238 			return WD_READ_SUCCESS;
239 		}
240 
241 		/*
242 		 * Now compute delay in consecutive watchdog read to see if
243 		 * there is too much external interferences that cause
244 		 * significant delay in reading both clocksource and watchdog.
245 		 *
246 		 * If consecutive WD read-back delay > WATCHDOG_MAX_SKEW/2,
247 		 * report system busy, reinit the watchdog and skip the current
248 		 * watchdog test.
249 		 */
250 		wd_delta = clocksource_delta(wd_end2, wd_end, watchdog->mask);
251 		wd_seq_delay = clocksource_cyc2ns(wd_delta, watchdog->mult, watchdog->shift);
252 		if (wd_seq_delay > WATCHDOG_MAX_SKEW/2)
253 			goto skip_test;
254 	}
255 
256 	pr_warn("timekeeping watchdog on CPU%d: %s read-back delay of %lldns, attempt %d, marking unstable\n",
257 		smp_processor_id(), watchdog->name, wd_delay, nretries);
258 	return WD_READ_UNSTABLE;
259 
260 skip_test:
261 	pr_info("timekeeping watchdog on CPU%d: %s wd-wd read-back delay of %lldns\n",
262 		smp_processor_id(), watchdog->name, wd_seq_delay);
263 	pr_info("wd-%s-wd read-back delay of %lldns, clock-skew test skipped!\n",
264 		cs->name, wd_delay);
265 	return WD_READ_SKIP;
266 }
267 
268 static u64 csnow_mid;
269 static cpumask_t cpus_ahead;
270 static cpumask_t cpus_behind;
271 static cpumask_t cpus_chosen;
272 
clocksource_verify_choose_cpus(void)273 static void clocksource_verify_choose_cpus(void)
274 {
275 	int cpu, i, n = verify_n_cpus;
276 
277 	if (n < 0) {
278 		/* Check all of the CPUs. */
279 		cpumask_copy(&cpus_chosen, cpu_online_mask);
280 		cpumask_clear_cpu(smp_processor_id(), &cpus_chosen);
281 		return;
282 	}
283 
284 	/* If no checking desired, or no other CPU to check, leave. */
285 	cpumask_clear(&cpus_chosen);
286 	if (n == 0 || num_online_cpus() <= 1)
287 		return;
288 
289 	/* Make sure to select at least one CPU other than the current CPU. */
290 	cpu = cpumask_next(-1, cpu_online_mask);
291 	if (cpu == smp_processor_id())
292 		cpu = cpumask_next(cpu, cpu_online_mask);
293 	if (WARN_ON_ONCE(cpu >= nr_cpu_ids))
294 		return;
295 	cpumask_set_cpu(cpu, &cpus_chosen);
296 
297 	/* Force a sane value for the boot parameter. */
298 	if (n > nr_cpu_ids)
299 		n = nr_cpu_ids;
300 
301 	/*
302 	 * Randomly select the specified number of CPUs.  If the same
303 	 * CPU is selected multiple times, that CPU is checked only once,
304 	 * and no replacement CPU is selected.  This gracefully handles
305 	 * situations where verify_n_cpus is greater than the number of
306 	 * CPUs that are currently online.
307 	 */
308 	for (i = 1; i < n; i++) {
309 		cpu = prandom_u32() % nr_cpu_ids;
310 		cpu = cpumask_next(cpu - 1, cpu_online_mask);
311 		if (cpu >= nr_cpu_ids)
312 			cpu = cpumask_next(-1, cpu_online_mask);
313 		if (!WARN_ON_ONCE(cpu >= nr_cpu_ids))
314 			cpumask_set_cpu(cpu, &cpus_chosen);
315 	}
316 
317 	/* Don't verify ourselves. */
318 	cpumask_clear_cpu(smp_processor_id(), &cpus_chosen);
319 }
320 
clocksource_verify_one_cpu(void * csin)321 static void clocksource_verify_one_cpu(void *csin)
322 {
323 	struct clocksource *cs = (struct clocksource *)csin;
324 
325 	csnow_mid = cs->read(cs);
326 }
327 
clocksource_verify_percpu(struct clocksource * cs)328 void clocksource_verify_percpu(struct clocksource *cs)
329 {
330 	int64_t cs_nsec, cs_nsec_max = 0, cs_nsec_min = LLONG_MAX;
331 	u64 csnow_begin, csnow_end;
332 	int cpu, testcpu;
333 	s64 delta;
334 
335 	if (verify_n_cpus == 0)
336 		return;
337 	cpumask_clear(&cpus_ahead);
338 	cpumask_clear(&cpus_behind);
339 	cpus_read_lock();
340 	preempt_disable();
341 	clocksource_verify_choose_cpus();
342 	if (cpumask_weight(&cpus_chosen) == 0) {
343 		preempt_enable();
344 		cpus_read_unlock();
345 		pr_warn("Not enough CPUs to check clocksource '%s'.\n", cs->name);
346 		return;
347 	}
348 	testcpu = smp_processor_id();
349 	pr_warn("Checking clocksource %s synchronization from CPU %d to CPUs %*pbl.\n", cs->name, testcpu, cpumask_pr_args(&cpus_chosen));
350 	for_each_cpu(cpu, &cpus_chosen) {
351 		if (cpu == testcpu)
352 			continue;
353 		csnow_begin = cs->read(cs);
354 		smp_call_function_single(cpu, clocksource_verify_one_cpu, cs, 1);
355 		csnow_end = cs->read(cs);
356 		delta = (s64)((csnow_mid - csnow_begin) & cs->mask);
357 		if (delta < 0)
358 			cpumask_set_cpu(cpu, &cpus_behind);
359 		delta = (csnow_end - csnow_mid) & cs->mask;
360 		if (delta < 0)
361 			cpumask_set_cpu(cpu, &cpus_ahead);
362 		delta = clocksource_delta(csnow_end, csnow_begin, cs->mask);
363 		cs_nsec = clocksource_cyc2ns(delta, cs->mult, cs->shift);
364 		if (cs_nsec > cs_nsec_max)
365 			cs_nsec_max = cs_nsec;
366 		if (cs_nsec < cs_nsec_min)
367 			cs_nsec_min = cs_nsec;
368 	}
369 	preempt_enable();
370 	cpus_read_unlock();
371 	if (!cpumask_empty(&cpus_ahead))
372 		pr_warn("        CPUs %*pbl ahead of CPU %d for clocksource %s.\n",
373 			cpumask_pr_args(&cpus_ahead), testcpu, cs->name);
374 	if (!cpumask_empty(&cpus_behind))
375 		pr_warn("        CPUs %*pbl behind CPU %d for clocksource %s.\n",
376 			cpumask_pr_args(&cpus_behind), testcpu, cs->name);
377 	if (!cpumask_empty(&cpus_ahead) || !cpumask_empty(&cpus_behind))
378 		pr_warn("        CPU %d check durations %lldns - %lldns for clocksource %s.\n",
379 			testcpu, cs_nsec_min, cs_nsec_max, cs->name);
380 }
381 EXPORT_SYMBOL_GPL(clocksource_verify_percpu);
382 
clocksource_reset_watchdog(void)383 static inline void clocksource_reset_watchdog(void)
384 {
385 	struct clocksource *cs;
386 
387 	list_for_each_entry(cs, &watchdog_list, wd_list)
388 		cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
389 }
390 
391 
clocksource_watchdog(struct timer_list * unused)392 static void clocksource_watchdog(struct timer_list *unused)
393 {
394 	u64 csnow, wdnow, cslast, wdlast, delta;
395 	int64_t wd_nsec, cs_nsec, interval;
396 	int next_cpu, reset_pending;
397 	struct clocksource *cs;
398 	enum wd_read_status read_ret;
399 	unsigned long extra_wait = 0;
400 	u32 md;
401 
402 	spin_lock(&watchdog_lock);
403 	if (!watchdog_running)
404 		goto out;
405 
406 	reset_pending = atomic_read(&watchdog_reset_pending);
407 
408 	list_for_each_entry(cs, &watchdog_list, wd_list) {
409 
410 		/* Clocksource already marked unstable? */
411 		if (cs->flags & CLOCK_SOURCE_UNSTABLE) {
412 			if (finished_booting)
413 				schedule_work(&watchdog_work);
414 			continue;
415 		}
416 
417 		read_ret = cs_watchdog_read(cs, &csnow, &wdnow);
418 
419 		if (read_ret == WD_READ_UNSTABLE) {
420 			/* Clock readout unreliable, so give it up. */
421 			__clocksource_unstable(cs);
422 			continue;
423 		}
424 
425 		/*
426 		 * When WD_READ_SKIP is returned, it means the system is likely
427 		 * under very heavy load, where the latency of reading
428 		 * watchdog/clocksource is very big, and affect the accuracy of
429 		 * watchdog check. So give system some space and suspend the
430 		 * watchdog check for 5 minutes.
431 		 */
432 		if (read_ret == WD_READ_SKIP) {
433 			/*
434 			 * As the watchdog timer will be suspended, and
435 			 * cs->last could keep unchanged for 5 minutes, reset
436 			 * the counters.
437 			 */
438 			clocksource_reset_watchdog();
439 			extra_wait = HZ * 300;
440 			break;
441 		}
442 
443 		/* Clocksource initialized ? */
444 		if (!(cs->flags & CLOCK_SOURCE_WATCHDOG) ||
445 		    atomic_read(&watchdog_reset_pending)) {
446 			cs->flags |= CLOCK_SOURCE_WATCHDOG;
447 			cs->wd_last = wdnow;
448 			cs->cs_last = csnow;
449 			continue;
450 		}
451 
452 		delta = clocksource_delta(wdnow, cs->wd_last, watchdog->mask);
453 		wd_nsec = clocksource_cyc2ns(delta, watchdog->mult,
454 					     watchdog->shift);
455 
456 		delta = clocksource_delta(csnow, cs->cs_last, cs->mask);
457 		cs_nsec = clocksource_cyc2ns(delta, cs->mult, cs->shift);
458 		wdlast = cs->wd_last; /* save these in case we print them */
459 		cslast = cs->cs_last;
460 		cs->cs_last = csnow;
461 		cs->wd_last = wdnow;
462 
463 		if (atomic_read(&watchdog_reset_pending))
464 			continue;
465 
466 		/*
467 		 * The processing of timer softirqs can get delayed (usually
468 		 * on account of ksoftirqd not getting to run in a timely
469 		 * manner), which causes the watchdog interval to stretch.
470 		 * Skew detection may fail for longer watchdog intervals
471 		 * on account of fixed margins being used.
472 		 * Some clocksources, e.g. acpi_pm, cannot tolerate
473 		 * watchdog intervals longer than a few seconds.
474 		 */
475 		interval = max(cs_nsec, wd_nsec);
476 		if (unlikely(interval > WATCHDOG_INTERVAL_MAX_NS)) {
477 			if (system_state > SYSTEM_SCHEDULING &&
478 			    interval > 2 * watchdog_max_interval) {
479 				watchdog_max_interval = interval;
480 				pr_warn("Long readout interval, skipping watchdog check: cs_nsec: %lld wd_nsec: %lld\n",
481 					cs_nsec, wd_nsec);
482 			}
483 			watchdog_timer.expires = jiffies;
484 			continue;
485 		}
486 
487 		/* Check the deviation from the watchdog clocksource. */
488 		md = cs->uncertainty_margin + watchdog->uncertainty_margin;
489 		if (abs(cs_nsec - wd_nsec) > md) {
490 			pr_warn("timekeeping watchdog on CPU%d: Marking clocksource '%s' as unstable because the skew is too large:\n",
491 				smp_processor_id(), cs->name);
492 			pr_warn("                      '%s' wd_nsec: %lld wd_now: %llx wd_last: %llx mask: %llx\n",
493 				watchdog->name, wd_nsec, wdnow, wdlast, watchdog->mask);
494 			pr_warn("                      '%s' cs_nsec: %lld cs_now: %llx cs_last: %llx mask: %llx\n",
495 				cs->name, cs_nsec, csnow, cslast, cs->mask);
496 			if (curr_clocksource == cs)
497 				pr_warn("                      '%s' is current clocksource.\n", cs->name);
498 			else if (curr_clocksource)
499 				pr_warn("                      '%s' (not '%s') is current clocksource.\n", curr_clocksource->name, cs->name);
500 			else
501 				pr_warn("                      No current clocksource.\n");
502 			__clocksource_unstable(cs);
503 			continue;
504 		}
505 
506 		if (cs == curr_clocksource && cs->tick_stable)
507 			cs->tick_stable(cs);
508 
509 		if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) &&
510 		    (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) &&
511 		    (watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) {
512 			/* Mark it valid for high-res. */
513 			cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
514 
515 			/*
516 			 * clocksource_done_booting() will sort it if
517 			 * finished_booting is not set yet.
518 			 */
519 			if (!finished_booting)
520 				continue;
521 
522 			/*
523 			 * If this is not the current clocksource let
524 			 * the watchdog thread reselect it. Due to the
525 			 * change to high res this clocksource might
526 			 * be preferred now. If it is the current
527 			 * clocksource let the tick code know about
528 			 * that change.
529 			 */
530 			if (cs != curr_clocksource) {
531 				cs->flags |= CLOCK_SOURCE_RESELECT;
532 				schedule_work(&watchdog_work);
533 			} else {
534 				tick_clock_notify();
535 			}
536 		}
537 	}
538 
539 	/*
540 	 * We only clear the watchdog_reset_pending, when we did a
541 	 * full cycle through all clocksources.
542 	 */
543 	if (reset_pending)
544 		atomic_dec(&watchdog_reset_pending);
545 
546 	/*
547 	 * Cycle through CPUs to check if the CPUs stay synchronized
548 	 * to each other.
549 	 */
550 	next_cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask);
551 	if (next_cpu >= nr_cpu_ids)
552 		next_cpu = cpumask_first(cpu_online_mask);
553 
554 	/*
555 	 * Arm timer if not already pending: could race with concurrent
556 	 * pair clocksource_stop_watchdog() clocksource_start_watchdog().
557 	 */
558 	if (!timer_pending(&watchdog_timer)) {
559 		watchdog_timer.expires += WATCHDOG_INTERVAL + extra_wait;
560 		add_timer_on(&watchdog_timer, next_cpu);
561 	}
562 out:
563 	spin_unlock(&watchdog_lock);
564 }
565 
clocksource_start_watchdog(void)566 static inline void clocksource_start_watchdog(void)
567 {
568 	if (watchdog_running || !watchdog || list_empty(&watchdog_list))
569 		return;
570 	timer_setup(&watchdog_timer, clocksource_watchdog, 0);
571 	watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL;
572 	add_timer_on(&watchdog_timer, cpumask_first(cpu_online_mask));
573 	watchdog_running = 1;
574 }
575 
clocksource_stop_watchdog(void)576 static inline void clocksource_stop_watchdog(void)
577 {
578 	if (!watchdog_running || (watchdog && !list_empty(&watchdog_list)))
579 		return;
580 	del_timer(&watchdog_timer);
581 	watchdog_running = 0;
582 }
583 
clocksource_resume_watchdog(void)584 static void clocksource_resume_watchdog(void)
585 {
586 	atomic_inc(&watchdog_reset_pending);
587 }
588 
clocksource_enqueue_watchdog(struct clocksource * cs)589 static void clocksource_enqueue_watchdog(struct clocksource *cs)
590 {
591 	INIT_LIST_HEAD(&cs->wd_list);
592 
593 	if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) {
594 		/* cs is a clocksource to be watched. */
595 		list_add(&cs->wd_list, &watchdog_list);
596 		cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
597 	} else {
598 		/* cs is a watchdog. */
599 		if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS)
600 			cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
601 	}
602 }
603 
clocksource_select_watchdog(bool fallback)604 static void clocksource_select_watchdog(bool fallback)
605 {
606 	struct clocksource *cs, *old_wd;
607 	unsigned long flags;
608 
609 	spin_lock_irqsave(&watchdog_lock, flags);
610 	/* save current watchdog */
611 	old_wd = watchdog;
612 	if (fallback)
613 		watchdog = NULL;
614 
615 	list_for_each_entry(cs, &clocksource_list, list) {
616 		/* cs is a clocksource to be watched. */
617 		if (cs->flags & CLOCK_SOURCE_MUST_VERIFY)
618 			continue;
619 
620 		/* Skip current if we were requested for a fallback. */
621 		if (fallback && cs == old_wd)
622 			continue;
623 
624 		/* Pick the best watchdog. */
625 		if (!watchdog || cs->rating > watchdog->rating)
626 			watchdog = cs;
627 	}
628 	/* If we failed to find a fallback restore the old one. */
629 	if (!watchdog)
630 		watchdog = old_wd;
631 
632 	/* If we changed the watchdog we need to reset cycles. */
633 	if (watchdog != old_wd)
634 		clocksource_reset_watchdog();
635 
636 	/* Check if the watchdog timer needs to be started. */
637 	clocksource_start_watchdog();
638 	spin_unlock_irqrestore(&watchdog_lock, flags);
639 }
640 
clocksource_dequeue_watchdog(struct clocksource * cs)641 static void clocksource_dequeue_watchdog(struct clocksource *cs)
642 {
643 	if (cs != watchdog) {
644 		if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) {
645 			/* cs is a watched clocksource. */
646 			list_del_init(&cs->wd_list);
647 			/* Check if the watchdog timer needs to be stopped. */
648 			clocksource_stop_watchdog();
649 		}
650 	}
651 }
652 
__clocksource_watchdog_kthread(void)653 static int __clocksource_watchdog_kthread(void)
654 {
655 	struct clocksource *cs, *tmp;
656 	unsigned long flags;
657 	int select = 0;
658 
659 	/* Do any required per-CPU skew verification. */
660 	if (curr_clocksource &&
661 	    curr_clocksource->flags & CLOCK_SOURCE_UNSTABLE &&
662 	    curr_clocksource->flags & CLOCK_SOURCE_VERIFY_PERCPU)
663 		clocksource_verify_percpu(curr_clocksource);
664 
665 	spin_lock_irqsave(&watchdog_lock, flags);
666 	list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) {
667 		if (cs->flags & CLOCK_SOURCE_UNSTABLE) {
668 			list_del_init(&cs->wd_list);
669 			__clocksource_change_rating(cs, 0);
670 			select = 1;
671 		}
672 		if (cs->flags & CLOCK_SOURCE_RESELECT) {
673 			cs->flags &= ~CLOCK_SOURCE_RESELECT;
674 			select = 1;
675 		}
676 	}
677 	/* Check if the watchdog timer needs to be stopped. */
678 	clocksource_stop_watchdog();
679 	spin_unlock_irqrestore(&watchdog_lock, flags);
680 
681 	return select;
682 }
683 
clocksource_watchdog_kthread(void * data)684 static int clocksource_watchdog_kthread(void *data)
685 {
686 	mutex_lock(&clocksource_mutex);
687 	if (__clocksource_watchdog_kthread())
688 		clocksource_select();
689 	mutex_unlock(&clocksource_mutex);
690 	return 0;
691 }
692 
clocksource_is_watchdog(struct clocksource * cs)693 static bool clocksource_is_watchdog(struct clocksource *cs)
694 {
695 	return cs == watchdog;
696 }
697 
698 #else /* CONFIG_CLOCKSOURCE_WATCHDOG */
699 
clocksource_enqueue_watchdog(struct clocksource * cs)700 static void clocksource_enqueue_watchdog(struct clocksource *cs)
701 {
702 	if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS)
703 		cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
704 }
705 
clocksource_select_watchdog(bool fallback)706 static void clocksource_select_watchdog(bool fallback) { }
clocksource_dequeue_watchdog(struct clocksource * cs)707 static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { }
clocksource_resume_watchdog(void)708 static inline void clocksource_resume_watchdog(void) { }
__clocksource_watchdog_kthread(void)709 static inline int __clocksource_watchdog_kthread(void) { return 0; }
clocksource_is_watchdog(struct clocksource * cs)710 static bool clocksource_is_watchdog(struct clocksource *cs) { return false; }
clocksource_mark_unstable(struct clocksource * cs)711 void clocksource_mark_unstable(struct clocksource *cs) { }
712 
clocksource_watchdog_lock(unsigned long * flags)713 static inline void clocksource_watchdog_lock(unsigned long *flags) { }
clocksource_watchdog_unlock(unsigned long * flags)714 static inline void clocksource_watchdog_unlock(unsigned long *flags) { }
715 
716 #endif /* CONFIG_CLOCKSOURCE_WATCHDOG */
717 
clocksource_is_suspend(struct clocksource * cs)718 static bool clocksource_is_suspend(struct clocksource *cs)
719 {
720 	return cs == suspend_clocksource;
721 }
722 
__clocksource_suspend_select(struct clocksource * cs)723 static void __clocksource_suspend_select(struct clocksource *cs)
724 {
725 	/*
726 	 * Skip the clocksource which will be stopped in suspend state.
727 	 */
728 	if (!(cs->flags & CLOCK_SOURCE_SUSPEND_NONSTOP))
729 		return;
730 
731 	/*
732 	 * The nonstop clocksource can be selected as the suspend clocksource to
733 	 * calculate the suspend time, so it should not supply suspend/resume
734 	 * interfaces to suspend the nonstop clocksource when system suspends.
735 	 */
736 	if (cs->suspend || cs->resume) {
737 		pr_warn("Nonstop clocksource %s should not supply suspend/resume interfaces\n",
738 			cs->name);
739 	}
740 
741 	/* Pick the best rating. */
742 	if (!suspend_clocksource || cs->rating > suspend_clocksource->rating)
743 		suspend_clocksource = cs;
744 }
745 
746 /**
747  * clocksource_suspend_select - Select the best clocksource for suspend timing
748  * @fallback:	if select a fallback clocksource
749  */
clocksource_suspend_select(bool fallback)750 static void clocksource_suspend_select(bool fallback)
751 {
752 	struct clocksource *cs, *old_suspend;
753 
754 	old_suspend = suspend_clocksource;
755 	if (fallback)
756 		suspend_clocksource = NULL;
757 
758 	list_for_each_entry(cs, &clocksource_list, list) {
759 		/* Skip current if we were requested for a fallback. */
760 		if (fallback && cs == old_suspend)
761 			continue;
762 
763 		__clocksource_suspend_select(cs);
764 	}
765 }
766 
767 /**
768  * clocksource_start_suspend_timing - Start measuring the suspend timing
769  * @cs:			current clocksource from timekeeping
770  * @start_cycles:	current cycles from timekeeping
771  *
772  * This function will save the start cycle values of suspend timer to calculate
773  * the suspend time when resuming system.
774  *
775  * This function is called late in the suspend process from timekeeping_suspend(),
776  * that means processes are frozen, non-boot cpus and interrupts are disabled
777  * now. It is therefore possible to start the suspend timer without taking the
778  * clocksource mutex.
779  */
clocksource_start_suspend_timing(struct clocksource * cs,u64 start_cycles)780 void clocksource_start_suspend_timing(struct clocksource *cs, u64 start_cycles)
781 {
782 	if (!suspend_clocksource)
783 		return;
784 
785 	/*
786 	 * If current clocksource is the suspend timer, we should use the
787 	 * tkr_mono.cycle_last value as suspend_start to avoid same reading
788 	 * from suspend timer.
789 	 */
790 	if (clocksource_is_suspend(cs)) {
791 		suspend_start = start_cycles;
792 		return;
793 	}
794 
795 	if (suspend_clocksource->enable &&
796 	    suspend_clocksource->enable(suspend_clocksource)) {
797 		pr_warn_once("Failed to enable the non-suspend-able clocksource.\n");
798 		return;
799 	}
800 
801 	suspend_start = suspend_clocksource->read(suspend_clocksource);
802 }
803 
804 /**
805  * clocksource_stop_suspend_timing - Stop measuring the suspend timing
806  * @cs:		current clocksource from timekeeping
807  * @cycle_now:	current cycles from timekeeping
808  *
809  * This function will calculate the suspend time from suspend timer.
810  *
811  * Returns nanoseconds since suspend started, 0 if no usable suspend clocksource.
812  *
813  * This function is called early in the resume process from timekeeping_resume(),
814  * that means there is only one cpu, no processes are running and the interrupts
815  * are disabled. It is therefore possible to stop the suspend timer without
816  * taking the clocksource mutex.
817  */
clocksource_stop_suspend_timing(struct clocksource * cs,u64 cycle_now)818 u64 clocksource_stop_suspend_timing(struct clocksource *cs, u64 cycle_now)
819 {
820 	u64 now, delta, nsec = 0;
821 
822 	if (!suspend_clocksource)
823 		return 0;
824 
825 	/*
826 	 * If current clocksource is the suspend timer, we should use the
827 	 * tkr_mono.cycle_last value from timekeeping as current cycle to
828 	 * avoid same reading from suspend timer.
829 	 */
830 	if (clocksource_is_suspend(cs))
831 		now = cycle_now;
832 	else
833 		now = suspend_clocksource->read(suspend_clocksource);
834 
835 	if (now > suspend_start) {
836 		delta = clocksource_delta(now, suspend_start,
837 					  suspend_clocksource->mask);
838 		nsec = mul_u64_u32_shr(delta, suspend_clocksource->mult,
839 				       suspend_clocksource->shift);
840 	}
841 
842 	/*
843 	 * Disable the suspend timer to save power if current clocksource is
844 	 * not the suspend timer.
845 	 */
846 	if (!clocksource_is_suspend(cs) && suspend_clocksource->disable)
847 		suspend_clocksource->disable(suspend_clocksource);
848 
849 	return nsec;
850 }
851 
852 /**
853  * clocksource_suspend - suspend the clocksource(s)
854  */
clocksource_suspend(void)855 void clocksource_suspend(void)
856 {
857 	struct clocksource *cs;
858 
859 	list_for_each_entry_reverse(cs, &clocksource_list, list)
860 		if (cs->suspend)
861 			cs->suspend(cs);
862 }
863 
864 /**
865  * clocksource_resume - resume the clocksource(s)
866  */
clocksource_resume(void)867 void clocksource_resume(void)
868 {
869 	struct clocksource *cs;
870 
871 	list_for_each_entry(cs, &clocksource_list, list)
872 		if (cs->resume)
873 			cs->resume(cs);
874 
875 	clocksource_resume_watchdog();
876 }
877 
878 /**
879  * clocksource_touch_watchdog - Update watchdog
880  *
881  * Update the watchdog after exception contexts such as kgdb so as not
882  * to incorrectly trip the watchdog. This might fail when the kernel
883  * was stopped in code which holds watchdog_lock.
884  */
clocksource_touch_watchdog(void)885 void clocksource_touch_watchdog(void)
886 {
887 	clocksource_resume_watchdog();
888 }
889 
890 /**
891  * clocksource_max_adjustment- Returns max adjustment amount
892  * @cs:         Pointer to clocksource
893  *
894  */
clocksource_max_adjustment(struct clocksource * cs)895 static u32 clocksource_max_adjustment(struct clocksource *cs)
896 {
897 	u64 ret;
898 	/*
899 	 * We won't try to correct for more than 11% adjustments (110,000 ppm),
900 	 */
901 	ret = (u64)cs->mult * 11;
902 	do_div(ret,100);
903 	return (u32)ret;
904 }
905 
906 /**
907  * clocks_calc_max_nsecs - Returns maximum nanoseconds that can be converted
908  * @mult:	cycle to nanosecond multiplier
909  * @shift:	cycle to nanosecond divisor (power of two)
910  * @maxadj:	maximum adjustment value to mult (~11%)
911  * @mask:	bitmask for two's complement subtraction of non 64 bit counters
912  * @max_cyc:	maximum cycle value before potential overflow (does not include
913  *		any safety margin)
914  *
915  * NOTE: This function includes a safety margin of 50%, in other words, we
916  * return half the number of nanoseconds the hardware counter can technically
917  * cover. This is done so that we can potentially detect problems caused by
918  * delayed timers or bad hardware, which might result in time intervals that
919  * are larger than what the math used can handle without overflows.
920  */
clocks_calc_max_nsecs(u32 mult,u32 shift,u32 maxadj,u64 mask,u64 * max_cyc)921 u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask, u64 *max_cyc)
922 {
923 	u64 max_nsecs, max_cycles;
924 
925 	/*
926 	 * Calculate the maximum number of cycles that we can pass to the
927 	 * cyc2ns() function without overflowing a 64-bit result.
928 	 */
929 	max_cycles = ULLONG_MAX;
930 	do_div(max_cycles, mult+maxadj);
931 
932 	/*
933 	 * The actual maximum number of cycles we can defer the clocksource is
934 	 * determined by the minimum of max_cycles and mask.
935 	 * Note: Here we subtract the maxadj to make sure we don't sleep for
936 	 * too long if there's a large negative adjustment.
937 	 */
938 	max_cycles = min(max_cycles, mask);
939 	max_nsecs = clocksource_cyc2ns(max_cycles, mult - maxadj, shift);
940 
941 	/* return the max_cycles value as well if requested */
942 	if (max_cyc)
943 		*max_cyc = max_cycles;
944 
945 	/* Return 50% of the actual maximum, so we can detect bad values */
946 	max_nsecs >>= 1;
947 
948 	return max_nsecs;
949 }
950 
951 /**
952  * clocksource_update_max_deferment - Updates the clocksource max_idle_ns & max_cycles
953  * @cs:         Pointer to clocksource to be updated
954  *
955  */
clocksource_update_max_deferment(struct clocksource * cs)956 static inline void clocksource_update_max_deferment(struct clocksource *cs)
957 {
958 	cs->max_idle_ns = clocks_calc_max_nsecs(cs->mult, cs->shift,
959 						cs->maxadj, cs->mask,
960 						&cs->max_cycles);
961 }
962 
clocksource_find_best(bool oneshot,bool skipcur)963 static struct clocksource *clocksource_find_best(bool oneshot, bool skipcur)
964 {
965 	struct clocksource *cs;
966 
967 	if (!finished_booting || list_empty(&clocksource_list))
968 		return NULL;
969 
970 	/*
971 	 * We pick the clocksource with the highest rating. If oneshot
972 	 * mode is active, we pick the highres valid clocksource with
973 	 * the best rating.
974 	 */
975 	list_for_each_entry(cs, &clocksource_list, list) {
976 		if (skipcur && cs == curr_clocksource)
977 			continue;
978 		if (oneshot && !(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES))
979 			continue;
980 		return cs;
981 	}
982 	return NULL;
983 }
984 
__clocksource_select(bool skipcur)985 static void __clocksource_select(bool skipcur)
986 {
987 	bool oneshot = tick_oneshot_mode_active();
988 	struct clocksource *best, *cs;
989 
990 	/* Find the best suitable clocksource */
991 	best = clocksource_find_best(oneshot, skipcur);
992 	if (!best)
993 		return;
994 
995 	if (!strlen(override_name))
996 		goto found;
997 
998 	/* Check for the override clocksource. */
999 	list_for_each_entry(cs, &clocksource_list, list) {
1000 		if (skipcur && cs == curr_clocksource)
1001 			continue;
1002 		if (strcmp(cs->name, override_name) != 0)
1003 			continue;
1004 		/*
1005 		 * Check to make sure we don't switch to a non-highres
1006 		 * capable clocksource if the tick code is in oneshot
1007 		 * mode (highres or nohz)
1008 		 */
1009 		if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) && oneshot) {
1010 			/* Override clocksource cannot be used. */
1011 			if (cs->flags & CLOCK_SOURCE_UNSTABLE) {
1012 				pr_warn("Override clocksource %s is unstable and not HRT compatible - cannot switch while in HRT/NOHZ mode\n",
1013 					cs->name);
1014 				override_name[0] = 0;
1015 			} else {
1016 				/*
1017 				 * The override cannot be currently verified.
1018 				 * Deferring to let the watchdog check.
1019 				 */
1020 				pr_info("Override clocksource %s is not currently HRT compatible - deferring\n",
1021 					cs->name);
1022 			}
1023 		} else
1024 			/* Override clocksource can be used. */
1025 			best = cs;
1026 		break;
1027 	}
1028 
1029 found:
1030 	if (curr_clocksource != best && !timekeeping_notify(best)) {
1031 		pr_info("Switched to clocksource %s\n", best->name);
1032 		curr_clocksource = best;
1033 	}
1034 }
1035 
1036 /**
1037  * clocksource_select - Select the best clocksource available
1038  *
1039  * Private function. Must hold clocksource_mutex when called.
1040  *
1041  * Select the clocksource with the best rating, or the clocksource,
1042  * which is selected by userspace override.
1043  */
clocksource_select(void)1044 static void clocksource_select(void)
1045 {
1046 	__clocksource_select(false);
1047 }
1048 
clocksource_select_fallback(void)1049 static void clocksource_select_fallback(void)
1050 {
1051 	__clocksource_select(true);
1052 }
1053 
1054 /*
1055  * clocksource_done_booting - Called near the end of core bootup
1056  *
1057  * Hack to avoid lots of clocksource churn at boot time.
1058  * We use fs_initcall because we want this to start before
1059  * device_initcall but after subsys_initcall.
1060  */
clocksource_done_booting(void)1061 static int __init clocksource_done_booting(void)
1062 {
1063 	mutex_lock(&clocksource_mutex);
1064 	curr_clocksource = clocksource_default_clock();
1065 	finished_booting = 1;
1066 	/*
1067 	 * Run the watchdog first to eliminate unstable clock sources
1068 	 */
1069 	__clocksource_watchdog_kthread();
1070 	clocksource_select();
1071 	mutex_unlock(&clocksource_mutex);
1072 	return 0;
1073 }
1074 fs_initcall(clocksource_done_booting);
1075 
1076 /*
1077  * Enqueue the clocksource sorted by rating
1078  */
clocksource_enqueue(struct clocksource * cs)1079 static void clocksource_enqueue(struct clocksource *cs)
1080 {
1081 	struct list_head *entry = &clocksource_list;
1082 	struct clocksource *tmp;
1083 
1084 	list_for_each_entry(tmp, &clocksource_list, list) {
1085 		/* Keep track of the place, where to insert */
1086 		if (tmp->rating < cs->rating)
1087 			break;
1088 		entry = &tmp->list;
1089 	}
1090 	list_add(&cs->list, entry);
1091 }
1092 
1093 /**
1094  * __clocksource_update_freq_scale - Used update clocksource with new freq
1095  * @cs:		clocksource to be registered
1096  * @scale:	Scale factor multiplied against freq to get clocksource hz
1097  * @freq:	clocksource frequency (cycles per second) divided by scale
1098  *
1099  * This should only be called from the clocksource->enable() method.
1100  *
1101  * This *SHOULD NOT* be called directly! Please use the
1102  * __clocksource_update_freq_hz() or __clocksource_update_freq_khz() helper
1103  * functions.
1104  */
__clocksource_update_freq_scale(struct clocksource * cs,u32 scale,u32 freq)1105 void __clocksource_update_freq_scale(struct clocksource *cs, u32 scale, u32 freq)
1106 {
1107 	u64 sec;
1108 
1109 	/*
1110 	 * Default clocksources are *special* and self-define their mult/shift.
1111 	 * But, you're not special, so you should specify a freq value.
1112 	 */
1113 	if (freq) {
1114 		/*
1115 		 * Calc the maximum number of seconds which we can run before
1116 		 * wrapping around. For clocksources which have a mask > 32-bit
1117 		 * we need to limit the max sleep time to have a good
1118 		 * conversion precision. 10 minutes is still a reasonable
1119 		 * amount. That results in a shift value of 24 for a
1120 		 * clocksource with mask >= 40-bit and f >= 4GHz. That maps to
1121 		 * ~ 0.06ppm granularity for NTP.
1122 		 */
1123 		sec = cs->mask;
1124 		do_div(sec, freq);
1125 		do_div(sec, scale);
1126 		if (!sec)
1127 			sec = 1;
1128 		else if (sec > 600 && cs->mask > UINT_MAX)
1129 			sec = 600;
1130 
1131 		clocks_calc_mult_shift(&cs->mult, &cs->shift, freq,
1132 				       NSEC_PER_SEC / scale, sec * scale);
1133 	}
1134 
1135 	/*
1136 	 * If the uncertainty margin is not specified, calculate it.
1137 	 * If both scale and freq are non-zero, calculate the clock
1138 	 * period, but bound below at 2*WATCHDOG_MAX_SKEW.  However,
1139 	 * if either of scale or freq is zero, be very conservative and
1140 	 * take the tens-of-milliseconds WATCHDOG_THRESHOLD value for the
1141 	 * uncertainty margin.  Allow stupidly small uncertainty margins
1142 	 * to be specified by the caller for testing purposes, but warn
1143 	 * to discourage production use of this capability.
1144 	 */
1145 	if (scale && freq && !cs->uncertainty_margin) {
1146 		cs->uncertainty_margin = NSEC_PER_SEC / (scale * freq);
1147 		if (cs->uncertainty_margin < 2 * WATCHDOG_MAX_SKEW)
1148 			cs->uncertainty_margin = 2 * WATCHDOG_MAX_SKEW;
1149 	} else if (!cs->uncertainty_margin) {
1150 		cs->uncertainty_margin = WATCHDOG_THRESHOLD;
1151 	}
1152 	WARN_ON_ONCE(cs->uncertainty_margin < 2 * WATCHDOG_MAX_SKEW);
1153 
1154 	/*
1155 	 * Ensure clocksources that have large 'mult' values don't overflow
1156 	 * when adjusted.
1157 	 */
1158 	cs->maxadj = clocksource_max_adjustment(cs);
1159 	while (freq && ((cs->mult + cs->maxadj < cs->mult)
1160 		|| (cs->mult - cs->maxadj > cs->mult))) {
1161 		cs->mult >>= 1;
1162 		cs->shift--;
1163 		cs->maxadj = clocksource_max_adjustment(cs);
1164 	}
1165 
1166 	/*
1167 	 * Only warn for *special* clocksources that self-define
1168 	 * their mult/shift values and don't specify a freq.
1169 	 */
1170 	WARN_ONCE(cs->mult + cs->maxadj < cs->mult,
1171 		"timekeeping: Clocksource %s might overflow on 11%% adjustment\n",
1172 		cs->name);
1173 
1174 	clocksource_update_max_deferment(cs);
1175 
1176 	pr_info("%s: mask: 0x%llx max_cycles: 0x%llx, max_idle_ns: %lld ns\n",
1177 		cs->name, cs->mask, cs->max_cycles, cs->max_idle_ns);
1178 }
1179 EXPORT_SYMBOL_GPL(__clocksource_update_freq_scale);
1180 
1181 /**
1182  * __clocksource_register_scale - Used to install new clocksources
1183  * @cs:		clocksource to be registered
1184  * @scale:	Scale factor multiplied against freq to get clocksource hz
1185  * @freq:	clocksource frequency (cycles per second) divided by scale
1186  *
1187  * Returns -EBUSY if registration fails, zero otherwise.
1188  *
1189  * This *SHOULD NOT* be called directly! Please use the
1190  * clocksource_register_hz() or clocksource_register_khz helper functions.
1191  */
__clocksource_register_scale(struct clocksource * cs,u32 scale,u32 freq)1192 int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
1193 {
1194 	unsigned long flags;
1195 
1196 	clocksource_arch_init(cs);
1197 
1198 	if (WARN_ON_ONCE((unsigned int)cs->id >= CSID_MAX))
1199 		cs->id = CSID_GENERIC;
1200 	if (cs->vdso_clock_mode < 0 ||
1201 	    cs->vdso_clock_mode >= VDSO_CLOCKMODE_MAX) {
1202 		pr_warn("clocksource %s registered with invalid VDSO mode %d. Disabling VDSO support.\n",
1203 			cs->name, cs->vdso_clock_mode);
1204 		cs->vdso_clock_mode = VDSO_CLOCKMODE_NONE;
1205 	}
1206 
1207 	/* Initialize mult/shift and max_idle_ns */
1208 	__clocksource_update_freq_scale(cs, scale, freq);
1209 
1210 	/* Add clocksource to the clocksource list */
1211 	mutex_lock(&clocksource_mutex);
1212 
1213 	clocksource_watchdog_lock(&flags);
1214 	clocksource_enqueue(cs);
1215 	clocksource_enqueue_watchdog(cs);
1216 	clocksource_watchdog_unlock(&flags);
1217 
1218 	clocksource_select();
1219 	clocksource_select_watchdog(false);
1220 	__clocksource_suspend_select(cs);
1221 	mutex_unlock(&clocksource_mutex);
1222 	return 0;
1223 }
1224 EXPORT_SYMBOL_GPL(__clocksource_register_scale);
1225 
__clocksource_change_rating(struct clocksource * cs,int rating)1226 static void __clocksource_change_rating(struct clocksource *cs, int rating)
1227 {
1228 	list_del(&cs->list);
1229 	cs->rating = rating;
1230 	clocksource_enqueue(cs);
1231 }
1232 
1233 /**
1234  * clocksource_change_rating - Change the rating of a registered clocksource
1235  * @cs:		clocksource to be changed
1236  * @rating:	new rating
1237  */
clocksource_change_rating(struct clocksource * cs,int rating)1238 void clocksource_change_rating(struct clocksource *cs, int rating)
1239 {
1240 	unsigned long flags;
1241 
1242 	mutex_lock(&clocksource_mutex);
1243 	clocksource_watchdog_lock(&flags);
1244 	__clocksource_change_rating(cs, rating);
1245 	clocksource_watchdog_unlock(&flags);
1246 
1247 	clocksource_select();
1248 	clocksource_select_watchdog(false);
1249 	clocksource_suspend_select(false);
1250 	mutex_unlock(&clocksource_mutex);
1251 }
1252 EXPORT_SYMBOL(clocksource_change_rating);
1253 
1254 /*
1255  * Unbind clocksource @cs. Called with clocksource_mutex held
1256  */
clocksource_unbind(struct clocksource * cs)1257 static int clocksource_unbind(struct clocksource *cs)
1258 {
1259 	unsigned long flags;
1260 
1261 	if (clocksource_is_watchdog(cs)) {
1262 		/* Select and try to install a replacement watchdog. */
1263 		clocksource_select_watchdog(true);
1264 		if (clocksource_is_watchdog(cs))
1265 			return -EBUSY;
1266 	}
1267 
1268 	if (cs == curr_clocksource) {
1269 		/* Select and try to install a replacement clock source */
1270 		clocksource_select_fallback();
1271 		if (curr_clocksource == cs)
1272 			return -EBUSY;
1273 	}
1274 
1275 	if (clocksource_is_suspend(cs)) {
1276 		/*
1277 		 * Select and try to install a replacement suspend clocksource.
1278 		 * If no replacement suspend clocksource, we will just let the
1279 		 * clocksource go and have no suspend clocksource.
1280 		 */
1281 		clocksource_suspend_select(true);
1282 	}
1283 
1284 	clocksource_watchdog_lock(&flags);
1285 	clocksource_dequeue_watchdog(cs);
1286 	list_del_init(&cs->list);
1287 	clocksource_watchdog_unlock(&flags);
1288 
1289 	return 0;
1290 }
1291 
1292 /**
1293  * clocksource_unregister - remove a registered clocksource
1294  * @cs:	clocksource to be unregistered
1295  */
clocksource_unregister(struct clocksource * cs)1296 int clocksource_unregister(struct clocksource *cs)
1297 {
1298 	int ret = 0;
1299 
1300 	mutex_lock(&clocksource_mutex);
1301 	if (!list_empty(&cs->list))
1302 		ret = clocksource_unbind(cs);
1303 	mutex_unlock(&clocksource_mutex);
1304 	return ret;
1305 }
1306 EXPORT_SYMBOL(clocksource_unregister);
1307 
1308 #ifdef CONFIG_SYSFS
1309 /**
1310  * current_clocksource_show - sysfs interface for current clocksource
1311  * @dev:	unused
1312  * @attr:	unused
1313  * @buf:	char buffer to be filled with clocksource list
1314  *
1315  * Provides sysfs interface for listing current clocksource.
1316  */
current_clocksource_show(struct device * dev,struct device_attribute * attr,char * buf)1317 static ssize_t current_clocksource_show(struct device *dev,
1318 					struct device_attribute *attr,
1319 					char *buf)
1320 {
1321 	ssize_t count = 0;
1322 
1323 	mutex_lock(&clocksource_mutex);
1324 	count = snprintf(buf, PAGE_SIZE, "%s\n", curr_clocksource->name);
1325 	mutex_unlock(&clocksource_mutex);
1326 
1327 	return count;
1328 }
1329 
sysfs_get_uname(const char * buf,char * dst,size_t cnt)1330 ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt)
1331 {
1332 	size_t ret = cnt;
1333 
1334 	/* strings from sysfs write are not 0 terminated! */
1335 	if (!cnt || cnt >= CS_NAME_LEN)
1336 		return -EINVAL;
1337 
1338 	/* strip of \n: */
1339 	if (buf[cnt-1] == '\n')
1340 		cnt--;
1341 	if (cnt > 0)
1342 		memcpy(dst, buf, cnt);
1343 	dst[cnt] = 0;
1344 	return ret;
1345 }
1346 
1347 /**
1348  * current_clocksource_store - interface for manually overriding clocksource
1349  * @dev:	unused
1350  * @attr:	unused
1351  * @buf:	name of override clocksource
1352  * @count:	length of buffer
1353  *
1354  * Takes input from sysfs interface for manually overriding the default
1355  * clocksource selection.
1356  */
current_clocksource_store(struct device * dev,struct device_attribute * attr,const char * buf,size_t count)1357 static ssize_t current_clocksource_store(struct device *dev,
1358 					 struct device_attribute *attr,
1359 					 const char *buf, size_t count)
1360 {
1361 	ssize_t ret;
1362 
1363 	mutex_lock(&clocksource_mutex);
1364 
1365 	ret = sysfs_get_uname(buf, override_name, count);
1366 	if (ret >= 0)
1367 		clocksource_select();
1368 
1369 	mutex_unlock(&clocksource_mutex);
1370 
1371 	return ret;
1372 }
1373 static DEVICE_ATTR_RW(current_clocksource);
1374 
1375 /**
1376  * unbind_clocksource_store - interface for manually unbinding clocksource
1377  * @dev:	unused
1378  * @attr:	unused
1379  * @buf:	unused
1380  * @count:	length of buffer
1381  *
1382  * Takes input from sysfs interface for manually unbinding a clocksource.
1383  */
unbind_clocksource_store(struct device * dev,struct device_attribute * attr,const char * buf,size_t count)1384 static ssize_t unbind_clocksource_store(struct device *dev,
1385 					struct device_attribute *attr,
1386 					const char *buf, size_t count)
1387 {
1388 	struct clocksource *cs;
1389 	char name[CS_NAME_LEN];
1390 	ssize_t ret;
1391 
1392 	ret = sysfs_get_uname(buf, name, count);
1393 	if (ret < 0)
1394 		return ret;
1395 
1396 	ret = -ENODEV;
1397 	mutex_lock(&clocksource_mutex);
1398 	list_for_each_entry(cs, &clocksource_list, list) {
1399 		if (strcmp(cs->name, name))
1400 			continue;
1401 		ret = clocksource_unbind(cs);
1402 		break;
1403 	}
1404 	mutex_unlock(&clocksource_mutex);
1405 
1406 	return ret ? ret : count;
1407 }
1408 static DEVICE_ATTR_WO(unbind_clocksource);
1409 
1410 /**
1411  * available_clocksource_show - sysfs interface for listing clocksource
1412  * @dev:	unused
1413  * @attr:	unused
1414  * @buf:	char buffer to be filled with clocksource list
1415  *
1416  * Provides sysfs interface for listing registered clocksources
1417  */
available_clocksource_show(struct device * dev,struct device_attribute * attr,char * buf)1418 static ssize_t available_clocksource_show(struct device *dev,
1419 					  struct device_attribute *attr,
1420 					  char *buf)
1421 {
1422 	struct clocksource *src;
1423 	ssize_t count = 0;
1424 
1425 	mutex_lock(&clocksource_mutex);
1426 	list_for_each_entry(src, &clocksource_list, list) {
1427 		/*
1428 		 * Don't show non-HRES clocksource if the tick code is
1429 		 * in one shot mode (highres=on or nohz=on)
1430 		 */
1431 		if (!tick_oneshot_mode_active() ||
1432 		    (src->flags & CLOCK_SOURCE_VALID_FOR_HRES))
1433 			count += snprintf(buf + count,
1434 				  max((ssize_t)PAGE_SIZE - count, (ssize_t)0),
1435 				  "%s ", src->name);
1436 	}
1437 	mutex_unlock(&clocksource_mutex);
1438 
1439 	count += snprintf(buf + count,
1440 			  max((ssize_t)PAGE_SIZE - count, (ssize_t)0), "\n");
1441 
1442 	return count;
1443 }
1444 static DEVICE_ATTR_RO(available_clocksource);
1445 
1446 static struct attribute *clocksource_attrs[] = {
1447 	&dev_attr_current_clocksource.attr,
1448 	&dev_attr_unbind_clocksource.attr,
1449 	&dev_attr_available_clocksource.attr,
1450 	NULL
1451 };
1452 ATTRIBUTE_GROUPS(clocksource);
1453 
1454 static struct bus_type clocksource_subsys = {
1455 	.name = "clocksource",
1456 	.dev_name = "clocksource",
1457 };
1458 
1459 static struct device device_clocksource = {
1460 	.id	= 0,
1461 	.bus	= &clocksource_subsys,
1462 	.groups	= clocksource_groups,
1463 };
1464 
init_clocksource_sysfs(void)1465 static int __init init_clocksource_sysfs(void)
1466 {
1467 	int error = subsys_system_register(&clocksource_subsys, NULL);
1468 
1469 	if (!error)
1470 		error = device_register(&device_clocksource);
1471 
1472 	return error;
1473 }
1474 
1475 device_initcall(init_clocksource_sysfs);
1476 #endif /* CONFIG_SYSFS */
1477 
1478 /**
1479  * boot_override_clocksource - boot clock override
1480  * @str:	override name
1481  *
1482  * Takes a clocksource= boot argument and uses it
1483  * as the clocksource override name.
1484  */
boot_override_clocksource(char * str)1485 static int __init boot_override_clocksource(char* str)
1486 {
1487 	mutex_lock(&clocksource_mutex);
1488 	if (str)
1489 		strlcpy(override_name, str, sizeof(override_name));
1490 	mutex_unlock(&clocksource_mutex);
1491 	return 1;
1492 }
1493 
1494 __setup("clocksource=", boot_override_clocksource);
1495 
1496 /**
1497  * boot_override_clock - Compatibility layer for deprecated boot option
1498  * @str:	override name
1499  *
1500  * DEPRECATED! Takes a clock= boot argument and uses it
1501  * as the clocksource override name
1502  */
boot_override_clock(char * str)1503 static int __init boot_override_clock(char* str)
1504 {
1505 	if (!strcmp(str, "pmtmr")) {
1506 		pr_warn("clock=pmtmr is deprecated - use clocksource=acpi_pm\n");
1507 		return boot_override_clocksource("acpi_pm");
1508 	}
1509 	pr_warn("clock= boot option is deprecated - use clocksource=xyz\n");
1510 	return boot_override_clocksource(str);
1511 }
1512 
1513 __setup("clock=", boot_override_clock);
1514