• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: GPL-2.0+
2 /*
3  * This file contains the functions which manage clocksource drivers.
4  *
5  * Copyright (C) 2004, 2005 IBM, John Stultz (johnstul@us.ibm.com)
6  */
7 
8 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
9 
10 #include <linux/device.h>
11 #include <linux/clocksource.h>
12 #include <linux/init.h>
13 #include <linux/module.h>
14 #include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */
15 #include <linux/tick.h>
16 #include <linux/kthread.h>
17 
18 #include "tick-internal.h"
19 #include "timekeeping_internal.h"
20 
21 /**
22  * clocks_calc_mult_shift - calculate mult/shift factors for scaled math of clocks
23  * @mult:	pointer to mult variable
24  * @shift:	pointer to shift variable
25  * @from:	frequency to convert from
26  * @to:		frequency to convert to
27  * @maxsec:	guaranteed runtime conversion range in seconds
28  *
29  * The function evaluates the shift/mult pair for the scaled math
30  * operations of clocksources and clockevents.
31  *
32  * @to and @from are frequency values in HZ. For clock sources @to is
33  * NSEC_PER_SEC == 1GHz and @from is the counter frequency. For clock
34  * event @to is the counter frequency and @from is NSEC_PER_SEC.
35  *
36  * The @maxsec conversion range argument controls the time frame in
37  * seconds which must be covered by the runtime conversion with the
38  * calculated mult and shift factors. This guarantees that no 64bit
39  * overflow happens when the input value of the conversion is
40  * multiplied with the calculated mult factor. Larger ranges may
41  * reduce the conversion accuracy by chosing smaller mult and shift
42  * factors.
43  */
44 void
clocks_calc_mult_shift(u32 * mult,u32 * shift,u32 from,u32 to,u32 maxsec)45 clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 maxsec)
46 {
47 	u64 tmp;
48 	u32 sft, sftacc= 32;
49 
50 	/*
51 	 * Calculate the shift factor which is limiting the conversion
52 	 * range:
53 	 */
54 	tmp = ((u64)maxsec * from) >> 32;
55 	while (tmp) {
56 		tmp >>=1;
57 		sftacc--;
58 	}
59 
60 	/*
61 	 * Find the conversion shift/mult pair which has the best
62 	 * accuracy and fits the maxsec conversion range:
63 	 */
64 	for (sft = 32; sft > 0; sft--) {
65 		tmp = (u64) to << sft;
66 		tmp += from / 2;
67 		do_div(tmp, from);
68 		if ((tmp >> sftacc) == 0)
69 			break;
70 	}
71 	*mult = tmp;
72 	*shift = sft;
73 }
74 EXPORT_SYMBOL_GPL(clocks_calc_mult_shift);
75 
76 /*[Clocksource internal variables]---------
77  * curr_clocksource:
78  *	currently selected clocksource.
79  * suspend_clocksource:
80  *	used to calculate the suspend time.
81  * clocksource_list:
82  *	linked list with the registered clocksources
83  * clocksource_mutex:
84  *	protects manipulations to curr_clocksource and the clocksource_list
85  * override_name:
86  *	Name of the user-specified clocksource.
87  */
88 static struct clocksource *curr_clocksource;
89 static struct clocksource *suspend_clocksource;
90 static LIST_HEAD(clocksource_list);
91 static DEFINE_MUTEX(clocksource_mutex);
92 static char override_name[CS_NAME_LEN];
93 static int finished_booting;
94 static u64 suspend_start;
95 
96 /*
97  * Threshold: 0.0312s, when doubled: 0.0625s.
98  * Also a default for cs->uncertainty_margin when registering clocks.
99  */
100 #define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 5)
101 
102 /*
103  * Maximum permissible delay between two readouts of the watchdog
104  * clocksource surrounding a read of the clocksource being validated.
105  * This delay could be due to SMIs, NMIs, or to VCPU preemptions.  Used as
106  * a lower bound for cs->uncertainty_margin values when registering clocks.
107  */
108 #define WATCHDOG_MAX_SKEW (100 * NSEC_PER_USEC)
109 
110 #ifdef CONFIG_CLOCKSOURCE_WATCHDOG
111 static void clocksource_watchdog_work(struct work_struct *work);
112 static void clocksource_select(void);
113 
114 static LIST_HEAD(watchdog_list);
115 static struct clocksource *watchdog;
116 static struct timer_list watchdog_timer;
117 static DECLARE_WORK(watchdog_work, clocksource_watchdog_work);
118 static DEFINE_SPINLOCK(watchdog_lock);
119 static int watchdog_running;
120 static atomic_t watchdog_reset_pending;
121 static int64_t watchdog_max_interval;
122 
clocksource_watchdog_lock(unsigned long * flags)123 static inline void clocksource_watchdog_lock(unsigned long *flags)
124 {
125 	spin_lock_irqsave(&watchdog_lock, *flags);
126 }
127 
clocksource_watchdog_unlock(unsigned long * flags)128 static inline void clocksource_watchdog_unlock(unsigned long *flags)
129 {
130 	spin_unlock_irqrestore(&watchdog_lock, *flags);
131 }
132 
133 static int clocksource_watchdog_kthread(void *data);
134 static void __clocksource_change_rating(struct clocksource *cs, int rating);
135 
136 /*
137  * Interval: 0.5sec.
138  */
139 #define WATCHDOG_INTERVAL (HZ >> 1)
140 #define WATCHDOG_INTERVAL_MAX_NS ((2 * WATCHDOG_INTERVAL) * (NSEC_PER_SEC / HZ))
141 
clocksource_watchdog_work(struct work_struct * work)142 static void clocksource_watchdog_work(struct work_struct *work)
143 {
144 	/*
145 	 * We cannot directly run clocksource_watchdog_kthread() here, because
146 	 * clocksource_select() calls timekeeping_notify() which uses
147 	 * stop_machine(). One cannot use stop_machine() from a workqueue() due
148 	 * lock inversions wrt CPU hotplug.
149 	 *
150 	 * Also, we only ever run this work once or twice during the lifetime
151 	 * of the kernel, so there is no point in creating a more permanent
152 	 * kthread for this.
153 	 *
154 	 * If kthread_run fails the next watchdog scan over the
155 	 * watchdog_list will find the unstable clock again.
156 	 */
157 	kthread_run(clocksource_watchdog_kthread, NULL, "kwatchdog");
158 }
159 
__clocksource_unstable(struct clocksource * cs)160 static void __clocksource_unstable(struct clocksource *cs)
161 {
162 	cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG);
163 	cs->flags |= CLOCK_SOURCE_UNSTABLE;
164 
165 	/*
166 	 * If the clocksource is registered clocksource_watchdog_kthread() will
167 	 * re-rate and re-select.
168 	 */
169 	if (list_empty(&cs->list)) {
170 		cs->rating = 0;
171 		return;
172 	}
173 
174 	if (cs->mark_unstable)
175 		cs->mark_unstable(cs);
176 
177 	/* kick clocksource_watchdog_kthread() */
178 	if (finished_booting)
179 		schedule_work(&watchdog_work);
180 }
181 
182 /**
183  * clocksource_mark_unstable - mark clocksource unstable via watchdog
184  * @cs:		clocksource to be marked unstable
185  *
186  * This function is called by the x86 TSC code to mark clocksources as unstable;
187  * it defers demotion and re-selection to a kthread.
188  */
clocksource_mark_unstable(struct clocksource * cs)189 void clocksource_mark_unstable(struct clocksource *cs)
190 {
191 	unsigned long flags;
192 
193 	spin_lock_irqsave(&watchdog_lock, flags);
194 	if (!(cs->flags & CLOCK_SOURCE_UNSTABLE)) {
195 		if (!list_empty(&cs->list) && list_empty(&cs->wd_list))
196 			list_add(&cs->wd_list, &watchdog_list);
197 		__clocksource_unstable(cs);
198 	}
199 	spin_unlock_irqrestore(&watchdog_lock, flags);
200 }
201 
202 static ulong max_cswd_read_retries = 3;
203 module_param(max_cswd_read_retries, ulong, 0644);
204 
205 enum wd_read_status {
206 	WD_READ_SUCCESS,
207 	WD_READ_UNSTABLE,
208 	WD_READ_SKIP
209 };
210 
cs_watchdog_read(struct clocksource * cs,u64 * csnow,u64 * wdnow)211 static enum wd_read_status cs_watchdog_read(struct clocksource *cs, u64 *csnow, u64 *wdnow)
212 {
213 	unsigned int nretries;
214 	u64 wd_end, wd_end2, wd_delta;
215 	int64_t wd_delay, wd_seq_delay;
216 
217 	for (nretries = 0; nretries <= max_cswd_read_retries; nretries++) {
218 		local_irq_disable();
219 		*wdnow = watchdog->read(watchdog);
220 		*csnow = cs->read(cs);
221 		wd_end = watchdog->read(watchdog);
222 		wd_end2 = watchdog->read(watchdog);
223 		local_irq_enable();
224 
225 		wd_delta = clocksource_delta(wd_end, *wdnow, watchdog->mask);
226 		wd_delay = clocksource_cyc2ns(wd_delta, watchdog->mult,
227 					      watchdog->shift);
228 		if (wd_delay <= WATCHDOG_MAX_SKEW) {
229 			if (nretries > 1 || nretries >= max_cswd_read_retries) {
230 				pr_warn("timekeeping watchdog on CPU%d: %s retried %d times before success\n",
231 					smp_processor_id(), watchdog->name, nretries);
232 			}
233 			return WD_READ_SUCCESS;
234 		}
235 
236 		/*
237 		 * Now compute delay in consecutive watchdog read to see if
238 		 * there is too much external interferences that cause
239 		 * significant delay in reading both clocksource and watchdog.
240 		 *
241 		 * If consecutive WD read-back delay > WATCHDOG_MAX_SKEW/2,
242 		 * report system busy, reinit the watchdog and skip the current
243 		 * watchdog test.
244 		 */
245 		wd_delta = clocksource_delta(wd_end2, wd_end, watchdog->mask);
246 		wd_seq_delay = clocksource_cyc2ns(wd_delta, watchdog->mult, watchdog->shift);
247 		if (wd_seq_delay > WATCHDOG_MAX_SKEW/2)
248 			goto skip_test;
249 	}
250 
251 	pr_warn("timekeeping watchdog on CPU%d: %s read-back delay of %lldns, attempt %d, marking unstable\n",
252 		smp_processor_id(), watchdog->name, wd_delay, nretries);
253 	return WD_READ_UNSTABLE;
254 
255 skip_test:
256 	pr_info("timekeeping watchdog on CPU%d: %s wd-wd read-back delay of %lldns\n",
257 		smp_processor_id(), watchdog->name, wd_seq_delay);
258 	pr_info("wd-%s-wd read-back delay of %lldns, clock-skew test skipped!\n",
259 		cs->name, wd_delay);
260 	return WD_READ_SKIP;
261 }
262 
263 static u64 csnow_mid;
264 static cpumask_t cpus_ahead;
265 static cpumask_t cpus_behind;
266 
clocksource_verify_one_cpu(void * csin)267 static void clocksource_verify_one_cpu(void *csin)
268 {
269 	struct clocksource *cs = (struct clocksource *)csin;
270 
271 	csnow_mid = cs->read(cs);
272 }
273 
clocksource_verify_percpu(struct clocksource * cs)274 static void clocksource_verify_percpu(struct clocksource *cs)
275 {
276 	int64_t cs_nsec, cs_nsec_max = 0, cs_nsec_min = LLONG_MAX;
277 	u64 csnow_begin, csnow_end;
278 	int cpu, testcpu;
279 	s64 delta;
280 
281 	cpumask_clear(&cpus_ahead);
282 	cpumask_clear(&cpus_behind);
283 	preempt_disable();
284 	testcpu = smp_processor_id();
285 	pr_warn("Checking clocksource %s synchronization from CPU %d.\n", cs->name, testcpu);
286 	for_each_online_cpu(cpu) {
287 		if (cpu == testcpu)
288 			continue;
289 		csnow_begin = cs->read(cs);
290 		smp_call_function_single(cpu, clocksource_verify_one_cpu, cs, 1);
291 		csnow_end = cs->read(cs);
292 		delta = (s64)((csnow_mid - csnow_begin) & cs->mask);
293 		if (delta < 0)
294 			cpumask_set_cpu(cpu, &cpus_behind);
295 		delta = (csnow_end - csnow_mid) & cs->mask;
296 		if (delta < 0)
297 			cpumask_set_cpu(cpu, &cpus_ahead);
298 		delta = clocksource_delta(csnow_end, csnow_begin, cs->mask);
299 		cs_nsec = clocksource_cyc2ns(delta, cs->mult, cs->shift);
300 		if (cs_nsec > cs_nsec_max)
301 			cs_nsec_max = cs_nsec;
302 		if (cs_nsec < cs_nsec_min)
303 			cs_nsec_min = cs_nsec;
304 	}
305 	preempt_enable();
306 	if (!cpumask_empty(&cpus_ahead))
307 		pr_warn("        CPUs %*pbl ahead of CPU %d for clocksource %s.\n",
308 			cpumask_pr_args(&cpus_ahead), testcpu, cs->name);
309 	if (!cpumask_empty(&cpus_behind))
310 		pr_warn("        CPUs %*pbl behind CPU %d for clocksource %s.\n",
311 			cpumask_pr_args(&cpus_behind), testcpu, cs->name);
312 	if (!cpumask_empty(&cpus_ahead) || !cpumask_empty(&cpus_behind))
313 		pr_warn("        CPU %d check durations %lldns - %lldns for clocksource %s.\n",
314 			testcpu, cs_nsec_min, cs_nsec_max, cs->name);
315 }
316 
clocksource_reset_watchdog(void)317 static inline void clocksource_reset_watchdog(void)
318 {
319 	struct clocksource *cs;
320 
321 	list_for_each_entry(cs, &watchdog_list, wd_list)
322 		cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
323 }
324 
325 
clocksource_watchdog(struct timer_list * unused)326 static void clocksource_watchdog(struct timer_list *unused)
327 {
328 	u64 csnow, wdnow, cslast, wdlast, delta;
329 	int64_t wd_nsec, cs_nsec, interval;
330 	int next_cpu, reset_pending;
331 	struct clocksource *cs;
332 	enum wd_read_status read_ret;
333 	unsigned long extra_wait = 0;
334 	u32 md;
335 
336 	spin_lock(&watchdog_lock);
337 	if (!watchdog_running)
338 		goto out;
339 
340 	reset_pending = atomic_read(&watchdog_reset_pending);
341 
342 	list_for_each_entry(cs, &watchdog_list, wd_list) {
343 
344 		/* Clocksource already marked unstable? */
345 		if (cs->flags & CLOCK_SOURCE_UNSTABLE) {
346 			if (finished_booting)
347 				schedule_work(&watchdog_work);
348 			continue;
349 		}
350 
351 		read_ret = cs_watchdog_read(cs, &csnow, &wdnow);
352 
353 		if (read_ret == WD_READ_UNSTABLE) {
354 			/* Clock readout unreliable, so give it up. */
355 			__clocksource_unstable(cs);
356 			continue;
357 		}
358 
359 		/*
360 		 * When WD_READ_SKIP is returned, it means the system is likely
361 		 * under very heavy load, where the latency of reading
362 		 * watchdog/clocksource is very big, and affect the accuracy of
363 		 * watchdog check. So give system some space and suspend the
364 		 * watchdog check for 5 minutes.
365 		 */
366 		if (read_ret == WD_READ_SKIP) {
367 			/*
368 			 * As the watchdog timer will be suspended, and
369 			 * cs->last could keep unchanged for 5 minutes, reset
370 			 * the counters.
371 			 */
372 			clocksource_reset_watchdog();
373 			extra_wait = HZ * 300;
374 			break;
375 		}
376 
377 		/* Clocksource initialized ? */
378 		if (!(cs->flags & CLOCK_SOURCE_WATCHDOG) ||
379 		    atomic_read(&watchdog_reset_pending)) {
380 			cs->flags |= CLOCK_SOURCE_WATCHDOG;
381 			cs->wd_last = wdnow;
382 			cs->cs_last = csnow;
383 			continue;
384 		}
385 
386 		delta = clocksource_delta(wdnow, cs->wd_last, watchdog->mask);
387 		wd_nsec = clocksource_cyc2ns(delta, watchdog->mult,
388 					     watchdog->shift);
389 
390 		delta = clocksource_delta(csnow, cs->cs_last, cs->mask);
391 		cs_nsec = clocksource_cyc2ns(delta, cs->mult, cs->shift);
392 		wdlast = cs->wd_last; /* save these in case we print them */
393 		cslast = cs->cs_last;
394 		cs->cs_last = csnow;
395 		cs->wd_last = wdnow;
396 
397 		if (atomic_read(&watchdog_reset_pending))
398 			continue;
399 
400 		/*
401 		 * The processing of timer softirqs can get delayed (usually
402 		 * on account of ksoftirqd not getting to run in a timely
403 		 * manner), which causes the watchdog interval to stretch.
404 		 * Skew detection may fail for longer watchdog intervals
405 		 * on account of fixed margins being used.
406 		 * Some clocksources, e.g. acpi_pm, cannot tolerate
407 		 * watchdog intervals longer than a few seconds.
408 		 */
409 		interval = max(cs_nsec, wd_nsec);
410 		if (unlikely(interval > WATCHDOG_INTERVAL_MAX_NS)) {
411 			if (system_state > SYSTEM_SCHEDULING &&
412 			    interval > 2 * watchdog_max_interval) {
413 				watchdog_max_interval = interval;
414 				pr_warn("Long readout interval, skipping watchdog check: cs_nsec: %lld wd_nsec: %lld\n",
415 					cs_nsec, wd_nsec);
416 			}
417 			watchdog_timer.expires = jiffies;
418 			continue;
419 		}
420 
421 		/* Check the deviation from the watchdog clocksource. */
422 		md = cs->uncertainty_margin + watchdog->uncertainty_margin;
423 		if (abs(cs_nsec - wd_nsec) > md) {
424 			pr_warn("timekeeping watchdog on CPU%d: Marking clocksource '%s' as unstable because the skew is too large:\n",
425 				smp_processor_id(), cs->name);
426 			pr_warn("                      '%s' wd_now: %llx wd_last: %llx mask: %llx\n",
427 				watchdog->name, wdnow, wdlast, watchdog->mask);
428 			pr_warn("                      '%s' cs_now: %llx cs_last: %llx mask: %llx\n",
429 				cs->name, csnow, cslast, cs->mask);
430 			__clocksource_unstable(cs);
431 			continue;
432 		}
433 
434 		if (cs == curr_clocksource && cs->tick_stable)
435 			cs->tick_stable(cs);
436 
437 		if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) &&
438 		    (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) &&
439 		    (watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) {
440 			/* Mark it valid for high-res. */
441 			cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
442 
443 			/*
444 			 * clocksource_done_booting() will sort it if
445 			 * finished_booting is not set yet.
446 			 */
447 			if (!finished_booting)
448 				continue;
449 
450 			/*
451 			 * If this is not the current clocksource let
452 			 * the watchdog thread reselect it. Due to the
453 			 * change to high res this clocksource might
454 			 * be preferred now. If it is the current
455 			 * clocksource let the tick code know about
456 			 * that change.
457 			 */
458 			if (cs != curr_clocksource) {
459 				cs->flags |= CLOCK_SOURCE_RESELECT;
460 				schedule_work(&watchdog_work);
461 			} else {
462 				tick_clock_notify();
463 			}
464 		}
465 	}
466 
467 	/*
468 	 * We only clear the watchdog_reset_pending, when we did a
469 	 * full cycle through all clocksources.
470 	 */
471 	if (reset_pending)
472 		atomic_dec(&watchdog_reset_pending);
473 
474 	/*
475 	 * Cycle through CPUs to check if the CPUs stay synchronized
476 	 * to each other.
477 	 */
478 	next_cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask);
479 	if (next_cpu >= nr_cpu_ids)
480 		next_cpu = cpumask_first(cpu_online_mask);
481 
482 	/*
483 	 * Arm timer if not already pending: could race with concurrent
484 	 * pair clocksource_stop_watchdog() clocksource_start_watchdog().
485 	 */
486 	if (!timer_pending(&watchdog_timer)) {
487 		watchdog_timer.expires += WATCHDOG_INTERVAL + extra_wait;
488 		add_timer_on(&watchdog_timer, next_cpu);
489 	}
490 out:
491 	spin_unlock(&watchdog_lock);
492 }
493 
clocksource_start_watchdog(void)494 static inline void clocksource_start_watchdog(void)
495 {
496 	if (watchdog_running || !watchdog || list_empty(&watchdog_list))
497 		return;
498 	timer_setup(&watchdog_timer, clocksource_watchdog, 0);
499 	watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL;
500 	add_timer_on(&watchdog_timer, cpumask_first(cpu_online_mask));
501 	watchdog_running = 1;
502 }
503 
clocksource_stop_watchdog(void)504 static inline void clocksource_stop_watchdog(void)
505 {
506 	if (!watchdog_running || (watchdog && !list_empty(&watchdog_list)))
507 		return;
508 	del_timer(&watchdog_timer);
509 	watchdog_running = 0;
510 }
511 
clocksource_resume_watchdog(void)512 static void clocksource_resume_watchdog(void)
513 {
514 	atomic_inc(&watchdog_reset_pending);
515 }
516 
clocksource_enqueue_watchdog(struct clocksource * cs)517 static void clocksource_enqueue_watchdog(struct clocksource *cs)
518 {
519 	INIT_LIST_HEAD(&cs->wd_list);
520 
521 	if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) {
522 		/* cs is a clocksource to be watched. */
523 		list_add(&cs->wd_list, &watchdog_list);
524 		cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
525 	} else {
526 		/* cs is a watchdog. */
527 		if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS)
528 			cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
529 	}
530 }
531 
clocksource_select_watchdog(bool fallback)532 static void clocksource_select_watchdog(bool fallback)
533 {
534 	struct clocksource *cs, *old_wd;
535 	unsigned long flags;
536 
537 	spin_lock_irqsave(&watchdog_lock, flags);
538 	/* save current watchdog */
539 	old_wd = watchdog;
540 	if (fallback)
541 		watchdog = NULL;
542 
543 	list_for_each_entry(cs, &clocksource_list, list) {
544 		/* cs is a clocksource to be watched. */
545 		if (cs->flags & CLOCK_SOURCE_MUST_VERIFY)
546 			continue;
547 
548 		/* Skip current if we were requested for a fallback. */
549 		if (fallback && cs == old_wd)
550 			continue;
551 
552 		/* Pick the best watchdog. */
553 		if (!watchdog || cs->rating > watchdog->rating)
554 			watchdog = cs;
555 	}
556 	/* If we failed to find a fallback restore the old one. */
557 	if (!watchdog)
558 		watchdog = old_wd;
559 
560 	/* If we changed the watchdog we need to reset cycles. */
561 	if (watchdog != old_wd)
562 		clocksource_reset_watchdog();
563 
564 	/* Check if the watchdog timer needs to be started. */
565 	clocksource_start_watchdog();
566 	spin_unlock_irqrestore(&watchdog_lock, flags);
567 }
568 
clocksource_dequeue_watchdog(struct clocksource * cs)569 static void clocksource_dequeue_watchdog(struct clocksource *cs)
570 {
571 	if (cs != watchdog) {
572 		if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) {
573 			/* cs is a watched clocksource. */
574 			list_del_init(&cs->wd_list);
575 			/* Check if the watchdog timer needs to be stopped. */
576 			clocksource_stop_watchdog();
577 		}
578 	}
579 }
580 
__clocksource_watchdog_kthread(void)581 static int __clocksource_watchdog_kthread(void)
582 {
583 	struct clocksource *cs, *tmp;
584 	unsigned long flags;
585 	int select = 0;
586 
587 	/* Do any required per-CPU skew verification. */
588 	if (curr_clocksource &&
589 	    curr_clocksource->flags & CLOCK_SOURCE_UNSTABLE &&
590 	    curr_clocksource->flags & CLOCK_SOURCE_VERIFY_PERCPU)
591 		clocksource_verify_percpu(curr_clocksource);
592 
593 	spin_lock_irqsave(&watchdog_lock, flags);
594 	list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) {
595 		if (cs->flags & CLOCK_SOURCE_UNSTABLE) {
596 			list_del_init(&cs->wd_list);
597 			__clocksource_change_rating(cs, 0);
598 			select = 1;
599 		}
600 		if (cs->flags & CLOCK_SOURCE_RESELECT) {
601 			cs->flags &= ~CLOCK_SOURCE_RESELECT;
602 			select = 1;
603 		}
604 	}
605 	/* Check if the watchdog timer needs to be stopped. */
606 	clocksource_stop_watchdog();
607 	spin_unlock_irqrestore(&watchdog_lock, flags);
608 
609 	return select;
610 }
611 
clocksource_watchdog_kthread(void * data)612 static int clocksource_watchdog_kthread(void *data)
613 {
614 	mutex_lock(&clocksource_mutex);
615 	if (__clocksource_watchdog_kthread())
616 		clocksource_select();
617 	mutex_unlock(&clocksource_mutex);
618 	return 0;
619 }
620 
clocksource_is_watchdog(struct clocksource * cs)621 static bool clocksource_is_watchdog(struct clocksource *cs)
622 {
623 	return cs == watchdog;
624 }
625 
626 #else /* CONFIG_CLOCKSOURCE_WATCHDOG */
627 
clocksource_enqueue_watchdog(struct clocksource * cs)628 static void clocksource_enqueue_watchdog(struct clocksource *cs)
629 {
630 	if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS)
631 		cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
632 }
633 
clocksource_select_watchdog(bool fallback)634 static void clocksource_select_watchdog(bool fallback) { }
clocksource_dequeue_watchdog(struct clocksource * cs)635 static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { }
clocksource_resume_watchdog(void)636 static inline void clocksource_resume_watchdog(void) { }
__clocksource_watchdog_kthread(void)637 static inline int __clocksource_watchdog_kthread(void) { return 0; }
clocksource_is_watchdog(struct clocksource * cs)638 static bool clocksource_is_watchdog(struct clocksource *cs) { return false; }
clocksource_mark_unstable(struct clocksource * cs)639 void clocksource_mark_unstable(struct clocksource *cs) { }
640 
clocksource_watchdog_lock(unsigned long * flags)641 static inline void clocksource_watchdog_lock(unsigned long *flags) { }
clocksource_watchdog_unlock(unsigned long * flags)642 static inline void clocksource_watchdog_unlock(unsigned long *flags) { }
643 
644 #endif /* CONFIG_CLOCKSOURCE_WATCHDOG */
645 
clocksource_is_suspend(struct clocksource * cs)646 static bool clocksource_is_suspend(struct clocksource *cs)
647 {
648 	return cs == suspend_clocksource;
649 }
650 
__clocksource_suspend_select(struct clocksource * cs)651 static void __clocksource_suspend_select(struct clocksource *cs)
652 {
653 	/*
654 	 * Skip the clocksource which will be stopped in suspend state.
655 	 */
656 	if (!(cs->flags & CLOCK_SOURCE_SUSPEND_NONSTOP))
657 		return;
658 
659 	/*
660 	 * The nonstop clocksource can be selected as the suspend clocksource to
661 	 * calculate the suspend time, so it should not supply suspend/resume
662 	 * interfaces to suspend the nonstop clocksource when system suspends.
663 	 */
664 	if (cs->suspend || cs->resume) {
665 		pr_warn("Nonstop clocksource %s should not supply suspend/resume interfaces\n",
666 			cs->name);
667 	}
668 
669 	/* Pick the best rating. */
670 	if (!suspend_clocksource || cs->rating > suspend_clocksource->rating)
671 		suspend_clocksource = cs;
672 }
673 
674 /**
675  * clocksource_suspend_select - Select the best clocksource for suspend timing
676  * @fallback:	if select a fallback clocksource
677  */
clocksource_suspend_select(bool fallback)678 static void clocksource_suspend_select(bool fallback)
679 {
680 	struct clocksource *cs, *old_suspend;
681 
682 	old_suspend = suspend_clocksource;
683 	if (fallback)
684 		suspend_clocksource = NULL;
685 
686 	list_for_each_entry(cs, &clocksource_list, list) {
687 		/* Skip current if we were requested for a fallback. */
688 		if (fallback && cs == old_suspend)
689 			continue;
690 
691 		__clocksource_suspend_select(cs);
692 	}
693 }
694 
695 /**
696  * clocksource_start_suspend_timing - Start measuring the suspend timing
697  * @cs:			current clocksource from timekeeping
698  * @start_cycles:	current cycles from timekeeping
699  *
700  * This function will save the start cycle values of suspend timer to calculate
701  * the suspend time when resuming system.
702  *
703  * This function is called late in the suspend process from timekeeping_suspend(),
704  * that means processes are freezed, non-boot cpus and interrupts are disabled
705  * now. It is therefore possible to start the suspend timer without taking the
706  * clocksource mutex.
707  */
clocksource_start_suspend_timing(struct clocksource * cs,u64 start_cycles)708 void clocksource_start_suspend_timing(struct clocksource *cs, u64 start_cycles)
709 {
710 	if (!suspend_clocksource)
711 		return;
712 
713 	/*
714 	 * If current clocksource is the suspend timer, we should use the
715 	 * tkr_mono.cycle_last value as suspend_start to avoid same reading
716 	 * from suspend timer.
717 	 */
718 	if (clocksource_is_suspend(cs)) {
719 		suspend_start = start_cycles;
720 		return;
721 	}
722 
723 	if (suspend_clocksource->enable &&
724 	    suspend_clocksource->enable(suspend_clocksource)) {
725 		pr_warn_once("Failed to enable the non-suspend-able clocksource.\n");
726 		return;
727 	}
728 
729 	suspend_start = suspend_clocksource->read(suspend_clocksource);
730 }
731 
732 /**
733  * clocksource_stop_suspend_timing - Stop measuring the suspend timing
734  * @cs:		current clocksource from timekeeping
735  * @cycle_now:	current cycles from timekeeping
736  *
737  * This function will calculate the suspend time from suspend timer.
738  *
739  * Returns nanoseconds since suspend started, 0 if no usable suspend clocksource.
740  *
741  * This function is called early in the resume process from timekeeping_resume(),
742  * that means there is only one cpu, no processes are running and the interrupts
743  * are disabled. It is therefore possible to stop the suspend timer without
744  * taking the clocksource mutex.
745  */
clocksource_stop_suspend_timing(struct clocksource * cs,u64 cycle_now)746 u64 clocksource_stop_suspend_timing(struct clocksource *cs, u64 cycle_now)
747 {
748 	u64 now, delta, nsec = 0;
749 
750 	if (!suspend_clocksource)
751 		return 0;
752 
753 	/*
754 	 * If current clocksource is the suspend timer, we should use the
755 	 * tkr_mono.cycle_last value from timekeeping as current cycle to
756 	 * avoid same reading from suspend timer.
757 	 */
758 	if (clocksource_is_suspend(cs))
759 		now = cycle_now;
760 	else
761 		now = suspend_clocksource->read(suspend_clocksource);
762 
763 	if (now > suspend_start) {
764 		delta = clocksource_delta(now, suspend_start,
765 					  suspend_clocksource->mask);
766 		nsec = mul_u64_u32_shr(delta, suspend_clocksource->mult,
767 				       suspend_clocksource->shift);
768 	}
769 
770 	/*
771 	 * Disable the suspend timer to save power if current clocksource is
772 	 * not the suspend timer.
773 	 */
774 	if (!clocksource_is_suspend(cs) && suspend_clocksource->disable)
775 		suspend_clocksource->disable(suspend_clocksource);
776 
777 	return nsec;
778 }
779 
780 /**
781  * clocksource_suspend - suspend the clocksource(s)
782  */
clocksource_suspend(void)783 void clocksource_suspend(void)
784 {
785 	struct clocksource *cs;
786 
787 	list_for_each_entry_reverse(cs, &clocksource_list, list)
788 		if (cs->suspend)
789 			cs->suspend(cs);
790 }
791 
792 /**
793  * clocksource_resume - resume the clocksource(s)
794  */
clocksource_resume(void)795 void clocksource_resume(void)
796 {
797 	struct clocksource *cs;
798 
799 	list_for_each_entry(cs, &clocksource_list, list)
800 		if (cs->resume)
801 			cs->resume(cs);
802 
803 	clocksource_resume_watchdog();
804 }
805 
806 /**
807  * clocksource_touch_watchdog - Update watchdog
808  *
809  * Update the watchdog after exception contexts such as kgdb so as not
810  * to incorrectly trip the watchdog. This might fail when the kernel
811  * was stopped in code which holds watchdog_lock.
812  */
clocksource_touch_watchdog(void)813 void clocksource_touch_watchdog(void)
814 {
815 	clocksource_resume_watchdog();
816 }
817 
818 /**
819  * clocksource_max_adjustment- Returns max adjustment amount
820  * @cs:         Pointer to clocksource
821  *
822  */
clocksource_max_adjustment(struct clocksource * cs)823 static u32 clocksource_max_adjustment(struct clocksource *cs)
824 {
825 	u64 ret;
826 	/*
827 	 * We won't try to correct for more than 11% adjustments (110,000 ppm),
828 	 */
829 	ret = (u64)cs->mult * 11;
830 	do_div(ret,100);
831 	return (u32)ret;
832 }
833 
834 /**
835  * clocks_calc_max_nsecs - Returns maximum nanoseconds that can be converted
836  * @mult:	cycle to nanosecond multiplier
837  * @shift:	cycle to nanosecond divisor (power of two)
838  * @maxadj:	maximum adjustment value to mult (~11%)
839  * @mask:	bitmask for two's complement subtraction of non 64 bit counters
840  * @max_cyc:	maximum cycle value before potential overflow (does not include
841  *		any safety margin)
842  *
843  * NOTE: This function includes a safety margin of 50%, in other words, we
844  * return half the number of nanoseconds the hardware counter can technically
845  * cover. This is done so that we can potentially detect problems caused by
846  * delayed timers or bad hardware, which might result in time intervals that
847  * are larger than what the math used can handle without overflows.
848  */
clocks_calc_max_nsecs(u32 mult,u32 shift,u32 maxadj,u64 mask,u64 * max_cyc)849 u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask, u64 *max_cyc)
850 {
851 	u64 max_nsecs, max_cycles;
852 
853 	/*
854 	 * Calculate the maximum number of cycles that we can pass to the
855 	 * cyc2ns() function without overflowing a 64-bit result.
856 	 */
857 	max_cycles = ULLONG_MAX;
858 	do_div(max_cycles, mult+maxadj);
859 
860 	/*
861 	 * The actual maximum number of cycles we can defer the clocksource is
862 	 * determined by the minimum of max_cycles and mask.
863 	 * Note: Here we subtract the maxadj to make sure we don't sleep for
864 	 * too long if there's a large negative adjustment.
865 	 */
866 	max_cycles = min(max_cycles, mask);
867 	max_nsecs = clocksource_cyc2ns(max_cycles, mult - maxadj, shift);
868 
869 	/* return the max_cycles value as well if requested */
870 	if (max_cyc)
871 		*max_cyc = max_cycles;
872 
873 	/* Return 50% of the actual maximum, so we can detect bad values */
874 	max_nsecs >>= 1;
875 
876 	return max_nsecs;
877 }
878 
879 /**
880  * clocksource_update_max_deferment - Updates the clocksource max_idle_ns & max_cycles
881  * @cs:         Pointer to clocksource to be updated
882  *
883  */
clocksource_update_max_deferment(struct clocksource * cs)884 static inline void clocksource_update_max_deferment(struct clocksource *cs)
885 {
886 	cs->max_idle_ns = clocks_calc_max_nsecs(cs->mult, cs->shift,
887 						cs->maxadj, cs->mask,
888 						&cs->max_cycles);
889 }
890 
891 #ifndef CONFIG_ARCH_USES_GETTIMEOFFSET
892 
clocksource_find_best(bool oneshot,bool skipcur)893 static struct clocksource *clocksource_find_best(bool oneshot, bool skipcur)
894 {
895 	struct clocksource *cs;
896 
897 	if (!finished_booting || list_empty(&clocksource_list))
898 		return NULL;
899 
900 	/*
901 	 * We pick the clocksource with the highest rating. If oneshot
902 	 * mode is active, we pick the highres valid clocksource with
903 	 * the best rating.
904 	 */
905 	list_for_each_entry(cs, &clocksource_list, list) {
906 		if (skipcur && cs == curr_clocksource)
907 			continue;
908 		if (oneshot && !(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES))
909 			continue;
910 		return cs;
911 	}
912 	return NULL;
913 }
914 
__clocksource_select(bool skipcur)915 static void __clocksource_select(bool skipcur)
916 {
917 	bool oneshot = tick_oneshot_mode_active();
918 	struct clocksource *best, *cs;
919 
920 	/* Find the best suitable clocksource */
921 	best = clocksource_find_best(oneshot, skipcur);
922 	if (!best)
923 		return;
924 
925 	if (!strlen(override_name))
926 		goto found;
927 
928 	/* Check for the override clocksource. */
929 	list_for_each_entry(cs, &clocksource_list, list) {
930 		if (skipcur && cs == curr_clocksource)
931 			continue;
932 		if (strcmp(cs->name, override_name) != 0)
933 			continue;
934 		/*
935 		 * Check to make sure we don't switch to a non-highres
936 		 * capable clocksource if the tick code is in oneshot
937 		 * mode (highres or nohz)
938 		 */
939 		if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) && oneshot) {
940 			/* Override clocksource cannot be used. */
941 			if (cs->flags & CLOCK_SOURCE_UNSTABLE) {
942 				pr_warn("Override clocksource %s is unstable and not HRT compatible - cannot switch while in HRT/NOHZ mode\n",
943 					cs->name);
944 				override_name[0] = 0;
945 			} else {
946 				/*
947 				 * The override cannot be currently verified.
948 				 * Deferring to let the watchdog check.
949 				 */
950 				pr_info("Override clocksource %s is not currently HRT compatible - deferring\n",
951 					cs->name);
952 			}
953 		} else
954 			/* Override clocksource can be used. */
955 			best = cs;
956 		break;
957 	}
958 
959 found:
960 	if (curr_clocksource != best && !timekeeping_notify(best)) {
961 		pr_info("Switched to clocksource %s\n", best->name);
962 		curr_clocksource = best;
963 	}
964 }
965 
966 /**
967  * clocksource_select - Select the best clocksource available
968  *
969  * Private function. Must hold clocksource_mutex when called.
970  *
971  * Select the clocksource with the best rating, or the clocksource,
972  * which is selected by userspace override.
973  */
clocksource_select(void)974 static void clocksource_select(void)
975 {
976 	__clocksource_select(false);
977 }
978 
clocksource_select_fallback(void)979 static void clocksource_select_fallback(void)
980 {
981 	__clocksource_select(true);
982 }
983 
984 #else /* !CONFIG_ARCH_USES_GETTIMEOFFSET */
clocksource_select(void)985 static inline void clocksource_select(void) { }
clocksource_select_fallback(void)986 static inline void clocksource_select_fallback(void) { }
987 
988 #endif
989 
990 /*
991  * clocksource_done_booting - Called near the end of core bootup
992  *
993  * Hack to avoid lots of clocksource churn at boot time.
994  * We use fs_initcall because we want this to start before
995  * device_initcall but after subsys_initcall.
996  */
clocksource_done_booting(void)997 static int __init clocksource_done_booting(void)
998 {
999 	mutex_lock(&clocksource_mutex);
1000 	curr_clocksource = clocksource_default_clock();
1001 	finished_booting = 1;
1002 	/*
1003 	 * Run the watchdog first to eliminate unstable clock sources
1004 	 */
1005 	__clocksource_watchdog_kthread();
1006 	clocksource_select();
1007 	mutex_unlock(&clocksource_mutex);
1008 	return 0;
1009 }
1010 fs_initcall(clocksource_done_booting);
1011 
1012 /*
1013  * Enqueue the clocksource sorted by rating
1014  */
clocksource_enqueue(struct clocksource * cs)1015 static void clocksource_enqueue(struct clocksource *cs)
1016 {
1017 	struct list_head *entry = &clocksource_list;
1018 	struct clocksource *tmp;
1019 
1020 	list_for_each_entry(tmp, &clocksource_list, list) {
1021 		/* Keep track of the place, where to insert */
1022 		if (tmp->rating < cs->rating)
1023 			break;
1024 		entry = &tmp->list;
1025 	}
1026 	list_add(&cs->list, entry);
1027 }
1028 
1029 /**
1030  * __clocksource_update_freq_scale - Used update clocksource with new freq
1031  * @cs:		clocksource to be registered
1032  * @scale:	Scale factor multiplied against freq to get clocksource hz
1033  * @freq:	clocksource frequency (cycles per second) divided by scale
1034  *
1035  * This should only be called from the clocksource->enable() method.
1036  *
1037  * This *SHOULD NOT* be called directly! Please use the
1038  * __clocksource_update_freq_hz() or __clocksource_update_freq_khz() helper
1039  * functions.
1040  */
__clocksource_update_freq_scale(struct clocksource * cs,u32 scale,u32 freq)1041 void __clocksource_update_freq_scale(struct clocksource *cs, u32 scale, u32 freq)
1042 {
1043 	u64 sec;
1044 
1045 	/*
1046 	 * Default clocksources are *special* and self-define their mult/shift.
1047 	 * But, you're not special, so you should specify a freq value.
1048 	 */
1049 	if (freq) {
1050 		/*
1051 		 * Calc the maximum number of seconds which we can run before
1052 		 * wrapping around. For clocksources which have a mask > 32-bit
1053 		 * we need to limit the max sleep time to have a good
1054 		 * conversion precision. 10 minutes is still a reasonable
1055 		 * amount. That results in a shift value of 24 for a
1056 		 * clocksource with mask >= 40-bit and f >= 4GHz. That maps to
1057 		 * ~ 0.06ppm granularity for NTP.
1058 		 */
1059 		sec = cs->mask;
1060 		do_div(sec, freq);
1061 		do_div(sec, scale);
1062 		if (!sec)
1063 			sec = 1;
1064 		else if (sec > 600 && cs->mask > UINT_MAX)
1065 			sec = 600;
1066 
1067 		clocks_calc_mult_shift(&cs->mult, &cs->shift, freq,
1068 				       NSEC_PER_SEC / scale, sec * scale);
1069 	}
1070 
1071 	/*
1072 	 * If the uncertainty margin is not specified, calculate it.
1073 	 * If both scale and freq are non-zero, calculate the clock
1074 	 * period, but bound below at 2*WATCHDOG_MAX_SKEW.  However,
1075 	 * if either of scale or freq is zero, be very conservative and
1076 	 * take the tens-of-milliseconds WATCHDOG_THRESHOLD value for the
1077 	 * uncertainty margin.  Allow stupidly small uncertainty margins
1078 	 * to be specified by the caller for testing purposes, but warn
1079 	 * to discourage production use of this capability.
1080 	 */
1081 	if (scale && freq && !cs->uncertainty_margin) {
1082 		cs->uncertainty_margin = NSEC_PER_SEC / (scale * freq);
1083 		if (cs->uncertainty_margin < 2 * WATCHDOG_MAX_SKEW)
1084 			cs->uncertainty_margin = 2 * WATCHDOG_MAX_SKEW;
1085 	} else if (!cs->uncertainty_margin) {
1086 		cs->uncertainty_margin = WATCHDOG_THRESHOLD;
1087 	}
1088 	WARN_ON_ONCE(cs->uncertainty_margin < 2 * WATCHDOG_MAX_SKEW);
1089 
1090 	/*
1091 	 * Ensure clocksources that have large 'mult' values don't overflow
1092 	 * when adjusted.
1093 	 */
1094 	cs->maxadj = clocksource_max_adjustment(cs);
1095 	while (freq && ((cs->mult + cs->maxadj < cs->mult)
1096 		|| (cs->mult - cs->maxadj > cs->mult))) {
1097 		cs->mult >>= 1;
1098 		cs->shift--;
1099 		cs->maxadj = clocksource_max_adjustment(cs);
1100 	}
1101 
1102 	/*
1103 	 * Only warn for *special* clocksources that self-define
1104 	 * their mult/shift values and don't specify a freq.
1105 	 */
1106 	WARN_ONCE(cs->mult + cs->maxadj < cs->mult,
1107 		"timekeeping: Clocksource %s might overflow on 11%% adjustment\n",
1108 		cs->name);
1109 
1110 	clocksource_update_max_deferment(cs);
1111 
1112 	pr_info("%s: mask: 0x%llx max_cycles: 0x%llx, max_idle_ns: %lld ns\n",
1113 		cs->name, cs->mask, cs->max_cycles, cs->max_idle_ns);
1114 }
1115 EXPORT_SYMBOL_GPL(__clocksource_update_freq_scale);
1116 
1117 /**
1118  * __clocksource_register_scale - Used to install new clocksources
1119  * @cs:		clocksource to be registered
1120  * @scale:	Scale factor multiplied against freq to get clocksource hz
1121  * @freq:	clocksource frequency (cycles per second) divided by scale
1122  *
1123  * Returns -EBUSY if registration fails, zero otherwise.
1124  *
1125  * This *SHOULD NOT* be called directly! Please use the
1126  * clocksource_register_hz() or clocksource_register_khz helper functions.
1127  */
__clocksource_register_scale(struct clocksource * cs,u32 scale,u32 freq)1128 int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
1129 {
1130 	unsigned long flags;
1131 
1132 	clocksource_arch_init(cs);
1133 
1134 	if (WARN_ON_ONCE((unsigned int)cs->id >= CSID_MAX))
1135 		cs->id = CSID_GENERIC;
1136 	if (cs->vdso_clock_mode < 0 ||
1137 	    cs->vdso_clock_mode >= VDSO_CLOCKMODE_MAX) {
1138 		pr_warn("clocksource %s registered with invalid VDSO mode %d. Disabling VDSO support.\n",
1139 			cs->name, cs->vdso_clock_mode);
1140 		cs->vdso_clock_mode = VDSO_CLOCKMODE_NONE;
1141 	}
1142 
1143 	/* Initialize mult/shift and max_idle_ns */
1144 	__clocksource_update_freq_scale(cs, scale, freq);
1145 
1146 	/* Add clocksource to the clocksource list */
1147 	mutex_lock(&clocksource_mutex);
1148 
1149 	clocksource_watchdog_lock(&flags);
1150 	clocksource_enqueue(cs);
1151 	clocksource_enqueue_watchdog(cs);
1152 	clocksource_watchdog_unlock(&flags);
1153 
1154 	clocksource_select();
1155 	clocksource_select_watchdog(false);
1156 	__clocksource_suspend_select(cs);
1157 	mutex_unlock(&clocksource_mutex);
1158 	return 0;
1159 }
1160 EXPORT_SYMBOL_GPL(__clocksource_register_scale);
1161 
__clocksource_change_rating(struct clocksource * cs,int rating)1162 static void __clocksource_change_rating(struct clocksource *cs, int rating)
1163 {
1164 	list_del(&cs->list);
1165 	cs->rating = rating;
1166 	clocksource_enqueue(cs);
1167 }
1168 
1169 /**
1170  * clocksource_change_rating - Change the rating of a registered clocksource
1171  * @cs:		clocksource to be changed
1172  * @rating:	new rating
1173  */
clocksource_change_rating(struct clocksource * cs,int rating)1174 void clocksource_change_rating(struct clocksource *cs, int rating)
1175 {
1176 	unsigned long flags;
1177 
1178 	mutex_lock(&clocksource_mutex);
1179 	clocksource_watchdog_lock(&flags);
1180 	__clocksource_change_rating(cs, rating);
1181 	clocksource_watchdog_unlock(&flags);
1182 
1183 	clocksource_select();
1184 	clocksource_select_watchdog(false);
1185 	clocksource_suspend_select(false);
1186 	mutex_unlock(&clocksource_mutex);
1187 }
1188 EXPORT_SYMBOL(clocksource_change_rating);
1189 
1190 /*
1191  * Unbind clocksource @cs. Called with clocksource_mutex held
1192  */
clocksource_unbind(struct clocksource * cs)1193 static int clocksource_unbind(struct clocksource *cs)
1194 {
1195 	unsigned long flags;
1196 
1197 	if (clocksource_is_watchdog(cs)) {
1198 		/* Select and try to install a replacement watchdog. */
1199 		clocksource_select_watchdog(true);
1200 		if (clocksource_is_watchdog(cs))
1201 			return -EBUSY;
1202 	}
1203 
1204 	if (cs == curr_clocksource) {
1205 		/* Select and try to install a replacement clock source */
1206 		clocksource_select_fallback();
1207 		if (curr_clocksource == cs)
1208 			return -EBUSY;
1209 	}
1210 
1211 	if (clocksource_is_suspend(cs)) {
1212 		/*
1213 		 * Select and try to install a replacement suspend clocksource.
1214 		 * If no replacement suspend clocksource, we will just let the
1215 		 * clocksource go and have no suspend clocksource.
1216 		 */
1217 		clocksource_suspend_select(true);
1218 	}
1219 
1220 	clocksource_watchdog_lock(&flags);
1221 	clocksource_dequeue_watchdog(cs);
1222 	list_del_init(&cs->list);
1223 	clocksource_watchdog_unlock(&flags);
1224 
1225 	return 0;
1226 }
1227 
1228 /**
1229  * clocksource_unregister - remove a registered clocksource
1230  * @cs:	clocksource to be unregistered
1231  */
clocksource_unregister(struct clocksource * cs)1232 int clocksource_unregister(struct clocksource *cs)
1233 {
1234 	int ret = 0;
1235 
1236 	mutex_lock(&clocksource_mutex);
1237 	if (!list_empty(&cs->list))
1238 		ret = clocksource_unbind(cs);
1239 	mutex_unlock(&clocksource_mutex);
1240 	return ret;
1241 }
1242 EXPORT_SYMBOL(clocksource_unregister);
1243 
1244 #ifdef CONFIG_SYSFS
1245 /**
1246  * current_clocksource_show - sysfs interface for current clocksource
1247  * @dev:	unused
1248  * @attr:	unused
1249  * @buf:	char buffer to be filled with clocksource list
1250  *
1251  * Provides sysfs interface for listing current clocksource.
1252  */
current_clocksource_show(struct device * dev,struct device_attribute * attr,char * buf)1253 static ssize_t current_clocksource_show(struct device *dev,
1254 					struct device_attribute *attr,
1255 					char *buf)
1256 {
1257 	ssize_t count = 0;
1258 
1259 	mutex_lock(&clocksource_mutex);
1260 	count = snprintf(buf, PAGE_SIZE, "%s\n", curr_clocksource->name);
1261 	mutex_unlock(&clocksource_mutex);
1262 
1263 	return count;
1264 }
1265 
sysfs_get_uname(const char * buf,char * dst,size_t cnt)1266 ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt)
1267 {
1268 	size_t ret = cnt;
1269 
1270 	/* strings from sysfs write are not 0 terminated! */
1271 	if (!cnt || cnt >= CS_NAME_LEN)
1272 		return -EINVAL;
1273 
1274 	/* strip of \n: */
1275 	if (buf[cnt-1] == '\n')
1276 		cnt--;
1277 	if (cnt > 0)
1278 		memcpy(dst, buf, cnt);
1279 	dst[cnt] = 0;
1280 	return ret;
1281 }
1282 
1283 /**
1284  * current_clocksource_store - interface for manually overriding clocksource
1285  * @dev:	unused
1286  * @attr:	unused
1287  * @buf:	name of override clocksource
1288  * @count:	length of buffer
1289  *
1290  * Takes input from sysfs interface for manually overriding the default
1291  * clocksource selection.
1292  */
current_clocksource_store(struct device * dev,struct device_attribute * attr,const char * buf,size_t count)1293 static ssize_t current_clocksource_store(struct device *dev,
1294 					 struct device_attribute *attr,
1295 					 const char *buf, size_t count)
1296 {
1297 	ssize_t ret;
1298 
1299 	mutex_lock(&clocksource_mutex);
1300 
1301 	ret = sysfs_get_uname(buf, override_name, count);
1302 	if (ret >= 0)
1303 		clocksource_select();
1304 
1305 	mutex_unlock(&clocksource_mutex);
1306 
1307 	return ret;
1308 }
1309 static DEVICE_ATTR_RW(current_clocksource);
1310 
1311 /**
1312  * unbind_clocksource_store - interface for manually unbinding clocksource
1313  * @dev:	unused
1314  * @attr:	unused
1315  * @buf:	unused
1316  * @count:	length of buffer
1317  *
1318  * Takes input from sysfs interface for manually unbinding a clocksource.
1319  */
unbind_clocksource_store(struct device * dev,struct device_attribute * attr,const char * buf,size_t count)1320 static ssize_t unbind_clocksource_store(struct device *dev,
1321 					struct device_attribute *attr,
1322 					const char *buf, size_t count)
1323 {
1324 	struct clocksource *cs;
1325 	char name[CS_NAME_LEN];
1326 	ssize_t ret;
1327 
1328 	ret = sysfs_get_uname(buf, name, count);
1329 	if (ret < 0)
1330 		return ret;
1331 
1332 	ret = -ENODEV;
1333 	mutex_lock(&clocksource_mutex);
1334 	list_for_each_entry(cs, &clocksource_list, list) {
1335 		if (strcmp(cs->name, name))
1336 			continue;
1337 		ret = clocksource_unbind(cs);
1338 		break;
1339 	}
1340 	mutex_unlock(&clocksource_mutex);
1341 
1342 	return ret ? ret : count;
1343 }
1344 static DEVICE_ATTR_WO(unbind_clocksource);
1345 
1346 /**
1347  * available_clocksource_show - sysfs interface for listing clocksource
1348  * @dev:	unused
1349  * @attr:	unused
1350  * @buf:	char buffer to be filled with clocksource list
1351  *
1352  * Provides sysfs interface for listing registered clocksources
1353  */
available_clocksource_show(struct device * dev,struct device_attribute * attr,char * buf)1354 static ssize_t available_clocksource_show(struct device *dev,
1355 					  struct device_attribute *attr,
1356 					  char *buf)
1357 {
1358 	struct clocksource *src;
1359 	ssize_t count = 0;
1360 
1361 	mutex_lock(&clocksource_mutex);
1362 	list_for_each_entry(src, &clocksource_list, list) {
1363 		/*
1364 		 * Don't show non-HRES clocksource if the tick code is
1365 		 * in one shot mode (highres=on or nohz=on)
1366 		 */
1367 		if (!tick_oneshot_mode_active() ||
1368 		    (src->flags & CLOCK_SOURCE_VALID_FOR_HRES))
1369 			count += snprintf(buf + count,
1370 				  max((ssize_t)PAGE_SIZE - count, (ssize_t)0),
1371 				  "%s ", src->name);
1372 	}
1373 	mutex_unlock(&clocksource_mutex);
1374 
1375 	count += snprintf(buf + count,
1376 			  max((ssize_t)PAGE_SIZE - count, (ssize_t)0), "\n");
1377 
1378 	return count;
1379 }
1380 static DEVICE_ATTR_RO(available_clocksource);
1381 
1382 static struct attribute *clocksource_attrs[] = {
1383 	&dev_attr_current_clocksource.attr,
1384 	&dev_attr_unbind_clocksource.attr,
1385 	&dev_attr_available_clocksource.attr,
1386 	NULL
1387 };
1388 ATTRIBUTE_GROUPS(clocksource);
1389 
1390 static struct bus_type clocksource_subsys = {
1391 	.name = "clocksource",
1392 	.dev_name = "clocksource",
1393 };
1394 
1395 static struct device device_clocksource = {
1396 	.id	= 0,
1397 	.bus	= &clocksource_subsys,
1398 	.groups	= clocksource_groups,
1399 };
1400 
init_clocksource_sysfs(void)1401 static int __init init_clocksource_sysfs(void)
1402 {
1403 	int error = subsys_system_register(&clocksource_subsys, NULL);
1404 
1405 	if (!error)
1406 		error = device_register(&device_clocksource);
1407 
1408 	return error;
1409 }
1410 
1411 device_initcall(init_clocksource_sysfs);
1412 #endif /* CONFIG_SYSFS */
1413 
1414 /**
1415  * boot_override_clocksource - boot clock override
1416  * @str:	override name
1417  *
1418  * Takes a clocksource= boot argument and uses it
1419  * as the clocksource override name.
1420  */
boot_override_clocksource(char * str)1421 static int __init boot_override_clocksource(char* str)
1422 {
1423 	mutex_lock(&clocksource_mutex);
1424 	if (str)
1425 		strlcpy(override_name, str, sizeof(override_name));
1426 	mutex_unlock(&clocksource_mutex);
1427 	return 1;
1428 }
1429 
1430 __setup("clocksource=", boot_override_clocksource);
1431 
1432 /**
1433  * boot_override_clock - Compatibility layer for deprecated boot option
1434  * @str:	override name
1435  *
1436  * DEPRECATED! Takes a clock= boot argument and uses it
1437  * as the clocksource override name
1438  */
boot_override_clock(char * str)1439 static int __init boot_override_clock(char* str)
1440 {
1441 	if (!strcmp(str, "pmtmr")) {
1442 		pr_warn("clock=pmtmr is deprecated - use clocksource=acpi_pm\n");
1443 		return boot_override_clocksource("acpi_pm");
1444 	}
1445 	pr_warn("clock= boot option is deprecated - use clocksource=xyz\n");
1446 	return boot_override_clocksource(str);
1447 }
1448 
1449 __setup("clock=", boot_override_clock);
1450