• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Xen time implementation.
3  *
4  * This is implemented in terms of a clocksource driver which uses
5  * the hypervisor clock as a nanosecond timebase, and a clockevent
6  * driver which uses the hypervisor's timer mechanism.
7  *
8  * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
9  */
10 #include <linux/kernel.h>
11 #include <linux/interrupt.h>
12 #include <linux/clocksource.h>
13 #include <linux/clockchips.h>
14 #include <linux/kernel_stat.h>
15 #include <linux/math64.h>
16 
17 #include <asm/pvclock.h>
18 #include <asm/xen/hypervisor.h>
19 #include <asm/xen/hypercall.h>
20 
21 #include <xen/events.h>
22 #include <xen/interface/xen.h>
23 #include <xen/interface/vcpu.h>
24 
25 #include "xen-ops.h"
26 
27 #define XEN_SHIFT 22
28 
29 /* Xen may fire a timer up to this many ns early */
30 #define TIMER_SLOP	100000
31 #define NS_PER_TICK	(1000000000LL / HZ)
32 
33 /* runstate info updated by Xen */
34 static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
35 
36 /* snapshots of runstate info */
37 static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate_snapshot);
38 
39 /* unused ns of stolen and blocked time */
40 static DEFINE_PER_CPU(u64, residual_stolen);
41 static DEFINE_PER_CPU(u64, residual_blocked);
42 
43 /* return an consistent snapshot of 64-bit time/counter value */
get64(const u64 * p)44 static u64 get64(const u64 *p)
45 {
46 	u64 ret;
47 
48 	if (BITS_PER_LONG < 64) {
49 		u32 *p32 = (u32 *)p;
50 		u32 h, l;
51 
52 		/*
53 		 * Read high then low, and then make sure high is
54 		 * still the same; this will only loop if low wraps
55 		 * and carries into high.
56 		 * XXX some clean way to make this endian-proof?
57 		 */
58 		do {
59 			h = p32[1];
60 			barrier();
61 			l = p32[0];
62 			barrier();
63 		} while (p32[1] != h);
64 
65 		ret = (((u64)h) << 32) | l;
66 	} else
67 		ret = *p;
68 
69 	return ret;
70 }
71 
72 /*
73  * Runstate accounting
74  */
get_runstate_snapshot(struct vcpu_runstate_info * res)75 static void get_runstate_snapshot(struct vcpu_runstate_info *res)
76 {
77 	u64 state_time;
78 	struct vcpu_runstate_info *state;
79 
80 	BUG_ON(preemptible());
81 
82 	state = &__get_cpu_var(runstate);
83 
84 	/*
85 	 * The runstate info is always updated by the hypervisor on
86 	 * the current CPU, so there's no need to use anything
87 	 * stronger than a compiler barrier when fetching it.
88 	 */
89 	do {
90 		state_time = get64(&state->state_entry_time);
91 		barrier();
92 		*res = *state;
93 		barrier();
94 	} while (get64(&state->state_entry_time) != state_time);
95 }
96 
97 /* return true when a vcpu could run but has no real cpu to run on */
xen_vcpu_stolen(int vcpu)98 bool xen_vcpu_stolen(int vcpu)
99 {
100 	return per_cpu(runstate, vcpu).state == RUNSTATE_runnable;
101 }
102 
setup_runstate_info(int cpu)103 static void setup_runstate_info(int cpu)
104 {
105 	struct vcpu_register_runstate_memory_area area;
106 
107 	area.addr.v = &per_cpu(runstate, cpu);
108 
109 	if (HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area,
110 			       cpu, &area))
111 		BUG();
112 }
113 
do_stolen_accounting(void)114 static void do_stolen_accounting(void)
115 {
116 	struct vcpu_runstate_info state;
117 	struct vcpu_runstate_info *snap;
118 	s64 blocked, runnable, offline, stolen;
119 	cputime_t ticks;
120 
121 	get_runstate_snapshot(&state);
122 
123 	WARN_ON(state.state != RUNSTATE_running);
124 
125 	snap = &__get_cpu_var(runstate_snapshot);
126 
127 	/* work out how much time the VCPU has not been runn*ing*  */
128 	blocked = state.time[RUNSTATE_blocked] - snap->time[RUNSTATE_blocked];
129 	runnable = state.time[RUNSTATE_runnable] - snap->time[RUNSTATE_runnable];
130 	offline = state.time[RUNSTATE_offline] - snap->time[RUNSTATE_offline];
131 
132 	*snap = state;
133 
134 	/* Add the appropriate number of ticks of stolen time,
135 	   including any left-overs from last time. */
136 	stolen = runnable + offline + __get_cpu_var(residual_stolen);
137 
138 	if (stolen < 0)
139 		stolen = 0;
140 
141 	ticks = iter_div_u64_rem(stolen, NS_PER_TICK, &stolen);
142 	__get_cpu_var(residual_stolen) = stolen;
143 	account_steal_ticks(ticks);
144 
145 	/* Add the appropriate number of ticks of blocked time,
146 	   including any left-overs from last time. */
147 	blocked += __get_cpu_var(residual_blocked);
148 
149 	if (blocked < 0)
150 		blocked = 0;
151 
152 	ticks = iter_div_u64_rem(blocked, NS_PER_TICK, &blocked);
153 	__get_cpu_var(residual_blocked) = blocked;
154 	account_idle_ticks(ticks);
155 }
156 
157 /*
158  * Xen sched_clock implementation.  Returns the number of unstolen
159  * nanoseconds, which is nanoseconds the VCPU spent in RUNNING+BLOCKED
160  * states.
161  */
xen_sched_clock(void)162 unsigned long long xen_sched_clock(void)
163 {
164 	struct vcpu_runstate_info state;
165 	cycle_t now;
166 	u64 ret;
167 	s64 offset;
168 
169 	/*
170 	 * Ideally sched_clock should be called on a per-cpu basis
171 	 * anyway, so preempt should already be disabled, but that's
172 	 * not current practice at the moment.
173 	 */
174 	preempt_disable();
175 
176 	now = xen_clocksource_read();
177 
178 	get_runstate_snapshot(&state);
179 
180 	WARN_ON(state.state != RUNSTATE_running);
181 
182 	offset = now - state.state_entry_time;
183 	if (offset < 0)
184 		offset = 0;
185 
186 	ret = state.time[RUNSTATE_blocked] +
187 		state.time[RUNSTATE_running] +
188 		offset;
189 
190 	preempt_enable();
191 
192 	return ret;
193 }
194 
195 
196 /* Get the TSC speed from Xen */
xen_tsc_khz(void)197 unsigned long xen_tsc_khz(void)
198 {
199 	struct pvclock_vcpu_time_info *info =
200 		&HYPERVISOR_shared_info->vcpu_info[0].time;
201 
202 	return pvclock_tsc_khz(info);
203 }
204 
xen_clocksource_read(void)205 cycle_t xen_clocksource_read(void)
206 {
207         struct pvclock_vcpu_time_info *src;
208 	cycle_t ret;
209 
210 	src = &get_cpu_var(xen_vcpu)->time;
211 	ret = pvclock_clocksource_read(src);
212 	put_cpu_var(xen_vcpu);
213 	return ret;
214 }
215 
xen_read_wallclock(struct timespec * ts)216 static void xen_read_wallclock(struct timespec *ts)
217 {
218 	struct shared_info *s = HYPERVISOR_shared_info;
219 	struct pvclock_wall_clock *wall_clock = &(s->wc);
220         struct pvclock_vcpu_time_info *vcpu_time;
221 
222 	vcpu_time = &get_cpu_var(xen_vcpu)->time;
223 	pvclock_read_wallclock(wall_clock, vcpu_time, ts);
224 	put_cpu_var(xen_vcpu);
225 }
226 
xen_get_wallclock(void)227 unsigned long xen_get_wallclock(void)
228 {
229 	struct timespec ts;
230 
231 	xen_read_wallclock(&ts);
232 	return ts.tv_sec;
233 }
234 
xen_set_wallclock(unsigned long now)235 int xen_set_wallclock(unsigned long now)
236 {
237 	/* do nothing for domU */
238 	return -1;
239 }
240 
241 static struct clocksource xen_clocksource __read_mostly = {
242 	.name = "xen",
243 	.rating = 400,
244 	.read = xen_clocksource_read,
245 	.mask = ~0,
246 	.mult = 1<<XEN_SHIFT,		/* time directly in nanoseconds */
247 	.shift = XEN_SHIFT,
248 	.flags = CLOCK_SOURCE_IS_CONTINUOUS,
249 };
250 
251 /*
252    Xen clockevent implementation
253 
254    Xen has two clockevent implementations:
255 
256    The old timer_op one works with all released versions of Xen prior
257    to version 3.0.4.  This version of the hypervisor provides a
258    single-shot timer with nanosecond resolution.  However, sharing the
259    same event channel is a 100Hz tick which is delivered while the
260    vcpu is running.  We don't care about or use this tick, but it will
261    cause the core time code to think the timer fired too soon, and
262    will end up resetting it each time.  It could be filtered, but
263    doing so has complications when the ktime clocksource is not yet
264    the xen clocksource (ie, at boot time).
265 
266    The new vcpu_op-based timer interface allows the tick timer period
267    to be changed or turned off.  The tick timer is not useful as a
268    periodic timer because events are only delivered to running vcpus.
269    The one-shot timer can report when a timeout is in the past, so
270    set_next_event is capable of returning -ETIME when appropriate.
271    This interface is used when available.
272 */
273 
274 
275 /*
276   Get a hypervisor absolute time.  In theory we could maintain an
277   offset between the kernel's time and the hypervisor's time, and
278   apply that to a kernel's absolute timeout.  Unfortunately the
279   hypervisor and kernel times can drift even if the kernel is using
280   the Xen clocksource, because ntp can warp the kernel's clocksource.
281 */
get_abs_timeout(unsigned long delta)282 static s64 get_abs_timeout(unsigned long delta)
283 {
284 	return xen_clocksource_read() + delta;
285 }
286 
xen_timerop_set_mode(enum clock_event_mode mode,struct clock_event_device * evt)287 static void xen_timerop_set_mode(enum clock_event_mode mode,
288 				 struct clock_event_device *evt)
289 {
290 	switch (mode) {
291 	case CLOCK_EVT_MODE_PERIODIC:
292 		/* unsupported */
293 		WARN_ON(1);
294 		break;
295 
296 	case CLOCK_EVT_MODE_ONESHOT:
297 	case CLOCK_EVT_MODE_RESUME:
298 		break;
299 
300 	case CLOCK_EVT_MODE_UNUSED:
301 	case CLOCK_EVT_MODE_SHUTDOWN:
302 		HYPERVISOR_set_timer_op(0);  /* cancel timeout */
303 		break;
304 	}
305 }
306 
xen_timerop_set_next_event(unsigned long delta,struct clock_event_device * evt)307 static int xen_timerop_set_next_event(unsigned long delta,
308 				      struct clock_event_device *evt)
309 {
310 	WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
311 
312 	if (HYPERVISOR_set_timer_op(get_abs_timeout(delta)) < 0)
313 		BUG();
314 
315 	/* We may have missed the deadline, but there's no real way of
316 	   knowing for sure.  If the event was in the past, then we'll
317 	   get an immediate interrupt. */
318 
319 	return 0;
320 }
321 
322 static const struct clock_event_device xen_timerop_clockevent = {
323 	.name = "xen",
324 	.features = CLOCK_EVT_FEAT_ONESHOT,
325 
326 	.max_delta_ns = 0xffffffff,
327 	.min_delta_ns = TIMER_SLOP,
328 
329 	.mult = 1,
330 	.shift = 0,
331 	.rating = 500,
332 
333 	.set_mode = xen_timerop_set_mode,
334 	.set_next_event = xen_timerop_set_next_event,
335 };
336 
337 
338 
xen_vcpuop_set_mode(enum clock_event_mode mode,struct clock_event_device * evt)339 static void xen_vcpuop_set_mode(enum clock_event_mode mode,
340 				struct clock_event_device *evt)
341 {
342 	int cpu = smp_processor_id();
343 
344 	switch (mode) {
345 	case CLOCK_EVT_MODE_PERIODIC:
346 		WARN_ON(1);	/* unsupported */
347 		break;
348 
349 	case CLOCK_EVT_MODE_ONESHOT:
350 		if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
351 			BUG();
352 		break;
353 
354 	case CLOCK_EVT_MODE_UNUSED:
355 	case CLOCK_EVT_MODE_SHUTDOWN:
356 		if (HYPERVISOR_vcpu_op(VCPUOP_stop_singleshot_timer, cpu, NULL) ||
357 		    HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
358 			BUG();
359 		break;
360 	case CLOCK_EVT_MODE_RESUME:
361 		break;
362 	}
363 }
364 
xen_vcpuop_set_next_event(unsigned long delta,struct clock_event_device * evt)365 static int xen_vcpuop_set_next_event(unsigned long delta,
366 				     struct clock_event_device *evt)
367 {
368 	int cpu = smp_processor_id();
369 	struct vcpu_set_singleshot_timer single;
370 	int ret;
371 
372 	WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
373 
374 	single.timeout_abs_ns = get_abs_timeout(delta);
375 	single.flags = VCPU_SSHOTTMR_future;
376 
377 	ret = HYPERVISOR_vcpu_op(VCPUOP_set_singleshot_timer, cpu, &single);
378 
379 	BUG_ON(ret != 0 && ret != -ETIME);
380 
381 	return ret;
382 }
383 
384 static const struct clock_event_device xen_vcpuop_clockevent = {
385 	.name = "xen",
386 	.features = CLOCK_EVT_FEAT_ONESHOT,
387 
388 	.max_delta_ns = 0xffffffff,
389 	.min_delta_ns = TIMER_SLOP,
390 
391 	.mult = 1,
392 	.shift = 0,
393 	.rating = 500,
394 
395 	.set_mode = xen_vcpuop_set_mode,
396 	.set_next_event = xen_vcpuop_set_next_event,
397 };
398 
399 static const struct clock_event_device *xen_clockevent =
400 	&xen_timerop_clockevent;
401 static DEFINE_PER_CPU(struct clock_event_device, xen_clock_events);
402 
xen_timer_interrupt(int irq,void * dev_id)403 static irqreturn_t xen_timer_interrupt(int irq, void *dev_id)
404 {
405 	struct clock_event_device *evt = &__get_cpu_var(xen_clock_events);
406 	irqreturn_t ret;
407 
408 	ret = IRQ_NONE;
409 	if (evt->event_handler) {
410 		evt->event_handler(evt);
411 		ret = IRQ_HANDLED;
412 	}
413 
414 	do_stolen_accounting();
415 
416 	return ret;
417 }
418 
xen_setup_timer(int cpu)419 void xen_setup_timer(int cpu)
420 {
421 	const char *name;
422 	struct clock_event_device *evt;
423 	int irq;
424 
425 	printk(KERN_INFO "installing Xen timer for CPU %d\n", cpu);
426 
427 	name = kasprintf(GFP_KERNEL, "timer%d", cpu);
428 	if (!name)
429 		name = "<timer kasprintf failed>";
430 
431 	irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt,
432 				      IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
433 				      name, NULL);
434 
435 	evt = &per_cpu(xen_clock_events, cpu);
436 	memcpy(evt, xen_clockevent, sizeof(*evt));
437 
438 	evt->cpumask = cpumask_of(cpu);
439 	evt->irq = irq;
440 
441 	setup_runstate_info(cpu);
442 }
443 
xen_teardown_timer(int cpu)444 void xen_teardown_timer(int cpu)
445 {
446 	struct clock_event_device *evt;
447 	BUG_ON(cpu == 0);
448 	evt = &per_cpu(xen_clock_events, cpu);
449 	unbind_from_irqhandler(evt->irq, NULL);
450 }
451 
xen_setup_cpu_clockevents(void)452 void xen_setup_cpu_clockevents(void)
453 {
454 	BUG_ON(preemptible());
455 
456 	clockevents_register_device(&__get_cpu_var(xen_clock_events));
457 }
458 
xen_timer_resume(void)459 void xen_timer_resume(void)
460 {
461 	int cpu;
462 
463 	if (xen_clockevent != &xen_vcpuop_clockevent)
464 		return;
465 
466 	for_each_online_cpu(cpu) {
467 		if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
468 			BUG();
469 	}
470 }
471 
xen_time_init(void)472 __init void xen_time_init(void)
473 {
474 	int cpu = smp_processor_id();
475 
476 	clocksource_register(&xen_clocksource);
477 
478 	if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) {
479 		/* Successfully turned off 100Hz tick, so we have the
480 		   vcpuop-based timer interface */
481 		printk(KERN_DEBUG "Xen: using vcpuop timer interface\n");
482 		xen_clockevent = &xen_vcpuop_clockevent;
483 	}
484 
485 	/* Set initial system time with full resolution */
486 	xen_read_wallclock(&xtime);
487 	set_normalized_timespec(&wall_to_monotonic,
488 				-xtime.tv_sec, -xtime.tv_nsec);
489 
490 	setup_force_cpu_cap(X86_FEATURE_TSC);
491 
492 	xen_setup_timer(cpu);
493 	xen_setup_cpu_clockevents();
494 }
495