1 // Copyright 2022 The ChromiumOS Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 //! Virtio version of a linux pvclock clocksource.
6 //!
7 //! Driver source is here:
8 //! <https://android.googlesource.com/kernel/common/+/ebaa2c516811825b141de844cee7a38653058ef5/drivers/virtio/virtio_pvclock.c>
9 //!
10 //! # Background
11 //!
12 //! Userland applications often rely on CLOCK_MONOTONIC to be relatively continuous.
13 //! Large jumps can signal problems (e.g., triggering Android watchdogs).
14 //! This assumption breaks down in virtualized environments, where a VM's suspension isn't
15 //! inherently linked to the guest kernel's concept of "suspend".
16 //! Since fixing all userland code is impractical, virtio-pvclock allows the VMM and guest kernel
17 //! to collaborate on emulating the expected clock behavior around suspend/resume.
18 //!
19 //! # How it works
20 //!
21 //! ## Core functions of virtio-pvclock device:
22 //!
23 //! 1. Adjusts hardware clocksource offsets to make the guest clocks appear suspended when the VM is
24 //! suspended.
25 //! - This is achieved through the pvclock mechanism implemented in x86 KVM used by kvm-clock.
26 //! 2. Provides the guest kernel with the duration of VM suspension, allowing the guest to adjust
27 //! its clocks accordingly.
28 //! - Since the offset between the CLOCK_MONOTONIC and CLOCK_BOOTTIME is maintained by the guest
29 //! kernel, applying the adjustment is the guest driver's responsibility.
30 //!
31 //! ## Expected guest clock behaviors under virtio-pvclock is enabled
32 //!
33 //! - Monotonicity of CLOCK_MONOTONIC and CLOCK_BOOTTIME is maintained.
34 //! - CLOCK_MONOTONIC will not include the time passed during crosvm is suspended from its run mode
35 //! perspective.
36 //! - CLOCK_BOOTTIME will be adjusted to include the time passed during crosvm is suspended.
37 //!
38 //! # Why it is needed
39 //!
40 //! Because the existing solution does not cover some expectations we need.
41 //!
42 //! kvm-clock is letting the host to manage the offsets of CLOCK_MONOTONIC.
43 //! However, it doesn't address the difference between CLOCK_BOOTTIME and CLOCK_MONOTONIC related
44 //! to host's suspend/resume, as it is designed to maintain the CLOCK_REALTIME in sync mainly.
45
46 #[cfg(target_arch = "aarch64")]
47 use std::arch::asm;
48 use std::collections::BTreeMap;
49 use std::mem::replace;
50 use std::mem::size_of;
51 use std::sync::atomic::AtomicU64;
52 use std::sync::atomic::Ordering;
53 use std::sync::Arc;
54 use std::time::Duration;
55
56 use anyhow::anyhow;
57 use anyhow::bail;
58 use anyhow::Context;
59 use anyhow::Result;
60 use base::error;
61 use base::info;
62 use base::warn;
63 use base::AsRawDescriptor;
64 #[cfg(windows)]
65 use base::CloseNotifier;
66 use base::Error;
67 use base::Event;
68 use base::EventToken;
69 use base::RawDescriptor;
70 use base::ReadNotifier;
71 use base::Tube;
72 use base::WaitContext;
73 use base::WorkerThread;
74 use chrono::DateTime;
75 use chrono::Utc;
76 use data_model::Le32;
77 use data_model::Le64;
78 use serde::Deserialize;
79 use serde::Serialize;
80 use snapshot::AnySnapshot;
81 use vm_control::PvClockCommand;
82 use vm_control::PvClockCommandResponse;
83 use vm_memory::GuestAddress;
84 use vm_memory::GuestMemory;
85 use vm_memory::GuestMemoryError;
86 use zerocopy::FromBytes;
87 use zerocopy::Immutable;
88 use zerocopy::IntoBytes;
89 use zerocopy::KnownLayout;
90
91 use super::copy_config;
92 use super::DeviceType;
93 use super::Interrupt;
94 use super::Queue;
95 use super::VirtioDevice;
96
97 // Pvclock has one virtio queue: set_pvclock_page
98 const QUEUE_SIZE: u16 = 1;
99 const QUEUE_SIZES: &[u16] = &[QUEUE_SIZE];
100
101 // pvclock flag bits
102 const PVCLOCK_TSC_STABLE_BIT: u8 = 1;
103 const PVCLOCK_GUEST_STOPPED: u8 = 2;
104
105 // The feature bitmap for virtio pvclock
106 const VIRTIO_PVCLOCK_F_TSC_STABLE: u64 = 0; // TSC is stable
107 const VIRTIO_PVCLOCK_F_INJECT_SLEEP: u64 = 1; // Inject sleep for suspend
108 const VIRTIO_PVCLOCK_F_CLOCKSOURCE_RATING: u64 = 2; // Use device clocksource rating
109
110 // Status values for a virtio_pvclock request.
111 const VIRTIO_PVCLOCK_S_OK: u8 = 0;
112 const VIRTIO_PVCLOCK_S_IOERR: u8 = 1;
113
114 const VIRTIO_PVCLOCK_CLOCKSOURCE_RATING: u32 = 450;
115
116 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
read_clock_counter() -> u64117 fn read_clock_counter() -> u64 {
118 // SAFETY: rdtsc is unprivileged and have no side effects.
119 unsafe { std::arch::x86_64::_rdtsc() }
120 }
121
122 #[cfg(target_arch = "aarch64")]
read_clock_counter() -> u64123 fn read_clock_counter() -> u64 {
124 let mut x: u64;
125 // SAFETY: This instruction have no side effect apart from storing the current timestamp counter
126 // into the specified register.
127 unsafe {
128 asm!("mrs {x}, cntvct_el0",
129 x = out(reg) x,
130 );
131 }
132 x
133 }
134
135 /// Calculate a (multiplier, shift) pair for scaled math of clocks.
136 /// The values are passed on to `pvclock_scale_delta` in the guest kernel and satisfy the following
137 /// (approximate) equality:
138 /// `n * scaled_hz / base_hz ~= ((n << shift) * multiplier) >> 32`
139 /// The logic here is roughly based on `kvm_get_time_scale` (but simplified as we can use u128).
140 /// # Arguments
141 /// * `scaled_hz` - Frequency to convert to. When dealing with clocksources, this is NSEC_PER_SEC.
142 /// * `base_hz` - Frequency to convert from. When dealing with clocksources, this is the counter
143 /// frequency.
freq_scale_shift(scaled_hz: u64, base_hz: u64) -> (u32, i8)144 fn freq_scale_shift(scaled_hz: u64, base_hz: u64) -> (u32, i8) {
145 assert!(scaled_hz > 0 && base_hz > 0);
146 // We treat `multiplier` as a 0.32 fixed-point number by folding the >> 32 into its definition.
147 // With this definition, `multiplier` can be calculated as `(scaled_hz / base_hz) >> shift`
148 // with a corresponding `shift`.
149 //
150 // The value of `shift` should satisfy a few constraints:
151 // 1. `multiplier` needs to be < 1.0 due to the representable range of 0.32 fixed-point (maximum
152 // (2^32-1)/2^32).
153 // 2. `shift` should be minimized because `pvclock_scale_delta` applies `shift` on the 64-bit
154 // TSC value before extending to 128-bit and large positive shifts reduce the TSC rollover
155 // time.
156 //
157 // Minimizing `shift` means maximizing `multiplier`. From the < 1.0 constraint, this is
158 // equivalent to having a multiplier within [0.5, 1.0). The logic below picks a multiplier
159 // satisfying that, while updating `shift` accordingly when we double or halve the multiplier.
160 let mut shift = 0;
161 // Convert to u128 so that overflow handling becomes much easier.
162 let mut scaled_hz = scaled_hz as u128;
163 let mut base_hz = base_hz as u128;
164 if scaled_hz >= base_hz {
165 while scaled_hz >= base_hz {
166 // `multiplier` >= 1.0; iteratively scale it down
167 // scaled_hz is at most 64 bits, so after this loop base_hz is at most 65 bits.
168 base_hz <<= 1;
169 shift += 1;
170 }
171 } else {
172 while base_hz > 2 * scaled_hz {
173 // `multiplier` < 0.5; iteratively scale it up
174 // base_hz is at most 64 bits. If the loop condition passes then scaled_hz is at most 63
175 // bits, otherwise at most 64 bits. Post-loop scaled_hz is at most 64 bits.
176 scaled_hz <<= 1;
177 shift -= 1;
178 }
179 }
180 // From above, we know that the values are at most 65 bits. This provides sufficient headroom
181 // for scaled_hz << 32 below.
182 assert!(base_hz < (1u128 << 65) && scaled_hz < (1u128 << 65));
183 let mult: u32 = ((scaled_hz << 32) / base_hz)
184 .try_into()
185 .expect("should not overflow");
186 (mult, shift)
187 }
188
189 // The config structure being exposed to the guest to tell them how much suspend time should be
190 // injected to the guest's CLOCK_BOOTTIME.
191 #[derive(Debug, Clone, Copy, Default, FromBytes, Immutable, IntoBytes, KnownLayout)]
192 #[allow(non_camel_case_types)]
193 #[repr(C)]
194 struct virtio_pvclock_config {
195 // Total duration the VM has been paused while the guest kernel is not in the suspended state
196 // (from the power management and timekeeping perspective).
197 suspend_time_ns: Le64,
198 // Device-suggested rating of the pvclock clocksource.
199 clocksource_rating: Le32,
200 padding: u32,
201 }
202
203 #[derive(Debug, Clone, Copy, Default, FromBytes, Immutable, IntoBytes, KnownLayout)]
204 #[allow(non_camel_case_types)]
205 #[repr(C)]
206 struct virtio_pvclock_set_pvclock_page_req {
207 // Physical address of pvclock page.
208 pvclock_page_pa: Le64,
209 // Current system time.
210 system_time: Le64,
211 // Current tsc value.
212 tsc_timestamp: Le64,
213 // Status of this request, one of VIRTIO_PVCLOCK_S_*.
214 status: u8,
215 padding: [u8; 7],
216 }
217
218 // Data structure for interacting with pvclock shared memory.
219 struct PvclockSharedData {
220 mem: GuestMemory,
221 seqlock_addr: GuestAddress,
222 tsc_suspended_delta_addr: GuestAddress,
223 tsc_frequency_multiplier_addr: GuestAddress,
224 tsc_frequency_shift_addr: GuestAddress,
225 flags_addr: GuestAddress,
226 }
227
228 impl PvclockSharedData {
new(mem: GuestMemory, addr: GuestAddress) -> Self229 pub fn new(mem: GuestMemory, addr: GuestAddress) -> Self {
230 PvclockSharedData {
231 mem,
232 // The addresses of the various fields that we need to modify are relative to the
233 // base of the pvclock page. For reference, see the pvclock_vcpu_time_info struct.
234 seqlock_addr: addr,
235 tsc_suspended_delta_addr: addr.unchecked_add(8),
236 tsc_frequency_multiplier_addr: addr.unchecked_add(24),
237 tsc_frequency_shift_addr: addr.unchecked_add(28),
238 flags_addr: addr.unchecked_add(29),
239 }
240 }
241
242 /// Only the seqlock_addr is needed to re-create this struct at restore
243 /// time, so that is all our snapshot contains.
snapshot(&self) -> GuestAddress244 fn snapshot(&self) -> GuestAddress {
245 self.seqlock_addr
246 }
247
248 /// Set all fields to zero.
zero_fill(&mut self) -> Result<()>249 pub fn zero_fill(&mut self) -> Result<()> {
250 // The pvclock data structure is 32 bytes long, so we write 32 bytes of 0s
251 self.mem
252 .write_all_at_addr(&[0u8; 32], self.seqlock_addr)
253 .context("failed to zero fill the pvclock shared data")
254 }
255
increment_seqlock(&mut self) -> Result<()>256 pub fn increment_seqlock(&mut self) -> Result<()> {
257 // TODO (b/264931437): reads and writes using read/write_obj_from/at_addr are not
258 // guaranteed to be atomic. Although this should not be a problem for the seqlock
259 // or the other fields in the pvclock shared data (whch are protected via the seqlock)
260 // we might want to update these calls to be as atomic as possible if/when we have
261 // the ability to do so, just as a general cleanup and to be consistent.
262 let value = self
263 .mem
264 .read_obj_from_addr::<u32>(self.seqlock_addr)
265 .context("failed to read seqlock value")?;
266 self.mem
267 .write_obj_at_addr(value.wrapping_add(1), self.seqlock_addr)
268 .context("failed to write seqlock value")
269 }
270
set_tsc_suspended_delta(&mut self, delta: u64) -> Result<()>271 pub fn set_tsc_suspended_delta(&mut self, delta: u64) -> Result<()> {
272 self.mem
273 .write_obj_at_addr(delta, self.tsc_suspended_delta_addr)
274 .context("failed to write tsc suspended delta")
275 }
276
set_tsc_frequency(&mut self, frequency: u64) -> Result<()>277 pub fn set_tsc_frequency(&mut self, frequency: u64) -> Result<()> {
278 let (multiplier, shift): (u32, i8) = freq_scale_shift(1_000_000_000, frequency);
279
280 self.mem
281 .write_obj_at_addr(multiplier, self.tsc_frequency_multiplier_addr)
282 .context("failed to write tsc frequency mlutiplier")?;
283 self.mem
284 .write_obj_at_addr(shift, self.tsc_frequency_shift_addr)
285 .context("failed to write tsc frequency shift")
286 }
287
enable_pvclock_flags(&mut self, flags: u8) -> Result<()>288 pub fn enable_pvclock_flags(&mut self, flags: u8) -> Result<()> {
289 let value = self
290 .mem
291 .read_obj_from_addr::<u8>(self.flags_addr)
292 .context("failed to read flags")?;
293 self.mem
294 .write_obj_at_addr(value | flags, self.flags_addr)
295 .context("failed to write flags")
296 }
297 }
298
299 /// Serializable part of the [PvClock] struct which will be used by the virtio_snapshot / restore.
300 #[derive(Serialize, Deserialize)]
301 struct PvClockState {
302 tsc_frequency: u64,
303 /// If the device is sleeping, a [PvClockWorkerSnapshot] that can re-create the worker
304 /// will be stored here. (We can't just store the worker itself as it contains an object
305 /// tree with references to [GuestMemory].)
306 paused_main_worker: Option<PvClockWorkerSnapshot>,
307 /// The total time the vm has been suspended, this is in an `Arc<AtomicU64>>` because it's set
308 /// by the PvClockWorker thread but read by PvClock from the mmio bus in the main thread.
309 total_suspend_ns: Arc<AtomicU64>,
310 features: u64,
311 acked_features: u64,
312 }
313
314 /// An enum to keep dynamic state of pvclock workers in a type safe manner.
315 enum PvClockWorkerState {
316 /// Idle means no worker is running.
317 /// This tube is for communicating with this device from the crosvm threads.
318 Idle(Tube),
319 /// A stub worker to respond pvclock commands when the device is not activated yet.
320 Stub(WorkerThread<StubWorkerReturn>),
321 /// A main worker to respond pvclock commands while the device is active.
322 Main(WorkerThread<MainWorkerReturn>),
323 /// None is used only for handling transitional state between the states above.
324 None,
325 }
326
327 /// A struct that represents virtio-pvclock device.
328 pub struct PvClock {
329 state: PvClockState,
330 worker_state: PvClockWorkerState,
331 }
332
333 impl PvClock {
new(base_features: u64, tsc_frequency: u64, suspend_tube: Tube) -> Self334 pub fn new(base_features: u64, tsc_frequency: u64, suspend_tube: Tube) -> Self {
335 let state = PvClockState {
336 tsc_frequency,
337 paused_main_worker: None,
338 total_suspend_ns: Arc::new(AtomicU64::new(0)),
339 features: base_features
340 | 1 << VIRTIO_PVCLOCK_F_TSC_STABLE
341 | 1 << VIRTIO_PVCLOCK_F_INJECT_SLEEP
342 | 1 << VIRTIO_PVCLOCK_F_CLOCKSOURCE_RATING,
343 acked_features: 0,
344 };
345 PvClock {
346 state,
347 worker_state: PvClockWorkerState::Idle(suspend_tube),
348 }
349 }
350
get_config(&self) -> virtio_pvclock_config351 fn get_config(&self) -> virtio_pvclock_config {
352 virtio_pvclock_config {
353 suspend_time_ns: self.state.total_suspend_ns.load(Ordering::SeqCst).into(),
354 clocksource_rating: VIRTIO_PVCLOCK_CLOCKSOURCE_RATING.into(),
355 padding: 0,
356 }
357 }
358
359 /// Use switch_to_*_worker unless needed to keep the state transition consistent
start_main_worker( &mut self, interrupt: Interrupt, pvclock_worker: PvClockWorker, mut queues: BTreeMap<usize, Queue>, ) -> anyhow::Result<()>360 fn start_main_worker(
361 &mut self,
362 interrupt: Interrupt,
363 pvclock_worker: PvClockWorker,
364 mut queues: BTreeMap<usize, Queue>,
365 ) -> anyhow::Result<()> {
366 let last_state = replace(&mut self.worker_state, PvClockWorkerState::None);
367 if let PvClockWorkerState::Idle(suspend_tube) = last_state {
368 if queues.len() != QUEUE_SIZES.len() {
369 self.worker_state = PvClockWorkerState::Idle(suspend_tube);
370 return Err(anyhow!(
371 "expected {} queues, got {}",
372 QUEUE_SIZES.len(),
373 queues.len()
374 ));
375 }
376 let set_pvclock_page_queue = queues.remove(&0).unwrap();
377 self.worker_state = PvClockWorkerState::Main(WorkerThread::start(
378 "virtio_pvclock".to_string(),
379 move |kill_evt| {
380 run_main_worker(
381 pvclock_worker,
382 set_pvclock_page_queue,
383 suspend_tube,
384 interrupt,
385 kill_evt,
386 )
387 },
388 ));
389 } else {
390 panic!("Invalid state transition");
391 }
392 Ok(())
393 }
394
395 /// Use switch_to_*_worker unless needed to keep the state transition consistent
start_stub_worker(&mut self)396 fn start_stub_worker(&mut self) {
397 let last_state = replace(&mut self.worker_state, PvClockWorkerState::None);
398 self.worker_state = if let PvClockWorkerState::Idle(suspend_tube) = last_state {
399 PvClockWorkerState::Stub(WorkerThread::start(
400 "virtio_pvclock_stub".to_string(),
401 move |kill_evt| run_stub_worker(suspend_tube, kill_evt),
402 ))
403 } else {
404 panic!("Invalid state transition");
405 };
406 }
407
408 /// Use switch_to_*_worker unless needed to keep the state transition consistent
stop_stub_worker(&mut self)409 fn stop_stub_worker(&mut self) {
410 let last_state = replace(&mut self.worker_state, PvClockWorkerState::None);
411 self.worker_state = if let PvClockWorkerState::Stub(stub_worker_thread) = last_state {
412 let stub_worker_ret = stub_worker_thread.stop();
413 PvClockWorkerState::Idle(stub_worker_ret.suspend_tube)
414 } else {
415 panic!("Invalid state transition");
416 }
417 }
418
419 /// Use switch_to_*_worker unless needed to keep the state transition consistent
stop_main_worker(&mut self)420 fn stop_main_worker(&mut self) {
421 let last_state = replace(&mut self.worker_state, PvClockWorkerState::None);
422 if let PvClockWorkerState::Main(main_worker_thread) = last_state {
423 let main_worker_ret = main_worker_thread.stop();
424 self.worker_state = PvClockWorkerState::Idle(main_worker_ret.suspend_tube);
425 let mut queues = BTreeMap::new();
426 queues.insert(0, main_worker_ret.set_pvclock_page_queue);
427 self.state.paused_main_worker = Some(main_worker_ret.worker.into());
428 } else {
429 panic!("Invalid state transition");
430 }
431 }
432
switch_to_stub_worker(&mut self)433 fn switch_to_stub_worker(&mut self) {
434 self.stop_main_worker();
435 self.start_stub_worker();
436 }
437
switch_to_main_worker( &mut self, interrupt: Interrupt, pvclock_worker: PvClockWorker, queues: BTreeMap<usize, Queue>, ) -> anyhow::Result<()>438 fn switch_to_main_worker(
439 &mut self,
440 interrupt: Interrupt,
441 pvclock_worker: PvClockWorker,
442 queues: BTreeMap<usize, Queue>,
443 ) -> anyhow::Result<()> {
444 self.stop_stub_worker();
445 self.start_main_worker(interrupt, pvclock_worker, queues)
446 }
447 }
448
449 /// Represents a moment in time including the TSC counter value at that time.
450 #[derive(Serialize, Deserialize, Clone)]
451 struct PvclockInstant {
452 time: DateTime<Utc>,
453 tsc_value: u64,
454 }
455
456 /// The unique data retained by [PvClockWorker] which can be used to re-create
457 /// an identical worker.
458 #[derive(Serialize, Deserialize, Clone)]
459 struct PvClockWorkerSnapshot {
460 suspend_time: Option<PvclockInstant>,
461 total_suspend_tsc_delta: u64,
462 pvclock_shared_data_base_address: Option<GuestAddress>,
463 }
464
465 impl From<PvClockWorker> for PvClockWorkerSnapshot {
from(worker: PvClockWorker) -> Self466 fn from(worker: PvClockWorker) -> Self {
467 PvClockWorkerSnapshot {
468 suspend_time: worker.suspend_time,
469 total_suspend_tsc_delta: worker.total_suspend_tsc_delta,
470 pvclock_shared_data_base_address: worker
471 .pvclock_shared_data
472 .map(|pvclock| pvclock.snapshot()),
473 }
474 }
475 }
476
477 /// Worker struct for the virtio-pvclock device.
478 ///
479 /// Handles virtio requests, storing information about suspend/resume, adjusting the
480 /// pvclock data in shared memory, and injecting suspend durations via config
481 /// changes.
482 struct PvClockWorker {
483 tsc_frequency: u64,
484 // The moment the last suspend occurred.
485 suspend_time: Option<PvclockInstant>,
486 // The total time the vm has been suspended, this is in an Arc<AtomicU64>> because it's set
487 // by the PvClockWorker thread but read by PvClock from the mmio bus in the main thread.
488 total_injected_ns: Arc<AtomicU64>,
489 // The total change in the TSC value over suspensions.
490 total_suspend_tsc_delta: u64,
491 // Pvclock shared data.
492 pvclock_shared_data: Option<PvclockSharedData>,
493 mem: GuestMemory,
494 }
495
496 impl PvClockWorker {
new(tsc_frequency: u64, total_injected_ns: Arc<AtomicU64>, mem: GuestMemory) -> Self497 pub fn new(tsc_frequency: u64, total_injected_ns: Arc<AtomicU64>, mem: GuestMemory) -> Self {
498 PvClockWorker {
499 tsc_frequency,
500 suspend_time: None,
501 total_injected_ns,
502 total_suspend_tsc_delta: 0,
503 pvclock_shared_data: None,
504 mem,
505 }
506 }
507
from_snapshot( tsc_frequency: u64, total_injected_ns: Arc<AtomicU64>, snap: PvClockWorkerSnapshot, mem: GuestMemory, ) -> Self508 fn from_snapshot(
509 tsc_frequency: u64,
510 total_injected_ns: Arc<AtomicU64>,
511 snap: PvClockWorkerSnapshot,
512 mem: GuestMemory,
513 ) -> Self {
514 PvClockWorker {
515 tsc_frequency,
516 suspend_time: snap.suspend_time,
517 total_injected_ns,
518 total_suspend_tsc_delta: snap.total_suspend_tsc_delta,
519 pvclock_shared_data: snap
520 .pvclock_shared_data_base_address
521 .map(|addr| PvclockSharedData::new(mem.clone(), addr)),
522 mem,
523 }
524 }
525
526 /// Initialize the pvclock for initial boot. We assume that the systemtime of 0 corresponds
527 /// to the tsc time of 0, so we do not set these. We set the tsc frequency based on the vcpu
528 /// tsc frequency and we set PVCLOCK_TSC_STABLE_BIT in flags to tell the guest that it's
529 /// safe to use vcpu0's pvclock page for use by the vdso. The order of writing the different
530 /// fields doesn't matter at this point, but does matter when updating.
set_pvclock_page(&mut self, addr: u64) -> Result<()>531 fn set_pvclock_page(&mut self, addr: u64) -> Result<()> {
532 if self.pvclock_shared_data.is_some() {
533 return Err(Error::new(libc::EALREADY)).context("pvclock page already set");
534 }
535
536 let mut shared_data = PvclockSharedData::new(self.mem.clone(), GuestAddress(addr));
537
538 // set all fields to 0 first
539 shared_data.zero_fill()?;
540
541 shared_data.set_tsc_frequency(self.tsc_frequency)?;
542 shared_data.enable_pvclock_flags(PVCLOCK_TSC_STABLE_BIT)?;
543
544 self.pvclock_shared_data = Some(shared_data);
545 Ok(())
546 }
547
suspend(&mut self)548 pub fn suspend(&mut self) {
549 if self.suspend_time.is_some() {
550 warn!("Suspend time already set, ignoring new suspend time");
551 return;
552 }
553 self.suspend_time = Some(PvclockInstant {
554 time: Utc::now(),
555 tsc_value: read_clock_counter(),
556 });
557 }
558
resume(&mut self) -> Result<u64>559 pub fn resume(&mut self) -> Result<u64> {
560 // First, increment the sequence lock by 1 before writing to the pvclock page.
561 self.increment_pvclock_seqlock()?;
562
563 // The guest makes sure there are memory barriers in between reads of the seqlock and other
564 // fields, we should make sure there are memory barriers in between writes of seqlock and
565 // writes to other fields.
566 std::sync::atomic::fence(Ordering::SeqCst);
567
568 // Set the guest_stopped_bit and tsc suspended delta in pvclock struct. We only need to set
569 // the bit, the guest will unset it once the guest has handled the stoppage.
570 // We get the result here because we want to call increment_pvclock_seqlock regardless of
571 // the result of these calls.
572 let result = self
573 .set_guest_stopped_bit()
574 .and_then(|_| self.set_suspended_time());
575
576 // The guest makes sure there are memory barriers in between reads of the seqlock and other
577 // fields, we should make sure there are memory barriers in between writes of seqlock and
578 // writes to other fields.
579 std::sync::atomic::fence(Ordering::SeqCst);
580
581 // Do a final increment once changes are done.
582 self.increment_pvclock_seqlock()?;
583
584 result
585 }
586
get_suspended_duration(suspend_time: &PvclockInstant) -> Duration587 fn get_suspended_duration(suspend_time: &PvclockInstant) -> Duration {
588 match Utc::now().signed_duration_since(suspend_time.time).to_std() {
589 Ok(duration) => duration,
590 Err(e) => {
591 error!(
592 "pvclock found suspend time in the future (was the host \
593 clock adjusted?). Guest boot/realtime clock may now be \
594 incorrect. Details: {}",
595 e
596 );
597 Duration::ZERO
598 }
599 }
600 }
601
set_suspended_time(&mut self) -> Result<u64>602 fn set_suspended_time(&mut self) -> Result<u64> {
603 let (this_suspend_duration, this_suspend_tsc_delta) =
604 if let Some(suspend_time) = self.suspend_time.take() {
605 (
606 Self::get_suspended_duration(&suspend_time),
607 // NB: This calculation may wrap around, as TSC can be reset to zero when
608 // the device has resumed from the "deep" suspend state (it may not happen for
609 // s2idle cases). It also happens when the tsc value itself wraps.
610 read_clock_counter().wrapping_sub(suspend_time.tsc_value),
611 )
612 } else {
613 return Err(Error::new(libc::ENOTSUP))
614 .context("Cannot set suspend time because suspend was never called");
615 };
616
617 // update the total tsc delta during all suspends
618 // NB: This calculation may wrap around, as the suspend time can be bigger than u64 range.
619 self.total_suspend_tsc_delta = self
620 .total_suspend_tsc_delta
621 .wrapping_add(this_suspend_tsc_delta);
622
623 // save tsc_suspended_delta to shared memory
624 self.pvclock_shared_data
625 .as_mut()
626 .ok_or(
627 anyhow::Error::new(Error::new(libc::ENODATA)).context("pvclock page is not set"),
628 )?
629 .set_tsc_suspended_delta(self.total_suspend_tsc_delta)?;
630
631 info!(
632 "set total suspend tsc delta to {}",
633 self.total_suspend_tsc_delta
634 );
635
636 // update total suspend ns
637 self.total_injected_ns
638 .fetch_add(this_suspend_duration.as_nanos() as u64, Ordering::SeqCst);
639
640 Ok(self.total_suspend_tsc_delta)
641 }
642
increment_pvclock_seqlock(&mut self) -> Result<()>643 fn increment_pvclock_seqlock(&mut self) -> Result<()> {
644 self.pvclock_shared_data
645 .as_mut()
646 .ok_or(
647 anyhow::Error::new(Error::new(libc::ENODATA)).context("pvclock page is not set"),
648 )?
649 .increment_seqlock()
650 }
651
set_guest_stopped_bit(&mut self) -> Result<()>652 fn set_guest_stopped_bit(&mut self) -> Result<()> {
653 self.pvclock_shared_data
654 .as_mut()
655 .ok_or(
656 anyhow::Error::new(Error::new(libc::ENODATA)).context("pvclock page is not set"),
657 )?
658 .enable_pvclock_flags(PVCLOCK_GUEST_STOPPED)
659 }
660 }
661
pvclock_response_error_from_anyhow(error: anyhow::Error) -> base::Error662 fn pvclock_response_error_from_anyhow(error: anyhow::Error) -> base::Error {
663 for cause in error.chain() {
664 if let Some(e) = cause.downcast_ref::<base::Error>() {
665 return *e;
666 }
667
668 if let Some(e) = cause.downcast_ref::<GuestMemoryError>() {
669 return match e {
670 // Two kinds of GuestMemoryError contain base::Error
671 GuestMemoryError::MemoryAddSealsFailed(e) => *e,
672 GuestMemoryError::MemoryCreationFailed(e) => *e,
673 // Otherwise return EINVAL
674 _ => Error::new(libc::EINVAL),
675 };
676 }
677 }
678 // Unknown base error
679 Error::new(libc::EFAULT)
680 }
681
682 struct StubWorkerReturn {
683 suspend_tube: Tube,
684 }
685
686 /// A stub worker to respond any requests when the device is inactive.
run_stub_worker(suspend_tube: Tube, kill_evt: Event) -> StubWorkerReturn687 fn run_stub_worker(suspend_tube: Tube, kill_evt: Event) -> StubWorkerReturn {
688 #[derive(EventToken, Debug)]
689 enum Token {
690 SomePvClockRequest,
691 Kill,
692 }
693 let wait_ctx: WaitContext<Token> = match WaitContext::build_with(&[
694 (suspend_tube.get_read_notifier(), Token::SomePvClockRequest),
695 // TODO(b/242743502): Can also close on Tube closure for Unix once CloseNotifier is
696 // implemented for Tube.
697 #[cfg(windows)]
698 (suspend_tube.get_close_notifier(), Token::Kill),
699 (&kill_evt, Token::Kill),
700 ]) {
701 Ok(wait_ctx) => wait_ctx,
702 Err(e) => {
703 error!("failed creating WaitContext: {}", e);
704 return StubWorkerReturn { suspend_tube };
705 }
706 };
707 'wait: loop {
708 let events = match wait_ctx.wait() {
709 Ok(v) => v,
710 Err(e) => {
711 error!("failed polling for events: {}", e);
712 break;
713 }
714 };
715 for event in events.iter().filter(|e| e.is_readable) {
716 match event.token {
717 Token::SomePvClockRequest => {
718 match suspend_tube.recv::<PvClockCommand>() {
719 Ok(req) => req,
720 Err(e) => {
721 error!("failed to receive request: {}", e);
722 continue;
723 }
724 };
725 if let Err(e) = suspend_tube.send(&PvClockCommandResponse::DeviceInactive) {
726 error!("error sending PvClockCommandResponse: {}", e);
727 }
728 }
729 Token::Kill => {
730 break 'wait;
731 }
732 }
733 }
734 }
735 StubWorkerReturn { suspend_tube }
736 }
737
738 struct MainWorkerReturn {
739 worker: PvClockWorker,
740 set_pvclock_page_queue: Queue,
741 suspend_tube: Tube,
742 }
743
744 // TODO(b/237300012): asyncify this device.
745 /// A worker to process PvClockCommand requests
run_main_worker( mut worker: PvClockWorker, mut set_pvclock_page_queue: Queue, suspend_tube: Tube, interrupt: Interrupt, kill_evt: Event, ) -> MainWorkerReturn746 fn run_main_worker(
747 mut worker: PvClockWorker,
748 mut set_pvclock_page_queue: Queue,
749 suspend_tube: Tube,
750 interrupt: Interrupt,
751 kill_evt: Event,
752 ) -> MainWorkerReturn {
753 #[derive(EventToken)]
754 enum Token {
755 SetPvClockPageQueue,
756 SuspendResume,
757 Kill,
758 }
759
760 let wait_ctx: WaitContext<Token> = match WaitContext::build_with(&[
761 (set_pvclock_page_queue.event(), Token::SetPvClockPageQueue),
762 (suspend_tube.get_read_notifier(), Token::SuspendResume),
763 // TODO(b/242743502): Can also close on Tube closure for Unix once CloseNotifier is
764 // implemented for Tube.
765 #[cfg(windows)]
766 (suspend_tube.get_close_notifier(), Token::Kill),
767 (&kill_evt, Token::Kill),
768 ]) {
769 Ok(pc) => pc,
770 Err(e) => {
771 error!("failed creating WaitContext: {}", e);
772 return MainWorkerReturn {
773 suspend_tube,
774 set_pvclock_page_queue,
775 worker,
776 };
777 }
778 };
779
780 'wait: loop {
781 let events = match wait_ctx.wait() {
782 Ok(v) => v,
783 Err(e) => {
784 error!("failed polling for events: {}", e);
785 break;
786 }
787 };
788
789 for event in events.iter().filter(|e| e.is_readable) {
790 match event.token {
791 Token::SetPvClockPageQueue => {
792 let _ = set_pvclock_page_queue.event().wait();
793 let desc_chain = match set_pvclock_page_queue.pop() {
794 Some(desc_chain) => desc_chain,
795 None => {
796 // Spurious doorbells from the driver are permitted
797 // by the virtio spec (v1.3; section 2.9).
798 continue;
799 }
800 };
801
802 // This device does not follow the virtio spec requirements for device-readable
803 // vs. device-writable descriptors, so we can't use `Reader`/`Writer`. Pick the
804 // first descriptor from the chain and assume the whole req structure is
805 // contained within it.
806 let desc = desc_chain
807 .reader
808 .get_remaining_regions()
809 .chain(desc_chain.writer.get_remaining_regions())
810 .next()
811 .unwrap();
812
813 let len = if desc.len < size_of::<virtio_pvclock_set_pvclock_page_req>() {
814 error!("pvclock descriptor too short");
815 0
816 } else {
817 let addr = GuestAddress(desc.offset);
818 let mut req: virtio_pvclock_set_pvclock_page_req = match worker
819 .mem
820 .read_obj_from_addr(addr)
821 {
822 Ok(req) => req,
823 Err(e) => {
824 error!("failed to read request from set_pvclock_page queue: {}", e);
825 continue;
826 }
827 };
828
829 req.status = match worker.set_pvclock_page(req.pvclock_page_pa.into()) {
830 Err(e) => {
831 error!("failed to set pvclock page: {:#}", e);
832 VIRTIO_PVCLOCK_S_IOERR
833 }
834 Ok(_) => VIRTIO_PVCLOCK_S_OK,
835 };
836
837 if let Err(e) = worker.mem.write_obj_at_addr(req, addr) {
838 error!("failed to write set_pvclock_page status: {}", e);
839 continue;
840 }
841
842 desc.len as u32
843 };
844
845 set_pvclock_page_queue.add_used(desc_chain, len);
846 set_pvclock_page_queue.trigger_interrupt();
847 }
848 Token::SuspendResume => {
849 let req = match suspend_tube.recv::<PvClockCommand>() {
850 Ok(req) => req,
851 Err(e) => {
852 error!("failed to receive request: {}", e);
853 continue;
854 }
855 };
856
857 let resp = match req {
858 PvClockCommand::Suspend => {
859 worker.suspend();
860 PvClockCommandResponse::Ok
861 }
862 PvClockCommand::Resume => {
863 match worker.resume() {
864 Ok(total_suspended_ticks) => {
865 // signal to the driver that the total_suspend_ns has changed
866 interrupt.signal_config_changed();
867 PvClockCommandResponse::Resumed {
868 total_suspended_ticks,
869 }
870 }
871 Err(e) => {
872 error!("Failed to resume pvclock: {:#}", e);
873 PvClockCommandResponse::Err(pvclock_response_error_from_anyhow(
874 e,
875 ))
876 }
877 }
878 }
879 };
880
881 if let Err(e) = suspend_tube.send(&resp) {
882 error!("error sending PvClockCommandResponse: {}", e);
883 }
884 }
885 Token::Kill => {
886 break 'wait;
887 }
888 }
889 }
890 }
891
892 MainWorkerReturn {
893 suspend_tube,
894 set_pvclock_page_queue,
895 worker,
896 }
897 }
898
899 impl VirtioDevice for PvClock {
keep_rds(&self) -> Vec<RawDescriptor>900 fn keep_rds(&self) -> Vec<RawDescriptor> {
901 if let PvClockWorkerState::Idle(suspend_tube) = &self.worker_state {
902 vec![suspend_tube.as_raw_descriptor()]
903 } else {
904 Vec::new()
905 }
906 }
907
device_type(&self) -> DeviceType908 fn device_type(&self) -> DeviceType {
909 DeviceType::Pvclock
910 }
911
queue_max_sizes(&self) -> &[u16]912 fn queue_max_sizes(&self) -> &[u16] {
913 QUEUE_SIZES
914 }
915
features(&self) -> u64916 fn features(&self) -> u64 {
917 self.state.features
918 }
919
ack_features(&mut self, mut value: u64)920 fn ack_features(&mut self, mut value: u64) {
921 if value & !self.features() != 0 {
922 warn!("virtio-pvclock got unknown feature ack {:x}", value);
923 value &= self.features();
924 }
925 self.state.acked_features |= value;
926 }
927
read_config(&self, offset: u64, data: &mut [u8])928 fn read_config(&self, offset: u64, data: &mut [u8]) {
929 copy_config(data, 0, self.get_config().as_bytes(), offset);
930 }
931
write_config(&mut self, offset: u64, data: &[u8])932 fn write_config(&mut self, offset: u64, data: &[u8]) {
933 // Pvclock device doesn't expect a guest write to config
934 warn!(
935 "Unexpected write to virtio-pvclock config at offset {}: {:?}",
936 offset, data
937 );
938 }
939
activate( &mut self, mem: GuestMemory, interrupt: Interrupt, queues: BTreeMap<usize, Queue>, ) -> anyhow::Result<()>940 fn activate(
941 &mut self,
942 mem: GuestMemory,
943 interrupt: Interrupt,
944 queues: BTreeMap<usize, Queue>,
945 ) -> anyhow::Result<()> {
946 let tsc_frequency = self.state.tsc_frequency;
947 let total_suspend_ns = self.state.total_suspend_ns.clone();
948 let worker = PvClockWorker::new(tsc_frequency, total_suspend_ns, mem);
949 self.switch_to_main_worker(interrupt, worker, queues)
950 }
951
reset(&mut self) -> Result<()>952 fn reset(&mut self) -> Result<()> {
953 self.switch_to_stub_worker();
954 Ok(())
955 }
956
virtio_sleep(&mut self) -> anyhow::Result<Option<BTreeMap<usize, Queue>>>957 fn virtio_sleep(&mut self) -> anyhow::Result<Option<BTreeMap<usize, Queue>>> {
958 let last_state = replace(&mut self.worker_state, PvClockWorkerState::None);
959 match last_state {
960 PvClockWorkerState::Main(main_worker_thread) => {
961 let main_worker_ret = main_worker_thread.stop();
962 let mut queues = BTreeMap::new();
963 queues.insert(0, main_worker_ret.set_pvclock_page_queue);
964 self.worker_state = PvClockWorkerState::Idle(main_worker_ret.suspend_tube);
965 self.state.paused_main_worker = Some(main_worker_ret.worker.into());
966 Ok(Some(queues))
967 }
968 PvClockWorkerState::Stub(stub_worker_thread) => {
969 let stub_ret = stub_worker_thread.stop();
970 self.worker_state = PvClockWorkerState::Idle(stub_ret.suspend_tube);
971 Ok(None)
972 }
973 PvClockWorkerState::Idle(suspend_tube) => {
974 self.worker_state = PvClockWorkerState::Idle(suspend_tube);
975 Ok(None)
976 }
977 PvClockWorkerState::None => panic!("invalid state transition"),
978 }
979 }
980
virtio_wake( &mut self, queues_state: Option<(GuestMemory, Interrupt, BTreeMap<usize, Queue>)>, ) -> anyhow::Result<()>981 fn virtio_wake(
982 &mut self,
983 queues_state: Option<(GuestMemory, Interrupt, BTreeMap<usize, Queue>)>,
984 ) -> anyhow::Result<()> {
985 if let Some((mem, interrupt, queues)) = queues_state {
986 let worker_snap = self
987 .state
988 .paused_main_worker
989 .take()
990 .ok_or(anyhow!("a sleeping pvclock must have a paused worker"))?;
991 let worker = PvClockWorker::from_snapshot(
992 self.state.tsc_frequency,
993 self.state.total_suspend_ns.clone(),
994 worker_snap,
995 mem,
996 );
997 // Use unchecked as no worker is running at this point
998 self.start_main_worker(interrupt, worker, queues)?;
999 } else {
1000 // If the device wasn't activated, we should bring up the stub worker since that's
1001 // what is supposed to be running for an un-activated device.
1002 self.start_stub_worker();
1003 }
1004 Ok(())
1005 }
1006
virtio_snapshot(&mut self) -> anyhow::Result<AnySnapshot>1007 fn virtio_snapshot(&mut self) -> anyhow::Result<AnySnapshot> {
1008 AnySnapshot::to_any(&self.state).context("failed to serialize PvClockState")
1009 }
1010
virtio_restore(&mut self, data: AnySnapshot) -> anyhow::Result<()>1011 fn virtio_restore(&mut self, data: AnySnapshot) -> anyhow::Result<()> {
1012 let state: PvClockState = AnySnapshot::from_any(data).context("error deserializing")?;
1013 if state.features != self.features() {
1014 bail!(
1015 "expected virtio_features to match, but they did not. Live: {:?}, snapshot {:?}",
1016 self.features(),
1017 state.features,
1018 );
1019 }
1020 // TODO(b/291346907): we assume that the TSC frequency has NOT changed
1021 // since the snapshot was made. Assuming we have not moved machines,
1022 // this is a reasonable assumption. We don't verify the frequency
1023 // because TSC calibration noisy.
1024 self.state = state;
1025 Ok(())
1026 }
1027
on_device_sandboxed(&mut self)1028 fn on_device_sandboxed(&mut self) {
1029 self.start_stub_worker();
1030 }
1031 }
1032
1033 #[cfg(test)]
1034 mod tests {
1035 use super::*;
1036 use crate::virtio::QueueConfig;
1037
1038 const TEST_QUEUE_SIZE: u16 = 2048;
1039
make_interrupt() -> Interrupt1040 fn make_interrupt() -> Interrupt {
1041 Interrupt::new_for_test()
1042 }
1043
create_pvclock_device() -> (Tube, PvClock)1044 fn create_pvclock_device() -> (Tube, PvClock) {
1045 let (host_tube, device_tube) = Tube::pair().unwrap();
1046 let mut pvclock_device = PvClock::new(0, 1e9 as u64, device_tube);
1047
1048 // Simulate the device initialization to start the stub thread.
1049 // In the real case, on_device_sandboxed will be called after the device is sandboxed
1050 // (or at some point during the device initializtion when the sandbox is disabled) to
1051 // allow devices to use multi-threads (as spawning new threads before sandboxing is
1052 // prohibited because of the minijail's restriction).
1053 pvclock_device.on_device_sandboxed();
1054
1055 (host_tube, pvclock_device)
1056 }
1057
create_sleeping_device() -> (PvClock, GuestMemory, Tube)1058 fn create_sleeping_device() -> (PvClock, GuestMemory, Tube) {
1059 let (_host_tube, mut pvclock_device) = create_pvclock_device();
1060
1061 // The queue won't actually be used, so passing one that isn't
1062 // fully configured is fine.
1063 let mut fake_queue = QueueConfig::new(TEST_QUEUE_SIZE, 0);
1064 fake_queue.set_ready(true);
1065 let mem = GuestMemory::new(&[(GuestAddress(0), 0x10000)]).unwrap();
1066 let interrupt = make_interrupt();
1067 pvclock_device
1068 .activate(
1069 mem.clone(),
1070 interrupt.clone(),
1071 BTreeMap::from([(
1072 0,
1073 fake_queue
1074 .activate(&mem, Event::new().unwrap(), interrupt)
1075 .unwrap(),
1076 )]),
1077 )
1078 .expect("activate should succeed");
1079 let queues = pvclock_device
1080 .virtio_sleep()
1081 .expect("sleep should succeed")
1082 .expect("sleep should yield queues");
1083 assert_eq!(queues.len(), 1);
1084 assert_eq!(
1085 queues.get(&0).expect("queue must be present").size(),
1086 TEST_QUEUE_SIZE
1087 );
1088 assert!(pvclock_device.state.paused_main_worker.is_some());
1089 (pvclock_device, mem, _host_tube)
1090 }
1091
assert_wake_successful(pvclock_device: &mut PvClock, mem: &GuestMemory)1092 fn assert_wake_successful(pvclock_device: &mut PvClock, mem: &GuestMemory) {
1093 // We just create a new queue here, because it isn't actually accessed
1094 // by the device in these tests.
1095 let mut wake_queues = BTreeMap::new();
1096 let mut fake_queue = QueueConfig::new(TEST_QUEUE_SIZE, 0);
1097 let interrupt = make_interrupt();
1098 fake_queue.set_ready(true);
1099 wake_queues.insert(
1100 0,
1101 fake_queue
1102 .activate(mem, Event::new().unwrap(), interrupt.clone())
1103 .unwrap(),
1104 );
1105 let queues_state = (mem.clone(), interrupt, wake_queues);
1106 pvclock_device
1107 .virtio_wake(Some(queues_state))
1108 .expect("wake should succeed");
1109 assert!(pvclock_device.state.paused_main_worker.is_none());
1110 }
1111
1112 #[test]
test_command_response_when_inactive()1113 fn test_command_response_when_inactive() {
1114 let (host_tube, _pvclock_device) = create_pvclock_device();
1115 assert!(host_tube.send(&PvClockCommand::Suspend).is_ok());
1116 let res = host_tube.recv::<PvClockCommandResponse>();
1117 assert!(matches!(res, Ok(PvClockCommandResponse::DeviceInactive)));
1118 }
1119
1120 #[test]
test_sleep_wake_smoke()1121 fn test_sleep_wake_smoke() {
1122 let (mut pvclock_device, mem, _tube) = create_sleeping_device();
1123 assert_wake_successful(&mut pvclock_device, &mem);
1124 }
1125
1126 #[test]
test_save_restore()1127 fn test_save_restore() {
1128 let (mut pvclock_device, mem, _tube) = create_sleeping_device();
1129 let test_suspend_ns = 9999;
1130
1131 // Store a test value we can look for later in the test to verify
1132 // we're restoring properties.
1133 pvclock_device
1134 .state
1135 .total_suspend_ns
1136 .store(test_suspend_ns, Ordering::SeqCst);
1137
1138 let snap = pvclock_device.virtio_snapshot().unwrap();
1139 pvclock_device
1140 .state
1141 .total_suspend_ns
1142 .store(0, Ordering::SeqCst);
1143 pvclock_device.virtio_restore(snap).unwrap();
1144 assert_eq!(
1145 pvclock_device.state.total_suspend_ns.load(Ordering::SeqCst),
1146 test_suspend_ns
1147 );
1148
1149 assert_wake_successful(&mut pvclock_device, &mem);
1150 }
1151
1152 /// A simplified clone of `pvclock_scale_delta` from Linux kernel to emulate
1153 /// what the kernel does when converting TSC to ktime.
pvclock_scale_tsc(mult: u32, shift: i8, tsc: u64) -> u641154 fn pvclock_scale_tsc(mult: u32, shift: i8, tsc: u64) -> u64 {
1155 let shifted = if shift < 0 {
1156 tsc >> -shift
1157 } else {
1158 tsc << shift
1159 };
1160 let product = shifted as u128 * mult as u128;
1161 (product >> 32).try_into().expect("should not overflow")
1162 }
1163
1164 /// Helper function for checking the behavior of `freq_scale_shift`.
check_freq_scale(f: u64, input: u64)1165 fn check_freq_scale(f: u64, input: u64) {
1166 // We only test `scaled_hz` = 1GHz because that is the only value used in the code base.
1167 let (mult, shift) = freq_scale_shift(1_000_000_000, f);
1168
1169 let scaled = pvclock_scale_tsc(mult, shift, input);
1170
1171 // Use relative error <= 1e-8 as the target. TSC can be huge so this isn't really a super
1172 // accurate target, and our goal is to simply sanity check the math without adding too many
1173 // requirements about rounding errors.
1174 let expected: u64 = (input as u128 * 1_000_000_000u128 / f as u128) as u64;
1175 let expected_lo: u64 = (input as u128 * 999_999_990u128 / f as u128) as u64;
1176 let expected_hi: u64 = (input as u128 * 1_000_000_010u128 / f as u128) as u64;
1177 assert!(
1178 (expected_lo..=expected_hi).contains(&scaled),
1179 "{scaled} should be close to {expected} (base_hz={f}, mult={mult}, shift={shift})"
1180 );
1181 }
1182
1183 #[test]
test_freq_scale_shift_accuracy()1184 fn test_freq_scale_shift_accuracy() {
1185 // Basic check for formula correctness: scaling `scaled_hz` to `base_hz` should yield
1186 // `base_hz`.
1187 for f in (1..=50).map(|n| n * 100_000_000) {
1188 check_freq_scale(f, f);
1189 }
1190 }
1191
1192 #[test]
test_freq_scale_shift_overflow_high_freq()1193 fn test_freq_scale_shift_overflow_high_freq() {
1194 // For scale factors < 1.0, test that we can correctly convert the maximum TSC value without
1195 // overflow. We must be able to handle values as large as it realistically can be, as the
1196 // kernel clock breaks if the calculated ktime goes backwards (b/342168920).
1197 for f in (11..=50).map(|n| n * 100_000_000) {
1198 check_freq_scale(f, u64::MAX);
1199 }
1200 }
1201
1202 #[test]
test_freq_scale_shift_overflow_low_freq()1203 fn test_freq_scale_shift_overflow_low_freq() {
1204 fn prev_power_of_two(n: u64) -> u64 {
1205 assert_ne!(n, 0);
1206 let highest_bit_set = 63 - n.leading_zeros();
1207 1 << highest_bit_set
1208 }
1209 // Same test as above, but for scale factors >= 1.0. The difference is that for scale
1210 // factors >= 1.0 we first round up the factor, then apply a multiplier (< 1.0). We reflect
1211 // this limitation in our tested maximum value.
1212 for f in (1..=10).map(|n| n * 100_000_000) {
1213 // Truncate the remainder since prev_power_of_two rounds down anyway.
1214 let factor = 1_000_000_000 / f;
1215 // This is like (exp2(floor(log2(factor)) + 1)).
1216 let target = u64::MAX / (prev_power_of_two(factor) << 1);
1217 check_freq_scale(f, target);
1218 }
1219 }
1220 }
1221