• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2022 The ChromiumOS Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 //! Virtio version of a linux pvclock clocksource.
6 //!
7 //! Driver source is here:
8 //! <https://android.googlesource.com/kernel/common/+/ebaa2c516811825b141de844cee7a38653058ef5/drivers/virtio/virtio_pvclock.c>
9 //!
10 //! # Background
11 //!
12 //! Userland applications often rely on CLOCK_MONOTONIC to be relatively continuous.
13 //! Large jumps can signal problems (e.g., triggering Android watchdogs).
14 //! This assumption breaks down in virtualized environments, where a VM's suspension isn't
15 //! inherently linked to the guest kernel's concept of "suspend".
16 //! Since fixing all userland code is impractical, virtio-pvclock allows the VMM and guest kernel
17 //! to collaborate on emulating the expected clock behavior around suspend/resume.
18 //!
19 //! # How it works
20 //!
21 //! ## Core functions of virtio-pvclock device:
22 //!
23 //! 1. Adjusts hardware clocksource offsets to make the guest clocks appear suspended when the VM is
24 //!    suspended.
25 //!   - This is achieved through the pvclock mechanism implemented in x86 KVM used by kvm-clock.
26 //! 2. Provides the guest kernel with the duration of VM suspension, allowing the guest to adjust
27 //!    its clocks accordingly.
28 //!   - Since the offset between the CLOCK_MONOTONIC and CLOCK_BOOTTIME is maintained by the guest
29 //!     kernel, applying the adjustment is the guest driver's responsibility.
30 //!
31 //! ## Expected guest clock behaviors under virtio-pvclock is enabled
32 //!
33 //! - Monotonicity of CLOCK_MONOTONIC and CLOCK_BOOTTIME is maintained.
34 //! - CLOCK_MONOTONIC will not include the time passed during crosvm is suspended from its run mode
35 //!   perspective.
36 //! - CLOCK_BOOTTIME will be adjusted to include the time passed during crosvm is suspended.
37 //!
38 //! # Why it is needed
39 //!
40 //! Because the existing solution does not cover some expectations we need.
41 //!
42 //! kvm-clock is letting the host to manage the offsets of CLOCK_MONOTONIC.
43 //! However, it doesn't address the difference between CLOCK_BOOTTIME and CLOCK_MONOTONIC related
44 //! to host's suspend/resume, as it is designed to maintain the CLOCK_REALTIME in sync mainly.
45 
46 #[cfg(target_arch = "aarch64")]
47 use std::arch::asm;
48 use std::collections::BTreeMap;
49 use std::mem::replace;
50 use std::mem::size_of;
51 use std::sync::atomic::AtomicU64;
52 use std::sync::atomic::Ordering;
53 use std::sync::Arc;
54 use std::time::Duration;
55 
56 use anyhow::anyhow;
57 use anyhow::bail;
58 use anyhow::Context;
59 use anyhow::Result;
60 use base::error;
61 use base::info;
62 use base::warn;
63 use base::AsRawDescriptor;
64 #[cfg(windows)]
65 use base::CloseNotifier;
66 use base::Error;
67 use base::Event;
68 use base::EventToken;
69 use base::RawDescriptor;
70 use base::ReadNotifier;
71 use base::Tube;
72 use base::WaitContext;
73 use base::WorkerThread;
74 use chrono::DateTime;
75 use chrono::Utc;
76 use data_model::Le32;
77 use data_model::Le64;
78 use serde::Deserialize;
79 use serde::Serialize;
80 use snapshot::AnySnapshot;
81 use vm_control::PvClockCommand;
82 use vm_control::PvClockCommandResponse;
83 use vm_memory::GuestAddress;
84 use vm_memory::GuestMemory;
85 use vm_memory::GuestMemoryError;
86 use zerocopy::FromBytes;
87 use zerocopy::Immutable;
88 use zerocopy::IntoBytes;
89 use zerocopy::KnownLayout;
90 
91 use super::copy_config;
92 use super::DeviceType;
93 use super::Interrupt;
94 use super::Queue;
95 use super::VirtioDevice;
96 
97 // Pvclock has one virtio queue: set_pvclock_page
98 const QUEUE_SIZE: u16 = 1;
99 const QUEUE_SIZES: &[u16] = &[QUEUE_SIZE];
100 
101 // pvclock flag bits
102 const PVCLOCK_TSC_STABLE_BIT: u8 = 1;
103 const PVCLOCK_GUEST_STOPPED: u8 = 2;
104 
105 // The feature bitmap for virtio pvclock
106 const VIRTIO_PVCLOCK_F_TSC_STABLE: u64 = 0; // TSC is stable
107 const VIRTIO_PVCLOCK_F_INJECT_SLEEP: u64 = 1; // Inject sleep for suspend
108 const VIRTIO_PVCLOCK_F_CLOCKSOURCE_RATING: u64 = 2; // Use device clocksource rating
109 
110 // Status values for a virtio_pvclock request.
111 const VIRTIO_PVCLOCK_S_OK: u8 = 0;
112 const VIRTIO_PVCLOCK_S_IOERR: u8 = 1;
113 
114 const VIRTIO_PVCLOCK_CLOCKSOURCE_RATING: u32 = 450;
115 
116 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
read_clock_counter() -> u64117 fn read_clock_counter() -> u64 {
118     // SAFETY: rdtsc is unprivileged and have no side effects.
119     unsafe { std::arch::x86_64::_rdtsc() }
120 }
121 
122 #[cfg(target_arch = "aarch64")]
read_clock_counter() -> u64123 fn read_clock_counter() -> u64 {
124     let mut x: u64;
125     // SAFETY: This instruction have no side effect apart from storing the current timestamp counter
126     //         into the specified register.
127     unsafe {
128         asm!("mrs {x}, cntvct_el0",
129             x = out(reg) x,
130         );
131     }
132     x
133 }
134 
135 /// Calculate a (multiplier, shift) pair for scaled math of clocks.
136 /// The values are passed on to `pvclock_scale_delta` in the guest kernel and satisfy the following
137 /// (approximate) equality:
138 /// `n * scaled_hz / base_hz ~= ((n << shift) * multiplier) >> 32`
139 /// The logic here is roughly based on `kvm_get_time_scale` (but simplified as we can use u128).
140 /// # Arguments
141 /// * `scaled_hz` - Frequency to convert to. When dealing with clocksources, this is NSEC_PER_SEC.
142 /// * `base_hz` - Frequency to convert from. When dealing with clocksources, this is the counter
143 ///   frequency.
freq_scale_shift(scaled_hz: u64, base_hz: u64) -> (u32, i8)144 fn freq_scale_shift(scaled_hz: u64, base_hz: u64) -> (u32, i8) {
145     assert!(scaled_hz > 0 && base_hz > 0);
146     // We treat `multiplier` as a 0.32 fixed-point number by folding the >> 32 into its definition.
147     // With this definition, `multiplier` can be calculated as `(scaled_hz / base_hz) >> shift`
148     // with a corresponding `shift`.
149     //
150     // The value of `shift` should satisfy a few constraints:
151     // 1. `multiplier` needs to be < 1.0 due to the representable range of 0.32 fixed-point (maximum
152     //    (2^32-1)/2^32).
153     // 2. `shift` should be minimized because `pvclock_scale_delta` applies `shift` on the 64-bit
154     //    TSC value before extending to 128-bit and large positive shifts reduce the TSC rollover
155     //    time.
156     //
157     // Minimizing `shift` means maximizing `multiplier`. From the < 1.0 constraint, this is
158     // equivalent to having a multiplier within [0.5, 1.0). The logic below picks a multiplier
159     // satisfying that, while updating `shift` accordingly when we double or halve the multiplier.
160     let mut shift = 0;
161     // Convert to u128 so that overflow handling becomes much easier.
162     let mut scaled_hz = scaled_hz as u128;
163     let mut base_hz = base_hz as u128;
164     if scaled_hz >= base_hz {
165         while scaled_hz >= base_hz {
166             // `multiplier` >= 1.0; iteratively scale it down
167             // scaled_hz is at most 64 bits, so after this loop base_hz is at most 65 bits.
168             base_hz <<= 1;
169             shift += 1;
170         }
171     } else {
172         while base_hz > 2 * scaled_hz {
173             // `multiplier` < 0.5; iteratively scale it up
174             // base_hz is at most 64 bits. If the loop condition passes then scaled_hz is at most 63
175             // bits, otherwise at most 64 bits. Post-loop scaled_hz is at most 64 bits.
176             scaled_hz <<= 1;
177             shift -= 1;
178         }
179     }
180     // From above, we know that the values are at most 65 bits. This provides sufficient headroom
181     // for scaled_hz << 32 below.
182     assert!(base_hz < (1u128 << 65) && scaled_hz < (1u128 << 65));
183     let mult: u32 = ((scaled_hz << 32) / base_hz)
184         .try_into()
185         .expect("should not overflow");
186     (mult, shift)
187 }
188 
189 // The config structure being exposed to the guest to tell them how much suspend time should be
190 // injected to the guest's CLOCK_BOOTTIME.
191 #[derive(Debug, Clone, Copy, Default, FromBytes, Immutable, IntoBytes, KnownLayout)]
192 #[allow(non_camel_case_types)]
193 #[repr(C)]
194 struct virtio_pvclock_config {
195     // Total duration the VM has been paused while the guest kernel is not in the suspended state
196     // (from the power management and timekeeping perspective).
197     suspend_time_ns: Le64,
198     // Device-suggested rating of the pvclock clocksource.
199     clocksource_rating: Le32,
200     padding: u32,
201 }
202 
203 #[derive(Debug, Clone, Copy, Default, FromBytes, Immutable, IntoBytes, KnownLayout)]
204 #[allow(non_camel_case_types)]
205 #[repr(C)]
206 struct virtio_pvclock_set_pvclock_page_req {
207     // Physical address of pvclock page.
208     pvclock_page_pa: Le64,
209     // Current system time.
210     system_time: Le64,
211     // Current tsc value.
212     tsc_timestamp: Le64,
213     // Status of this request, one of VIRTIO_PVCLOCK_S_*.
214     status: u8,
215     padding: [u8; 7],
216 }
217 
218 // Data structure for interacting with pvclock shared memory.
219 struct PvclockSharedData {
220     mem: GuestMemory,
221     seqlock_addr: GuestAddress,
222     tsc_suspended_delta_addr: GuestAddress,
223     tsc_frequency_multiplier_addr: GuestAddress,
224     tsc_frequency_shift_addr: GuestAddress,
225     flags_addr: GuestAddress,
226 }
227 
228 impl PvclockSharedData {
new(mem: GuestMemory, addr: GuestAddress) -> Self229     pub fn new(mem: GuestMemory, addr: GuestAddress) -> Self {
230         PvclockSharedData {
231             mem,
232             // The addresses of the various fields that we need to modify are relative to the
233             // base of the pvclock page. For reference, see the pvclock_vcpu_time_info struct.
234             seqlock_addr: addr,
235             tsc_suspended_delta_addr: addr.unchecked_add(8),
236             tsc_frequency_multiplier_addr: addr.unchecked_add(24),
237             tsc_frequency_shift_addr: addr.unchecked_add(28),
238             flags_addr: addr.unchecked_add(29),
239         }
240     }
241 
242     /// Only the seqlock_addr is needed to re-create this struct at restore
243     /// time, so that is all our snapshot contains.
snapshot(&self) -> GuestAddress244     fn snapshot(&self) -> GuestAddress {
245         self.seqlock_addr
246     }
247 
248     /// Set all fields to zero.
zero_fill(&mut self) -> Result<()>249     pub fn zero_fill(&mut self) -> Result<()> {
250         // The pvclock data structure is 32 bytes long, so we write 32 bytes of 0s
251         self.mem
252             .write_all_at_addr(&[0u8; 32], self.seqlock_addr)
253             .context("failed to zero fill the pvclock shared data")
254     }
255 
increment_seqlock(&mut self) -> Result<()>256     pub fn increment_seqlock(&mut self) -> Result<()> {
257         // TODO (b/264931437): reads and writes using read/write_obj_from/at_addr are not
258         //  guaranteed to be atomic. Although this should not be a problem for the seqlock
259         //  or the other fields in the pvclock shared data (whch are protected via the seqlock)
260         //  we might want to update these calls to be as atomic as possible if/when we have
261         //  the ability to do so, just as a general cleanup and to be consistent.
262         let value = self
263             .mem
264             .read_obj_from_addr::<u32>(self.seqlock_addr)
265             .context("failed to read seqlock value")?;
266         self.mem
267             .write_obj_at_addr(value.wrapping_add(1), self.seqlock_addr)
268             .context("failed to write seqlock value")
269     }
270 
set_tsc_suspended_delta(&mut self, delta: u64) -> Result<()>271     pub fn set_tsc_suspended_delta(&mut self, delta: u64) -> Result<()> {
272         self.mem
273             .write_obj_at_addr(delta, self.tsc_suspended_delta_addr)
274             .context("failed to write tsc suspended delta")
275     }
276 
set_tsc_frequency(&mut self, frequency: u64) -> Result<()>277     pub fn set_tsc_frequency(&mut self, frequency: u64) -> Result<()> {
278         let (multiplier, shift): (u32, i8) = freq_scale_shift(1_000_000_000, frequency);
279 
280         self.mem
281             .write_obj_at_addr(multiplier, self.tsc_frequency_multiplier_addr)
282             .context("failed to write tsc frequency mlutiplier")?;
283         self.mem
284             .write_obj_at_addr(shift, self.tsc_frequency_shift_addr)
285             .context("failed to write tsc frequency shift")
286     }
287 
enable_pvclock_flags(&mut self, flags: u8) -> Result<()>288     pub fn enable_pvclock_flags(&mut self, flags: u8) -> Result<()> {
289         let value = self
290             .mem
291             .read_obj_from_addr::<u8>(self.flags_addr)
292             .context("failed to read flags")?;
293         self.mem
294             .write_obj_at_addr(value | flags, self.flags_addr)
295             .context("failed to write flags")
296     }
297 }
298 
299 /// Serializable part of the [PvClock] struct which will be used by the virtio_snapshot / restore.
300 #[derive(Serialize, Deserialize)]
301 struct PvClockState {
302     tsc_frequency: u64,
303     /// If the device is sleeping, a [PvClockWorkerSnapshot] that can re-create the worker
304     /// will be stored here. (We can't just store the worker itself as it contains an object
305     /// tree with references to [GuestMemory].)
306     paused_main_worker: Option<PvClockWorkerSnapshot>,
307     /// The total time the vm has been suspended, this is in an `Arc<AtomicU64>>` because it's set
308     /// by the PvClockWorker thread but read by PvClock from the mmio bus in the main thread.
309     total_suspend_ns: Arc<AtomicU64>,
310     features: u64,
311     acked_features: u64,
312 }
313 
314 /// An enum to keep dynamic state of pvclock workers in a type safe manner.
315 enum PvClockWorkerState {
316     /// Idle means no worker is running.
317     /// This tube is for communicating with this device from the crosvm threads.
318     Idle(Tube),
319     /// A stub worker to respond pvclock commands when the device is not activated yet.
320     Stub(WorkerThread<StubWorkerReturn>),
321     /// A main worker to respond pvclock commands while the device is active.
322     Main(WorkerThread<MainWorkerReturn>),
323     /// None is used only for handling transitional state between the states above.
324     None,
325 }
326 
327 /// A struct that represents virtio-pvclock device.
328 pub struct PvClock {
329     state: PvClockState,
330     worker_state: PvClockWorkerState,
331 }
332 
333 impl PvClock {
new(base_features: u64, tsc_frequency: u64, suspend_tube: Tube) -> Self334     pub fn new(base_features: u64, tsc_frequency: u64, suspend_tube: Tube) -> Self {
335         let state = PvClockState {
336             tsc_frequency,
337             paused_main_worker: None,
338             total_suspend_ns: Arc::new(AtomicU64::new(0)),
339             features: base_features
340                 | 1 << VIRTIO_PVCLOCK_F_TSC_STABLE
341                 | 1 << VIRTIO_PVCLOCK_F_INJECT_SLEEP
342                 | 1 << VIRTIO_PVCLOCK_F_CLOCKSOURCE_RATING,
343             acked_features: 0,
344         };
345         PvClock {
346             state,
347             worker_state: PvClockWorkerState::Idle(suspend_tube),
348         }
349     }
350 
get_config(&self) -> virtio_pvclock_config351     fn get_config(&self) -> virtio_pvclock_config {
352         virtio_pvclock_config {
353             suspend_time_ns: self.state.total_suspend_ns.load(Ordering::SeqCst).into(),
354             clocksource_rating: VIRTIO_PVCLOCK_CLOCKSOURCE_RATING.into(),
355             padding: 0,
356         }
357     }
358 
359     /// Use switch_to_*_worker unless needed to keep the state transition consistent
start_main_worker( &mut self, interrupt: Interrupt, pvclock_worker: PvClockWorker, mut queues: BTreeMap<usize, Queue>, ) -> anyhow::Result<()>360     fn start_main_worker(
361         &mut self,
362         interrupt: Interrupt,
363         pvclock_worker: PvClockWorker,
364         mut queues: BTreeMap<usize, Queue>,
365     ) -> anyhow::Result<()> {
366         let last_state = replace(&mut self.worker_state, PvClockWorkerState::None);
367         if let PvClockWorkerState::Idle(suspend_tube) = last_state {
368             if queues.len() != QUEUE_SIZES.len() {
369                 self.worker_state = PvClockWorkerState::Idle(suspend_tube);
370                 return Err(anyhow!(
371                     "expected {} queues, got {}",
372                     QUEUE_SIZES.len(),
373                     queues.len()
374                 ));
375             }
376             let set_pvclock_page_queue = queues.remove(&0).unwrap();
377             self.worker_state = PvClockWorkerState::Main(WorkerThread::start(
378                 "virtio_pvclock".to_string(),
379                 move |kill_evt| {
380                     run_main_worker(
381                         pvclock_worker,
382                         set_pvclock_page_queue,
383                         suspend_tube,
384                         interrupt,
385                         kill_evt,
386                     )
387                 },
388             ));
389         } else {
390             panic!("Invalid state transition");
391         }
392         Ok(())
393     }
394 
395     /// Use switch_to_*_worker unless needed to keep the state transition consistent
start_stub_worker(&mut self)396     fn start_stub_worker(&mut self) {
397         let last_state = replace(&mut self.worker_state, PvClockWorkerState::None);
398         self.worker_state = if let PvClockWorkerState::Idle(suspend_tube) = last_state {
399             PvClockWorkerState::Stub(WorkerThread::start(
400                 "virtio_pvclock_stub".to_string(),
401                 move |kill_evt| run_stub_worker(suspend_tube, kill_evt),
402             ))
403         } else {
404             panic!("Invalid state transition");
405         };
406     }
407 
408     /// Use switch_to_*_worker unless needed to keep the state transition consistent
stop_stub_worker(&mut self)409     fn stop_stub_worker(&mut self) {
410         let last_state = replace(&mut self.worker_state, PvClockWorkerState::None);
411         self.worker_state = if let PvClockWorkerState::Stub(stub_worker_thread) = last_state {
412             let stub_worker_ret = stub_worker_thread.stop();
413             PvClockWorkerState::Idle(stub_worker_ret.suspend_tube)
414         } else {
415             panic!("Invalid state transition");
416         }
417     }
418 
419     /// Use switch_to_*_worker unless needed to keep the state transition consistent
stop_main_worker(&mut self)420     fn stop_main_worker(&mut self) {
421         let last_state = replace(&mut self.worker_state, PvClockWorkerState::None);
422         if let PvClockWorkerState::Main(main_worker_thread) = last_state {
423             let main_worker_ret = main_worker_thread.stop();
424             self.worker_state = PvClockWorkerState::Idle(main_worker_ret.suspend_tube);
425             let mut queues = BTreeMap::new();
426             queues.insert(0, main_worker_ret.set_pvclock_page_queue);
427             self.state.paused_main_worker = Some(main_worker_ret.worker.into());
428         } else {
429             panic!("Invalid state transition");
430         }
431     }
432 
switch_to_stub_worker(&mut self)433     fn switch_to_stub_worker(&mut self) {
434         self.stop_main_worker();
435         self.start_stub_worker();
436     }
437 
switch_to_main_worker( &mut self, interrupt: Interrupt, pvclock_worker: PvClockWorker, queues: BTreeMap<usize, Queue>, ) -> anyhow::Result<()>438     fn switch_to_main_worker(
439         &mut self,
440         interrupt: Interrupt,
441         pvclock_worker: PvClockWorker,
442         queues: BTreeMap<usize, Queue>,
443     ) -> anyhow::Result<()> {
444         self.stop_stub_worker();
445         self.start_main_worker(interrupt, pvclock_worker, queues)
446     }
447 }
448 
449 /// Represents a moment in time including the TSC counter value at that time.
450 #[derive(Serialize, Deserialize, Clone)]
451 struct PvclockInstant {
452     time: DateTime<Utc>,
453     tsc_value: u64,
454 }
455 
456 /// The unique data retained by [PvClockWorker] which can be used to re-create
457 /// an identical worker.
458 #[derive(Serialize, Deserialize, Clone)]
459 struct PvClockWorkerSnapshot {
460     suspend_time: Option<PvclockInstant>,
461     total_suspend_tsc_delta: u64,
462     pvclock_shared_data_base_address: Option<GuestAddress>,
463 }
464 
465 impl From<PvClockWorker> for PvClockWorkerSnapshot {
from(worker: PvClockWorker) -> Self466     fn from(worker: PvClockWorker) -> Self {
467         PvClockWorkerSnapshot {
468             suspend_time: worker.suspend_time,
469             total_suspend_tsc_delta: worker.total_suspend_tsc_delta,
470             pvclock_shared_data_base_address: worker
471                 .pvclock_shared_data
472                 .map(|pvclock| pvclock.snapshot()),
473         }
474     }
475 }
476 
477 /// Worker struct for the virtio-pvclock device.
478 ///
479 /// Handles virtio requests, storing information about suspend/resume, adjusting the
480 /// pvclock data in shared memory, and injecting suspend durations via config
481 /// changes.
482 struct PvClockWorker {
483     tsc_frequency: u64,
484     // The moment the last suspend occurred.
485     suspend_time: Option<PvclockInstant>,
486     // The total time the vm has been suspended, this is in an Arc<AtomicU64>> because it's set
487     // by the PvClockWorker thread but read by PvClock from the mmio bus in the main thread.
488     total_injected_ns: Arc<AtomicU64>,
489     // The total change in the TSC value over suspensions.
490     total_suspend_tsc_delta: u64,
491     // Pvclock shared data.
492     pvclock_shared_data: Option<PvclockSharedData>,
493     mem: GuestMemory,
494 }
495 
496 impl PvClockWorker {
new(tsc_frequency: u64, total_injected_ns: Arc<AtomicU64>, mem: GuestMemory) -> Self497     pub fn new(tsc_frequency: u64, total_injected_ns: Arc<AtomicU64>, mem: GuestMemory) -> Self {
498         PvClockWorker {
499             tsc_frequency,
500             suspend_time: None,
501             total_injected_ns,
502             total_suspend_tsc_delta: 0,
503             pvclock_shared_data: None,
504             mem,
505         }
506     }
507 
from_snapshot( tsc_frequency: u64, total_injected_ns: Arc<AtomicU64>, snap: PvClockWorkerSnapshot, mem: GuestMemory, ) -> Self508     fn from_snapshot(
509         tsc_frequency: u64,
510         total_injected_ns: Arc<AtomicU64>,
511         snap: PvClockWorkerSnapshot,
512         mem: GuestMemory,
513     ) -> Self {
514         PvClockWorker {
515             tsc_frequency,
516             suspend_time: snap.suspend_time,
517             total_injected_ns,
518             total_suspend_tsc_delta: snap.total_suspend_tsc_delta,
519             pvclock_shared_data: snap
520                 .pvclock_shared_data_base_address
521                 .map(|addr| PvclockSharedData::new(mem.clone(), addr)),
522             mem,
523         }
524     }
525 
526     /// Initialize the pvclock for initial boot. We assume that the systemtime of 0 corresponds
527     /// to the tsc time of 0, so we do not set these. We set the tsc frequency based on the vcpu
528     /// tsc frequency and we set PVCLOCK_TSC_STABLE_BIT in flags to tell the guest that it's
529     /// safe to use vcpu0's pvclock page for use by the vdso. The order of writing the different
530     /// fields doesn't matter at this point, but does matter when updating.
set_pvclock_page(&mut self, addr: u64) -> Result<()>531     fn set_pvclock_page(&mut self, addr: u64) -> Result<()> {
532         if self.pvclock_shared_data.is_some() {
533             return Err(Error::new(libc::EALREADY)).context("pvclock page already set");
534         }
535 
536         let mut shared_data = PvclockSharedData::new(self.mem.clone(), GuestAddress(addr));
537 
538         // set all fields to 0 first
539         shared_data.zero_fill()?;
540 
541         shared_data.set_tsc_frequency(self.tsc_frequency)?;
542         shared_data.enable_pvclock_flags(PVCLOCK_TSC_STABLE_BIT)?;
543 
544         self.pvclock_shared_data = Some(shared_data);
545         Ok(())
546     }
547 
suspend(&mut self)548     pub fn suspend(&mut self) {
549         if self.suspend_time.is_some() {
550             warn!("Suspend time already set, ignoring new suspend time");
551             return;
552         }
553         self.suspend_time = Some(PvclockInstant {
554             time: Utc::now(),
555             tsc_value: read_clock_counter(),
556         });
557     }
558 
resume(&mut self) -> Result<u64>559     pub fn resume(&mut self) -> Result<u64> {
560         // First, increment the sequence lock by 1 before writing to the pvclock page.
561         self.increment_pvclock_seqlock()?;
562 
563         // The guest makes sure there are memory barriers in between reads of the seqlock and other
564         // fields, we should make sure there are memory barriers in between writes of seqlock and
565         // writes to other fields.
566         std::sync::atomic::fence(Ordering::SeqCst);
567 
568         // Set the guest_stopped_bit and tsc suspended delta in pvclock struct. We only need to set
569         // the bit, the guest will unset it once the guest has handled the stoppage.
570         // We get the result here because we want to call increment_pvclock_seqlock regardless of
571         // the result of these calls.
572         let result = self
573             .set_guest_stopped_bit()
574             .and_then(|_| self.set_suspended_time());
575 
576         // The guest makes sure there are memory barriers in between reads of the seqlock and other
577         // fields, we should make sure there are memory barriers in between writes of seqlock and
578         // writes to other fields.
579         std::sync::atomic::fence(Ordering::SeqCst);
580 
581         // Do a final increment once changes are done.
582         self.increment_pvclock_seqlock()?;
583 
584         result
585     }
586 
get_suspended_duration(suspend_time: &PvclockInstant) -> Duration587     fn get_suspended_duration(suspend_time: &PvclockInstant) -> Duration {
588         match Utc::now().signed_duration_since(suspend_time.time).to_std() {
589             Ok(duration) => duration,
590             Err(e) => {
591                 error!(
592                     "pvclock found suspend time in the future (was the host \
593                     clock adjusted?). Guest boot/realtime clock may now be \
594                     incorrect. Details: {}",
595                     e
596                 );
597                 Duration::ZERO
598             }
599         }
600     }
601 
set_suspended_time(&mut self) -> Result<u64>602     fn set_suspended_time(&mut self) -> Result<u64> {
603         let (this_suspend_duration, this_suspend_tsc_delta) =
604             if let Some(suspend_time) = self.suspend_time.take() {
605                 (
606                     Self::get_suspended_duration(&suspend_time),
607                     // NB: This calculation may wrap around, as TSC can be reset to zero when
608                     // the device has resumed from the "deep" suspend state (it may not happen for
609                     // s2idle cases). It also happens when the tsc value itself wraps.
610                     read_clock_counter().wrapping_sub(suspend_time.tsc_value),
611                 )
612             } else {
613                 return Err(Error::new(libc::ENOTSUP))
614                     .context("Cannot set suspend time because suspend was never called");
615             };
616 
617         // update the total tsc delta during all suspends
618         // NB: This calculation may wrap around, as the suspend time can be bigger than u64 range.
619         self.total_suspend_tsc_delta = self
620             .total_suspend_tsc_delta
621             .wrapping_add(this_suspend_tsc_delta);
622 
623         // save tsc_suspended_delta to shared memory
624         self.pvclock_shared_data
625             .as_mut()
626             .ok_or(
627                 anyhow::Error::new(Error::new(libc::ENODATA)).context("pvclock page is not set"),
628             )?
629             .set_tsc_suspended_delta(self.total_suspend_tsc_delta)?;
630 
631         info!(
632             "set total suspend tsc delta to {}",
633             self.total_suspend_tsc_delta
634         );
635 
636         // update total suspend ns
637         self.total_injected_ns
638             .fetch_add(this_suspend_duration.as_nanos() as u64, Ordering::SeqCst);
639 
640         Ok(self.total_suspend_tsc_delta)
641     }
642 
increment_pvclock_seqlock(&mut self) -> Result<()>643     fn increment_pvclock_seqlock(&mut self) -> Result<()> {
644         self.pvclock_shared_data
645             .as_mut()
646             .ok_or(
647                 anyhow::Error::new(Error::new(libc::ENODATA)).context("pvclock page is not set"),
648             )?
649             .increment_seqlock()
650     }
651 
set_guest_stopped_bit(&mut self) -> Result<()>652     fn set_guest_stopped_bit(&mut self) -> Result<()> {
653         self.pvclock_shared_data
654             .as_mut()
655             .ok_or(
656                 anyhow::Error::new(Error::new(libc::ENODATA)).context("pvclock page is not set"),
657             )?
658             .enable_pvclock_flags(PVCLOCK_GUEST_STOPPED)
659     }
660 }
661 
pvclock_response_error_from_anyhow(error: anyhow::Error) -> base::Error662 fn pvclock_response_error_from_anyhow(error: anyhow::Error) -> base::Error {
663     for cause in error.chain() {
664         if let Some(e) = cause.downcast_ref::<base::Error>() {
665             return *e;
666         }
667 
668         if let Some(e) = cause.downcast_ref::<GuestMemoryError>() {
669             return match e {
670                 // Two kinds of GuestMemoryError contain base::Error
671                 GuestMemoryError::MemoryAddSealsFailed(e) => *e,
672                 GuestMemoryError::MemoryCreationFailed(e) => *e,
673                 // Otherwise return EINVAL
674                 _ => Error::new(libc::EINVAL),
675             };
676         }
677     }
678     // Unknown base error
679     Error::new(libc::EFAULT)
680 }
681 
682 struct StubWorkerReturn {
683     suspend_tube: Tube,
684 }
685 
686 /// A stub worker to respond any requests when the device is inactive.
run_stub_worker(suspend_tube: Tube, kill_evt: Event) -> StubWorkerReturn687 fn run_stub_worker(suspend_tube: Tube, kill_evt: Event) -> StubWorkerReturn {
688     #[derive(EventToken, Debug)]
689     enum Token {
690         SomePvClockRequest,
691         Kill,
692     }
693     let wait_ctx: WaitContext<Token> = match WaitContext::build_with(&[
694         (suspend_tube.get_read_notifier(), Token::SomePvClockRequest),
695         // TODO(b/242743502): Can also close on Tube closure for Unix once CloseNotifier is
696         // implemented for Tube.
697         #[cfg(windows)]
698         (suspend_tube.get_close_notifier(), Token::Kill),
699         (&kill_evt, Token::Kill),
700     ]) {
701         Ok(wait_ctx) => wait_ctx,
702         Err(e) => {
703             error!("failed creating WaitContext: {}", e);
704             return StubWorkerReturn { suspend_tube };
705         }
706     };
707     'wait: loop {
708         let events = match wait_ctx.wait() {
709             Ok(v) => v,
710             Err(e) => {
711                 error!("failed polling for events: {}", e);
712                 break;
713             }
714         };
715         for event in events.iter().filter(|e| e.is_readable) {
716             match event.token {
717                 Token::SomePvClockRequest => {
718                     match suspend_tube.recv::<PvClockCommand>() {
719                         Ok(req) => req,
720                         Err(e) => {
721                             error!("failed to receive request: {}", e);
722                             continue;
723                         }
724                     };
725                     if let Err(e) = suspend_tube.send(&PvClockCommandResponse::DeviceInactive) {
726                         error!("error sending PvClockCommandResponse: {}", e);
727                     }
728                 }
729                 Token::Kill => {
730                     break 'wait;
731                 }
732             }
733         }
734     }
735     StubWorkerReturn { suspend_tube }
736 }
737 
738 struct MainWorkerReturn {
739     worker: PvClockWorker,
740     set_pvclock_page_queue: Queue,
741     suspend_tube: Tube,
742 }
743 
744 // TODO(b/237300012): asyncify this device.
745 /// A worker to process PvClockCommand requests
run_main_worker( mut worker: PvClockWorker, mut set_pvclock_page_queue: Queue, suspend_tube: Tube, interrupt: Interrupt, kill_evt: Event, ) -> MainWorkerReturn746 fn run_main_worker(
747     mut worker: PvClockWorker,
748     mut set_pvclock_page_queue: Queue,
749     suspend_tube: Tube,
750     interrupt: Interrupt,
751     kill_evt: Event,
752 ) -> MainWorkerReturn {
753     #[derive(EventToken)]
754     enum Token {
755         SetPvClockPageQueue,
756         SuspendResume,
757         Kill,
758     }
759 
760     let wait_ctx: WaitContext<Token> = match WaitContext::build_with(&[
761         (set_pvclock_page_queue.event(), Token::SetPvClockPageQueue),
762         (suspend_tube.get_read_notifier(), Token::SuspendResume),
763         // TODO(b/242743502): Can also close on Tube closure for Unix once CloseNotifier is
764         // implemented for Tube.
765         #[cfg(windows)]
766         (suspend_tube.get_close_notifier(), Token::Kill),
767         (&kill_evt, Token::Kill),
768     ]) {
769         Ok(pc) => pc,
770         Err(e) => {
771             error!("failed creating WaitContext: {}", e);
772             return MainWorkerReturn {
773                 suspend_tube,
774                 set_pvclock_page_queue,
775                 worker,
776             };
777         }
778     };
779 
780     'wait: loop {
781         let events = match wait_ctx.wait() {
782             Ok(v) => v,
783             Err(e) => {
784                 error!("failed polling for events: {}", e);
785                 break;
786             }
787         };
788 
789         for event in events.iter().filter(|e| e.is_readable) {
790             match event.token {
791                 Token::SetPvClockPageQueue => {
792                     let _ = set_pvclock_page_queue.event().wait();
793                     let desc_chain = match set_pvclock_page_queue.pop() {
794                         Some(desc_chain) => desc_chain,
795                         None => {
796                             // Spurious doorbells from the driver are permitted
797                             // by the virtio spec (v1.3; section 2.9).
798                             continue;
799                         }
800                     };
801 
802                     // This device does not follow the virtio spec requirements for device-readable
803                     // vs. device-writable descriptors, so we can't use `Reader`/`Writer`. Pick the
804                     // first descriptor from the chain and assume the whole req structure is
805                     // contained within it.
806                     let desc = desc_chain
807                         .reader
808                         .get_remaining_regions()
809                         .chain(desc_chain.writer.get_remaining_regions())
810                         .next()
811                         .unwrap();
812 
813                     let len = if desc.len < size_of::<virtio_pvclock_set_pvclock_page_req>() {
814                         error!("pvclock descriptor too short");
815                         0
816                     } else {
817                         let addr = GuestAddress(desc.offset);
818                         let mut req: virtio_pvclock_set_pvclock_page_req = match worker
819                             .mem
820                             .read_obj_from_addr(addr)
821                         {
822                             Ok(req) => req,
823                             Err(e) => {
824                                 error!("failed to read request from set_pvclock_page queue: {}", e);
825                                 continue;
826                             }
827                         };
828 
829                         req.status = match worker.set_pvclock_page(req.pvclock_page_pa.into()) {
830                             Err(e) => {
831                                 error!("failed to set pvclock page: {:#}", e);
832                                 VIRTIO_PVCLOCK_S_IOERR
833                             }
834                             Ok(_) => VIRTIO_PVCLOCK_S_OK,
835                         };
836 
837                         if let Err(e) = worker.mem.write_obj_at_addr(req, addr) {
838                             error!("failed to write set_pvclock_page status: {}", e);
839                             continue;
840                         }
841 
842                         desc.len as u32
843                     };
844 
845                     set_pvclock_page_queue.add_used(desc_chain, len);
846                     set_pvclock_page_queue.trigger_interrupt();
847                 }
848                 Token::SuspendResume => {
849                     let req = match suspend_tube.recv::<PvClockCommand>() {
850                         Ok(req) => req,
851                         Err(e) => {
852                             error!("failed to receive request: {}", e);
853                             continue;
854                         }
855                     };
856 
857                     let resp = match req {
858                         PvClockCommand::Suspend => {
859                             worker.suspend();
860                             PvClockCommandResponse::Ok
861                         }
862                         PvClockCommand::Resume => {
863                             match worker.resume() {
864                                 Ok(total_suspended_ticks) => {
865                                     // signal to the driver that the total_suspend_ns has changed
866                                     interrupt.signal_config_changed();
867                                     PvClockCommandResponse::Resumed {
868                                         total_suspended_ticks,
869                                     }
870                                 }
871                                 Err(e) => {
872                                     error!("Failed to resume pvclock: {:#}", e);
873                                     PvClockCommandResponse::Err(pvclock_response_error_from_anyhow(
874                                         e,
875                                     ))
876                                 }
877                             }
878                         }
879                     };
880 
881                     if let Err(e) = suspend_tube.send(&resp) {
882                         error!("error sending PvClockCommandResponse: {}", e);
883                     }
884                 }
885                 Token::Kill => {
886                     break 'wait;
887                 }
888             }
889         }
890     }
891 
892     MainWorkerReturn {
893         suspend_tube,
894         set_pvclock_page_queue,
895         worker,
896     }
897 }
898 
899 impl VirtioDevice for PvClock {
keep_rds(&self) -> Vec<RawDescriptor>900     fn keep_rds(&self) -> Vec<RawDescriptor> {
901         if let PvClockWorkerState::Idle(suspend_tube) = &self.worker_state {
902             vec![suspend_tube.as_raw_descriptor()]
903         } else {
904             Vec::new()
905         }
906     }
907 
device_type(&self) -> DeviceType908     fn device_type(&self) -> DeviceType {
909         DeviceType::Pvclock
910     }
911 
queue_max_sizes(&self) -> &[u16]912     fn queue_max_sizes(&self) -> &[u16] {
913         QUEUE_SIZES
914     }
915 
features(&self) -> u64916     fn features(&self) -> u64 {
917         self.state.features
918     }
919 
ack_features(&mut self, mut value: u64)920     fn ack_features(&mut self, mut value: u64) {
921         if value & !self.features() != 0 {
922             warn!("virtio-pvclock got unknown feature ack {:x}", value);
923             value &= self.features();
924         }
925         self.state.acked_features |= value;
926     }
927 
read_config(&self, offset: u64, data: &mut [u8])928     fn read_config(&self, offset: u64, data: &mut [u8]) {
929         copy_config(data, 0, self.get_config().as_bytes(), offset);
930     }
931 
write_config(&mut self, offset: u64, data: &[u8])932     fn write_config(&mut self, offset: u64, data: &[u8]) {
933         // Pvclock device doesn't expect a guest write to config
934         warn!(
935             "Unexpected write to virtio-pvclock config at offset {}: {:?}",
936             offset, data
937         );
938     }
939 
activate( &mut self, mem: GuestMemory, interrupt: Interrupt, queues: BTreeMap<usize, Queue>, ) -> anyhow::Result<()>940     fn activate(
941         &mut self,
942         mem: GuestMemory,
943         interrupt: Interrupt,
944         queues: BTreeMap<usize, Queue>,
945     ) -> anyhow::Result<()> {
946         let tsc_frequency = self.state.tsc_frequency;
947         let total_suspend_ns = self.state.total_suspend_ns.clone();
948         let worker = PvClockWorker::new(tsc_frequency, total_suspend_ns, mem);
949         self.switch_to_main_worker(interrupt, worker, queues)
950     }
951 
reset(&mut self) -> Result<()>952     fn reset(&mut self) -> Result<()> {
953         self.switch_to_stub_worker();
954         Ok(())
955     }
956 
virtio_sleep(&mut self) -> anyhow::Result<Option<BTreeMap<usize, Queue>>>957     fn virtio_sleep(&mut self) -> anyhow::Result<Option<BTreeMap<usize, Queue>>> {
958         let last_state = replace(&mut self.worker_state, PvClockWorkerState::None);
959         match last_state {
960             PvClockWorkerState::Main(main_worker_thread) => {
961                 let main_worker_ret = main_worker_thread.stop();
962                 let mut queues = BTreeMap::new();
963                 queues.insert(0, main_worker_ret.set_pvclock_page_queue);
964                 self.worker_state = PvClockWorkerState::Idle(main_worker_ret.suspend_tube);
965                 self.state.paused_main_worker = Some(main_worker_ret.worker.into());
966                 Ok(Some(queues))
967             }
968             PvClockWorkerState::Stub(stub_worker_thread) => {
969                 let stub_ret = stub_worker_thread.stop();
970                 self.worker_state = PvClockWorkerState::Idle(stub_ret.suspend_tube);
971                 Ok(None)
972             }
973             PvClockWorkerState::Idle(suspend_tube) => {
974                 self.worker_state = PvClockWorkerState::Idle(suspend_tube);
975                 Ok(None)
976             }
977             PvClockWorkerState::None => panic!("invalid state transition"),
978         }
979     }
980 
virtio_wake( &mut self, queues_state: Option<(GuestMemory, Interrupt, BTreeMap<usize, Queue>)>, ) -> anyhow::Result<()>981     fn virtio_wake(
982         &mut self,
983         queues_state: Option<(GuestMemory, Interrupt, BTreeMap<usize, Queue>)>,
984     ) -> anyhow::Result<()> {
985         if let Some((mem, interrupt, queues)) = queues_state {
986             let worker_snap = self
987                 .state
988                 .paused_main_worker
989                 .take()
990                 .ok_or(anyhow!("a sleeping pvclock must have a paused worker"))?;
991             let worker = PvClockWorker::from_snapshot(
992                 self.state.tsc_frequency,
993                 self.state.total_suspend_ns.clone(),
994                 worker_snap,
995                 mem,
996             );
997             // Use unchecked as no worker is running at this point
998             self.start_main_worker(interrupt, worker, queues)?;
999         } else {
1000             // If the device wasn't activated, we should bring up the stub worker since that's
1001             // what is supposed to be running for an un-activated device.
1002             self.start_stub_worker();
1003         }
1004         Ok(())
1005     }
1006 
virtio_snapshot(&mut self) -> anyhow::Result<AnySnapshot>1007     fn virtio_snapshot(&mut self) -> anyhow::Result<AnySnapshot> {
1008         AnySnapshot::to_any(&self.state).context("failed to serialize PvClockState")
1009     }
1010 
virtio_restore(&mut self, data: AnySnapshot) -> anyhow::Result<()>1011     fn virtio_restore(&mut self, data: AnySnapshot) -> anyhow::Result<()> {
1012         let state: PvClockState = AnySnapshot::from_any(data).context("error deserializing")?;
1013         if state.features != self.features() {
1014             bail!(
1015                 "expected virtio_features to match, but they did not. Live: {:?}, snapshot {:?}",
1016                 self.features(),
1017                 state.features,
1018             );
1019         }
1020         // TODO(b/291346907): we assume that the TSC frequency has NOT changed
1021         // since the snapshot was made. Assuming we have not moved machines,
1022         // this is a reasonable assumption. We don't verify the frequency
1023         // because TSC calibration noisy.
1024         self.state = state;
1025         Ok(())
1026     }
1027 
on_device_sandboxed(&mut self)1028     fn on_device_sandboxed(&mut self) {
1029         self.start_stub_worker();
1030     }
1031 }
1032 
1033 #[cfg(test)]
1034 mod tests {
1035     use super::*;
1036     use crate::virtio::QueueConfig;
1037 
1038     const TEST_QUEUE_SIZE: u16 = 2048;
1039 
make_interrupt() -> Interrupt1040     fn make_interrupt() -> Interrupt {
1041         Interrupt::new_for_test()
1042     }
1043 
create_pvclock_device() -> (Tube, PvClock)1044     fn create_pvclock_device() -> (Tube, PvClock) {
1045         let (host_tube, device_tube) = Tube::pair().unwrap();
1046         let mut pvclock_device = PvClock::new(0, 1e9 as u64, device_tube);
1047 
1048         // Simulate the device initialization to start the stub thread.
1049         // In the real case, on_device_sandboxed will be called after the device is sandboxed
1050         // (or at some point during the device initializtion when the sandbox is disabled) to
1051         // allow devices to use multi-threads (as spawning new threads before sandboxing is
1052         // prohibited because of the minijail's restriction).
1053         pvclock_device.on_device_sandboxed();
1054 
1055         (host_tube, pvclock_device)
1056     }
1057 
create_sleeping_device() -> (PvClock, GuestMemory, Tube)1058     fn create_sleeping_device() -> (PvClock, GuestMemory, Tube) {
1059         let (_host_tube, mut pvclock_device) = create_pvclock_device();
1060 
1061         // The queue won't actually be used, so passing one that isn't
1062         // fully configured is fine.
1063         let mut fake_queue = QueueConfig::new(TEST_QUEUE_SIZE, 0);
1064         fake_queue.set_ready(true);
1065         let mem = GuestMemory::new(&[(GuestAddress(0), 0x10000)]).unwrap();
1066         let interrupt = make_interrupt();
1067         pvclock_device
1068             .activate(
1069                 mem.clone(),
1070                 interrupt.clone(),
1071                 BTreeMap::from([(
1072                     0,
1073                     fake_queue
1074                         .activate(&mem, Event::new().unwrap(), interrupt)
1075                         .unwrap(),
1076                 )]),
1077             )
1078             .expect("activate should succeed");
1079         let queues = pvclock_device
1080             .virtio_sleep()
1081             .expect("sleep should succeed")
1082             .expect("sleep should yield queues");
1083         assert_eq!(queues.len(), 1);
1084         assert_eq!(
1085             queues.get(&0).expect("queue must be present").size(),
1086             TEST_QUEUE_SIZE
1087         );
1088         assert!(pvclock_device.state.paused_main_worker.is_some());
1089         (pvclock_device, mem, _host_tube)
1090     }
1091 
assert_wake_successful(pvclock_device: &mut PvClock, mem: &GuestMemory)1092     fn assert_wake_successful(pvclock_device: &mut PvClock, mem: &GuestMemory) {
1093         // We just create a new queue here, because it isn't actually accessed
1094         // by the device in these tests.
1095         let mut wake_queues = BTreeMap::new();
1096         let mut fake_queue = QueueConfig::new(TEST_QUEUE_SIZE, 0);
1097         let interrupt = make_interrupt();
1098         fake_queue.set_ready(true);
1099         wake_queues.insert(
1100             0,
1101             fake_queue
1102                 .activate(mem, Event::new().unwrap(), interrupt.clone())
1103                 .unwrap(),
1104         );
1105         let queues_state = (mem.clone(), interrupt, wake_queues);
1106         pvclock_device
1107             .virtio_wake(Some(queues_state))
1108             .expect("wake should succeed");
1109         assert!(pvclock_device.state.paused_main_worker.is_none());
1110     }
1111 
1112     #[test]
test_command_response_when_inactive()1113     fn test_command_response_when_inactive() {
1114         let (host_tube, _pvclock_device) = create_pvclock_device();
1115         assert!(host_tube.send(&PvClockCommand::Suspend).is_ok());
1116         let res = host_tube.recv::<PvClockCommandResponse>();
1117         assert!(matches!(res, Ok(PvClockCommandResponse::DeviceInactive)));
1118     }
1119 
1120     #[test]
test_sleep_wake_smoke()1121     fn test_sleep_wake_smoke() {
1122         let (mut pvclock_device, mem, _tube) = create_sleeping_device();
1123         assert_wake_successful(&mut pvclock_device, &mem);
1124     }
1125 
1126     #[test]
test_save_restore()1127     fn test_save_restore() {
1128         let (mut pvclock_device, mem, _tube) = create_sleeping_device();
1129         let test_suspend_ns = 9999;
1130 
1131         // Store a test value we can look for later in the test to verify
1132         // we're restoring properties.
1133         pvclock_device
1134             .state
1135             .total_suspend_ns
1136             .store(test_suspend_ns, Ordering::SeqCst);
1137 
1138         let snap = pvclock_device.virtio_snapshot().unwrap();
1139         pvclock_device
1140             .state
1141             .total_suspend_ns
1142             .store(0, Ordering::SeqCst);
1143         pvclock_device.virtio_restore(snap).unwrap();
1144         assert_eq!(
1145             pvclock_device.state.total_suspend_ns.load(Ordering::SeqCst),
1146             test_suspend_ns
1147         );
1148 
1149         assert_wake_successful(&mut pvclock_device, &mem);
1150     }
1151 
1152     /// A simplified clone of `pvclock_scale_delta` from Linux kernel to emulate
1153     /// what the kernel does when converting TSC to ktime.
pvclock_scale_tsc(mult: u32, shift: i8, tsc: u64) -> u641154     fn pvclock_scale_tsc(mult: u32, shift: i8, tsc: u64) -> u64 {
1155         let shifted = if shift < 0 {
1156             tsc >> -shift
1157         } else {
1158             tsc << shift
1159         };
1160         let product = shifted as u128 * mult as u128;
1161         (product >> 32).try_into().expect("should not overflow")
1162     }
1163 
1164     /// Helper function for checking the behavior of `freq_scale_shift`.
check_freq_scale(f: u64, input: u64)1165     fn check_freq_scale(f: u64, input: u64) {
1166         // We only test `scaled_hz` = 1GHz because that is the only value used in the code base.
1167         let (mult, shift) = freq_scale_shift(1_000_000_000, f);
1168 
1169         let scaled = pvclock_scale_tsc(mult, shift, input);
1170 
1171         // Use relative error <= 1e-8 as the target. TSC can be huge so this isn't really a super
1172         // accurate target, and our goal is to simply sanity check the math without adding too many
1173         // requirements about rounding errors.
1174         let expected: u64 = (input as u128 * 1_000_000_000u128 / f as u128) as u64;
1175         let expected_lo: u64 = (input as u128 * 999_999_990u128 / f as u128) as u64;
1176         let expected_hi: u64 = (input as u128 * 1_000_000_010u128 / f as u128) as u64;
1177         assert!(
1178             (expected_lo..=expected_hi).contains(&scaled),
1179             "{scaled} should be close to {expected} (base_hz={f}, mult={mult}, shift={shift})"
1180         );
1181     }
1182 
1183     #[test]
test_freq_scale_shift_accuracy()1184     fn test_freq_scale_shift_accuracy() {
1185         // Basic check for formula correctness: scaling `scaled_hz` to `base_hz` should yield
1186         // `base_hz`.
1187         for f in (1..=50).map(|n| n * 100_000_000) {
1188             check_freq_scale(f, f);
1189         }
1190     }
1191 
1192     #[test]
test_freq_scale_shift_overflow_high_freq()1193     fn test_freq_scale_shift_overflow_high_freq() {
1194         // For scale factors < 1.0, test that we can correctly convert the maximum TSC value without
1195         // overflow. We must be able to handle values as large as it realistically can be, as the
1196         // kernel clock breaks if the calculated ktime goes backwards (b/342168920).
1197         for f in (11..=50).map(|n| n * 100_000_000) {
1198             check_freq_scale(f, u64::MAX);
1199         }
1200     }
1201 
1202     #[test]
test_freq_scale_shift_overflow_low_freq()1203     fn test_freq_scale_shift_overflow_low_freq() {
1204         fn prev_power_of_two(n: u64) -> u64 {
1205             assert_ne!(n, 0);
1206             let highest_bit_set = 63 - n.leading_zeros();
1207             1 << highest_bit_set
1208         }
1209         // Same test as above, but for scale factors >= 1.0. The difference is that for scale
1210         // factors >= 1.0 we first round up the factor, then apply a multiplier (< 1.0). We reflect
1211         // this limitation in our tested maximum value.
1212         for f in (1..=10).map(|n| n * 100_000_000) {
1213             // Truncate the remainder since prev_power_of_two rounds down anyway.
1214             let factor = 1_000_000_000 / f;
1215             // This is like (exp2(floor(log2(factor)) + 1)).
1216             let target = u64::MAX / (prev_power_of_two(factor) << 1);
1217             check_freq_scale(f, target);
1218         }
1219     }
1220 }
1221