• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2022 The ChromiumOS Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 //! Virtio version of a linux pvclock clocksource.
6 //!
7 //! Driver source is here:
8 //! <https://android.googlesource.com/kernel/common/+/ebaa2c516811825b141de844cee7a38653058ef5/drivers/virtio/virtio_pvclock.c>
9 //!
10 //! # Background
11 //!
12 //! Userland applications often rely on CLOCK_MONOTONIC to be relatively continuous.
13 //! Large jumps can signal problems (e.g., triggering Android watchdogs).
14 //! This assumption breaks down in virtualized environments, where a VM's suspension isn't
15 //! inherently linked to the guest kernel's concept of "suspend".
16 //! Since fixing all userland code is impractical, virtio-pvclock allows the VMM and guest kernel
17 //! to collaborate on emulating the expected clock behavior around suspend/resume.
18 //!
19 //! # How it works
20 //!
21 //! ## Core functions of virtio-pvclock device:
22 //!
23 //! 1. Adjusts hardware clocksource offsets to make the guest clocks appear suspended when the VM is
24 //!    suspended.
25 //!   - This is achieved through the pvclock mechanism implemented in x86 KVM used by kvm-clock.
26 //! 2. Provides the guest kernel with the duration of VM suspension, allowing the guest to adjust
27 //!    its clocks accordingly.
28 //!   - Since the offset between the CLOCK_MONOTONIC and CLOCK_BOOTTIME is maintained by the guest
29 //!     kernel, applying the adjustment is the guest driver's responsibility.
30 //!
31 //! ## Expected guest clock behaviors under virtio-pvclock is enabled
32 //!
33 //! - Monotonicity of CLOCK_MONOTONIC and CLOCK_BOOTTIME is maintained.
34 //! - CLOCK_MONOTONIC will not include the time passed during crosvm is suspended from its run mode
35 //!   perspective.
36 //! - CLOCK_BOOTTIME will be adjusted to include the time passed during crosvm is suspended.
37 //!
38 //! # Why it is needed
39 //!
40 //! Because the existing solution does not cover some expectations we need.
41 //!
42 //! kvm-clock is letting the host to manage the offsets of CLOCK_MONOTONIC.
43 //! However, it doesn't address the difference between CLOCK_BOOTTIME and CLOCK_MONOTONIC related
44 //! to host's suspend/resume, as it is designed to maintain the CLOCK_REALTIME in sync mainly.
45 
46 use std::arch::x86_64::_rdtsc;
47 use std::collections::BTreeMap;
48 use std::mem::replace;
49 use std::mem::size_of;
50 use std::sync::atomic::AtomicU64;
51 use std::sync::atomic::Ordering;
52 use std::sync::Arc;
53 use std::time::Duration;
54 
55 use anyhow::anyhow;
56 use anyhow::bail;
57 use anyhow::Context;
58 use anyhow::Result;
59 use base::error;
60 use base::info;
61 use base::warn;
62 use base::AsRawDescriptor;
63 #[cfg(windows)]
64 use base::CloseNotifier;
65 use base::Error;
66 use base::Event;
67 use base::EventToken;
68 use base::RawDescriptor;
69 use base::ReadNotifier;
70 use base::Tube;
71 use base::WaitContext;
72 use base::WorkerThread;
73 use chrono::DateTime;
74 use chrono::Utc;
75 use data_model::Le32;
76 use data_model::Le64;
77 use serde::Deserialize;
78 use serde::Serialize;
79 use vm_control::PvClockCommand;
80 use vm_control::PvClockCommandResponse;
81 use vm_memory::GuestAddress;
82 use vm_memory::GuestMemory;
83 use vm_memory::GuestMemoryError;
84 use zerocopy::AsBytes;
85 use zerocopy::FromBytes;
86 use zerocopy::FromZeroes;
87 
88 use super::copy_config;
89 use super::DeviceType;
90 use super::Interrupt;
91 use super::Queue;
92 use super::VirtioDevice;
93 
94 // Pvclock has one virtio queue: set_pvclock_page
95 const QUEUE_SIZE: u16 = 1;
96 const QUEUE_SIZES: &[u16] = &[QUEUE_SIZE];
97 
98 // pvclock flag bits
99 const PVCLOCK_TSC_STABLE_BIT: u8 = 1;
100 const PVCLOCK_GUEST_STOPPED: u8 = 2;
101 
102 // The feature bitmap for virtio pvclock
103 const VIRTIO_PVCLOCK_F_TSC_STABLE: u64 = 0; // TSC is stable
104 const VIRTIO_PVCLOCK_F_INJECT_SLEEP: u64 = 1; // Inject sleep for suspend
105 const VIRTIO_PVCLOCK_F_CLOCKSOURCE_RATING: u64 = 2; // Use device clocksource rating
106 
107 // Status values for a virtio_pvclock request.
108 const VIRTIO_PVCLOCK_S_OK: u8 = 0;
109 const VIRTIO_PVCLOCK_S_IOERR: u8 = 1;
110 
111 const VIRTIO_PVCLOCK_CLOCKSOURCE_RATING: u32 = 450;
112 
113 // The config structure being exposed to the guest to tell them how much suspend time should be
114 // injected to the guest's CLOCK_BOOTTIME.
115 #[derive(Debug, Clone, Copy, Default, AsBytes, FromZeroes, FromBytes)]
116 #[allow(non_camel_case_types)]
117 #[repr(C)]
118 struct virtio_pvclock_config {
119     // Total duration the VM has been paused while the guest kernel is not in the suspended state
120     // (from the power management and timekeeping perspective).
121     suspend_time_ns: Le64,
122     // Device-suggested rating of the pvclock clocksource.
123     clocksource_rating: Le32,
124     padding: u32,
125 }
126 
127 #[derive(Debug, Clone, Copy, Default, FromZeroes, FromBytes, AsBytes)]
128 #[allow(non_camel_case_types)]
129 #[repr(C)]
130 struct virtio_pvclock_set_pvclock_page_req {
131     // Physical address of pvclock page.
132     pvclock_page_pa: Le64,
133     // Current system time.
134     system_time: Le64,
135     // Current tsc value.
136     tsc_timestamp: Le64,
137     // Status of this request, one of VIRTIO_PVCLOCK_S_*.
138     status: u8,
139     padding: [u8; 7],
140 }
141 
142 // Data structure for interacting with pvclock shared memory.
143 struct PvclockSharedData {
144     mem: GuestMemory,
145     seqlock_addr: GuestAddress,
146     tsc_suspended_delta_addr: GuestAddress,
147     tsc_frequency_multiplier_addr: GuestAddress,
148     tsc_frequency_shift_addr: GuestAddress,
149     flags_addr: GuestAddress,
150 }
151 
152 impl PvclockSharedData {
new(mem: GuestMemory, addr: GuestAddress) -> Self153     pub fn new(mem: GuestMemory, addr: GuestAddress) -> Self {
154         PvclockSharedData {
155             mem,
156             // The addresses of the various fields that we need to modify are relative to the
157             // base of the pvclock page. For reference, see the pvclock_vcpu_time_info struct.
158             seqlock_addr: addr,
159             tsc_suspended_delta_addr: addr.unchecked_add(8),
160             tsc_frequency_multiplier_addr: addr.unchecked_add(24),
161             tsc_frequency_shift_addr: addr.unchecked_add(28),
162             flags_addr: addr.unchecked_add(29),
163         }
164     }
165 
166     /// Only the seqlock_addr is needed to re-create this struct at restore
167     /// time, so that is all our snapshot contains.
snapshot(&self) -> GuestAddress168     fn snapshot(&self) -> GuestAddress {
169         self.seqlock_addr
170     }
171 
172     /// Set all fields to zero.
zero_fill(&mut self) -> Result<()>173     pub fn zero_fill(&mut self) -> Result<()> {
174         // The pvclock data structure is 32 bytes long, so we write 32 bytes of 0s
175         self.mem
176             .write_all_at_addr(&[0u8; 32], self.seqlock_addr)
177             .context("failed to zero fill the pvclock shared data")
178     }
179 
increment_seqlock(&mut self) -> Result<()>180     pub fn increment_seqlock(&mut self) -> Result<()> {
181         // TODO (b/264931437): reads and writes using read/write_obj_from/at_addr are not
182         //  guaranteed to be atomic. Although this should not be a problem for the seqlock
183         //  or the other fields in the pvclock shared data (whch are protected via the seqlock)
184         //  we might want to update these calls to be as atomic as possible if/when we have
185         //  the ability to do so, just as a general cleanup and to be consistent.
186         let value = self
187             .mem
188             .read_obj_from_addr::<u32>(self.seqlock_addr)
189             .context("failed to read seqlock value")?;
190         self.mem
191             .write_obj_at_addr(value.wrapping_add(1), self.seqlock_addr)
192             .context("failed to write seqlock value")
193     }
194 
set_tsc_suspended_delta(&mut self, delta: u64) -> Result<()>195     pub fn set_tsc_suspended_delta(&mut self, delta: u64) -> Result<()> {
196         self.mem
197             .write_obj_at_addr(delta, self.tsc_suspended_delta_addr)
198             .context("failed to write tsc suspended delta")
199     }
200 
set_tsc_frequency(&mut self, frequency: u64) -> Result<()>201     pub fn set_tsc_frequency(&mut self, frequency: u64) -> Result<()> {
202         // TSC values are converted to timestamps using the following algorithm:
203         //   delta = _rdtsc() - tsc_suspended_delta
204         //   if tsc_frequency_shift > 0:
205         //     delta <<= tsc_frequency_shift
206         //   else:
207         //     delta >>= -tsc_frequency_shift
208         //   return (delta * tsc_frequency_multiplier) >> 32
209         //
210         // So, tsc_frequency_multiplier needs to be something like 1e9/tsc_frquency, in which case
211         // tsc_frequency_shift would be 32 (to counteract the final 32 right shift). But
212         // 1e9/tsc_frequency is <1 so we actually need to scale that value up and scale down
213         // the tsc_frequency_shift so we don't lose precision in the frequency. Our tsc_frequency
214         // isn't *that* precise, so we scale it up by 16 and scale down the tsc_frequency_shift by
215         // 16 (so it's also 16).
216         let shift = 16i8;
217         let multiplier: u32 = ((1_000_000_000u128 << shift) / frequency as u128)
218             .try_into()
219             .context(format!(
220                 "tsc frequency multiplier overflow, frequency {}Hz is too small",
221                 frequency
222             ))?;
223 
224         self.mem
225             .write_obj_at_addr(multiplier, self.tsc_frequency_multiplier_addr)
226             .context("failed to write tsc frequency mlutiplier")?;
227         self.mem
228             .write_obj_at_addr(shift, self.tsc_frequency_shift_addr)
229             .context("failed to write tsc frequency shift")
230     }
231 
enable_pvclock_flags(&mut self, flags: u8) -> Result<()>232     pub fn enable_pvclock_flags(&mut self, flags: u8) -> Result<()> {
233         let value = self
234             .mem
235             .read_obj_from_addr::<u8>(self.flags_addr)
236             .context("failed to read flags")?;
237         self.mem
238             .write_obj_at_addr(value | flags, self.flags_addr)
239             .context("failed to write flags")
240     }
241 }
242 
243 /// Serializable part of the [PvClock] struct which will be used by the virtio_snapshot / restore.
244 #[derive(Serialize, Deserialize)]
245 struct PvClockState {
246     tsc_frequency: u64,
247     /// If the device is sleeping, a [PvClockWorkerSnapshot] that can re-create the worker
248     /// will be stored here. (We can't just store the worker itself as it contains an object
249     /// tree with references to [GuestMemory].)
250     paused_main_worker: Option<PvClockWorkerSnapshot>,
251     /// The total time the vm has been suspended, this is in an `Arc<AtomicU64>>` because it's set
252     /// by the PvClockWorker thread but read by PvClock from the mmio bus in the main thread.
253     total_suspend_ns: Arc<AtomicU64>,
254     features: u64,
255     acked_features: u64,
256 }
257 
258 /// An enum to keep dynamic state of pvclock workers in a type safe manner.
259 enum PvClockWorkerState {
260     /// Idle means no worker is running.
261     /// This tube is for communicating with this device from the crosvm threads.
262     Idle(Tube),
263     /// A stub worker to respond pvclock commands when the device is not activated yet.
264     Stub(WorkerThread<StubWorkerReturn>),
265     /// A main worker to respond pvclock commands while the device is active.
266     Main(WorkerThread<MainWorkerReturn>),
267     /// None is used only for handling transitional state between the states above.
268     None,
269 }
270 
271 /// A struct that represents virtio-pvclock device.
272 pub struct PvClock {
273     state: PvClockState,
274     worker_state: PvClockWorkerState,
275 }
276 
277 impl PvClock {
new(base_features: u64, tsc_frequency: u64, suspend_tube: Tube) -> Self278     pub fn new(base_features: u64, tsc_frequency: u64, suspend_tube: Tube) -> Self {
279         let state = PvClockState {
280             tsc_frequency,
281             paused_main_worker: None,
282             total_suspend_ns: Arc::new(AtomicU64::new(0)),
283             features: base_features
284                 | 1 << VIRTIO_PVCLOCK_F_TSC_STABLE
285                 | 1 << VIRTIO_PVCLOCK_F_INJECT_SLEEP
286                 | 1 << VIRTIO_PVCLOCK_F_CLOCKSOURCE_RATING,
287             acked_features: 0,
288         };
289         PvClock {
290             state,
291             worker_state: PvClockWorkerState::Idle(suspend_tube),
292         }
293     }
294 
get_config(&self) -> virtio_pvclock_config295     fn get_config(&self) -> virtio_pvclock_config {
296         virtio_pvclock_config {
297             suspend_time_ns: self.state.total_suspend_ns.load(Ordering::SeqCst).into(),
298             clocksource_rating: VIRTIO_PVCLOCK_CLOCKSOURCE_RATING.into(),
299             padding: 0,
300         }
301     }
302 
303     /// Use switch_to_*_worker unless needed to keep the state transition consistent
start_main_worker( &mut self, interrupt: Interrupt, pvclock_worker: PvClockWorker, mut queues: BTreeMap<usize, Queue>, ) -> anyhow::Result<()>304     fn start_main_worker(
305         &mut self,
306         interrupt: Interrupt,
307         pvclock_worker: PvClockWorker,
308         mut queues: BTreeMap<usize, Queue>,
309     ) -> anyhow::Result<()> {
310         let last_state = replace(&mut self.worker_state, PvClockWorkerState::None);
311         if let PvClockWorkerState::Idle(suspend_tube) = last_state {
312             if queues.len() != QUEUE_SIZES.len() {
313                 return Err(anyhow!(
314                     "expected {} queues, got {}",
315                     QUEUE_SIZES.len(),
316                     queues.len()
317                 ));
318             }
319             let set_pvclock_page_queue = queues.remove(&0).unwrap();
320             self.worker_state = PvClockWorkerState::Main(WorkerThread::start(
321                 "virtio_pvclock".to_string(),
322                 move |kill_evt| {
323                     run_main_worker(
324                         pvclock_worker,
325                         set_pvclock_page_queue,
326                         suspend_tube,
327                         interrupt,
328                         kill_evt,
329                     )
330                 },
331             ));
332         } else {
333             panic!("Invalid state transition");
334         }
335         Ok(())
336     }
337 
338     /// Use switch_to_*_worker unless needed to keep the state transition consistent
start_stub_worker(&mut self)339     fn start_stub_worker(&mut self) {
340         let last_state = replace(&mut self.worker_state, PvClockWorkerState::None);
341         self.worker_state = if let PvClockWorkerState::Idle(suspend_tube) = last_state {
342             PvClockWorkerState::Stub(WorkerThread::start(
343                 "virtio_pvclock_stub".to_string(),
344                 move |kill_evt| run_stub_worker(suspend_tube, kill_evt),
345             ))
346         } else {
347             panic!("Invalid state transition");
348         };
349     }
350 
351     /// Use switch_to_*_worker unless needed to keep the state transition consistent
stop_stub_worker(&mut self)352     fn stop_stub_worker(&mut self) {
353         let last_state = replace(&mut self.worker_state, PvClockWorkerState::None);
354         self.worker_state = if let PvClockWorkerState::Stub(stub_worker_thread) = last_state {
355             let stub_worker_ret = stub_worker_thread.stop();
356             PvClockWorkerState::Idle(stub_worker_ret.suspend_tube)
357         } else {
358             panic!("Invalid state transition");
359         }
360     }
361 
362     /// Use switch_to_*_worker unless needed to keep the state transition consistent
stop_main_worker(&mut self)363     fn stop_main_worker(&mut self) {
364         let last_state = replace(&mut self.worker_state, PvClockWorkerState::None);
365         if let PvClockWorkerState::Main(main_worker_thread) = last_state {
366             let main_worker_ret = main_worker_thread.stop();
367             self.worker_state = PvClockWorkerState::Idle(main_worker_ret.suspend_tube);
368             let mut queues = BTreeMap::new();
369             queues.insert(0, main_worker_ret.set_pvclock_page_queue);
370             self.state.paused_main_worker = Some(main_worker_ret.worker.into());
371         } else {
372             panic!("Invalid state transition");
373         }
374     }
375 
switch_to_stub_worker(&mut self)376     fn switch_to_stub_worker(&mut self) {
377         self.stop_main_worker();
378         self.start_stub_worker();
379     }
380 
switch_to_main_worker( &mut self, interrupt: Interrupt, pvclock_worker: PvClockWorker, queues: BTreeMap<usize, Queue>, ) -> anyhow::Result<()>381     fn switch_to_main_worker(
382         &mut self,
383         interrupt: Interrupt,
384         pvclock_worker: PvClockWorker,
385         queues: BTreeMap<usize, Queue>,
386     ) -> anyhow::Result<()> {
387         self.stop_stub_worker();
388         self.start_main_worker(interrupt, pvclock_worker, queues)
389     }
390 }
391 
392 /// Represents a moment in time including the TSC counter value at that time.
393 #[derive(Serialize, Deserialize, Clone)]
394 struct PvclockInstant {
395     time: DateTime<Utc>,
396     tsc_value: u64,
397 }
398 
399 /// The unique data retained by [PvClockWorker] which can be used to re-create
400 /// an identical worker.
401 #[derive(Serialize, Deserialize, Clone)]
402 struct PvClockWorkerSnapshot {
403     suspend_time: Option<PvclockInstant>,
404     total_suspend_tsc_delta: u64,
405     pvclock_shared_data_base_address: Option<GuestAddress>,
406 }
407 
408 impl From<PvClockWorker> for PvClockWorkerSnapshot {
from(worker: PvClockWorker) -> Self409     fn from(worker: PvClockWorker) -> Self {
410         PvClockWorkerSnapshot {
411             suspend_time: worker.suspend_time,
412             total_suspend_tsc_delta: worker.total_suspend_tsc_delta,
413             pvclock_shared_data_base_address: worker
414                 .pvclock_shared_data
415                 .map(|pvclock| pvclock.snapshot()),
416         }
417     }
418 }
419 
420 /// Worker struct for the virtio-pvclock device.
421 ///
422 /// Handles virtio requests, storing information about suspend/resume, adjusting the
423 /// pvclock data in shared memory, and injecting suspend durations via config
424 /// changes.
425 struct PvClockWorker {
426     tsc_frequency: u64,
427     // The moment the last suspend occurred.
428     suspend_time: Option<PvclockInstant>,
429     // The total time the vm has been suspended, this is in an Arc<AtomicU64>> because it's set
430     // by the PvClockWorker thread but read by PvClock from the mmio bus in the main thread.
431     total_injected_ns: Arc<AtomicU64>,
432     // The total change in the TSC value over suspensions.
433     total_suspend_tsc_delta: u64,
434     // Pvclock shared data.
435     pvclock_shared_data: Option<PvclockSharedData>,
436     mem: GuestMemory,
437 }
438 
439 impl PvClockWorker {
new(tsc_frequency: u64, total_injected_ns: Arc<AtomicU64>, mem: GuestMemory) -> Self440     pub fn new(tsc_frequency: u64, total_injected_ns: Arc<AtomicU64>, mem: GuestMemory) -> Self {
441         PvClockWorker {
442             tsc_frequency,
443             suspend_time: None,
444             total_injected_ns,
445             total_suspend_tsc_delta: 0,
446             pvclock_shared_data: None,
447             mem,
448         }
449     }
450 
from_snapshot( tsc_frequency: u64, total_injected_ns: Arc<AtomicU64>, snap: PvClockWorkerSnapshot, mem: GuestMemory, ) -> Self451     fn from_snapshot(
452         tsc_frequency: u64,
453         total_injected_ns: Arc<AtomicU64>,
454         snap: PvClockWorkerSnapshot,
455         mem: GuestMemory,
456     ) -> Self {
457         PvClockWorker {
458             tsc_frequency,
459             suspend_time: snap.suspend_time,
460             total_injected_ns,
461             total_suspend_tsc_delta: snap.total_suspend_tsc_delta,
462             pvclock_shared_data: snap
463                 .pvclock_shared_data_base_address
464                 .map(|addr| PvclockSharedData::new(mem.clone(), addr)),
465             mem,
466         }
467     }
468 
469     /// Initialize the pvclock for initial boot. We assume that the systemtime of 0 corresponds
470     /// to the tsc time of 0, so we do not set these. We set the tsc frequency based on the vcpu
471     /// tsc frequency and we set PVCLOCK_TSC_STABLE_BIT in flags to tell the guest that it's
472     /// safe to use vcpu0's pvclock page for use by the vdso. The order of writing the different
473     /// fields doesn't matter at this point, but does matter when updating.
set_pvclock_page(&mut self, addr: u64) -> Result<()>474     fn set_pvclock_page(&mut self, addr: u64) -> Result<()> {
475         if self.pvclock_shared_data.is_some() {
476             return Err(Error::new(libc::EALREADY)).context("pvclock page already set");
477         }
478 
479         let mut shared_data = PvclockSharedData::new(self.mem.clone(), GuestAddress(addr));
480 
481         // set all fields to 0 first
482         shared_data.zero_fill()?;
483 
484         shared_data.set_tsc_frequency(self.tsc_frequency)?;
485         shared_data.enable_pvclock_flags(PVCLOCK_TSC_STABLE_BIT)?;
486 
487         self.pvclock_shared_data = Some(shared_data);
488         Ok(())
489     }
490 
suspend(&mut self)491     pub fn suspend(&mut self) {
492         if self.suspend_time.is_some() {
493             warn!("Suspend time already set, ignoring new suspend time");
494             return;
495         }
496         self.suspend_time = Some(PvclockInstant {
497             time: Utc::now(),
498             // SAFETY:
499             // Safe because _rdtsc takes no arguments, and we trust _rdtsc to not modify any other
500             // memory.
501             tsc_value: unsafe { _rdtsc() },
502         });
503     }
504 
resume(&mut self) -> Result<()>505     pub fn resume(&mut self) -> Result<()> {
506         // First, increment the sequence lock by 1 before writing to the pvclock page.
507         self.increment_pvclock_seqlock()?;
508 
509         // The guest makes sure there are memory barriers in between reads of the seqlock and other
510         // fields, we should make sure there are memory barriers in between writes of seqlock and
511         // writes to other fields.
512         std::sync::atomic::fence(Ordering::SeqCst);
513 
514         // Set the tsc suspended delta and guest_stopped_bit in pvclock struct. We only need to set
515         // the bit, the guest will unset it once the guest has handled the stoppage.
516         // We get the result here because we want to call increment_pvclock_seqlock regardless of
517         // the result of these calls.
518         let result = self
519             .set_suspended_time()
520             .and_then(|_| self.set_guest_stopped_bit());
521 
522         // The guest makes sure there are memory barriers in between reads of the seqlock and other
523         // fields, we should make sure there are memory barriers in between writes of seqlock and
524         // writes to other fields.
525         std::sync::atomic::fence(Ordering::SeqCst);
526 
527         // Do a final increment once changes are done.
528         self.increment_pvclock_seqlock()?;
529 
530         result
531     }
532 
get_suspended_duration(suspend_time: &PvclockInstant) -> Duration533     fn get_suspended_duration(suspend_time: &PvclockInstant) -> Duration {
534         match Utc::now().signed_duration_since(suspend_time.time).to_std() {
535             Ok(duration) => duration,
536             Err(e) => {
537                 error!(
538                     "pvclock found suspend time in the future (was the host \
539                     clock adjusted?). Guest boot/realtime clock may now be \
540                     incorrect. Details: {}",
541                     e
542                 );
543                 Duration::ZERO
544             }
545         }
546     }
547 
set_suspended_time(&mut self) -> Result<()>548     fn set_suspended_time(&mut self) -> Result<()> {
549         let (this_suspend_duration, this_suspend_tsc_delta) =
550             if let Some(suspend_time) = self.suspend_time.take() {
551                 (
552                     Self::get_suspended_duration(&suspend_time),
553                     // SAFETY:
554                     // Safe because _rdtsc takes no arguments, and we trust _rdtsc to not modify
555                     // any other memory.
556                     // NB: This calculation may wrap around, as TSC can be reset to zero when
557                     // the device has resumed from the "deep" suspend state (it may not happen for
558                     // s2idle cases). It also happens when the tsc value itself wraps.
559                     unsafe { _rdtsc() }.wrapping_sub(suspend_time.tsc_value),
560                 )
561             } else {
562                 return Err(Error::new(libc::ENOTSUP))
563                     .context("Cannot set suspend time because suspend was never called");
564             };
565 
566         // update the total tsc delta during all suspends
567         // NB: This calculation may wrap around, as the suspend time can be bigger than u64 range.
568         self.total_suspend_tsc_delta = self
569             .total_suspend_tsc_delta
570             .wrapping_add(this_suspend_tsc_delta);
571 
572         // save tsc_suspended_delta to shared memory
573         self.pvclock_shared_data
574             .as_mut()
575             .ok_or(
576                 anyhow::Error::new(Error::new(libc::ENODATA)).context("pvclock page is not set"),
577             )?
578             .set_tsc_suspended_delta(self.total_suspend_tsc_delta)?;
579 
580         info!(
581             "set total suspend tsc delta to {}",
582             self.total_suspend_tsc_delta
583         );
584 
585         // update total suspend ns
586         self.total_injected_ns
587             .fetch_add(this_suspend_duration.as_nanos() as u64, Ordering::SeqCst);
588 
589         Ok(())
590     }
591 
increment_pvclock_seqlock(&mut self) -> Result<()>592     fn increment_pvclock_seqlock(&mut self) -> Result<()> {
593         self.pvclock_shared_data
594             .as_mut()
595             .ok_or(
596                 anyhow::Error::new(Error::new(libc::ENODATA)).context("pvclock page is not set"),
597             )?
598             .increment_seqlock()
599     }
600 
set_guest_stopped_bit(&mut self) -> Result<()>601     fn set_guest_stopped_bit(&mut self) -> Result<()> {
602         self.pvclock_shared_data
603             .as_mut()
604             .ok_or(
605                 anyhow::Error::new(Error::new(libc::ENODATA)).context("pvclock page is not set"),
606             )?
607             .enable_pvclock_flags(PVCLOCK_GUEST_STOPPED)
608     }
609 }
610 
pvclock_response_error_from_anyhow(error: anyhow::Error) -> base::Error611 fn pvclock_response_error_from_anyhow(error: anyhow::Error) -> base::Error {
612     for cause in error.chain() {
613         if let Some(e) = cause.downcast_ref::<base::Error>() {
614             return *e;
615         }
616 
617         if let Some(e) = cause.downcast_ref::<GuestMemoryError>() {
618             return match e {
619                 // Two kinds of GuestMemoryError contain base::Error
620                 GuestMemoryError::MemoryAddSealsFailed(e) => *e,
621                 GuestMemoryError::MemoryCreationFailed(e) => *e,
622                 // Otherwise return EINVAL
623                 _ => Error::new(libc::EINVAL),
624             };
625         }
626     }
627     // Unknown base error
628     Error::new(libc::EFAULT)
629 }
630 
631 struct StubWorkerReturn {
632     suspend_tube: Tube,
633 }
634 
635 /// A stub worker to respond any requests when the device is inactive.
run_stub_worker(suspend_tube: Tube, kill_evt: Event) -> StubWorkerReturn636 fn run_stub_worker(suspend_tube: Tube, kill_evt: Event) -> StubWorkerReturn {
637     #[derive(EventToken, Debug)]
638     enum Token {
639         SomePvClockRequest,
640         Kill,
641     }
642     let wait_ctx: WaitContext<Token> = match WaitContext::build_with(&[
643         (suspend_tube.get_read_notifier(), Token::SomePvClockRequest),
644         // TODO(b/242743502): Can also close on Tube closure for Unix once CloseNotifier is
645         // implemented for Tube.
646         #[cfg(windows)]
647         (suspend_tube.get_close_notifier(), Token::Kill),
648         (&kill_evt, Token::Kill),
649     ]) {
650         Ok(wait_ctx) => wait_ctx,
651         Err(e) => {
652             error!("failed creating WaitContext: {}", e);
653             return StubWorkerReturn { suspend_tube };
654         }
655     };
656     'wait: loop {
657         let events = match wait_ctx.wait() {
658             Ok(v) => v,
659             Err(e) => {
660                 error!("failed polling for events: {}", e);
661                 break;
662             }
663         };
664         for event in events.iter().filter(|e| e.is_readable) {
665             match event.token {
666                 Token::SomePvClockRequest => {
667                     match suspend_tube.recv::<PvClockCommand>() {
668                         Ok(req) => req,
669                         Err(e) => {
670                             error!("failed to receive request: {}", e);
671                             continue;
672                         }
673                     };
674                     if let Err(e) = suspend_tube.send(&PvClockCommandResponse::DeviceInactive) {
675                         error!("error sending PvClockCommandResponse: {}", e);
676                     }
677                 }
678                 Token::Kill => {
679                     break 'wait;
680                 }
681             }
682         }
683     }
684     StubWorkerReturn { suspend_tube }
685 }
686 
687 struct MainWorkerReturn {
688     worker: PvClockWorker,
689     set_pvclock_page_queue: Queue,
690     suspend_tube: Tube,
691 }
692 
693 // TODO(b/237300012): asyncify this device.
694 /// A worker to process PvClockCommand requests
run_main_worker( mut worker: PvClockWorker, mut set_pvclock_page_queue: Queue, suspend_tube: Tube, interrupt: Interrupt, kill_evt: Event, ) -> MainWorkerReturn695 fn run_main_worker(
696     mut worker: PvClockWorker,
697     mut set_pvclock_page_queue: Queue,
698     suspend_tube: Tube,
699     interrupt: Interrupt,
700     kill_evt: Event,
701 ) -> MainWorkerReturn {
702     #[derive(EventToken)]
703     enum Token {
704         SetPvClockPageQueue,
705         SuspendResume,
706         InterruptResample,
707         Kill,
708     }
709 
710     let wait_ctx: WaitContext<Token> = match WaitContext::build_with(&[
711         (set_pvclock_page_queue.event(), Token::SetPvClockPageQueue),
712         (suspend_tube.get_read_notifier(), Token::SuspendResume),
713         // TODO(b/242743502): Can also close on Tube closure for Unix once CloseNotifier is
714         // implemented for Tube.
715         #[cfg(windows)]
716         (suspend_tube.get_close_notifier(), Token::Kill),
717         (&kill_evt, Token::Kill),
718     ]) {
719         Ok(pc) => pc,
720         Err(e) => {
721             error!("failed creating WaitContext: {}", e);
722             return MainWorkerReturn {
723                 suspend_tube,
724                 set_pvclock_page_queue,
725                 worker,
726             };
727         }
728     };
729     if let Some(resample_evt) = interrupt.get_resample_evt() {
730         if wait_ctx
731             .add(resample_evt, Token::InterruptResample)
732             .is_err()
733         {
734             error!("failed creating WaitContext");
735             return MainWorkerReturn {
736                 suspend_tube,
737                 set_pvclock_page_queue,
738                 worker,
739             };
740         }
741     }
742 
743     'wait: loop {
744         let events = match wait_ctx.wait() {
745             Ok(v) => v,
746             Err(e) => {
747                 error!("failed polling for events: {}", e);
748                 break;
749             }
750         };
751 
752         for event in events.iter().filter(|e| e.is_readable) {
753             match event.token {
754                 Token::SetPvClockPageQueue => {
755                     let _ = set_pvclock_page_queue.event().wait();
756                     let desc_chain = match set_pvclock_page_queue.pop() {
757                         Some(desc_chain) => desc_chain,
758                         None => {
759                             error!("set_pvclock_page queue was empty");
760                             continue;
761                         }
762                     };
763 
764                     // This device does not follow the virtio spec requirements for device-readable
765                     // vs. device-writable descriptors, so we can't use `Reader`/`Writer`. Pick the
766                     // first descriptor from the chain and assume the whole req structure is
767                     // contained within it.
768                     let desc = desc_chain
769                         .reader
770                         .get_remaining_regions()
771                         .chain(desc_chain.writer.get_remaining_regions())
772                         .next()
773                         .unwrap();
774 
775                     let len = if desc.len < size_of::<virtio_pvclock_set_pvclock_page_req>() {
776                         error!("pvclock descriptor too short");
777                         0
778                     } else {
779                         let addr = GuestAddress(desc.offset);
780                         let mut req: virtio_pvclock_set_pvclock_page_req = match worker
781                             .mem
782                             .read_obj_from_addr(addr)
783                         {
784                             Ok(req) => req,
785                             Err(e) => {
786                                 error!("failed to read request from set_pvclock_page queue: {}", e);
787                                 continue;
788                             }
789                         };
790 
791                         req.status = match worker.set_pvclock_page(req.pvclock_page_pa.into()) {
792                             Err(e) => {
793                                 error!("failed to set pvclock page: {:#}", e);
794                                 VIRTIO_PVCLOCK_S_IOERR
795                             }
796                             Ok(_) => VIRTIO_PVCLOCK_S_OK,
797                         };
798 
799                         if let Err(e) = worker.mem.write_obj_at_addr(req, addr) {
800                             error!("failed to write set_pvclock_page status: {}", e);
801                             continue;
802                         }
803 
804                         desc.len as u32
805                     };
806 
807                     set_pvclock_page_queue.add_used(desc_chain, len);
808                     set_pvclock_page_queue.trigger_interrupt(&interrupt);
809                 }
810                 Token::SuspendResume => {
811                     let req = match suspend_tube.recv::<PvClockCommand>() {
812                         Ok(req) => req,
813                         Err(e) => {
814                             error!("failed to receive request: {}", e);
815                             continue;
816                         }
817                     };
818 
819                     let resp = match req {
820                         PvClockCommand::Suspend => {
821                             worker.suspend();
822                             PvClockCommandResponse::Ok
823                         }
824                         PvClockCommand::Resume => {
825                             if let Err(e) = worker.resume() {
826                                 error!("Failed to resume pvclock: {:#}", e);
827                                 PvClockCommandResponse::Err(pvclock_response_error_from_anyhow(e))
828                             } else {
829                                 // signal to the driver that the total_suspend_ns has changed
830                                 interrupt.signal_config_changed();
831                                 PvClockCommandResponse::Ok
832                             }
833                         }
834                     };
835 
836                     if let Err(e) = suspend_tube.send(&resp) {
837                         error!("error sending PvClockCommandResponse: {}", e);
838                     }
839                 }
840 
841                 Token::InterruptResample => {
842                     interrupt.interrupt_resample();
843                 }
844                 Token::Kill => {
845                     break 'wait;
846                 }
847             }
848         }
849     }
850 
851     MainWorkerReturn {
852         suspend_tube,
853         set_pvclock_page_queue,
854         worker,
855     }
856 }
857 
858 impl VirtioDevice for PvClock {
keep_rds(&self) -> Vec<RawDescriptor>859     fn keep_rds(&self) -> Vec<RawDescriptor> {
860         if let PvClockWorkerState::Idle(suspend_tube) = &self.worker_state {
861             vec![suspend_tube.as_raw_descriptor()]
862         } else {
863             Vec::new()
864         }
865     }
866 
device_type(&self) -> DeviceType867     fn device_type(&self) -> DeviceType {
868         DeviceType::Pvclock
869     }
870 
queue_max_sizes(&self) -> &[u16]871     fn queue_max_sizes(&self) -> &[u16] {
872         QUEUE_SIZES
873     }
874 
features(&self) -> u64875     fn features(&self) -> u64 {
876         self.state.features
877     }
878 
ack_features(&mut self, mut value: u64)879     fn ack_features(&mut self, mut value: u64) {
880         if value & !self.features() != 0 {
881             warn!("virtio-pvclock got unknown feature ack {:x}", value);
882             value &= self.features();
883         }
884         self.state.acked_features |= value;
885     }
886 
read_config(&self, offset: u64, data: &mut [u8])887     fn read_config(&self, offset: u64, data: &mut [u8]) {
888         copy_config(data, 0, self.get_config().as_bytes(), offset);
889     }
890 
write_config(&mut self, offset: u64, data: &[u8])891     fn write_config(&mut self, offset: u64, data: &[u8]) {
892         // Pvclock device doesn't expect a guest write to config
893         warn!(
894             "Unexpected write to virtio-pvclock config at offset {}: {:?}",
895             offset, data
896         );
897     }
898 
activate( &mut self, mem: GuestMemory, interrupt: Interrupt, queues: BTreeMap<usize, Queue>, ) -> anyhow::Result<()>899     fn activate(
900         &mut self,
901         mem: GuestMemory,
902         interrupt: Interrupt,
903         queues: BTreeMap<usize, Queue>,
904     ) -> anyhow::Result<()> {
905         let tsc_frequency = self.state.tsc_frequency;
906         let total_suspend_ns = self.state.total_suspend_ns.clone();
907         let worker = PvClockWorker::new(tsc_frequency, total_suspend_ns, mem);
908         self.switch_to_main_worker(interrupt, worker, queues)
909     }
910 
reset(&mut self) -> Result<()>911     fn reset(&mut self) -> Result<()> {
912         self.switch_to_stub_worker();
913         Ok(())
914     }
915 
virtio_sleep(&mut self) -> anyhow::Result<Option<BTreeMap<usize, Queue>>>916     fn virtio_sleep(&mut self) -> anyhow::Result<Option<BTreeMap<usize, Queue>>> {
917         let last_state = replace(&mut self.worker_state, PvClockWorkerState::None);
918         if let PvClockWorkerState::Main(main_worker_thread) = last_state {
919             let main_worker_ret = main_worker_thread.stop();
920             let mut queues = BTreeMap::new();
921             queues.insert(0, main_worker_ret.set_pvclock_page_queue);
922             self.worker_state = PvClockWorkerState::Idle(main_worker_ret.suspend_tube);
923             self.state.paused_main_worker = Some(main_worker_ret.worker.into());
924             Ok(Some(queues))
925         } else {
926             Ok(None)
927         }
928     }
929 
virtio_wake( &mut self, queues_state: Option<(GuestMemory, Interrupt, BTreeMap<usize, Queue>)>, ) -> anyhow::Result<()>930     fn virtio_wake(
931         &mut self,
932         queues_state: Option<(GuestMemory, Interrupt, BTreeMap<usize, Queue>)>,
933     ) -> anyhow::Result<()> {
934         if let Some((mem, interrupt, queues)) = queues_state {
935             let worker_snap = self
936                 .state
937                 .paused_main_worker
938                 .take()
939                 .ok_or(anyhow!("a sleeping pvclock must have a paused worker"))?;
940             let worker = PvClockWorker::from_snapshot(
941                 self.state.tsc_frequency,
942                 self.state.total_suspend_ns.clone(),
943                 worker_snap,
944                 mem,
945             );
946             // Use unchecked as no worker is running at this point
947             self.start_main_worker(interrupt, worker, queues)?;
948         }
949         Ok(())
950     }
951 
virtio_snapshot(&mut self) -> anyhow::Result<serde_json::Value>952     fn virtio_snapshot(&mut self) -> anyhow::Result<serde_json::Value> {
953         serde_json::to_value(&self.state).context("failed to serialize PvClockState")
954     }
955 
virtio_restore(&mut self, data: serde_json::Value) -> anyhow::Result<()>956     fn virtio_restore(&mut self, data: serde_json::Value) -> anyhow::Result<()> {
957         let state: PvClockState = serde_json::from_value(data).context("error deserializing")?;
958         if state.features != self.features() {
959             bail!(
960                 "expected virtio_features to match, but they did not. Live: {:?}, snapshot {:?}",
961                 self.features(),
962                 state.features,
963             );
964         }
965         // TODO(b/291346907): we assume that the TSC frequency has NOT changed
966         // since the snapshot was made. Assuming we have not moved machines,
967         // this is a reasonable assumption. We don't verify the frequency
968         // because TSC calibration noisy.
969         self.state = state;
970         Ok(())
971     }
972 
on_device_sandboxed(&mut self)973     fn on_device_sandboxed(&mut self) {
974         self.start_stub_worker();
975     }
976 }
977 
978 #[cfg(test)]
979 mod tests {
980     use super::*;
981     use crate::virtio::QueueConfig;
982 
983     const TEST_QUEUE_SIZE: u16 = 2048;
984 
make_interrupt() -> Interrupt985     fn make_interrupt() -> Interrupt {
986         Interrupt::new_for_test()
987     }
988 
create_pvclock_device() -> (Tube, PvClock)989     fn create_pvclock_device() -> (Tube, PvClock) {
990         let (host_tube, device_tube) = Tube::pair().unwrap();
991         let mut pvclock_device = PvClock::new(0, 1e9 as u64, device_tube);
992 
993         // Simulate the device initialization to start the stub thread.
994         // In the real case, on_device_sandboxed will be called after the device is sandboxed
995         // (or at some point during the device initializtion when the sandbox is disabled) to
996         // allow devices to use multi-threads (as spawning new threads before sandboxing is
997         // prohibited because of the minijail's restriction).
998         pvclock_device.on_device_sandboxed();
999 
1000         (host_tube, pvclock_device)
1001     }
1002 
create_sleeping_device() -> (PvClock, GuestMemory, Tube)1003     fn create_sleeping_device() -> (PvClock, GuestMemory, Tube) {
1004         let (_host_tube, mut pvclock_device) = create_pvclock_device();
1005 
1006         // The queue won't actually be used, so passing one that isn't
1007         // fully configured is fine.
1008         let mut fake_queue = QueueConfig::new(TEST_QUEUE_SIZE, 0);
1009         fake_queue.set_ready(true);
1010         let mem = GuestMemory::new(&[(GuestAddress(0), 0x10000)]).unwrap();
1011         pvclock_device
1012             .activate(
1013                 mem.clone(),
1014                 make_interrupt(),
1015                 BTreeMap::from([(0, fake_queue.activate(&mem, Event::new().unwrap()).unwrap())]),
1016             )
1017             .expect("activate should succeed");
1018         let queues = pvclock_device
1019             .virtio_sleep()
1020             .expect("sleep should succeed")
1021             .expect("sleep should yield queues");
1022         assert_eq!(queues.len(), 1);
1023         assert_eq!(
1024             queues.get(&0).expect("queue must be present").size(),
1025             TEST_QUEUE_SIZE
1026         );
1027         assert!(pvclock_device.state.paused_main_worker.is_some());
1028         (pvclock_device, mem, _host_tube)
1029     }
1030 
assert_wake_successful(pvclock_device: &mut PvClock, mem: &GuestMemory)1031     fn assert_wake_successful(pvclock_device: &mut PvClock, mem: &GuestMemory) {
1032         // We just create a new queue here, because it isn't actually accessed
1033         // by the device in these tests.
1034         let mut wake_queues = BTreeMap::new();
1035         let mut fake_queue = QueueConfig::new(TEST_QUEUE_SIZE, 0);
1036         fake_queue.set_ready(true);
1037         wake_queues.insert(0, fake_queue.activate(mem, Event::new().unwrap()).unwrap());
1038         let queues_state = (mem.clone(), make_interrupt(), wake_queues);
1039         pvclock_device
1040             .virtio_wake(Some(queues_state))
1041             .expect("wake should succeed");
1042         assert!(pvclock_device.state.paused_main_worker.is_none());
1043     }
1044 
1045     #[test]
test_command_response_when_inactive()1046     fn test_command_response_when_inactive() {
1047         let (host_tube, _pvclock_device) = create_pvclock_device();
1048         assert!(host_tube.send(&PvClockCommand::Suspend).is_ok());
1049         let res = host_tube.recv::<PvClockCommandResponse>();
1050         assert!(matches!(res, Ok(PvClockCommandResponse::DeviceInactive)));
1051     }
1052 
1053     #[test]
test_sleep_wake_smoke()1054     fn test_sleep_wake_smoke() {
1055         let (mut pvclock_device, mem, _tube) = create_sleeping_device();
1056         assert_wake_successful(&mut pvclock_device, &mem);
1057     }
1058 
1059     #[test]
test_save_restore()1060     fn test_save_restore() {
1061         let (mut pvclock_device, mem, _tube) = create_sleeping_device();
1062         let test_suspend_ns = 9999;
1063 
1064         // Store a test value we can look for later in the test to verify
1065         // we're restoring properties.
1066         pvclock_device
1067             .state
1068             .total_suspend_ns
1069             .store(test_suspend_ns, Ordering::SeqCst);
1070 
1071         let snap = pvclock_device.virtio_snapshot().unwrap();
1072         pvclock_device
1073             .state
1074             .total_suspend_ns
1075             .store(0, Ordering::SeqCst);
1076         pvclock_device.virtio_restore(snap).unwrap();
1077         assert_eq!(
1078             pvclock_device.state.total_suspend_ns.load(Ordering::SeqCst),
1079             test_suspend_ns
1080         );
1081 
1082         assert_wake_successful(&mut pvclock_device, &mem);
1083     }
1084 }
1085