1 // Copyright 2022 The ChromiumOS Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 //! Virtio version of a linux pvclock clocksource.
6 //!
7 //! Driver source is here:
8 //! <https://android.googlesource.com/kernel/common/+/ebaa2c516811825b141de844cee7a38653058ef5/drivers/virtio/virtio_pvclock.c>
9 //!
10 //! # Background
11 //!
12 //! Userland applications often rely on CLOCK_MONOTONIC to be relatively continuous.
13 //! Large jumps can signal problems (e.g., triggering Android watchdogs).
14 //! This assumption breaks down in virtualized environments, where a VM's suspension isn't
15 //! inherently linked to the guest kernel's concept of "suspend".
16 //! Since fixing all userland code is impractical, virtio-pvclock allows the VMM and guest kernel
17 //! to collaborate on emulating the expected clock behavior around suspend/resume.
18 //!
19 //! # How it works
20 //!
21 //! ## Core functions of virtio-pvclock device:
22 //!
23 //! 1. Adjusts hardware clocksource offsets to make the guest clocks appear suspended when the VM is
24 //! suspended.
25 //! - This is achieved through the pvclock mechanism implemented in x86 KVM used by kvm-clock.
26 //! 2. Provides the guest kernel with the duration of VM suspension, allowing the guest to adjust
27 //! its clocks accordingly.
28 //! - Since the offset between the CLOCK_MONOTONIC and CLOCK_BOOTTIME is maintained by the guest
29 //! kernel, applying the adjustment is the guest driver's responsibility.
30 //!
31 //! ## Expected guest clock behaviors under virtio-pvclock is enabled
32 //!
33 //! - Monotonicity of CLOCK_MONOTONIC and CLOCK_BOOTTIME is maintained.
34 //! - CLOCK_MONOTONIC will not include the time passed during crosvm is suspended from its run mode
35 //! perspective.
36 //! - CLOCK_BOOTTIME will be adjusted to include the time passed during crosvm is suspended.
37 //!
38 //! # Why it is needed
39 //!
40 //! Because the existing solution does not cover some expectations we need.
41 //!
42 //! kvm-clock is letting the host to manage the offsets of CLOCK_MONOTONIC.
43 //! However, it doesn't address the difference between CLOCK_BOOTTIME and CLOCK_MONOTONIC related
44 //! to host's suspend/resume, as it is designed to maintain the CLOCK_REALTIME in sync mainly.
45
46 use std::arch::x86_64::_rdtsc;
47 use std::collections::BTreeMap;
48 use std::mem::replace;
49 use std::mem::size_of;
50 use std::sync::atomic::AtomicU64;
51 use std::sync::atomic::Ordering;
52 use std::sync::Arc;
53 use std::time::Duration;
54
55 use anyhow::anyhow;
56 use anyhow::bail;
57 use anyhow::Context;
58 use anyhow::Result;
59 use base::error;
60 use base::info;
61 use base::warn;
62 use base::AsRawDescriptor;
63 #[cfg(windows)]
64 use base::CloseNotifier;
65 use base::Error;
66 use base::Event;
67 use base::EventToken;
68 use base::RawDescriptor;
69 use base::ReadNotifier;
70 use base::Tube;
71 use base::WaitContext;
72 use base::WorkerThread;
73 use chrono::DateTime;
74 use chrono::Utc;
75 use data_model::Le32;
76 use data_model::Le64;
77 use serde::Deserialize;
78 use serde::Serialize;
79 use vm_control::PvClockCommand;
80 use vm_control::PvClockCommandResponse;
81 use vm_memory::GuestAddress;
82 use vm_memory::GuestMemory;
83 use vm_memory::GuestMemoryError;
84 use zerocopy::AsBytes;
85 use zerocopy::FromBytes;
86 use zerocopy::FromZeroes;
87
88 use super::copy_config;
89 use super::DeviceType;
90 use super::Interrupt;
91 use super::Queue;
92 use super::VirtioDevice;
93
94 // Pvclock has one virtio queue: set_pvclock_page
95 const QUEUE_SIZE: u16 = 1;
96 const QUEUE_SIZES: &[u16] = &[QUEUE_SIZE];
97
98 // pvclock flag bits
99 const PVCLOCK_TSC_STABLE_BIT: u8 = 1;
100 const PVCLOCK_GUEST_STOPPED: u8 = 2;
101
102 // The feature bitmap for virtio pvclock
103 const VIRTIO_PVCLOCK_F_TSC_STABLE: u64 = 0; // TSC is stable
104 const VIRTIO_PVCLOCK_F_INJECT_SLEEP: u64 = 1; // Inject sleep for suspend
105 const VIRTIO_PVCLOCK_F_CLOCKSOURCE_RATING: u64 = 2; // Use device clocksource rating
106
107 // Status values for a virtio_pvclock request.
108 const VIRTIO_PVCLOCK_S_OK: u8 = 0;
109 const VIRTIO_PVCLOCK_S_IOERR: u8 = 1;
110
111 const VIRTIO_PVCLOCK_CLOCKSOURCE_RATING: u32 = 450;
112
113 // The config structure being exposed to the guest to tell them how much suspend time should be
114 // injected to the guest's CLOCK_BOOTTIME.
115 #[derive(Debug, Clone, Copy, Default, AsBytes, FromZeroes, FromBytes)]
116 #[allow(non_camel_case_types)]
117 #[repr(C)]
118 struct virtio_pvclock_config {
119 // Total duration the VM has been paused while the guest kernel is not in the suspended state
120 // (from the power management and timekeeping perspective).
121 suspend_time_ns: Le64,
122 // Device-suggested rating of the pvclock clocksource.
123 clocksource_rating: Le32,
124 padding: u32,
125 }
126
127 #[derive(Debug, Clone, Copy, Default, FromZeroes, FromBytes, AsBytes)]
128 #[allow(non_camel_case_types)]
129 #[repr(C)]
130 struct virtio_pvclock_set_pvclock_page_req {
131 // Physical address of pvclock page.
132 pvclock_page_pa: Le64,
133 // Current system time.
134 system_time: Le64,
135 // Current tsc value.
136 tsc_timestamp: Le64,
137 // Status of this request, one of VIRTIO_PVCLOCK_S_*.
138 status: u8,
139 padding: [u8; 7],
140 }
141
142 // Data structure for interacting with pvclock shared memory.
143 struct PvclockSharedData {
144 mem: GuestMemory,
145 seqlock_addr: GuestAddress,
146 tsc_suspended_delta_addr: GuestAddress,
147 tsc_frequency_multiplier_addr: GuestAddress,
148 tsc_frequency_shift_addr: GuestAddress,
149 flags_addr: GuestAddress,
150 }
151
152 impl PvclockSharedData {
new(mem: GuestMemory, addr: GuestAddress) -> Self153 pub fn new(mem: GuestMemory, addr: GuestAddress) -> Self {
154 PvclockSharedData {
155 mem,
156 // The addresses of the various fields that we need to modify are relative to the
157 // base of the pvclock page. For reference, see the pvclock_vcpu_time_info struct.
158 seqlock_addr: addr,
159 tsc_suspended_delta_addr: addr.unchecked_add(8),
160 tsc_frequency_multiplier_addr: addr.unchecked_add(24),
161 tsc_frequency_shift_addr: addr.unchecked_add(28),
162 flags_addr: addr.unchecked_add(29),
163 }
164 }
165
166 /// Only the seqlock_addr is needed to re-create this struct at restore
167 /// time, so that is all our snapshot contains.
snapshot(&self) -> GuestAddress168 fn snapshot(&self) -> GuestAddress {
169 self.seqlock_addr
170 }
171
172 /// Set all fields to zero.
zero_fill(&mut self) -> Result<()>173 pub fn zero_fill(&mut self) -> Result<()> {
174 // The pvclock data structure is 32 bytes long, so we write 32 bytes of 0s
175 self.mem
176 .write_all_at_addr(&[0u8; 32], self.seqlock_addr)
177 .context("failed to zero fill the pvclock shared data")
178 }
179
increment_seqlock(&mut self) -> Result<()>180 pub fn increment_seqlock(&mut self) -> Result<()> {
181 // TODO (b/264931437): reads and writes using read/write_obj_from/at_addr are not
182 // guaranteed to be atomic. Although this should not be a problem for the seqlock
183 // or the other fields in the pvclock shared data (whch are protected via the seqlock)
184 // we might want to update these calls to be as atomic as possible if/when we have
185 // the ability to do so, just as a general cleanup and to be consistent.
186 let value = self
187 .mem
188 .read_obj_from_addr::<u32>(self.seqlock_addr)
189 .context("failed to read seqlock value")?;
190 self.mem
191 .write_obj_at_addr(value.wrapping_add(1), self.seqlock_addr)
192 .context("failed to write seqlock value")
193 }
194
set_tsc_suspended_delta(&mut self, delta: u64) -> Result<()>195 pub fn set_tsc_suspended_delta(&mut self, delta: u64) -> Result<()> {
196 self.mem
197 .write_obj_at_addr(delta, self.tsc_suspended_delta_addr)
198 .context("failed to write tsc suspended delta")
199 }
200
set_tsc_frequency(&mut self, frequency: u64) -> Result<()>201 pub fn set_tsc_frequency(&mut self, frequency: u64) -> Result<()> {
202 // TSC values are converted to timestamps using the following algorithm:
203 // delta = _rdtsc() - tsc_suspended_delta
204 // if tsc_frequency_shift > 0:
205 // delta <<= tsc_frequency_shift
206 // else:
207 // delta >>= -tsc_frequency_shift
208 // return (delta * tsc_frequency_multiplier) >> 32
209 //
210 // So, tsc_frequency_multiplier needs to be something like 1e9/tsc_frquency, in which case
211 // tsc_frequency_shift would be 32 (to counteract the final 32 right shift). But
212 // 1e9/tsc_frequency is <1 so we actually need to scale that value up and scale down
213 // the tsc_frequency_shift so we don't lose precision in the frequency. Our tsc_frequency
214 // isn't *that* precise, so we scale it up by 16 and scale down the tsc_frequency_shift by
215 // 16 (so it's also 16).
216 let shift = 16i8;
217 let multiplier: u32 = ((1_000_000_000u128 << shift) / frequency as u128)
218 .try_into()
219 .context(format!(
220 "tsc frequency multiplier overflow, frequency {}Hz is too small",
221 frequency
222 ))?;
223
224 self.mem
225 .write_obj_at_addr(multiplier, self.tsc_frequency_multiplier_addr)
226 .context("failed to write tsc frequency mlutiplier")?;
227 self.mem
228 .write_obj_at_addr(shift, self.tsc_frequency_shift_addr)
229 .context("failed to write tsc frequency shift")
230 }
231
enable_pvclock_flags(&mut self, flags: u8) -> Result<()>232 pub fn enable_pvclock_flags(&mut self, flags: u8) -> Result<()> {
233 let value = self
234 .mem
235 .read_obj_from_addr::<u8>(self.flags_addr)
236 .context("failed to read flags")?;
237 self.mem
238 .write_obj_at_addr(value | flags, self.flags_addr)
239 .context("failed to write flags")
240 }
241 }
242
243 /// Serializable part of the [PvClock] struct which will be used by the virtio_snapshot / restore.
244 #[derive(Serialize, Deserialize)]
245 struct PvClockState {
246 tsc_frequency: u64,
247 /// If the device is sleeping, a [PvClockWorkerSnapshot] that can re-create the worker
248 /// will be stored here. (We can't just store the worker itself as it contains an object
249 /// tree with references to [GuestMemory].)
250 paused_main_worker: Option<PvClockWorkerSnapshot>,
251 /// The total time the vm has been suspended, this is in an `Arc<AtomicU64>>` because it's set
252 /// by the PvClockWorker thread but read by PvClock from the mmio bus in the main thread.
253 total_suspend_ns: Arc<AtomicU64>,
254 features: u64,
255 acked_features: u64,
256 }
257
258 /// An enum to keep dynamic state of pvclock workers in a type safe manner.
259 enum PvClockWorkerState {
260 /// Idle means no worker is running.
261 /// This tube is for communicating with this device from the crosvm threads.
262 Idle(Tube),
263 /// A stub worker to respond pvclock commands when the device is not activated yet.
264 Stub(WorkerThread<StubWorkerReturn>),
265 /// A main worker to respond pvclock commands while the device is active.
266 Main(WorkerThread<MainWorkerReturn>),
267 /// None is used only for handling transitional state between the states above.
268 None,
269 }
270
271 /// A struct that represents virtio-pvclock device.
272 pub struct PvClock {
273 state: PvClockState,
274 worker_state: PvClockWorkerState,
275 }
276
277 impl PvClock {
new(base_features: u64, tsc_frequency: u64, suspend_tube: Tube) -> Self278 pub fn new(base_features: u64, tsc_frequency: u64, suspend_tube: Tube) -> Self {
279 let state = PvClockState {
280 tsc_frequency,
281 paused_main_worker: None,
282 total_suspend_ns: Arc::new(AtomicU64::new(0)),
283 features: base_features
284 | 1 << VIRTIO_PVCLOCK_F_TSC_STABLE
285 | 1 << VIRTIO_PVCLOCK_F_INJECT_SLEEP
286 | 1 << VIRTIO_PVCLOCK_F_CLOCKSOURCE_RATING,
287 acked_features: 0,
288 };
289 PvClock {
290 state,
291 worker_state: PvClockWorkerState::Idle(suspend_tube),
292 }
293 }
294
get_config(&self) -> virtio_pvclock_config295 fn get_config(&self) -> virtio_pvclock_config {
296 virtio_pvclock_config {
297 suspend_time_ns: self.state.total_suspend_ns.load(Ordering::SeqCst).into(),
298 clocksource_rating: VIRTIO_PVCLOCK_CLOCKSOURCE_RATING.into(),
299 padding: 0,
300 }
301 }
302
303 /// Use switch_to_*_worker unless needed to keep the state transition consistent
start_main_worker( &mut self, interrupt: Interrupt, pvclock_worker: PvClockWorker, mut queues: BTreeMap<usize, Queue>, ) -> anyhow::Result<()>304 fn start_main_worker(
305 &mut self,
306 interrupt: Interrupt,
307 pvclock_worker: PvClockWorker,
308 mut queues: BTreeMap<usize, Queue>,
309 ) -> anyhow::Result<()> {
310 let last_state = replace(&mut self.worker_state, PvClockWorkerState::None);
311 if let PvClockWorkerState::Idle(suspend_tube) = last_state {
312 if queues.len() != QUEUE_SIZES.len() {
313 return Err(anyhow!(
314 "expected {} queues, got {}",
315 QUEUE_SIZES.len(),
316 queues.len()
317 ));
318 }
319 let set_pvclock_page_queue = queues.remove(&0).unwrap();
320 self.worker_state = PvClockWorkerState::Main(WorkerThread::start(
321 "virtio_pvclock".to_string(),
322 move |kill_evt| {
323 run_main_worker(
324 pvclock_worker,
325 set_pvclock_page_queue,
326 suspend_tube,
327 interrupt,
328 kill_evt,
329 )
330 },
331 ));
332 } else {
333 panic!("Invalid state transition");
334 }
335 Ok(())
336 }
337
338 /// Use switch_to_*_worker unless needed to keep the state transition consistent
start_stub_worker(&mut self)339 fn start_stub_worker(&mut self) {
340 let last_state = replace(&mut self.worker_state, PvClockWorkerState::None);
341 self.worker_state = if let PvClockWorkerState::Idle(suspend_tube) = last_state {
342 PvClockWorkerState::Stub(WorkerThread::start(
343 "virtio_pvclock_stub".to_string(),
344 move |kill_evt| run_stub_worker(suspend_tube, kill_evt),
345 ))
346 } else {
347 panic!("Invalid state transition");
348 };
349 }
350
351 /// Use switch_to_*_worker unless needed to keep the state transition consistent
stop_stub_worker(&mut self)352 fn stop_stub_worker(&mut self) {
353 let last_state = replace(&mut self.worker_state, PvClockWorkerState::None);
354 self.worker_state = if let PvClockWorkerState::Stub(stub_worker_thread) = last_state {
355 let stub_worker_ret = stub_worker_thread.stop();
356 PvClockWorkerState::Idle(stub_worker_ret.suspend_tube)
357 } else {
358 panic!("Invalid state transition");
359 }
360 }
361
362 /// Use switch_to_*_worker unless needed to keep the state transition consistent
stop_main_worker(&mut self)363 fn stop_main_worker(&mut self) {
364 let last_state = replace(&mut self.worker_state, PvClockWorkerState::None);
365 if let PvClockWorkerState::Main(main_worker_thread) = last_state {
366 let main_worker_ret = main_worker_thread.stop();
367 self.worker_state = PvClockWorkerState::Idle(main_worker_ret.suspend_tube);
368 let mut queues = BTreeMap::new();
369 queues.insert(0, main_worker_ret.set_pvclock_page_queue);
370 self.state.paused_main_worker = Some(main_worker_ret.worker.into());
371 } else {
372 panic!("Invalid state transition");
373 }
374 }
375
switch_to_stub_worker(&mut self)376 fn switch_to_stub_worker(&mut self) {
377 self.stop_main_worker();
378 self.start_stub_worker();
379 }
380
switch_to_main_worker( &mut self, interrupt: Interrupt, pvclock_worker: PvClockWorker, queues: BTreeMap<usize, Queue>, ) -> anyhow::Result<()>381 fn switch_to_main_worker(
382 &mut self,
383 interrupt: Interrupt,
384 pvclock_worker: PvClockWorker,
385 queues: BTreeMap<usize, Queue>,
386 ) -> anyhow::Result<()> {
387 self.stop_stub_worker();
388 self.start_main_worker(interrupt, pvclock_worker, queues)
389 }
390 }
391
392 /// Represents a moment in time including the TSC counter value at that time.
393 #[derive(Serialize, Deserialize, Clone)]
394 struct PvclockInstant {
395 time: DateTime<Utc>,
396 tsc_value: u64,
397 }
398
399 /// The unique data retained by [PvClockWorker] which can be used to re-create
400 /// an identical worker.
401 #[derive(Serialize, Deserialize, Clone)]
402 struct PvClockWorkerSnapshot {
403 suspend_time: Option<PvclockInstant>,
404 total_suspend_tsc_delta: u64,
405 pvclock_shared_data_base_address: Option<GuestAddress>,
406 }
407
408 impl From<PvClockWorker> for PvClockWorkerSnapshot {
from(worker: PvClockWorker) -> Self409 fn from(worker: PvClockWorker) -> Self {
410 PvClockWorkerSnapshot {
411 suspend_time: worker.suspend_time,
412 total_suspend_tsc_delta: worker.total_suspend_tsc_delta,
413 pvclock_shared_data_base_address: worker
414 .pvclock_shared_data
415 .map(|pvclock| pvclock.snapshot()),
416 }
417 }
418 }
419
420 /// Worker struct for the virtio-pvclock device.
421 ///
422 /// Handles virtio requests, storing information about suspend/resume, adjusting the
423 /// pvclock data in shared memory, and injecting suspend durations via config
424 /// changes.
425 struct PvClockWorker {
426 tsc_frequency: u64,
427 // The moment the last suspend occurred.
428 suspend_time: Option<PvclockInstant>,
429 // The total time the vm has been suspended, this is in an Arc<AtomicU64>> because it's set
430 // by the PvClockWorker thread but read by PvClock from the mmio bus in the main thread.
431 total_injected_ns: Arc<AtomicU64>,
432 // The total change in the TSC value over suspensions.
433 total_suspend_tsc_delta: u64,
434 // Pvclock shared data.
435 pvclock_shared_data: Option<PvclockSharedData>,
436 mem: GuestMemory,
437 }
438
439 impl PvClockWorker {
new(tsc_frequency: u64, total_injected_ns: Arc<AtomicU64>, mem: GuestMemory) -> Self440 pub fn new(tsc_frequency: u64, total_injected_ns: Arc<AtomicU64>, mem: GuestMemory) -> Self {
441 PvClockWorker {
442 tsc_frequency,
443 suspend_time: None,
444 total_injected_ns,
445 total_suspend_tsc_delta: 0,
446 pvclock_shared_data: None,
447 mem,
448 }
449 }
450
from_snapshot( tsc_frequency: u64, total_injected_ns: Arc<AtomicU64>, snap: PvClockWorkerSnapshot, mem: GuestMemory, ) -> Self451 fn from_snapshot(
452 tsc_frequency: u64,
453 total_injected_ns: Arc<AtomicU64>,
454 snap: PvClockWorkerSnapshot,
455 mem: GuestMemory,
456 ) -> Self {
457 PvClockWorker {
458 tsc_frequency,
459 suspend_time: snap.suspend_time,
460 total_injected_ns,
461 total_suspend_tsc_delta: snap.total_suspend_tsc_delta,
462 pvclock_shared_data: snap
463 .pvclock_shared_data_base_address
464 .map(|addr| PvclockSharedData::new(mem.clone(), addr)),
465 mem,
466 }
467 }
468
469 /// Initialize the pvclock for initial boot. We assume that the systemtime of 0 corresponds
470 /// to the tsc time of 0, so we do not set these. We set the tsc frequency based on the vcpu
471 /// tsc frequency and we set PVCLOCK_TSC_STABLE_BIT in flags to tell the guest that it's
472 /// safe to use vcpu0's pvclock page for use by the vdso. The order of writing the different
473 /// fields doesn't matter at this point, but does matter when updating.
set_pvclock_page(&mut self, addr: u64) -> Result<()>474 fn set_pvclock_page(&mut self, addr: u64) -> Result<()> {
475 if self.pvclock_shared_data.is_some() {
476 return Err(Error::new(libc::EALREADY)).context("pvclock page already set");
477 }
478
479 let mut shared_data = PvclockSharedData::new(self.mem.clone(), GuestAddress(addr));
480
481 // set all fields to 0 first
482 shared_data.zero_fill()?;
483
484 shared_data.set_tsc_frequency(self.tsc_frequency)?;
485 shared_data.enable_pvclock_flags(PVCLOCK_TSC_STABLE_BIT)?;
486
487 self.pvclock_shared_data = Some(shared_data);
488 Ok(())
489 }
490
suspend(&mut self)491 pub fn suspend(&mut self) {
492 if self.suspend_time.is_some() {
493 warn!("Suspend time already set, ignoring new suspend time");
494 return;
495 }
496 self.suspend_time = Some(PvclockInstant {
497 time: Utc::now(),
498 // SAFETY:
499 // Safe because _rdtsc takes no arguments, and we trust _rdtsc to not modify any other
500 // memory.
501 tsc_value: unsafe { _rdtsc() },
502 });
503 }
504
resume(&mut self) -> Result<()>505 pub fn resume(&mut self) -> Result<()> {
506 // First, increment the sequence lock by 1 before writing to the pvclock page.
507 self.increment_pvclock_seqlock()?;
508
509 // The guest makes sure there are memory barriers in between reads of the seqlock and other
510 // fields, we should make sure there are memory barriers in between writes of seqlock and
511 // writes to other fields.
512 std::sync::atomic::fence(Ordering::SeqCst);
513
514 // Set the tsc suspended delta and guest_stopped_bit in pvclock struct. We only need to set
515 // the bit, the guest will unset it once the guest has handled the stoppage.
516 // We get the result here because we want to call increment_pvclock_seqlock regardless of
517 // the result of these calls.
518 let result = self
519 .set_suspended_time()
520 .and_then(|_| self.set_guest_stopped_bit());
521
522 // The guest makes sure there are memory barriers in between reads of the seqlock and other
523 // fields, we should make sure there are memory barriers in between writes of seqlock and
524 // writes to other fields.
525 std::sync::atomic::fence(Ordering::SeqCst);
526
527 // Do a final increment once changes are done.
528 self.increment_pvclock_seqlock()?;
529
530 result
531 }
532
get_suspended_duration(suspend_time: &PvclockInstant) -> Duration533 fn get_suspended_duration(suspend_time: &PvclockInstant) -> Duration {
534 match Utc::now().signed_duration_since(suspend_time.time).to_std() {
535 Ok(duration) => duration,
536 Err(e) => {
537 error!(
538 "pvclock found suspend time in the future (was the host \
539 clock adjusted?). Guest boot/realtime clock may now be \
540 incorrect. Details: {}",
541 e
542 );
543 Duration::ZERO
544 }
545 }
546 }
547
set_suspended_time(&mut self) -> Result<()>548 fn set_suspended_time(&mut self) -> Result<()> {
549 let (this_suspend_duration, this_suspend_tsc_delta) =
550 if let Some(suspend_time) = self.suspend_time.take() {
551 (
552 Self::get_suspended_duration(&suspend_time),
553 // SAFETY:
554 // Safe because _rdtsc takes no arguments, and we trust _rdtsc to not modify
555 // any other memory.
556 // NB: This calculation may wrap around, as TSC can be reset to zero when
557 // the device has resumed from the "deep" suspend state (it may not happen for
558 // s2idle cases). It also happens when the tsc value itself wraps.
559 unsafe { _rdtsc() }.wrapping_sub(suspend_time.tsc_value),
560 )
561 } else {
562 return Err(Error::new(libc::ENOTSUP))
563 .context("Cannot set suspend time because suspend was never called");
564 };
565
566 // update the total tsc delta during all suspends
567 // NB: This calculation may wrap around, as the suspend time can be bigger than u64 range.
568 self.total_suspend_tsc_delta = self
569 .total_suspend_tsc_delta
570 .wrapping_add(this_suspend_tsc_delta);
571
572 // save tsc_suspended_delta to shared memory
573 self.pvclock_shared_data
574 .as_mut()
575 .ok_or(
576 anyhow::Error::new(Error::new(libc::ENODATA)).context("pvclock page is not set"),
577 )?
578 .set_tsc_suspended_delta(self.total_suspend_tsc_delta)?;
579
580 info!(
581 "set total suspend tsc delta to {}",
582 self.total_suspend_tsc_delta
583 );
584
585 // update total suspend ns
586 self.total_injected_ns
587 .fetch_add(this_suspend_duration.as_nanos() as u64, Ordering::SeqCst);
588
589 Ok(())
590 }
591
increment_pvclock_seqlock(&mut self) -> Result<()>592 fn increment_pvclock_seqlock(&mut self) -> Result<()> {
593 self.pvclock_shared_data
594 .as_mut()
595 .ok_or(
596 anyhow::Error::new(Error::new(libc::ENODATA)).context("pvclock page is not set"),
597 )?
598 .increment_seqlock()
599 }
600
set_guest_stopped_bit(&mut self) -> Result<()>601 fn set_guest_stopped_bit(&mut self) -> Result<()> {
602 self.pvclock_shared_data
603 .as_mut()
604 .ok_or(
605 anyhow::Error::new(Error::new(libc::ENODATA)).context("pvclock page is not set"),
606 )?
607 .enable_pvclock_flags(PVCLOCK_GUEST_STOPPED)
608 }
609 }
610
pvclock_response_error_from_anyhow(error: anyhow::Error) -> base::Error611 fn pvclock_response_error_from_anyhow(error: anyhow::Error) -> base::Error {
612 for cause in error.chain() {
613 if let Some(e) = cause.downcast_ref::<base::Error>() {
614 return *e;
615 }
616
617 if let Some(e) = cause.downcast_ref::<GuestMemoryError>() {
618 return match e {
619 // Two kinds of GuestMemoryError contain base::Error
620 GuestMemoryError::MemoryAddSealsFailed(e) => *e,
621 GuestMemoryError::MemoryCreationFailed(e) => *e,
622 // Otherwise return EINVAL
623 _ => Error::new(libc::EINVAL),
624 };
625 }
626 }
627 // Unknown base error
628 Error::new(libc::EFAULT)
629 }
630
631 struct StubWorkerReturn {
632 suspend_tube: Tube,
633 }
634
635 /// A stub worker to respond any requests when the device is inactive.
run_stub_worker(suspend_tube: Tube, kill_evt: Event) -> StubWorkerReturn636 fn run_stub_worker(suspend_tube: Tube, kill_evt: Event) -> StubWorkerReturn {
637 #[derive(EventToken, Debug)]
638 enum Token {
639 SomePvClockRequest,
640 Kill,
641 }
642 let wait_ctx: WaitContext<Token> = match WaitContext::build_with(&[
643 (suspend_tube.get_read_notifier(), Token::SomePvClockRequest),
644 // TODO(b/242743502): Can also close on Tube closure for Unix once CloseNotifier is
645 // implemented for Tube.
646 #[cfg(windows)]
647 (suspend_tube.get_close_notifier(), Token::Kill),
648 (&kill_evt, Token::Kill),
649 ]) {
650 Ok(wait_ctx) => wait_ctx,
651 Err(e) => {
652 error!("failed creating WaitContext: {}", e);
653 return StubWorkerReturn { suspend_tube };
654 }
655 };
656 'wait: loop {
657 let events = match wait_ctx.wait() {
658 Ok(v) => v,
659 Err(e) => {
660 error!("failed polling for events: {}", e);
661 break;
662 }
663 };
664 for event in events.iter().filter(|e| e.is_readable) {
665 match event.token {
666 Token::SomePvClockRequest => {
667 match suspend_tube.recv::<PvClockCommand>() {
668 Ok(req) => req,
669 Err(e) => {
670 error!("failed to receive request: {}", e);
671 continue;
672 }
673 };
674 if let Err(e) = suspend_tube.send(&PvClockCommandResponse::DeviceInactive) {
675 error!("error sending PvClockCommandResponse: {}", e);
676 }
677 }
678 Token::Kill => {
679 break 'wait;
680 }
681 }
682 }
683 }
684 StubWorkerReturn { suspend_tube }
685 }
686
687 struct MainWorkerReturn {
688 worker: PvClockWorker,
689 set_pvclock_page_queue: Queue,
690 suspend_tube: Tube,
691 }
692
693 // TODO(b/237300012): asyncify this device.
694 /// A worker to process PvClockCommand requests
run_main_worker( mut worker: PvClockWorker, mut set_pvclock_page_queue: Queue, suspend_tube: Tube, interrupt: Interrupt, kill_evt: Event, ) -> MainWorkerReturn695 fn run_main_worker(
696 mut worker: PvClockWorker,
697 mut set_pvclock_page_queue: Queue,
698 suspend_tube: Tube,
699 interrupt: Interrupt,
700 kill_evt: Event,
701 ) -> MainWorkerReturn {
702 #[derive(EventToken)]
703 enum Token {
704 SetPvClockPageQueue,
705 SuspendResume,
706 InterruptResample,
707 Kill,
708 }
709
710 let wait_ctx: WaitContext<Token> = match WaitContext::build_with(&[
711 (set_pvclock_page_queue.event(), Token::SetPvClockPageQueue),
712 (suspend_tube.get_read_notifier(), Token::SuspendResume),
713 // TODO(b/242743502): Can also close on Tube closure for Unix once CloseNotifier is
714 // implemented for Tube.
715 #[cfg(windows)]
716 (suspend_tube.get_close_notifier(), Token::Kill),
717 (&kill_evt, Token::Kill),
718 ]) {
719 Ok(pc) => pc,
720 Err(e) => {
721 error!("failed creating WaitContext: {}", e);
722 return MainWorkerReturn {
723 suspend_tube,
724 set_pvclock_page_queue,
725 worker,
726 };
727 }
728 };
729 if let Some(resample_evt) = interrupt.get_resample_evt() {
730 if wait_ctx
731 .add(resample_evt, Token::InterruptResample)
732 .is_err()
733 {
734 error!("failed creating WaitContext");
735 return MainWorkerReturn {
736 suspend_tube,
737 set_pvclock_page_queue,
738 worker,
739 };
740 }
741 }
742
743 'wait: loop {
744 let events = match wait_ctx.wait() {
745 Ok(v) => v,
746 Err(e) => {
747 error!("failed polling for events: {}", e);
748 break;
749 }
750 };
751
752 for event in events.iter().filter(|e| e.is_readable) {
753 match event.token {
754 Token::SetPvClockPageQueue => {
755 let _ = set_pvclock_page_queue.event().wait();
756 let desc_chain = match set_pvclock_page_queue.pop() {
757 Some(desc_chain) => desc_chain,
758 None => {
759 error!("set_pvclock_page queue was empty");
760 continue;
761 }
762 };
763
764 // This device does not follow the virtio spec requirements for device-readable
765 // vs. device-writable descriptors, so we can't use `Reader`/`Writer`. Pick the
766 // first descriptor from the chain and assume the whole req structure is
767 // contained within it.
768 let desc = desc_chain
769 .reader
770 .get_remaining_regions()
771 .chain(desc_chain.writer.get_remaining_regions())
772 .next()
773 .unwrap();
774
775 let len = if desc.len < size_of::<virtio_pvclock_set_pvclock_page_req>() {
776 error!("pvclock descriptor too short");
777 0
778 } else {
779 let addr = GuestAddress(desc.offset);
780 let mut req: virtio_pvclock_set_pvclock_page_req = match worker
781 .mem
782 .read_obj_from_addr(addr)
783 {
784 Ok(req) => req,
785 Err(e) => {
786 error!("failed to read request from set_pvclock_page queue: {}", e);
787 continue;
788 }
789 };
790
791 req.status = match worker.set_pvclock_page(req.pvclock_page_pa.into()) {
792 Err(e) => {
793 error!("failed to set pvclock page: {:#}", e);
794 VIRTIO_PVCLOCK_S_IOERR
795 }
796 Ok(_) => VIRTIO_PVCLOCK_S_OK,
797 };
798
799 if let Err(e) = worker.mem.write_obj_at_addr(req, addr) {
800 error!("failed to write set_pvclock_page status: {}", e);
801 continue;
802 }
803
804 desc.len as u32
805 };
806
807 set_pvclock_page_queue.add_used(desc_chain, len);
808 set_pvclock_page_queue.trigger_interrupt(&interrupt);
809 }
810 Token::SuspendResume => {
811 let req = match suspend_tube.recv::<PvClockCommand>() {
812 Ok(req) => req,
813 Err(e) => {
814 error!("failed to receive request: {}", e);
815 continue;
816 }
817 };
818
819 let resp = match req {
820 PvClockCommand::Suspend => {
821 worker.suspend();
822 PvClockCommandResponse::Ok
823 }
824 PvClockCommand::Resume => {
825 if let Err(e) = worker.resume() {
826 error!("Failed to resume pvclock: {:#}", e);
827 PvClockCommandResponse::Err(pvclock_response_error_from_anyhow(e))
828 } else {
829 // signal to the driver that the total_suspend_ns has changed
830 interrupt.signal_config_changed();
831 PvClockCommandResponse::Ok
832 }
833 }
834 };
835
836 if let Err(e) = suspend_tube.send(&resp) {
837 error!("error sending PvClockCommandResponse: {}", e);
838 }
839 }
840
841 Token::InterruptResample => {
842 interrupt.interrupt_resample();
843 }
844 Token::Kill => {
845 break 'wait;
846 }
847 }
848 }
849 }
850
851 MainWorkerReturn {
852 suspend_tube,
853 set_pvclock_page_queue,
854 worker,
855 }
856 }
857
858 impl VirtioDevice for PvClock {
keep_rds(&self) -> Vec<RawDescriptor>859 fn keep_rds(&self) -> Vec<RawDescriptor> {
860 if let PvClockWorkerState::Idle(suspend_tube) = &self.worker_state {
861 vec![suspend_tube.as_raw_descriptor()]
862 } else {
863 Vec::new()
864 }
865 }
866
device_type(&self) -> DeviceType867 fn device_type(&self) -> DeviceType {
868 DeviceType::Pvclock
869 }
870
queue_max_sizes(&self) -> &[u16]871 fn queue_max_sizes(&self) -> &[u16] {
872 QUEUE_SIZES
873 }
874
features(&self) -> u64875 fn features(&self) -> u64 {
876 self.state.features
877 }
878
ack_features(&mut self, mut value: u64)879 fn ack_features(&mut self, mut value: u64) {
880 if value & !self.features() != 0 {
881 warn!("virtio-pvclock got unknown feature ack {:x}", value);
882 value &= self.features();
883 }
884 self.state.acked_features |= value;
885 }
886
read_config(&self, offset: u64, data: &mut [u8])887 fn read_config(&self, offset: u64, data: &mut [u8]) {
888 copy_config(data, 0, self.get_config().as_bytes(), offset);
889 }
890
write_config(&mut self, offset: u64, data: &[u8])891 fn write_config(&mut self, offset: u64, data: &[u8]) {
892 // Pvclock device doesn't expect a guest write to config
893 warn!(
894 "Unexpected write to virtio-pvclock config at offset {}: {:?}",
895 offset, data
896 );
897 }
898
activate( &mut self, mem: GuestMemory, interrupt: Interrupt, queues: BTreeMap<usize, Queue>, ) -> anyhow::Result<()>899 fn activate(
900 &mut self,
901 mem: GuestMemory,
902 interrupt: Interrupt,
903 queues: BTreeMap<usize, Queue>,
904 ) -> anyhow::Result<()> {
905 let tsc_frequency = self.state.tsc_frequency;
906 let total_suspend_ns = self.state.total_suspend_ns.clone();
907 let worker = PvClockWorker::new(tsc_frequency, total_suspend_ns, mem);
908 self.switch_to_main_worker(interrupt, worker, queues)
909 }
910
reset(&mut self) -> Result<()>911 fn reset(&mut self) -> Result<()> {
912 self.switch_to_stub_worker();
913 Ok(())
914 }
915
virtio_sleep(&mut self) -> anyhow::Result<Option<BTreeMap<usize, Queue>>>916 fn virtio_sleep(&mut self) -> anyhow::Result<Option<BTreeMap<usize, Queue>>> {
917 let last_state = replace(&mut self.worker_state, PvClockWorkerState::None);
918 if let PvClockWorkerState::Main(main_worker_thread) = last_state {
919 let main_worker_ret = main_worker_thread.stop();
920 let mut queues = BTreeMap::new();
921 queues.insert(0, main_worker_ret.set_pvclock_page_queue);
922 self.worker_state = PvClockWorkerState::Idle(main_worker_ret.suspend_tube);
923 self.state.paused_main_worker = Some(main_worker_ret.worker.into());
924 Ok(Some(queues))
925 } else {
926 Ok(None)
927 }
928 }
929
virtio_wake( &mut self, queues_state: Option<(GuestMemory, Interrupt, BTreeMap<usize, Queue>)>, ) -> anyhow::Result<()>930 fn virtio_wake(
931 &mut self,
932 queues_state: Option<(GuestMemory, Interrupt, BTreeMap<usize, Queue>)>,
933 ) -> anyhow::Result<()> {
934 if let Some((mem, interrupt, queues)) = queues_state {
935 let worker_snap = self
936 .state
937 .paused_main_worker
938 .take()
939 .ok_or(anyhow!("a sleeping pvclock must have a paused worker"))?;
940 let worker = PvClockWorker::from_snapshot(
941 self.state.tsc_frequency,
942 self.state.total_suspend_ns.clone(),
943 worker_snap,
944 mem,
945 );
946 // Use unchecked as no worker is running at this point
947 self.start_main_worker(interrupt, worker, queues)?;
948 }
949 Ok(())
950 }
951
virtio_snapshot(&mut self) -> anyhow::Result<serde_json::Value>952 fn virtio_snapshot(&mut self) -> anyhow::Result<serde_json::Value> {
953 serde_json::to_value(&self.state).context("failed to serialize PvClockState")
954 }
955
virtio_restore(&mut self, data: serde_json::Value) -> anyhow::Result<()>956 fn virtio_restore(&mut self, data: serde_json::Value) -> anyhow::Result<()> {
957 let state: PvClockState = serde_json::from_value(data).context("error deserializing")?;
958 if state.features != self.features() {
959 bail!(
960 "expected virtio_features to match, but they did not. Live: {:?}, snapshot {:?}",
961 self.features(),
962 state.features,
963 );
964 }
965 // TODO(b/291346907): we assume that the TSC frequency has NOT changed
966 // since the snapshot was made. Assuming we have not moved machines,
967 // this is a reasonable assumption. We don't verify the frequency
968 // because TSC calibration noisy.
969 self.state = state;
970 Ok(())
971 }
972
on_device_sandboxed(&mut self)973 fn on_device_sandboxed(&mut self) {
974 self.start_stub_worker();
975 }
976 }
977
978 #[cfg(test)]
979 mod tests {
980 use super::*;
981 use crate::virtio::QueueConfig;
982
983 const TEST_QUEUE_SIZE: u16 = 2048;
984
make_interrupt() -> Interrupt985 fn make_interrupt() -> Interrupt {
986 Interrupt::new_for_test()
987 }
988
create_pvclock_device() -> (Tube, PvClock)989 fn create_pvclock_device() -> (Tube, PvClock) {
990 let (host_tube, device_tube) = Tube::pair().unwrap();
991 let mut pvclock_device = PvClock::new(0, 1e9 as u64, device_tube);
992
993 // Simulate the device initialization to start the stub thread.
994 // In the real case, on_device_sandboxed will be called after the device is sandboxed
995 // (or at some point during the device initializtion when the sandbox is disabled) to
996 // allow devices to use multi-threads (as spawning new threads before sandboxing is
997 // prohibited because of the minijail's restriction).
998 pvclock_device.on_device_sandboxed();
999
1000 (host_tube, pvclock_device)
1001 }
1002
create_sleeping_device() -> (PvClock, GuestMemory, Tube)1003 fn create_sleeping_device() -> (PvClock, GuestMemory, Tube) {
1004 let (_host_tube, mut pvclock_device) = create_pvclock_device();
1005
1006 // The queue won't actually be used, so passing one that isn't
1007 // fully configured is fine.
1008 let mut fake_queue = QueueConfig::new(TEST_QUEUE_SIZE, 0);
1009 fake_queue.set_ready(true);
1010 let mem = GuestMemory::new(&[(GuestAddress(0), 0x10000)]).unwrap();
1011 pvclock_device
1012 .activate(
1013 mem.clone(),
1014 make_interrupt(),
1015 BTreeMap::from([(0, fake_queue.activate(&mem, Event::new().unwrap()).unwrap())]),
1016 )
1017 .expect("activate should succeed");
1018 let queues = pvclock_device
1019 .virtio_sleep()
1020 .expect("sleep should succeed")
1021 .expect("sleep should yield queues");
1022 assert_eq!(queues.len(), 1);
1023 assert_eq!(
1024 queues.get(&0).expect("queue must be present").size(),
1025 TEST_QUEUE_SIZE
1026 );
1027 assert!(pvclock_device.state.paused_main_worker.is_some());
1028 (pvclock_device, mem, _host_tube)
1029 }
1030
assert_wake_successful(pvclock_device: &mut PvClock, mem: &GuestMemory)1031 fn assert_wake_successful(pvclock_device: &mut PvClock, mem: &GuestMemory) {
1032 // We just create a new queue here, because it isn't actually accessed
1033 // by the device in these tests.
1034 let mut wake_queues = BTreeMap::new();
1035 let mut fake_queue = QueueConfig::new(TEST_QUEUE_SIZE, 0);
1036 fake_queue.set_ready(true);
1037 wake_queues.insert(0, fake_queue.activate(mem, Event::new().unwrap()).unwrap());
1038 let queues_state = (mem.clone(), make_interrupt(), wake_queues);
1039 pvclock_device
1040 .virtio_wake(Some(queues_state))
1041 .expect("wake should succeed");
1042 assert!(pvclock_device.state.paused_main_worker.is_none());
1043 }
1044
1045 #[test]
test_command_response_when_inactive()1046 fn test_command_response_when_inactive() {
1047 let (host_tube, _pvclock_device) = create_pvclock_device();
1048 assert!(host_tube.send(&PvClockCommand::Suspend).is_ok());
1049 let res = host_tube.recv::<PvClockCommandResponse>();
1050 assert!(matches!(res, Ok(PvClockCommandResponse::DeviceInactive)));
1051 }
1052
1053 #[test]
test_sleep_wake_smoke()1054 fn test_sleep_wake_smoke() {
1055 let (mut pvclock_device, mem, _tube) = create_sleeping_device();
1056 assert_wake_successful(&mut pvclock_device, &mem);
1057 }
1058
1059 #[test]
test_save_restore()1060 fn test_save_restore() {
1061 let (mut pvclock_device, mem, _tube) = create_sleeping_device();
1062 let test_suspend_ns = 9999;
1063
1064 // Store a test value we can look for later in the test to verify
1065 // we're restoring properties.
1066 pvclock_device
1067 .state
1068 .total_suspend_ns
1069 .store(test_suspend_ns, Ordering::SeqCst);
1070
1071 let snap = pvclock_device.virtio_snapshot().unwrap();
1072 pvclock_device
1073 .state
1074 .total_suspend_ns
1075 .store(0, Ordering::SeqCst);
1076 pvclock_device.virtio_restore(snap).unwrap();
1077 assert_eq!(
1078 pvclock_device.state.total_suspend_ns.load(Ordering::SeqCst),
1079 test_suspend_ns
1080 );
1081
1082 assert_wake_successful(&mut pvclock_device, &mem);
1083 }
1084 }
1085