1 // Copyright 2022 The ChromiumOS Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 use std::arch::x86_64::__cpuid;
6 use std::arch::x86_64::__cpuid_count;
7 use std::convert::TryInto;
8 use std::fmt;
9 use std::fmt::Display;
10 use std::sync::atomic::AtomicU64;
11 use std::sync::atomic::Ordering;
12 use std::sync::mpsc;
13 use std::sync::Arc;
14 use std::sync::Barrier;
15 use std::thread;
16 use std::thread::JoinHandle;
17 use std::time::Duration;
18 use std::time::Instant;
19
20 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
21 use aarch64::AArch64 as Arch;
22 use anyhow::anyhow;
23 use anyhow::Context;
24 use anyhow::Result;
25 use arch::CpuConfigArch;
26 use arch::CpuSet;
27 use arch::IrqChipArch;
28 use arch::LinuxArch;
29 use arch::RunnableLinuxVm;
30 use arch::VcpuAffinity;
31 use arch::VcpuArch;
32 use arch::VmArch;
33 use base::error;
34 use base::info;
35 use base::set_audio_thread_priority;
36 use base::set_cpu_affinity;
37 use base::warn;
38 use base::Event;
39 use base::Result as BaseResult;
40 use base::SafeMultimediaHandle;
41 use base::SendTube;
42 use base::Timer;
43 use base::Tube;
44 use base::VmEventType;
45 use cros_async::select2;
46 use cros_async::EventAsync;
47 use cros_async::Executor;
48 use cros_async::SelectResult;
49 use cros_async::TimerAsync;
50 use cros_tracing::trace_event;
51 use crosvm_cli::bail_exit_code;
52 use crosvm_cli::sys::windows::exit::Exit;
53 use crosvm_cli::sys::windows::exit::ExitContext;
54 use crosvm_cli::sys::windows::exit::ExitContextAnyhow;
55 use devices::tsc::TscSyncMitigations;
56 use devices::Bus;
57 use devices::VcpuRunState;
58 use futures::pin_mut;
59 #[cfg(feature = "whpx")]
60 use hypervisor::whpx::WhpxVcpu;
61 #[cfg(target_arch = "x86_64")]
62 use hypervisor::CpuConfigX86_64;
63 use hypervisor::HypervisorCap;
64 use hypervisor::IoEventAddress;
65 use hypervisor::IoOperation;
66 use hypervisor::IoParams;
67 use hypervisor::VcpuExit;
68 use hypervisor::VcpuInitX86_64;
69 use sync::Condvar;
70 use sync::Mutex;
71 use vm_control::VcpuControl;
72 use vm_control::VmRunMode;
73 use winapi::shared::winerror::ERROR_RETRY;
74 #[cfg(target_arch = "x86_64")]
75 use x86_64::cpuid::adjust_cpuid;
76 #[cfg(target_arch = "x86_64")]
77 use x86_64::cpuid::CpuIdContext;
78 #[cfg(target_arch = "x86_64")]
79 use x86_64::X8664arch as Arch;
80
81 #[cfg(feature = "stats")]
82 use crate::crosvm::sys::windows::stats::StatisticsCollector;
83 #[cfg(feature = "stats")]
84 use crate::crosvm::sys::windows::stats::VmExitStatistics;
85 use crate::sys::windows::save_vcpu_tsc_offset;
86 use crate::sys::windows::ExitState;
87
88 const ERROR_RETRY_I32: i32 = ERROR_RETRY as i32;
89
90 #[derive(Default)]
91 pub struct VcpuRunMode {
92 mtx: Mutex<VmRunMode>,
93 cvar: Condvar,
94 }
95
96 impl VcpuRunMode {
get_mode(&self) -> VmRunMode97 pub fn get_mode(&self) -> VmRunMode {
98 *self.mtx.lock()
99 }
100
set_and_notify(&self, new_mode: VmRunMode)101 pub fn set_and_notify(&self, new_mode: VmRunMode) {
102 *self.mtx.lock() = new_mode;
103 self.cvar.notify_all();
104 }
105 }
106
107 struct RunnableVcpuInfo<V> {
108 vcpu: V,
109 thread_priority_handle: Option<SafeMultimediaHandle>,
110 }
111
112 #[derive(Clone, Debug)]
113 struct VcpuMonitoringMetadata {
114 pub start_instant: Instant,
115 // Milliseconds since the baseline start_instant
116 pub last_run_time: Arc<AtomicU64>,
117 pub last_exit_snapshot: Arc<Mutex<Option<VcpuExitData>>>,
118 }
119
120 #[derive(Clone, Debug)]
121 struct VcpuRunThread {
122 pub cpu_id: usize,
123 pub monitoring_metadata: Option<VcpuMonitoringMetadata>,
124 }
125
126 impl VcpuRunThread {
new(cpu_id: usize, enable_vcpu_monitoring: bool) -> VcpuRunThread127 pub fn new(cpu_id: usize, enable_vcpu_monitoring: bool) -> VcpuRunThread {
128 VcpuRunThread {
129 cpu_id,
130 monitoring_metadata: enable_vcpu_monitoring.then(|| VcpuMonitoringMetadata {
131 start_instant: Instant::now(),
132 last_run_time: Arc::new(AtomicU64::new(0)),
133 last_exit_snapshot: Arc::new(Mutex::new(Option::None)),
134 }),
135 }
136 }
137
138 /// Perform WHPX-specific vcpu configurations
139 #[cfg(feature = "whpx")]
whpx_configure_vcpu(vcpu: &mut dyn VcpuArch, irq_chip: &mut dyn IrqChipArch)140 fn whpx_configure_vcpu(vcpu: &mut dyn VcpuArch, irq_chip: &mut dyn IrqChipArch) {
141 // only apply to actual WhpxVcpu instances
142 if let Some(whpx_vcpu) = vcpu.downcast_mut::<WhpxVcpu>() {
143 // WhpxVcpu instances need to know the TSC and Lapic frequencies to handle Hyper-V MSR
144 // reads and writes.
145 let tsc_freq = devices::tsc::tsc_frequency()
146 .map_err(|e| {
147 error!(
148 "Could not determine TSC frequency, WHPX vcpu will not be configured with \
149 a TSC Frequency: {e}"
150 );
151 e
152 })
153 .ok();
154 whpx_vcpu.set_frequencies(tsc_freq, irq_chip.lapic_frequency());
155 }
156 }
157
158 // Sets up a vcpu and converts it into a runnable vcpu.
runnable_vcpu<V>( cpu_id: usize, vcpu: Option<V>, vcpu_init: VcpuInitX86_64, vm: &impl VmArch, irq_chip: &mut dyn IrqChipArch, vcpu_count: usize, run_rt: bool, vcpu_affinity: Option<CpuSet>, no_smt: bool, host_cpu_topology: bool, force_calibrated_tsc_leaf: bool, ) -> Result<RunnableVcpuInfo<V>> where V: VcpuArch,159 fn runnable_vcpu<V>(
160 cpu_id: usize,
161 vcpu: Option<V>,
162 vcpu_init: VcpuInitX86_64,
163 vm: &impl VmArch,
164 irq_chip: &mut dyn IrqChipArch,
165 vcpu_count: usize,
166 run_rt: bool,
167 vcpu_affinity: Option<CpuSet>,
168 no_smt: bool,
169 host_cpu_topology: bool,
170 force_calibrated_tsc_leaf: bool,
171 ) -> Result<RunnableVcpuInfo<V>>
172 where
173 V: VcpuArch,
174 {
175 let mut vcpu = match vcpu {
176 Some(v) => v,
177 None => {
178 // If vcpu is None, it means this arch/hypervisor requires create_vcpu to be called
179 // from the vcpu thread.
180 match vm
181 .create_vcpu(cpu_id)
182 .exit_context(Exit::CreateVcpu, "failed to create vcpu")?
183 .downcast::<V>()
184 {
185 Ok(v) => *v,
186 Err(_) => panic!("VM created wrong type of VCPU"),
187 }
188 }
189 };
190
191 irq_chip
192 .add_vcpu(cpu_id, &vcpu)
193 .exit_context(Exit::AddIrqChipVcpu, "failed to add vcpu to irq chip")?;
194
195 if let Some(affinity) = vcpu_affinity {
196 if let Err(e) = set_cpu_affinity(affinity) {
197 error!("Failed to set CPU affinity: {}", e);
198 }
199 }
200
201 #[cfg(target_arch = "x86_64")]
202 let cpu_config = Some(CpuConfigX86_64::new(
203 force_calibrated_tsc_leaf,
204 host_cpu_topology,
205 false, /* enable_hwp */
206 no_smt,
207 false, /* itmt */
208 None, /* hybrid_type */
209 ));
210
211 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
212 let cpu_config = None;
213
214 Arch::configure_vcpu(
215 vm,
216 vm.get_hypervisor(),
217 irq_chip,
218 &mut vcpu,
219 vcpu_init,
220 cpu_id,
221 vcpu_count,
222 cpu_config,
223 )
224 .exit_context(Exit::ConfigureVcpu, "failed to configure vcpu")?;
225
226 #[cfg(feature = "whpx")]
227 Self::whpx_configure_vcpu(&mut vcpu, irq_chip);
228
229 let mut thread_priority_handle = None;
230 if run_rt {
231 // Until we are multi process on Windows, we can't use the normal thread priority APIs;
232 // instead, we use a trick from the audio device which is able to set a thread RT even
233 // though the process itself is not RT.
234 thread_priority_handle = match set_audio_thread_priority() {
235 Ok(hndl) => Some(hndl),
236 Err(e) => {
237 warn!("Failed to set vcpu thread to real time priority: {}", e);
238 None
239 }
240 };
241 }
242
243 Ok(RunnableVcpuInfo {
244 vcpu,
245 thread_priority_handle,
246 })
247 }
248
run<V>( &self, vcpu: Option<V>, vcpu_init: VcpuInitX86_64, vcpus: Arc<Mutex<Vec<Box<dyn VcpuArch>>>>, vm: impl VmArch + 'static, mut irq_chip: Box<dyn IrqChipArch + 'static>, vcpu_count: usize, run_rt: bool, vcpu_affinity: Option<CpuSet>, delay_rt: bool, no_smt: bool, start_barrier: Arc<Barrier>, vcpu_create_barrier: Arc<Barrier>, mut io_bus: devices::Bus, mut mmio_bus: devices::Bus, vm_evt_wrtube: SendTube, run_mode_arc: Arc<VcpuRunMode>, #[cfg(feature = "stats")] stats: Option<Arc<Mutex<StatisticsCollector>>>, host_cpu_topology: bool, tsc_offset: Option<u64>, force_calibrated_tsc_leaf: bool, vcpu_control: mpsc::Receiver<VcpuControl>, ) -> Result<JoinHandle<Result<()>>> where V: VcpuArch + 'static,249 pub fn run<V>(
250 &self,
251 vcpu: Option<V>,
252 vcpu_init: VcpuInitX86_64,
253 vcpus: Arc<Mutex<Vec<Box<dyn VcpuArch>>>>,
254 vm: impl VmArch + 'static,
255 mut irq_chip: Box<dyn IrqChipArch + 'static>,
256 vcpu_count: usize,
257 run_rt: bool,
258 vcpu_affinity: Option<CpuSet>,
259 delay_rt: bool,
260 no_smt: bool,
261 start_barrier: Arc<Barrier>,
262 vcpu_create_barrier: Arc<Barrier>,
263 mut io_bus: devices::Bus,
264 mut mmio_bus: devices::Bus,
265 vm_evt_wrtube: SendTube,
266 run_mode_arc: Arc<VcpuRunMode>,
267 #[cfg(feature = "stats")] stats: Option<Arc<Mutex<StatisticsCollector>>>,
268 host_cpu_topology: bool,
269 tsc_offset: Option<u64>,
270 force_calibrated_tsc_leaf: bool,
271 vcpu_control: mpsc::Receiver<VcpuControl>,
272 ) -> Result<JoinHandle<Result<()>>>
273 where
274 V: VcpuArch + 'static,
275 {
276 let context = self.clone();
277 thread::Builder::new()
278 .name(format!("crosvm_vcpu{}", self.cpu_id))
279 .spawn(move || {
280 // Having a closure returning ExitState guarentees that we
281 // send a VmEventType on all code paths after the closure
282 // returns.
283 let vcpu_fn = || -> Result<ExitState> {
284 let runnable_vcpu = Self::runnable_vcpu(
285 context.cpu_id,
286 vcpu,
287 vcpu_init,
288 &vm,
289 irq_chip.as_mut(),
290 vcpu_count,
291 run_rt && !delay_rt,
292 vcpu_affinity,
293 no_smt,
294 host_cpu_topology,
295 force_calibrated_tsc_leaf,
296 );
297
298 #[cfg(target_arch = "x86_64")]
299 let cpu_config = CpuConfigX86_64::new(
300 force_calibrated_tsc_leaf,
301 host_cpu_topology,
302 false, /* enable_hwp */
303 no_smt,
304 false, /* itmt */
305 None, /* hybrid_type */
306 );
307
308 #[cfg(target_arch = "x86_64")]
309 let cpuid_context = CpuIdContext::new(
310 context.cpu_id,
311 vcpu_count,
312 Some(irq_chip.as_ref()),
313 cpu_config,
314 vm.get_hypervisor()
315 .check_capability(HypervisorCap::CalibratedTscLeafRequired),
316 __cpuid_count,
317 __cpuid,
318 );
319
320 // The vcpu_create_barrier is supplied from the main thread in order for it to
321 // wait until this thread is done creating its vcpu.
322 vcpu_create_barrier.wait();
323
324 // Wait for this barrier before continuing forward.
325 start_barrier.wait();
326
327 let RunnableVcpuInfo {
328 vcpu,
329 thread_priority_handle: _thread_priority_handle,
330 } = runnable_vcpu?;
331
332 if let Some(offset) = tsc_offset {
333 vcpu.set_tsc_offset(offset).unwrap_or_else(|e| {
334 error!(
335 "Failed to set tsc_offset of {} on vcpu {}: {}",
336 offset, context.cpu_id, e
337 )
338 });
339 }
340
341 // Clone vcpu so it can be used by the main thread to force a vcpu run to exit
342 vcpus
343 .lock()
344 .push(Box::new(vcpu.try_clone().expect("Could not clone vcpu!")));
345
346 mmio_bus.set_access_id(context.cpu_id);
347 io_bus.set_access_id(context.cpu_id);
348
349 vcpu_loop(
350 &context,
351 vcpu,
352 vm,
353 irq_chip,
354 io_bus,
355 mmio_bus,
356 run_mode_arc,
357 #[cfg(feature = "stats")]
358 stats,
359 #[cfg(target_arch = "x86_64")]
360 cpuid_context,
361 vcpu_control,
362 )
363 };
364
365 let final_event_data = match vcpu_fn().unwrap_or_else(|e| {
366 error!(
367 "vcpu {} run loop exited with error: {:#}",
368 context.cpu_id, e
369 );
370 ExitState::Stop
371 }) {
372 ExitState::Stop => VmEventType::Exit,
373 _ => unreachable!(),
374 };
375 vm_evt_wrtube
376 .send::<VmEventType>(&final_event_data)
377 .unwrap_or_else(|e| {
378 error!(
379 "failed to send final event {:?} on vcpu {}: {}",
380 final_event_data, context.cpu_id, e
381 )
382 });
383 Ok(())
384 })
385 .exit_context(Exit::SpawnVcpu, "failed to spawn VCPU thread")
386 }
387 }
388
389 #[derive(Clone, Debug)]
390 struct VcpuExitData {
391 // Represented by duration since baseline start_instant
392 exit_time: Duration,
393 exit_result: BaseResult<VcpuExit>,
394 }
395
396 impl Display for VcpuExitData {
fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result397 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
398 write!(f, "exit result: {:?}", self.exit_result)
399 }
400 }
401
402 struct VcpuStallMonitor {
403 vcpu_run_threads: Vec<VcpuRunThread>,
404 run_mode: Arc<VcpuRunMode>,
405 }
406
407 impl VcpuStallMonitor {
408 const HOST_STALL_TIMEOUT: Duration = Duration::from_secs(2);
409 const VCPU_CHECKUP_INTERVAL: Duration = Duration::from_secs(1);
410 const STALL_REPORTING_LIMITER: Duration = Duration::from_secs(10);
411
init(run_mode: Arc<VcpuRunMode>) -> VcpuStallMonitor412 pub fn init(run_mode: Arc<VcpuRunMode>) -> VcpuStallMonitor {
413 VcpuStallMonitor {
414 vcpu_run_threads: vec![],
415 run_mode,
416 }
417 }
418
add_vcpu_thread(&mut self, thread: VcpuRunThread)419 pub fn add_vcpu_thread(&mut self, thread: VcpuRunThread) {
420 self.vcpu_run_threads.push(thread);
421 }
422
run(self, exit_event: &Event) -> Result<JoinHandle<Result<()>>>423 pub fn run(self, exit_event: &Event) -> Result<JoinHandle<Result<()>>> {
424 let cloned_exit_event = exit_event
425 .try_clone()
426 .exit_context(Exit::CloneEvent, "failed to clone event")?;
427 thread::Builder::new()
428 .name("crosvm_vcpu_stall_monitor".to_string())
429 .spawn(move || {
430 let ex = Executor::new()?;
431
432 let mut timer = TimerAsync::new(Timer::new()?, &ex)?;
433 let mut reset_timer = true;
434
435 let exit_evt_async = EventAsync::new(cloned_exit_event, &ex)?;
436 let exit_future = exit_evt_async.next_val();
437 pin_mut!(exit_future);
438 'main: loop {
439 if reset_timer {
440 timer.reset(
441 Self::VCPU_CHECKUP_INTERVAL,
442 Some(Self::VCPU_CHECKUP_INTERVAL),
443 )?;
444 reset_timer = false;
445 }
446 let timer_future = timer.wait();
447 pin_mut!(timer_future);
448 match ex.run_until(select2(timer_future, exit_future)) {
449 Ok((timer_result, exit_result)) => {
450 match exit_result {
451 SelectResult::Finished(_) => {
452 info!("vcpu monitor got exit event");
453 break 'main;
454 }
455 SelectResult::Pending(future) => exit_future = future,
456 }
457
458 match timer_result {
459 SelectResult::Finished(Err(e)) => {
460 error!(
461 "vcpu monitor aborting due to error awaiting future: {}",
462 e
463 );
464 break 'main;
465 }
466 SelectResult::Finished(_) => self.report_any_stalls(),
467 _ => (),
468 }
469 }
470 Err(e) => {
471 error!("vcpu monitor failed to wait on future set: {:?}", e);
472 break 'main;
473 }
474 }
475
476 // Always ensure the vcpus aren't suspended before continuing to montior.
477 let mut run_mode_lock = self.run_mode.mtx.lock();
478 loop {
479 match *run_mode_lock {
480 VmRunMode::Running => break,
481 VmRunMode::Suspending | VmRunMode::Breakpoint => {
482 info!("vcpu monitor pausing until end of suspension");
483 run_mode_lock = self.run_mode.cvar.wait(run_mode_lock);
484 reset_timer = true;
485 }
486 VmRunMode::Exiting => {
487 info!("vcpu monitor detected vm exit");
488 break 'main;
489 }
490 }
491 }
492 }
493
494 Ok(())
495 })
496 .exit_context(
497 Exit::SpawnVcpuMonitor,
498 "failed to spawn VCPU stall monitor thread",
499 )
500 }
501
report_any_stalls(&self)502 fn report_any_stalls(&self) {
503 // TODO(b/208267651): Add and fire Clearcut events for stalls (and add tests)
504 // TODO(b/208267651): Also test guest stalls (vcpu.run() goes too long without exiting)
505 let now = Instant::now();
506 for vcpu_thread in self.vcpu_run_threads.iter() {
507 let monitoring_metadata = vcpu_thread.monitoring_metadata.as_ref().unwrap();
508 if let Some(ref exit_snapshot) = monitoring_metadata.last_exit_snapshot.lock().clone() {
509 let last_run =
510 Duration::from_millis(monitoring_metadata.last_run_time.load(Ordering::SeqCst));
511 if last_run < exit_snapshot.exit_time {
512 // VCPU is between runs
513 let time_since_exit = now.saturating_duration_since(
514 monitoring_metadata.start_instant + exit_snapshot.exit_time,
515 );
516 if time_since_exit > Self::HOST_STALL_TIMEOUT {
517 self.report_stall(vcpu_thread.cpu_id, exit_snapshot, time_since_exit);
518 }
519 }
520 };
521 }
522 }
523
report_stall(&self, cpu_id: usize, exit_data: &VcpuExitData, stall_time: Duration)524 fn report_stall(&self, cpu_id: usize, exit_data: &VcpuExitData, stall_time: Duration) {
525 if stall_time > Self::STALL_REPORTING_LIMITER {
526 return;
527 }
528 // Double check the Vm is running. We don't care about stalls during suspension/exit
529 if *self.run_mode.mtx.lock() != VmRunMode::Running {
530 let duration_string = format!("{:.1}sec", stall_time.as_secs_f32());
531 error!(
532 "Host stall for {} on VCPU {} exit while handling: {}",
533 duration_string, cpu_id, exit_data,
534 );
535 }
536 }
537 }
538
setup_vcpu_signal_handler() -> Result<()>539 fn setup_vcpu_signal_handler() -> Result<()> {
540 Ok(())
541 }
542
run_all_vcpus<V: VmArch + 'static, Vcpu: VcpuArch + 'static>( vcpus: Vec<Option<Vcpu>>, vcpu_boxes: Arc<Mutex<Vec<Box<dyn VcpuArch>>>>, guest_os: &RunnableLinuxVm<V, Vcpu>, exit_evt: &Event, vm_evt_wrtube: &SendTube, #[cfg(feature = "stats")] stats: &Option<Arc<Mutex<StatisticsCollector>>>, host_cpu_topology: bool, run_mode_arc: Arc<VcpuRunMode>, tsc_sync_mitigations: TscSyncMitigations, force_calibrated_tsc_leaf: bool, ) -> Result<(Vec<JoinHandle<Result<()>>>, Vec<mpsc::Sender<VcpuControl>>)>543 pub fn run_all_vcpus<V: VmArch + 'static, Vcpu: VcpuArch + 'static>(
544 vcpus: Vec<Option<Vcpu>>,
545 vcpu_boxes: Arc<Mutex<Vec<Box<dyn VcpuArch>>>>,
546 guest_os: &RunnableLinuxVm<V, Vcpu>,
547 exit_evt: &Event,
548 vm_evt_wrtube: &SendTube,
549 #[cfg(feature = "stats")] stats: &Option<Arc<Mutex<StatisticsCollector>>>,
550 host_cpu_topology: bool,
551 run_mode_arc: Arc<VcpuRunMode>,
552 tsc_sync_mitigations: TscSyncMitigations,
553 force_calibrated_tsc_leaf: bool,
554 ) -> Result<(Vec<JoinHandle<Result<()>>>, Vec<mpsc::Sender<VcpuControl>>)> {
555 let mut vcpu_threads = Vec::with_capacity(guest_os.vcpu_count + 1);
556 let mut vcpu_control_channels = Vec::with_capacity(guest_os.vcpu_count);
557 let start_barrier = Arc::new(Barrier::new(guest_os.vcpu_count + 1));
558 let enable_vcpu_monitoring = anti_tamper::enable_vcpu_monitoring();
559 setup_vcpu_signal_handler()?;
560
561 let mut stall_monitor =
562 enable_vcpu_monitoring.then(|| VcpuStallMonitor::init(run_mode_arc.clone()));
563 for (cpu_id, vcpu) in vcpus.into_iter().enumerate() {
564 let vcpu_affinity = match guest_os.vcpu_affinity.clone() {
565 Some(VcpuAffinity::Global(v)) => Some(v),
566 Some(VcpuAffinity::PerVcpu(mut m)) => Some(m.remove(&cpu_id).unwrap_or_default()),
567 None => None,
568 };
569
570 // TSC sync mitigations may set vcpu affinity and set a TSC offset
571 let (vcpu_affinity, tsc_offset): (Option<CpuSet>, Option<u64>) =
572 if let Some(mitigation_affinity) = tsc_sync_mitigations.get_vcpu_affinity(cpu_id) {
573 if vcpu_affinity.is_none() {
574 (
575 Some(CpuSet::new(mitigation_affinity)),
576 tsc_sync_mitigations.get_vcpu_tsc_offset(cpu_id),
577 )
578 } else {
579 error!(
580 "Core affinity {:?} specified via commandline conflicts and overrides \
581 affinity needed for TSC sync mitigation: {:?}.",
582 vcpu_affinity, mitigation_affinity
583 );
584 (vcpu_affinity, None)
585 }
586 } else {
587 (vcpu_affinity, None)
588 };
589
590 let vcpu_init = &guest_os.vcpu_init[cpu_id];
591 // The vcpu_create_barrier allows the main thread to delay the spawning of additional
592 // vcpu threads until a single vcpu thread spawned has finished creating it's vcpu.
593 // We currently use this to allow creation of 1 vcpu at a time for all hypervisors.
594 // There are issues with multiple hypervisors with this approach:
595 // - Windows 11 has a regression which causes a BSOD with creation of multiple vcpu
596 // in parallel. http://b/229635845 for more details.
597 // - GHAXM/HAXM cannot create vcpu0 in parallel with other Vcpus.
598 let vcpu_create_barrier = Arc::new(Barrier::new(2));
599 let vcpu_run_thread = VcpuRunThread::new(cpu_id, enable_vcpu_monitoring);
600 let (vcpu_control_send, vcpu_control_recv) = mpsc::channel();
601 vcpu_control_channels.push(vcpu_control_send);
602 let join_handle = vcpu_run_thread.run(
603 vcpu,
604 vcpu_init.clone(),
605 vcpu_boxes.clone(),
606 guest_os
607 .vm
608 .try_clone()
609 .exit_context(Exit::CloneEvent, "failed to clone vm")?,
610 guest_os
611 .irq_chip
612 .try_box_clone()
613 .exit_context(Exit::CloneEvent, "failed to clone event")?,
614 guest_os.vcpu_count,
615 guest_os.rt_cpus.contains(&cpu_id),
616 vcpu_affinity,
617 guest_os.delay_rt,
618 guest_os.no_smt,
619 start_barrier.clone(),
620 vcpu_create_barrier.clone(),
621 (*guest_os.io_bus).clone(),
622 (*guest_os.mmio_bus).clone(),
623 vm_evt_wrtube
624 .try_clone()
625 .exit_context(Exit::CloneTube, "failed to clone tube")?,
626 run_mode_arc.clone(),
627 #[cfg(feature = "stats")]
628 stats.clone(),
629 host_cpu_topology,
630 tsc_offset,
631 force_calibrated_tsc_leaf,
632 vcpu_control_recv,
633 )?;
634 if let Some(ref mut monitor) = stall_monitor {
635 monitor.add_vcpu_thread(vcpu_run_thread);
636 }
637
638 // Wait until the vcpu is created before we start a new vcpu thread
639 vcpu_create_barrier.wait();
640
641 vcpu_threads.push(join_handle);
642 }
643 if let Some(monitor) = stall_monitor {
644 vcpu_threads.push(monitor.run(exit_evt)?);
645 }
646 // Now wait on the start barrier to start all threads at the same time.
647 start_barrier.wait();
648 Ok((vcpu_threads, vcpu_control_channels))
649 }
650
vcpu_loop<V>( context: &VcpuRunThread, mut vcpu: V, vm: impl VmArch + 'static, irq_chip: Box<dyn IrqChipArch + 'static>, io_bus: Bus, mmio_bus: Bus, run_mode_arc: Arc<VcpuRunMode>, #[cfg(feature = "stats")] stats: Option<Arc<Mutex<StatisticsCollector>>>, #[cfg(target_arch = "x86_64")] cpuid_context: CpuIdContext, vcpu_control: mpsc::Receiver<VcpuControl>, ) -> Result<ExitState> where V: VcpuArch + 'static,651 fn vcpu_loop<V>(
652 context: &VcpuRunThread,
653 mut vcpu: V,
654 vm: impl VmArch + 'static,
655 irq_chip: Box<dyn IrqChipArch + 'static>,
656 io_bus: Bus,
657 mmio_bus: Bus,
658 run_mode_arc: Arc<VcpuRunMode>,
659 #[cfg(feature = "stats")] stats: Option<Arc<Mutex<StatisticsCollector>>>,
660 #[cfg(target_arch = "x86_64")] cpuid_context: CpuIdContext,
661 vcpu_control: mpsc::Receiver<VcpuControl>,
662 ) -> Result<ExitState>
663 where
664 V: VcpuArch + 'static,
665 {
666 #[cfg(feature = "stats")]
667 let mut exit_stats = VmExitStatistics::new();
668
669 #[cfg(feature = "stats")]
670 {
671 mmio_bus.stats.lock().set_enabled(stats.is_some());
672 io_bus.stats.lock().set_enabled(stats.is_some());
673 exit_stats.set_enabled(stats.is_some());
674 }
675
676 let mut save_tsc_offset = true;
677
678 loop {
679 let _trace_event = trace_event!(crosvm, "vcpu loop");
680 let mut check_vm_shutdown = run_mode_arc.get_mode() != VmRunMode::Running;
681
682 match irq_chip.wait_until_runnable(&vcpu).with_exit_context(
683 Exit::WaitUntilRunnable,
684 || {
685 format!(
686 "error waiting for vcpu {} to become runnable",
687 context.cpu_id
688 )
689 },
690 )? {
691 VcpuRunState::Runnable => {}
692 VcpuRunState::Interrupted => check_vm_shutdown = true,
693 }
694
695 if !check_vm_shutdown {
696 let exit = {
697 let _trace_event = trace_event!(crosvm, "vcpu::run");
698 if let Some(ref monitoring_metadata) = context.monitoring_metadata {
699 monitoring_metadata.last_run_time.store(
700 // Safe conversion because millis will always be < u32::MAX
701 monitoring_metadata
702 .start_instant
703 .elapsed()
704 .as_millis()
705 .try_into()
706 .unwrap(),
707 Ordering::SeqCst,
708 );
709 }
710 vcpu.run()
711 };
712 if let Some(ref monitoring_metadata) = context.monitoring_metadata {
713 *monitoring_metadata.last_exit_snapshot.lock() = Some(VcpuExitData {
714 exit_time: monitoring_metadata.start_instant.elapsed(),
715 exit_result: exit,
716 });
717 }
718
719 // save the tsc offset if we need to
720 if save_tsc_offset {
721 if let Ok(offset) = vcpu.get_tsc_offset() {
722 save_vcpu_tsc_offset(offset, context.cpu_id);
723 } else {
724 error!("Unable to determine TSC offset");
725 }
726 save_tsc_offset = false;
727 }
728
729 #[cfg(feature = "stats")]
730 let start = exit_stats.start_stat();
731
732 match exit {
733 Ok(VcpuExit::Io) => {
734 let _trace_event = trace_event!(crosvm, "VcpuExit::Io");
735 vcpu.handle_io(&mut |IoParams { address, mut size, operation}| {
736 match operation {
737 IoOperation::Read => {
738 let mut data = [0u8; 8];
739 if size > data.len() {
740 error!("unsupported IoIn size of {} bytes", size);
741 size = data.len();
742 }
743 io_bus.read(address, &mut data[..size]);
744 Some(data)
745 }
746 IoOperation::Write { data } => {
747 if size > data.len() {
748 error!("unsupported IoOut size of {} bytes", size);
749 size = data.len()
750 }
751 vm.handle_io_events(IoEventAddress::Pio(address), &data[..size])
752 .unwrap_or_else(|e| error!(
753 "failed to handle ioevent for pio write to {} on vcpu {}: {}",
754 address, context.cpu_id, e
755 ));
756 io_bus.write(address, &data[..size]);
757 None
758 }
759 }
760 }).unwrap_or_else(|e| error!("failed to handle io: {}", e));
761 }
762 Ok(VcpuExit::Mmio) => {
763 let _trace_event = trace_event!(crosvm, "VcpuExit::Mmio");
764 vcpu.handle_mmio(&mut |IoParams { address, mut size, operation }| {
765 match operation {
766 IoOperation::Read => {
767 let mut data = [0u8; 8];
768 if size > data.len() {
769 error!("unsupported MmioRead size of {} bytes", size);
770 size = data.len();
771 }
772 {
773 let data = &mut data[..size];
774 if !mmio_bus.read(address, data) {
775 info!(
776 "mmio read failed: {:x}; trying memory read..",
777 address
778 );
779 vm.get_memory()
780 .read_exact_at_addr(
781 data,
782 vm_memory::GuestAddress(address),
783 )
784 .unwrap_or_else(|e| {
785 error!(
786 "guest memory read failed at {:x}: {}",
787 address, e
788 )
789 });
790 }
791 }
792 Some(data)
793 }
794 IoOperation::Write { data } => {
795 if size > data.len() {
796 error!("unsupported MmioWrite size of {} bytes", size);
797 size = data.len()
798 }
799 let data = &data[..size];
800 vm.handle_io_events(IoEventAddress::Mmio(address), data)
801 .unwrap_or_else(|e| error!(
802 "failed to handle ioevent for mmio write to {} on vcpu {}: {}",
803 address, context.cpu_id, e
804 ));
805 if !mmio_bus.write(address, data) {
806 info!(
807 "mmio write failed: {:x}; trying memory write..",
808 address
809 );
810 vm.get_memory()
811 .write_all_at_addr(data, vm_memory::GuestAddress(address))
812 .unwrap_or_else(|e| error!(
813 "guest memory write failed at {:x}: {}",
814 address, e
815 ));
816 }
817 None
818 }
819 }
820 }).unwrap_or_else(|e| error!("failed to handle mmio: {}", e));
821 }
822 Ok(VcpuExit::IoapicEoi { vector }) => {
823 irq_chip.broadcast_eoi(vector).unwrap_or_else(|e| {
824 error!(
825 "failed to broadcast eoi {} on vcpu {}: {}",
826 vector, context.cpu_id, e
827 )
828 });
829 }
830 Ok(VcpuExit::IrqWindowOpen) => {}
831 Ok(VcpuExit::Hlt) => irq_chip.halted(context.cpu_id),
832
833 // VcpuExit::Shutdown is always an error on Windows. HAXM exits with
834 // Shutdown only for triple faults and other vcpu panics. WHPX never exits
835 // with Shutdown. Normal reboots and shutdowns, like window close, use
836 // the vm event tube and VmRunMode::Exiting instead of VcpuExit::Shutdown.
837 Ok(VcpuExit::Shutdown) => bail_exit_code!(Exit::VcpuShutdown, "vcpu shutdown"),
838 Ok(VcpuExit::FailEntry {
839 hardware_entry_failure_reason,
840 }) => bail_exit_code!(
841 Exit::VcpuFailEntry,
842 "vcpu hw run failure: {:#x}",
843 hardware_entry_failure_reason,
844 ),
845 Ok(VcpuExit::SystemEventShutdown) => {
846 bail_exit_code!(Exit::VcpuSystemEvent, "vcpu SystemEventShutdown")
847 }
848 Ok(VcpuExit::SystemEventReset) => {
849 bail_exit_code!(Exit::VcpuSystemEvent, "vcpu SystemEventReset")
850 }
851 Ok(VcpuExit::SystemEventCrash) => {
852 bail_exit_code!(Exit::VcpuSystemEvent, "vcpu SystemEventCrash")
853 }
854
855 // When we're shutting down (e.g., emulator window gets closed), GVM vmexits
856 // with KVM_EXIT_INTR, which vcpu.run maps to VcpuExit::Intr. But KVM_EXIT_INTR
857 // can happen during normal operation too, when GVM's timer finds requests
858 // pending from the host. So we set check_vm_shutdown, then below check the
859 // VmRunMode state to see if we should exit the run loop.
860 Ok(VcpuExit::Intr) => check_vm_shutdown = true,
861 Ok(VcpuExit::Canceled) => check_vm_shutdown = true,
862 #[cfg(target_arch = "x86_64")]
863 Ok(VcpuExit::Cpuid { mut entry }) => {
864 let _trace_event = trace_event!(crosvm, "VcpuExit::Cpuid");
865 // adjust the results based on crosvm logic
866 adjust_cpuid(&mut entry, &cpuid_context);
867
868 // let the vcpu finish handling the exit
869 vcpu.handle_cpuid(&entry).unwrap_or_else(|e| {
870 error!(
871 "failed to handle setting cpuid results on cpu {}: {}",
872 context.cpu_id, e
873 )
874 });
875 }
876 #[cfg(target_arch = "x86_64")]
877 Ok(VcpuExit::MsrAccess) => {} // MsrAccess handled by hypervisor impl
878 Ok(r) => {
879 error!("unexpected vcpu.run return value: {:?}", r);
880 check_vm_shutdown = true;
881 }
882 Err(e) => match e.errno() {
883 ERROR_RETRY_I32 => {}
884 _ => {
885 run_mode_arc.set_and_notify(VmRunMode::Exiting);
886 Err(e).exit_context(Exit::VcpuRunError, "vcpu run error")?;
887 }
888 },
889 }
890
891 #[cfg(feature = "stats")]
892 exit_stats.end_stat(&exit, start);
893 }
894
895 if check_vm_shutdown {
896 let mut run_mode_lock = run_mode_arc.mtx.lock();
897 loop {
898 match *run_mode_lock {
899 VmRunMode::Running => {
900 process_vcpu_control_messages(&mut vcpu, *run_mode_lock, &vcpu_control);
901 break;
902 }
903 VmRunMode::Suspending => {
904 if let Err(e) = vcpu.on_suspend() {
905 error!(
906 "failed to signal to hypervisor that vcpu {} is being suspended: {}",
907 context.cpu_id, e
908 );
909 }
910 }
911 VmRunMode::Breakpoint => {}
912 VmRunMode::Exiting => {
913 #[cfg(feature = "stats")]
914 if let Some(stats) = stats {
915 let mut collector = stats.lock();
916 collector.pio_bus_stats.push(io_bus.stats);
917 collector.mmio_bus_stats.push(mmio_bus.stats);
918 collector.vm_exit_stats.push(exit_stats);
919 }
920 return Ok(ExitState::Stop);
921 }
922 }
923
924 // For non running modes, we don't want to process messages until we've completed
925 // *all* work for any VmRunMode transition. This is because one control message
926 // asks us to inform the requestor of our current state. We want to make sure our
927 // our state has completely transitioned before we respond to the requestor. If
928 // we do this elsewhere, we might respond while in a partial state which could
929 // break features like snapshotting (e.g. by introducing a race condition).
930 process_vcpu_control_messages(&mut vcpu, *run_mode_lock, &vcpu_control);
931
932 // Give ownership of our exclusive lock to the condition variable that
933 // will block. When the condition variable is notified, `wait` will
934 // unblock and return a new exclusive lock.
935 run_mode_lock = run_mode_arc.cvar.wait(run_mode_lock);
936 }
937 }
938
939 irq_chip.inject_interrupts(&vcpu).unwrap_or_else(|e| {
940 error!(
941 "failed to inject interrupts for vcpu {}: {}",
942 context.cpu_id, e
943 )
944 });
945 }
946 }
947
process_vcpu_control_messages<V>( vcpu: &mut V, run_mode: VmRunMode, vcpu_control: &mpsc::Receiver<VcpuControl>, ) where V: VcpuArch + 'static,948 fn process_vcpu_control_messages<V>(
949 vcpu: &mut V,
950 run_mode: VmRunMode,
951 vcpu_control: &mpsc::Receiver<VcpuControl>,
952 ) where
953 V: VcpuArch + 'static,
954 {
955 let control_messages: Vec<VcpuControl> = vcpu_control.try_iter().collect();
956
957 for msg in control_messages {
958 match msg {
959 VcpuControl::RunState(new_mode) => {
960 panic!("VCPUs do not handle RunState messages on Windows")
961 }
962 #[cfg(feature = "gdb")]
963 VcpuControl::Debug(d) => {
964 unimplemented!("Windows VCPUs do not support debug yet.");
965 }
966 VcpuControl::MakeRT => {
967 unimplemented!("Windows VCPUs do not support on demand RT.");
968 }
969 VcpuControl::GetStates(response_chan) => {
970 // Wondering why we need this given that the state value is already in an Arc?
971 //
972 // The control loop generally sets the run mode directly via the Arc; however,
973 // it has no way of knowing *when* the VCPU threads have actually acknowledged
974 // the new value. By returning the value in here, we prove the the control loop
975 // we have accepted the new value and are done with our state change.
976 if let Err(e) = response_chan.send(run_mode) {
977 error!("Failed to send GetState: {}", e);
978 };
979 }
980 VcpuControl::Snapshot(snapshot_writer, response_chan) => {
981 let resp = vcpu
982 .snapshot()
983 .and_then(|s| snapshot_writer.write_fragment(&format!("vcpu{}", vcpu.id()), &s))
984 .with_context(|| format!("Failed to snapshot Vcpu #{}", vcpu.id()));
985 if let Err(e) = response_chan.send(resp) {
986 error!("Failed to send snapshot response: {}", e);
987 }
988 }
989 VcpuControl::Restore(req) => {
990 let resp = req
991 .snapshot_reader
992 .read_fragment(&format!("vcpu{}", vcpu.id()))
993 .and_then(|s| vcpu.restore(&s, req.host_tsc_reference_moment))
994 .with_context(|| format!("Failed to restore Vcpu #{}", vcpu.id()));
995 if let Err(e) = req.result_sender.send(resp) {
996 error!("Failed to send restore response: {}", e);
997 }
998 }
999 }
1000 }
1001 }
1002
1003 #[cfg(test)]
1004 mod tests {
1005 use super::*;
1006
1007 struct SetupData {
1008 pub monitor: VcpuStallMonitor,
1009 pub exit_evt: Event,
1010 }
1011
set_up_stall_monitor(vcpu_count: usize) -> Result<SetupData>1012 fn set_up_stall_monitor(vcpu_count: usize) -> Result<SetupData> {
1013 let run_mode = Arc::new(VcpuRunMode::default());
1014 let mut monitor = VcpuStallMonitor::init(run_mode);
1015
1016 for id in 0..vcpu_count {
1017 let new_vcpu = VcpuRunThread::new(id, true /* enable_vcpu_monitoring */);
1018 monitor.add_vcpu_thread(new_vcpu);
1019 }
1020
1021 Ok(SetupData {
1022 monitor,
1023 exit_evt: Event::new().expect("Failed to create event"),
1024 })
1025 }
1026
1027 #[test]
stall_monitor_closes_on_exit_evt() -> Result<()>1028 fn stall_monitor_closes_on_exit_evt() -> Result<()> {
1029 let SetupData { monitor, exit_evt } = set_up_stall_monitor(1)?;
1030
1031 exit_evt.signal()?;
1032 let _ = monitor
1033 .run(&exit_evt)?
1034 .join()
1035 .unwrap_or_else(|e| panic!("Thread join failed: {:?}", e));
1036 Ok(())
1037 }
1038 }
1039