• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2022 The ChromiumOS Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 use std::arch::x86_64::__cpuid;
6 use std::arch::x86_64::__cpuid_count;
7 use std::convert::TryInto;
8 use std::fmt;
9 use std::fmt::Display;
10 use std::sync::atomic::AtomicU64;
11 use std::sync::atomic::Ordering;
12 use std::sync::mpsc;
13 use std::sync::Arc;
14 use std::sync::Barrier;
15 use std::thread;
16 use std::thread::JoinHandle;
17 use std::time::Duration;
18 use std::time::Instant;
19 
20 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
21 use aarch64::AArch64 as Arch;
22 use anyhow::anyhow;
23 use anyhow::Context;
24 use anyhow::Result;
25 use arch::CpuConfigArch;
26 use arch::CpuSet;
27 use arch::IrqChipArch;
28 use arch::LinuxArch;
29 use arch::RunnableLinuxVm;
30 use arch::VcpuAffinity;
31 use arch::VcpuArch;
32 use arch::VmArch;
33 use base::error;
34 use base::info;
35 use base::set_audio_thread_priority;
36 use base::set_cpu_affinity;
37 use base::warn;
38 use base::Event;
39 use base::Result as BaseResult;
40 use base::SafeMultimediaHandle;
41 use base::SendTube;
42 use base::Timer;
43 use base::Tube;
44 use base::VmEventType;
45 use cros_async::select2;
46 use cros_async::EventAsync;
47 use cros_async::Executor;
48 use cros_async::SelectResult;
49 use cros_async::TimerAsync;
50 use cros_tracing::trace_event;
51 use crosvm_cli::bail_exit_code;
52 use crosvm_cli::sys::windows::exit::Exit;
53 use crosvm_cli::sys::windows::exit::ExitContext;
54 use crosvm_cli::sys::windows::exit::ExitContextAnyhow;
55 use devices::tsc::TscSyncMitigations;
56 use devices::Bus;
57 use devices::VcpuRunState;
58 use futures::pin_mut;
59 #[cfg(feature = "whpx")]
60 use hypervisor::whpx::WhpxVcpu;
61 #[cfg(target_arch = "x86_64")]
62 use hypervisor::CpuConfigX86_64;
63 use hypervisor::HypervisorCap;
64 use hypervisor::IoEventAddress;
65 use hypervisor::IoOperation;
66 use hypervisor::IoParams;
67 use hypervisor::VcpuExit;
68 use hypervisor::VcpuInitX86_64;
69 use sync::Condvar;
70 use sync::Mutex;
71 use vm_control::VcpuControl;
72 use vm_control::VmRunMode;
73 use winapi::shared::winerror::ERROR_RETRY;
74 #[cfg(target_arch = "x86_64")]
75 use x86_64::cpuid::adjust_cpuid;
76 #[cfg(target_arch = "x86_64")]
77 use x86_64::cpuid::CpuIdContext;
78 #[cfg(target_arch = "x86_64")]
79 use x86_64::X8664arch as Arch;
80 
81 #[cfg(feature = "stats")]
82 use crate::crosvm::sys::windows::stats::StatisticsCollector;
83 #[cfg(feature = "stats")]
84 use crate::crosvm::sys::windows::stats::VmExitStatistics;
85 use crate::sys::windows::save_vcpu_tsc_offset;
86 use crate::sys::windows::ExitState;
87 
88 const ERROR_RETRY_I32: i32 = ERROR_RETRY as i32;
89 
90 #[derive(Default)]
91 pub struct VcpuRunMode {
92     mtx: Mutex<VmRunMode>,
93     cvar: Condvar,
94 }
95 
96 impl VcpuRunMode {
get_mode(&self) -> VmRunMode97     pub fn get_mode(&self) -> VmRunMode {
98         *self.mtx.lock()
99     }
100 
set_and_notify(&self, new_mode: VmRunMode)101     pub fn set_and_notify(&self, new_mode: VmRunMode) {
102         *self.mtx.lock() = new_mode;
103         self.cvar.notify_all();
104     }
105 }
106 
107 struct RunnableVcpuInfo<V> {
108     vcpu: V,
109     thread_priority_handle: Option<SafeMultimediaHandle>,
110 }
111 
112 #[derive(Clone, Debug)]
113 struct VcpuMonitoringMetadata {
114     pub start_instant: Instant,
115     // Milliseconds since the baseline start_instant
116     pub last_run_time: Arc<AtomicU64>,
117     pub last_exit_snapshot: Arc<Mutex<Option<VcpuExitData>>>,
118 }
119 
120 #[derive(Clone, Debug)]
121 struct VcpuRunThread {
122     pub cpu_id: usize,
123     pub monitoring_metadata: Option<VcpuMonitoringMetadata>,
124 }
125 
126 impl VcpuRunThread {
new(cpu_id: usize, enable_vcpu_monitoring: bool) -> VcpuRunThread127     pub fn new(cpu_id: usize, enable_vcpu_monitoring: bool) -> VcpuRunThread {
128         VcpuRunThread {
129             cpu_id,
130             monitoring_metadata: enable_vcpu_monitoring.then(|| VcpuMonitoringMetadata {
131                 start_instant: Instant::now(),
132                 last_run_time: Arc::new(AtomicU64::new(0)),
133                 last_exit_snapshot: Arc::new(Mutex::new(Option::None)),
134             }),
135         }
136     }
137 
138     /// Perform WHPX-specific vcpu configurations
139     #[cfg(feature = "whpx")]
whpx_configure_vcpu(vcpu: &mut dyn VcpuArch, irq_chip: &mut dyn IrqChipArch)140     fn whpx_configure_vcpu(vcpu: &mut dyn VcpuArch, irq_chip: &mut dyn IrqChipArch) {
141         // only apply to actual WhpxVcpu instances
142         if let Some(whpx_vcpu) = vcpu.downcast_mut::<WhpxVcpu>() {
143             // WhpxVcpu instances need to know the TSC and Lapic frequencies to handle Hyper-V MSR
144             // reads and writes.
145             let tsc_freq = devices::tsc::tsc_frequency()
146                 .map_err(|e| {
147                     error!(
148                         "Could not determine TSC frequency, WHPX vcpu will not be configured with \
149                         a TSC Frequency: {e}"
150                     );
151                     e
152                 })
153                 .ok();
154             whpx_vcpu.set_frequencies(tsc_freq, irq_chip.lapic_frequency());
155         }
156     }
157 
158     // Sets up a vcpu and converts it into a runnable vcpu.
runnable_vcpu<V>( cpu_id: usize, vcpu: Option<V>, vcpu_init: VcpuInitX86_64, vm: &impl VmArch, irq_chip: &mut dyn IrqChipArch, vcpu_count: usize, run_rt: bool, vcpu_affinity: Option<CpuSet>, no_smt: bool, host_cpu_topology: bool, force_calibrated_tsc_leaf: bool, ) -> Result<RunnableVcpuInfo<V>> where V: VcpuArch,159     fn runnable_vcpu<V>(
160         cpu_id: usize,
161         vcpu: Option<V>,
162         vcpu_init: VcpuInitX86_64,
163         vm: &impl VmArch,
164         irq_chip: &mut dyn IrqChipArch,
165         vcpu_count: usize,
166         run_rt: bool,
167         vcpu_affinity: Option<CpuSet>,
168         no_smt: bool,
169         host_cpu_topology: bool,
170         force_calibrated_tsc_leaf: bool,
171     ) -> Result<RunnableVcpuInfo<V>>
172     where
173         V: VcpuArch,
174     {
175         let mut vcpu = match vcpu {
176             Some(v) => v,
177             None => {
178                 // If vcpu is None, it means this arch/hypervisor requires create_vcpu to be called
179                 // from the vcpu thread.
180                 match vm
181                     .create_vcpu(cpu_id)
182                     .exit_context(Exit::CreateVcpu, "failed to create vcpu")?
183                     .downcast::<V>()
184                 {
185                     Ok(v) => *v,
186                     Err(_) => panic!("VM created wrong type of VCPU"),
187                 }
188             }
189         };
190 
191         irq_chip
192             .add_vcpu(cpu_id, &vcpu)
193             .exit_context(Exit::AddIrqChipVcpu, "failed to add vcpu to irq chip")?;
194 
195         if let Some(affinity) = vcpu_affinity {
196             if let Err(e) = set_cpu_affinity(affinity) {
197                 error!("Failed to set CPU affinity: {}", e);
198             }
199         }
200 
201         #[cfg(target_arch = "x86_64")]
202         let cpu_config = Some(CpuConfigX86_64::new(
203             force_calibrated_tsc_leaf,
204             host_cpu_topology,
205             false, /* enable_hwp */
206             no_smt,
207             false, /* itmt */
208             None,  /* hybrid_type */
209         ));
210 
211         #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
212         let cpu_config = None;
213 
214         Arch::configure_vcpu(
215             vm,
216             vm.get_hypervisor(),
217             irq_chip,
218             &mut vcpu,
219             vcpu_init,
220             cpu_id,
221             vcpu_count,
222             cpu_config,
223         )
224         .exit_context(Exit::ConfigureVcpu, "failed to configure vcpu")?;
225 
226         #[cfg(feature = "whpx")]
227         Self::whpx_configure_vcpu(&mut vcpu, irq_chip);
228 
229         let mut thread_priority_handle = None;
230         if run_rt {
231             // Until we are multi process on Windows, we can't use the normal thread priority APIs;
232             // instead, we use a trick from the audio device which is able to set a thread RT even
233             // though the process itself is not RT.
234             thread_priority_handle = match set_audio_thread_priority() {
235                 Ok(hndl) => Some(hndl),
236                 Err(e) => {
237                     warn!("Failed to set vcpu thread to real time priority: {}", e);
238                     None
239                 }
240             };
241         }
242 
243         Ok(RunnableVcpuInfo {
244             vcpu,
245             thread_priority_handle,
246         })
247     }
248 
run<V>( &self, vcpu: Option<V>, vcpu_init: VcpuInitX86_64, vcpus: Arc<Mutex<Vec<Box<dyn VcpuArch>>>>, vm: impl VmArch + 'static, mut irq_chip: Box<dyn IrqChipArch + 'static>, vcpu_count: usize, run_rt: bool, vcpu_affinity: Option<CpuSet>, delay_rt: bool, no_smt: bool, start_barrier: Arc<Barrier>, vcpu_create_barrier: Arc<Barrier>, mut io_bus: devices::Bus, mut mmio_bus: devices::Bus, vm_evt_wrtube: SendTube, run_mode_arc: Arc<VcpuRunMode>, #[cfg(feature = "stats")] stats: Option<Arc<Mutex<StatisticsCollector>>>, host_cpu_topology: bool, tsc_offset: Option<u64>, force_calibrated_tsc_leaf: bool, vcpu_control: mpsc::Receiver<VcpuControl>, ) -> Result<JoinHandle<Result<()>>> where V: VcpuArch + 'static,249     pub fn run<V>(
250         &self,
251         vcpu: Option<V>,
252         vcpu_init: VcpuInitX86_64,
253         vcpus: Arc<Mutex<Vec<Box<dyn VcpuArch>>>>,
254         vm: impl VmArch + 'static,
255         mut irq_chip: Box<dyn IrqChipArch + 'static>,
256         vcpu_count: usize,
257         run_rt: bool,
258         vcpu_affinity: Option<CpuSet>,
259         delay_rt: bool,
260         no_smt: bool,
261         start_barrier: Arc<Barrier>,
262         vcpu_create_barrier: Arc<Barrier>,
263         mut io_bus: devices::Bus,
264         mut mmio_bus: devices::Bus,
265         vm_evt_wrtube: SendTube,
266         run_mode_arc: Arc<VcpuRunMode>,
267         #[cfg(feature = "stats")] stats: Option<Arc<Mutex<StatisticsCollector>>>,
268         host_cpu_topology: bool,
269         tsc_offset: Option<u64>,
270         force_calibrated_tsc_leaf: bool,
271         vcpu_control: mpsc::Receiver<VcpuControl>,
272     ) -> Result<JoinHandle<Result<()>>>
273     where
274         V: VcpuArch + 'static,
275     {
276         let context = self.clone();
277         thread::Builder::new()
278             .name(format!("crosvm_vcpu{}", self.cpu_id))
279             .spawn(move || {
280                 // Having a closure returning ExitState guarentees that we
281                 // send a VmEventType on all code paths after the closure
282                 // returns.
283                 let vcpu_fn = || -> Result<ExitState> {
284                     let runnable_vcpu = Self::runnable_vcpu(
285                         context.cpu_id,
286                         vcpu,
287                         vcpu_init,
288                         &vm,
289                         irq_chip.as_mut(),
290                         vcpu_count,
291                         run_rt && !delay_rt,
292                         vcpu_affinity,
293                         no_smt,
294                         host_cpu_topology,
295                         force_calibrated_tsc_leaf,
296                     );
297 
298                     #[cfg(target_arch = "x86_64")]
299                     let cpu_config = CpuConfigX86_64::new(
300                         force_calibrated_tsc_leaf,
301                         host_cpu_topology,
302                         false, /* enable_hwp */
303                         no_smt,
304                         false, /* itmt */
305                         None,  /* hybrid_type */
306                     );
307 
308                     #[cfg(target_arch = "x86_64")]
309                     let cpuid_context = CpuIdContext::new(
310                         context.cpu_id,
311                         vcpu_count,
312                         Some(irq_chip.as_ref()),
313                         cpu_config,
314                         vm.get_hypervisor()
315                             .check_capability(HypervisorCap::CalibratedTscLeafRequired),
316                         __cpuid_count,
317                         __cpuid,
318                     );
319 
320                     // The vcpu_create_barrier is supplied from the main thread in order for it to
321                     // wait until this thread is done creating its vcpu.
322                     vcpu_create_barrier.wait();
323 
324                     // Wait for this barrier before continuing forward.
325                     start_barrier.wait();
326 
327                     let RunnableVcpuInfo {
328                         vcpu,
329                         thread_priority_handle: _thread_priority_handle,
330                     } = runnable_vcpu?;
331 
332                     if let Some(offset) = tsc_offset {
333                         vcpu.set_tsc_offset(offset).unwrap_or_else(|e| {
334                             error!(
335                                 "Failed to set tsc_offset of {} on vcpu {}: {}",
336                                 offset, context.cpu_id, e
337                             )
338                         });
339                     }
340 
341                     // Clone vcpu so it can be used by the main thread to force a vcpu run to exit
342                     vcpus
343                         .lock()
344                         .push(Box::new(vcpu.try_clone().expect("Could not clone vcpu!")));
345 
346                     mmio_bus.set_access_id(context.cpu_id);
347                     io_bus.set_access_id(context.cpu_id);
348 
349                     vcpu_loop(
350                         &context,
351                         vcpu,
352                         vm,
353                         irq_chip,
354                         io_bus,
355                         mmio_bus,
356                         run_mode_arc,
357                         #[cfg(feature = "stats")]
358                         stats,
359                         #[cfg(target_arch = "x86_64")]
360                         cpuid_context,
361                         vcpu_control,
362                     )
363                 };
364 
365                 let final_event_data = match vcpu_fn().unwrap_or_else(|e| {
366                     error!(
367                         "vcpu {} run loop exited with error: {:#}",
368                         context.cpu_id, e
369                     );
370                     ExitState::Stop
371                 }) {
372                     ExitState::Stop => VmEventType::Exit,
373                     _ => unreachable!(),
374                 };
375                 vm_evt_wrtube
376                     .send::<VmEventType>(&final_event_data)
377                     .unwrap_or_else(|e| {
378                         error!(
379                             "failed to send final event {:?} on vcpu {}: {}",
380                             final_event_data, context.cpu_id, e
381                         )
382                     });
383                 Ok(())
384             })
385             .exit_context(Exit::SpawnVcpu, "failed to spawn VCPU thread")
386     }
387 }
388 
389 #[derive(Clone, Debug)]
390 struct VcpuExitData {
391     // Represented by duration since baseline start_instant
392     exit_time: Duration,
393     exit_result: BaseResult<VcpuExit>,
394 }
395 
396 impl Display for VcpuExitData {
fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result397     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
398         write!(f, "exit result: {:?}", self.exit_result)
399     }
400 }
401 
402 struct VcpuStallMonitor {
403     vcpu_run_threads: Vec<VcpuRunThread>,
404     run_mode: Arc<VcpuRunMode>,
405 }
406 
407 impl VcpuStallMonitor {
408     const HOST_STALL_TIMEOUT: Duration = Duration::from_secs(2);
409     const VCPU_CHECKUP_INTERVAL: Duration = Duration::from_secs(1);
410     const STALL_REPORTING_LIMITER: Duration = Duration::from_secs(10);
411 
init(run_mode: Arc<VcpuRunMode>) -> VcpuStallMonitor412     pub fn init(run_mode: Arc<VcpuRunMode>) -> VcpuStallMonitor {
413         VcpuStallMonitor {
414             vcpu_run_threads: vec![],
415             run_mode,
416         }
417     }
418 
add_vcpu_thread(&mut self, thread: VcpuRunThread)419     pub fn add_vcpu_thread(&mut self, thread: VcpuRunThread) {
420         self.vcpu_run_threads.push(thread);
421     }
422 
run(self, exit_event: &Event) -> Result<JoinHandle<Result<()>>>423     pub fn run(self, exit_event: &Event) -> Result<JoinHandle<Result<()>>> {
424         let cloned_exit_event = exit_event
425             .try_clone()
426             .exit_context(Exit::CloneEvent, "failed to clone event")?;
427         thread::Builder::new()
428             .name("crosvm_vcpu_stall_monitor".to_string())
429             .spawn(move || {
430                 let ex = Executor::new()?;
431 
432                 let mut timer = TimerAsync::new(Timer::new()?, &ex)?;
433                 let mut reset_timer = true;
434 
435                 let exit_evt_async = EventAsync::new(cloned_exit_event, &ex)?;
436                 let exit_future = exit_evt_async.next_val();
437                 pin_mut!(exit_future);
438                 'main: loop {
439                     if reset_timer {
440                         timer.reset(
441                             Self::VCPU_CHECKUP_INTERVAL,
442                             Some(Self::VCPU_CHECKUP_INTERVAL),
443                         )?;
444                         reset_timer = false;
445                     }
446                     let timer_future = timer.wait();
447                     pin_mut!(timer_future);
448                     match ex.run_until(select2(timer_future, exit_future)) {
449                         Ok((timer_result, exit_result)) => {
450                             match exit_result {
451                                 SelectResult::Finished(_) => {
452                                     info!("vcpu monitor got exit event");
453                                     break 'main;
454                                 }
455                                 SelectResult::Pending(future) => exit_future = future,
456                             }
457 
458                             match timer_result {
459                                 SelectResult::Finished(Err(e)) => {
460                                     error!(
461                                         "vcpu monitor aborting due to error awaiting future: {}",
462                                         e
463                                     );
464                                     break 'main;
465                                 }
466                                 SelectResult::Finished(_) => self.report_any_stalls(),
467                                 _ => (),
468                             }
469                         }
470                         Err(e) => {
471                             error!("vcpu monitor failed to wait on future set: {:?}", e);
472                             break 'main;
473                         }
474                     }
475 
476                     // Always ensure the vcpus aren't suspended before continuing to montior.
477                     let mut run_mode_lock = self.run_mode.mtx.lock();
478                     loop {
479                         match *run_mode_lock {
480                             VmRunMode::Running => break,
481                             VmRunMode::Suspending | VmRunMode::Breakpoint => {
482                                 info!("vcpu monitor pausing until end of suspension");
483                                 run_mode_lock = self.run_mode.cvar.wait(run_mode_lock);
484                                 reset_timer = true;
485                             }
486                             VmRunMode::Exiting => {
487                                 info!("vcpu monitor detected vm exit");
488                                 break 'main;
489                             }
490                         }
491                     }
492                 }
493 
494                 Ok(())
495             })
496             .exit_context(
497                 Exit::SpawnVcpuMonitor,
498                 "failed to spawn VCPU stall monitor thread",
499             )
500     }
501 
report_any_stalls(&self)502     fn report_any_stalls(&self) {
503         // TODO(b/208267651): Add and fire Clearcut events for stalls (and add tests)
504         // TODO(b/208267651): Also test guest stalls (vcpu.run() goes too long without exiting)
505         let now = Instant::now();
506         for vcpu_thread in self.vcpu_run_threads.iter() {
507             let monitoring_metadata = vcpu_thread.monitoring_metadata.as_ref().unwrap();
508             if let Some(ref exit_snapshot) = monitoring_metadata.last_exit_snapshot.lock().clone() {
509                 let last_run =
510                     Duration::from_millis(monitoring_metadata.last_run_time.load(Ordering::SeqCst));
511                 if last_run < exit_snapshot.exit_time {
512                     // VCPU is between runs
513                     let time_since_exit = now.saturating_duration_since(
514                         monitoring_metadata.start_instant + exit_snapshot.exit_time,
515                     );
516                     if time_since_exit > Self::HOST_STALL_TIMEOUT {
517                         self.report_stall(vcpu_thread.cpu_id, exit_snapshot, time_since_exit);
518                     }
519                 }
520             };
521         }
522     }
523 
report_stall(&self, cpu_id: usize, exit_data: &VcpuExitData, stall_time: Duration)524     fn report_stall(&self, cpu_id: usize, exit_data: &VcpuExitData, stall_time: Duration) {
525         if stall_time > Self::STALL_REPORTING_LIMITER {
526             return;
527         }
528         // Double check the Vm is running. We don't care about stalls during suspension/exit
529         if *self.run_mode.mtx.lock() != VmRunMode::Running {
530             let duration_string = format!("{:.1}sec", stall_time.as_secs_f32());
531             error!(
532                 "Host stall for {} on VCPU {} exit while handling: {}",
533                 duration_string, cpu_id, exit_data,
534             );
535         }
536     }
537 }
538 
setup_vcpu_signal_handler() -> Result<()>539 fn setup_vcpu_signal_handler() -> Result<()> {
540     Ok(())
541 }
542 
run_all_vcpus<V: VmArch + 'static, Vcpu: VcpuArch + 'static>( vcpus: Vec<Option<Vcpu>>, vcpu_boxes: Arc<Mutex<Vec<Box<dyn VcpuArch>>>>, guest_os: &RunnableLinuxVm<V, Vcpu>, exit_evt: &Event, vm_evt_wrtube: &SendTube, #[cfg(feature = "stats")] stats: &Option<Arc<Mutex<StatisticsCollector>>>, host_cpu_topology: bool, run_mode_arc: Arc<VcpuRunMode>, tsc_sync_mitigations: TscSyncMitigations, force_calibrated_tsc_leaf: bool, ) -> Result<(Vec<JoinHandle<Result<()>>>, Vec<mpsc::Sender<VcpuControl>>)>543 pub fn run_all_vcpus<V: VmArch + 'static, Vcpu: VcpuArch + 'static>(
544     vcpus: Vec<Option<Vcpu>>,
545     vcpu_boxes: Arc<Mutex<Vec<Box<dyn VcpuArch>>>>,
546     guest_os: &RunnableLinuxVm<V, Vcpu>,
547     exit_evt: &Event,
548     vm_evt_wrtube: &SendTube,
549     #[cfg(feature = "stats")] stats: &Option<Arc<Mutex<StatisticsCollector>>>,
550     host_cpu_topology: bool,
551     run_mode_arc: Arc<VcpuRunMode>,
552     tsc_sync_mitigations: TscSyncMitigations,
553     force_calibrated_tsc_leaf: bool,
554 ) -> Result<(Vec<JoinHandle<Result<()>>>, Vec<mpsc::Sender<VcpuControl>>)> {
555     let mut vcpu_threads = Vec::with_capacity(guest_os.vcpu_count + 1);
556     let mut vcpu_control_channels = Vec::with_capacity(guest_os.vcpu_count);
557     let start_barrier = Arc::new(Barrier::new(guest_os.vcpu_count + 1));
558     let enable_vcpu_monitoring = anti_tamper::enable_vcpu_monitoring();
559     setup_vcpu_signal_handler()?;
560 
561     let mut stall_monitor =
562         enable_vcpu_monitoring.then(|| VcpuStallMonitor::init(run_mode_arc.clone()));
563     for (cpu_id, vcpu) in vcpus.into_iter().enumerate() {
564         let vcpu_affinity = match guest_os.vcpu_affinity.clone() {
565             Some(VcpuAffinity::Global(v)) => Some(v),
566             Some(VcpuAffinity::PerVcpu(mut m)) => Some(m.remove(&cpu_id).unwrap_or_default()),
567             None => None,
568         };
569 
570         // TSC sync mitigations may set vcpu affinity and set a TSC offset
571         let (vcpu_affinity, tsc_offset): (Option<CpuSet>, Option<u64>) =
572             if let Some(mitigation_affinity) = tsc_sync_mitigations.get_vcpu_affinity(cpu_id) {
573                 if vcpu_affinity.is_none() {
574                     (
575                         Some(CpuSet::new(mitigation_affinity)),
576                         tsc_sync_mitigations.get_vcpu_tsc_offset(cpu_id),
577                     )
578                 } else {
579                     error!(
580                         "Core affinity {:?} specified via commandline conflicts and overrides \
581                         affinity needed for TSC sync mitigation: {:?}.",
582                         vcpu_affinity, mitigation_affinity
583                     );
584                     (vcpu_affinity, None)
585                 }
586             } else {
587                 (vcpu_affinity, None)
588             };
589 
590         let vcpu_init = &guest_os.vcpu_init[cpu_id];
591         // The vcpu_create_barrier allows the main thread to delay the spawning of additional
592         // vcpu threads until a single vcpu thread spawned has finished creating it's vcpu.
593         // We currently use this to allow creation of 1 vcpu at a time for all hypervisors.
594         // There are issues with multiple hypervisors with this approach:
595         // - Windows 11 has a regression which causes a BSOD with creation of multiple vcpu
596         //   in parallel. http://b/229635845 for more details.
597         // - GHAXM/HAXM cannot create vcpu0 in parallel with other Vcpus.
598         let vcpu_create_barrier = Arc::new(Barrier::new(2));
599         let vcpu_run_thread = VcpuRunThread::new(cpu_id, enable_vcpu_monitoring);
600         let (vcpu_control_send, vcpu_control_recv) = mpsc::channel();
601         vcpu_control_channels.push(vcpu_control_send);
602         let join_handle = vcpu_run_thread.run(
603             vcpu,
604             vcpu_init.clone(),
605             vcpu_boxes.clone(),
606             guest_os
607                 .vm
608                 .try_clone()
609                 .exit_context(Exit::CloneEvent, "failed to clone vm")?,
610             guest_os
611                 .irq_chip
612                 .try_box_clone()
613                 .exit_context(Exit::CloneEvent, "failed to clone event")?,
614             guest_os.vcpu_count,
615             guest_os.rt_cpus.contains(&cpu_id),
616             vcpu_affinity,
617             guest_os.delay_rt,
618             guest_os.no_smt,
619             start_barrier.clone(),
620             vcpu_create_barrier.clone(),
621             (*guest_os.io_bus).clone(),
622             (*guest_os.mmio_bus).clone(),
623             vm_evt_wrtube
624                 .try_clone()
625                 .exit_context(Exit::CloneTube, "failed to clone tube")?,
626             run_mode_arc.clone(),
627             #[cfg(feature = "stats")]
628             stats.clone(),
629             host_cpu_topology,
630             tsc_offset,
631             force_calibrated_tsc_leaf,
632             vcpu_control_recv,
633         )?;
634         if let Some(ref mut monitor) = stall_monitor {
635             monitor.add_vcpu_thread(vcpu_run_thread);
636         }
637 
638         // Wait until the vcpu is created before we start a new vcpu thread
639         vcpu_create_barrier.wait();
640 
641         vcpu_threads.push(join_handle);
642     }
643     if let Some(monitor) = stall_monitor {
644         vcpu_threads.push(monitor.run(exit_evt)?);
645     }
646     // Now wait on the start barrier to start all threads at the same time.
647     start_barrier.wait();
648     Ok((vcpu_threads, vcpu_control_channels))
649 }
650 
vcpu_loop<V>( context: &VcpuRunThread, mut vcpu: V, vm: impl VmArch + 'static, irq_chip: Box<dyn IrqChipArch + 'static>, io_bus: Bus, mmio_bus: Bus, run_mode_arc: Arc<VcpuRunMode>, #[cfg(feature = "stats")] stats: Option<Arc<Mutex<StatisticsCollector>>>, #[cfg(target_arch = "x86_64")] cpuid_context: CpuIdContext, vcpu_control: mpsc::Receiver<VcpuControl>, ) -> Result<ExitState> where V: VcpuArch + 'static,651 fn vcpu_loop<V>(
652     context: &VcpuRunThread,
653     mut vcpu: V,
654     vm: impl VmArch + 'static,
655     irq_chip: Box<dyn IrqChipArch + 'static>,
656     io_bus: Bus,
657     mmio_bus: Bus,
658     run_mode_arc: Arc<VcpuRunMode>,
659     #[cfg(feature = "stats")] stats: Option<Arc<Mutex<StatisticsCollector>>>,
660     #[cfg(target_arch = "x86_64")] cpuid_context: CpuIdContext,
661     vcpu_control: mpsc::Receiver<VcpuControl>,
662 ) -> Result<ExitState>
663 where
664     V: VcpuArch + 'static,
665 {
666     #[cfg(feature = "stats")]
667     let mut exit_stats = VmExitStatistics::new();
668 
669     #[cfg(feature = "stats")]
670     {
671         mmio_bus.stats.lock().set_enabled(stats.is_some());
672         io_bus.stats.lock().set_enabled(stats.is_some());
673         exit_stats.set_enabled(stats.is_some());
674     }
675 
676     let mut save_tsc_offset = true;
677 
678     loop {
679         let _trace_event = trace_event!(crosvm, "vcpu loop");
680         let mut check_vm_shutdown = run_mode_arc.get_mode() != VmRunMode::Running;
681 
682         match irq_chip.wait_until_runnable(&vcpu).with_exit_context(
683             Exit::WaitUntilRunnable,
684             || {
685                 format!(
686                     "error waiting for vcpu {} to become runnable",
687                     context.cpu_id
688                 )
689             },
690         )? {
691             VcpuRunState::Runnable => {}
692             VcpuRunState::Interrupted => check_vm_shutdown = true,
693         }
694 
695         if !check_vm_shutdown {
696             let exit = {
697                 let _trace_event = trace_event!(crosvm, "vcpu::run");
698                 if let Some(ref monitoring_metadata) = context.monitoring_metadata {
699                     monitoring_metadata.last_run_time.store(
700                         // Safe conversion because millis will always be < u32::MAX
701                         monitoring_metadata
702                             .start_instant
703                             .elapsed()
704                             .as_millis()
705                             .try_into()
706                             .unwrap(),
707                         Ordering::SeqCst,
708                     );
709                 }
710                 vcpu.run()
711             };
712             if let Some(ref monitoring_metadata) = context.monitoring_metadata {
713                 *monitoring_metadata.last_exit_snapshot.lock() = Some(VcpuExitData {
714                     exit_time: monitoring_metadata.start_instant.elapsed(),
715                     exit_result: exit,
716                 });
717             }
718 
719             // save the tsc offset if we need to
720             if save_tsc_offset {
721                 if let Ok(offset) = vcpu.get_tsc_offset() {
722                     save_vcpu_tsc_offset(offset, context.cpu_id);
723                 } else {
724                     error!("Unable to determine TSC offset");
725                 }
726                 save_tsc_offset = false;
727             }
728 
729             #[cfg(feature = "stats")]
730             let start = exit_stats.start_stat();
731 
732             match exit {
733                 Ok(VcpuExit::Io) => {
734                     let _trace_event = trace_event!(crosvm, "VcpuExit::Io");
735                     vcpu.handle_io(&mut |IoParams { address, mut size, operation}| {
736                         match operation {
737                             IoOperation::Read => {
738                                 let mut data = [0u8; 8];
739                                 if size > data.len() {
740                                     error!("unsupported IoIn size of {} bytes", size);
741                                     size = data.len();
742                                 }
743                                 io_bus.read(address, &mut data[..size]);
744                                 Some(data)
745                             }
746                             IoOperation::Write { data } => {
747                                 if size > data.len() {
748                                     error!("unsupported IoOut size of {} bytes", size);
749                                     size = data.len()
750                                 }
751                                 vm.handle_io_events(IoEventAddress::Pio(address), &data[..size])
752                                     .unwrap_or_else(|e| error!(
753                                         "failed to handle ioevent for pio write to {} on vcpu {}: {}",
754                                         address, context.cpu_id, e
755                                     ));
756                                 io_bus.write(address, &data[..size]);
757                                 None
758                             }
759                         }
760                     }).unwrap_or_else(|e| error!("failed to handle io: {}", e));
761                 }
762                 Ok(VcpuExit::Mmio) => {
763                     let _trace_event = trace_event!(crosvm, "VcpuExit::Mmio");
764                     vcpu.handle_mmio(&mut |IoParams { address, mut size, operation }| {
765                         match operation {
766                             IoOperation::Read => {
767                                 let mut data = [0u8; 8];
768                                 if size > data.len() {
769                                     error!("unsupported MmioRead size of {} bytes", size);
770                                     size = data.len();
771                                 }
772                                 {
773                                     let data = &mut data[..size];
774                                     if !mmio_bus.read(address, data) {
775                                         info!(
776                                             "mmio read failed: {:x}; trying memory read..",
777                                             address
778                                         );
779                                         vm.get_memory()
780                                             .read_exact_at_addr(
781                                                 data,
782                                                 vm_memory::GuestAddress(address),
783                                             )
784                                             .unwrap_or_else(|e| {
785                                                 error!(
786                                                     "guest memory read failed at {:x}: {}",
787                                                     address, e
788                                                 )
789                                             });
790                                     }
791                                 }
792                                 Some(data)
793                             }
794                             IoOperation::Write { data } => {
795                                 if size > data.len() {
796                                     error!("unsupported MmioWrite size of {} bytes", size);
797                                     size = data.len()
798                                 }
799                                 let data = &data[..size];
800                                 vm.handle_io_events(IoEventAddress::Mmio(address), data)
801                                     .unwrap_or_else(|e| error!(
802                                         "failed to handle ioevent for mmio write to {} on vcpu {}: {}",
803                                         address, context.cpu_id, e
804                                     ));
805                                 if !mmio_bus.write(address, data) {
806                                     info!(
807                                         "mmio write failed: {:x}; trying memory write..",
808                                         address
809                                     );
810                                     vm.get_memory()
811                                         .write_all_at_addr(data, vm_memory::GuestAddress(address))
812                                         .unwrap_or_else(|e| error!(
813                                             "guest memory write failed at {:x}: {}",
814                                             address, e
815                                         ));
816                                 }
817                                 None
818                             }
819                         }
820                     }).unwrap_or_else(|e| error!("failed to handle mmio: {}", e));
821                 }
822                 Ok(VcpuExit::IoapicEoi { vector }) => {
823                     irq_chip.broadcast_eoi(vector).unwrap_or_else(|e| {
824                         error!(
825                             "failed to broadcast eoi {} on vcpu {}: {}",
826                             vector, context.cpu_id, e
827                         )
828                     });
829                 }
830                 Ok(VcpuExit::IrqWindowOpen) => {}
831                 Ok(VcpuExit::Hlt) => irq_chip.halted(context.cpu_id),
832 
833                 // VcpuExit::Shutdown is always an error on Windows.  HAXM exits with
834                 // Shutdown only for triple faults and other vcpu panics.  WHPX never exits
835                 // with Shutdown.  Normal reboots and shutdowns, like window close, use
836                 // the vm event tube and VmRunMode::Exiting instead of VcpuExit::Shutdown.
837                 Ok(VcpuExit::Shutdown) => bail_exit_code!(Exit::VcpuShutdown, "vcpu shutdown"),
838                 Ok(VcpuExit::FailEntry {
839                     hardware_entry_failure_reason,
840                 }) => bail_exit_code!(
841                     Exit::VcpuFailEntry,
842                     "vcpu hw run failure: {:#x}",
843                     hardware_entry_failure_reason,
844                 ),
845                 Ok(VcpuExit::SystemEventShutdown) => {
846                     bail_exit_code!(Exit::VcpuSystemEvent, "vcpu SystemEventShutdown")
847                 }
848                 Ok(VcpuExit::SystemEventReset) => {
849                     bail_exit_code!(Exit::VcpuSystemEvent, "vcpu SystemEventReset")
850                 }
851                 Ok(VcpuExit::SystemEventCrash) => {
852                     bail_exit_code!(Exit::VcpuSystemEvent, "vcpu SystemEventCrash")
853                 }
854 
855                 // When we're shutting down (e.g., emulator window gets closed), GVM vmexits
856                 // with KVM_EXIT_INTR, which vcpu.run maps to VcpuExit::Intr.  But KVM_EXIT_INTR
857                 // can happen during normal operation too, when GVM's timer finds requests
858                 // pending from the host.  So we set check_vm_shutdown, then below check the
859                 // VmRunMode state to see if we should exit the run loop.
860                 Ok(VcpuExit::Intr) => check_vm_shutdown = true,
861                 Ok(VcpuExit::Canceled) => check_vm_shutdown = true,
862                 #[cfg(target_arch = "x86_64")]
863                 Ok(VcpuExit::Cpuid { mut entry }) => {
864                     let _trace_event = trace_event!(crosvm, "VcpuExit::Cpuid");
865                     // adjust the results based on crosvm logic
866                     adjust_cpuid(&mut entry, &cpuid_context);
867 
868                     // let the vcpu finish handling the exit
869                     vcpu.handle_cpuid(&entry).unwrap_or_else(|e| {
870                         error!(
871                             "failed to handle setting cpuid results on cpu {}: {}",
872                             context.cpu_id, e
873                         )
874                     });
875                 }
876                 #[cfg(target_arch = "x86_64")]
877                 Ok(VcpuExit::MsrAccess) => {} // MsrAccess handled by hypervisor impl
878                 Ok(r) => {
879                     error!("unexpected vcpu.run return value: {:?}", r);
880                     check_vm_shutdown = true;
881                 }
882                 Err(e) => match e.errno() {
883                     ERROR_RETRY_I32 => {}
884                     _ => {
885                         run_mode_arc.set_and_notify(VmRunMode::Exiting);
886                         Err(e).exit_context(Exit::VcpuRunError, "vcpu run error")?;
887                     }
888                 },
889             }
890 
891             #[cfg(feature = "stats")]
892             exit_stats.end_stat(&exit, start);
893         }
894 
895         if check_vm_shutdown {
896             let mut run_mode_lock = run_mode_arc.mtx.lock();
897             loop {
898                 match *run_mode_lock {
899                     VmRunMode::Running => {
900                         process_vcpu_control_messages(&mut vcpu, *run_mode_lock, &vcpu_control);
901                         break;
902                     }
903                     VmRunMode::Suspending => {
904                         if let Err(e) = vcpu.on_suspend() {
905                             error!(
906                                 "failed to signal to hypervisor that vcpu {} is being suspended: {}",
907                                 context.cpu_id, e
908                             );
909                         }
910                     }
911                     VmRunMode::Breakpoint => {}
912                     VmRunMode::Exiting => {
913                         #[cfg(feature = "stats")]
914                         if let Some(stats) = stats {
915                             let mut collector = stats.lock();
916                             collector.pio_bus_stats.push(io_bus.stats);
917                             collector.mmio_bus_stats.push(mmio_bus.stats);
918                             collector.vm_exit_stats.push(exit_stats);
919                         }
920                         return Ok(ExitState::Stop);
921                     }
922                 }
923 
924                 // For non running modes, we don't want to process messages until we've completed
925                 // *all* work for any VmRunMode transition. This is because one control message
926                 // asks us to inform the requestor of our current state. We want to make sure our
927                 // our state has completely transitioned before we respond to the requestor. If
928                 // we do this elsewhere, we might respond while in a partial state which could
929                 // break features like snapshotting (e.g. by introducing a race condition).
930                 process_vcpu_control_messages(&mut vcpu, *run_mode_lock, &vcpu_control);
931 
932                 // Give ownership of our exclusive lock to the condition variable that
933                 // will block. When the condition variable is notified, `wait` will
934                 // unblock and return a new exclusive lock.
935                 run_mode_lock = run_mode_arc.cvar.wait(run_mode_lock);
936             }
937         }
938 
939         irq_chip.inject_interrupts(&vcpu).unwrap_or_else(|e| {
940             error!(
941                 "failed to inject interrupts for vcpu {}: {}",
942                 context.cpu_id, e
943             )
944         });
945     }
946 }
947 
process_vcpu_control_messages<V>( vcpu: &mut V, run_mode: VmRunMode, vcpu_control: &mpsc::Receiver<VcpuControl>, ) where V: VcpuArch + 'static,948 fn process_vcpu_control_messages<V>(
949     vcpu: &mut V,
950     run_mode: VmRunMode,
951     vcpu_control: &mpsc::Receiver<VcpuControl>,
952 ) where
953     V: VcpuArch + 'static,
954 {
955     let control_messages: Vec<VcpuControl> = vcpu_control.try_iter().collect();
956 
957     for msg in control_messages {
958         match msg {
959             VcpuControl::RunState(new_mode) => {
960                 panic!("VCPUs do not handle RunState messages on Windows")
961             }
962             #[cfg(feature = "gdb")]
963             VcpuControl::Debug(d) => {
964                 unimplemented!("Windows VCPUs do not support debug yet.");
965             }
966             VcpuControl::MakeRT => {
967                 unimplemented!("Windows VCPUs do not support on demand RT.");
968             }
969             VcpuControl::GetStates(response_chan) => {
970                 // Wondering why we need this given that the state value is already in an Arc?
971                 //
972                 // The control loop generally sets the run mode directly via the Arc; however,
973                 // it has no way of knowing *when* the VCPU threads have actually acknowledged
974                 // the new value. By returning the value in here, we prove the the control loop
975                 // we have accepted the new value and are done with our state change.
976                 if let Err(e) = response_chan.send(run_mode) {
977                     error!("Failed to send GetState: {}", e);
978                 };
979             }
980             VcpuControl::Snapshot(snapshot_writer, response_chan) => {
981                 let resp = vcpu
982                     .snapshot()
983                     .and_then(|s| snapshot_writer.write_fragment(&format!("vcpu{}", vcpu.id()), &s))
984                     .with_context(|| format!("Failed to snapshot Vcpu #{}", vcpu.id()));
985                 if let Err(e) = response_chan.send(resp) {
986                     error!("Failed to send snapshot response: {}", e);
987                 }
988             }
989             VcpuControl::Restore(req) => {
990                 let resp = req
991                     .snapshot_reader
992                     .read_fragment(&format!("vcpu{}", vcpu.id()))
993                     .and_then(|s| vcpu.restore(&s, req.host_tsc_reference_moment))
994                     .with_context(|| format!("Failed to restore Vcpu #{}", vcpu.id()));
995                 if let Err(e) = req.result_sender.send(resp) {
996                     error!("Failed to send restore response: {}", e);
997                 }
998             }
999         }
1000     }
1001 }
1002 
1003 #[cfg(test)]
1004 mod tests {
1005     use super::*;
1006 
1007     struct SetupData {
1008         pub monitor: VcpuStallMonitor,
1009         pub exit_evt: Event,
1010     }
1011 
set_up_stall_monitor(vcpu_count: usize) -> Result<SetupData>1012     fn set_up_stall_monitor(vcpu_count: usize) -> Result<SetupData> {
1013         let run_mode = Arc::new(VcpuRunMode::default());
1014         let mut monitor = VcpuStallMonitor::init(run_mode);
1015 
1016         for id in 0..vcpu_count {
1017             let new_vcpu = VcpuRunThread::new(id, true /* enable_vcpu_monitoring */);
1018             monitor.add_vcpu_thread(new_vcpu);
1019         }
1020 
1021         Ok(SetupData {
1022             monitor,
1023             exit_evt: Event::new().expect("Failed to create event"),
1024         })
1025     }
1026 
1027     #[test]
stall_monitor_closes_on_exit_evt() -> Result<()>1028     fn stall_monitor_closes_on_exit_evt() -> Result<()> {
1029         let SetupData { monitor, exit_evt } = set_up_stall_monitor(1)?;
1030 
1031         exit_evt.signal()?;
1032         let _ = monitor
1033             .run(&exit_evt)?
1034             .join()
1035             .unwrap_or_else(|e| panic!("Thread join failed: {:?}", e));
1036         Ok(())
1037     }
1038 }
1039