1 // Copyright 2022 The ChromiumOS Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 //! crate for the vmm-swap feature.
6
7 #![cfg(unix)]
8 #![deny(missing_docs)]
9
10 mod file;
11 mod logger;
12 mod pagesize;
13 mod present_list;
14 // this is public only for integration tests.
15 pub mod page_handler;
16 mod processes;
17 mod staging;
18 // this is public only for integration tests.
19 pub mod userfaultfd;
20 // this is public only for integration tests.
21 pub mod worker;
22
23 use std::fs::File;
24 use std::fs::OpenOptions;
25 use std::io::stderr;
26 use std::io::stdout;
27 use std::ops::Range;
28 use std::os::unix::fs::OpenOptionsExt;
29 use std::path::Path;
30 use std::thread::Scope;
31 use std::thread::ScopedJoinHandle;
32 use std::time::Duration;
33 use std::time::Instant;
34
35 use anyhow::bail;
36 use anyhow::Context;
37 use base::debug;
38 use base::error;
39 use base::info;
40 use base::syslog;
41 use base::unix::process::fork_process;
42 use base::unix::process::Child;
43 use base::warn;
44 use base::AsRawDescriptor;
45 use base::AsRawDescriptors;
46 use base::EventToken;
47 use base::FromRawDescriptor;
48 use base::RawDescriptor;
49 use base::SharedMemory;
50 use base::Tube;
51 use base::WaitContext;
52 use jail::create_base_minijail;
53 use jail::create_sandbox_minijail;
54 use jail::JailConfig;
55 use jail::SandboxConfig;
56 use jail::MAX_OPEN_FILES_DEFAULT;
57 use once_cell::sync::Lazy;
58 use serde::Deserialize;
59 use serde::Serialize;
60 use sync::Mutex;
61 use vm_memory::GuestMemory;
62 use vm_memory::MemoryRegionInformation;
63
64 #[cfg(feature = "log_page_fault")]
65 use crate::logger::PageFaultEventLogger;
66 use crate::page_handler::MoveToStaging;
67 use crate::page_handler::PageHandler;
68 use crate::page_handler::MLOCK_BUDGET;
69 use crate::pagesize::THP_SIZE;
70 use crate::processes::freeze_child_processes;
71 use crate::processes::ProcessesGuard;
72 use crate::userfaultfd::register_regions;
73 use crate::userfaultfd::unregister_regions;
74 use crate::userfaultfd::Factory as UffdFactory;
75 use crate::userfaultfd::UffdEvent;
76 use crate::userfaultfd::Userfaultfd;
77 use crate::worker::BackgroundJobControl;
78 use crate::worker::Worker;
79
80 /// The max size of chunks to swap out/in at once.
81 const MAX_SWAP_CHUNK_SIZE: usize = 2 * 1024 * 1024; // = 2MB
82 /// The max pages to trim at once.
83 const MAX_TRIM_PAGES: usize = 1024;
84
85 /// Current state of vmm-swap.
86 ///
87 /// This should not contain fields but be a plain enum because this will be displayed to user using
88 /// `serde_json` crate.
89 #[derive(Serialize, Deserialize, Debug, Clone)]
90 pub enum State {
91 /// vmm-swap is ready. userfaultfd is disabled until vmm-swap is enabled.
92 Ready,
93 /// Pages in guest memory are moved to the staging memory.
94 Pending,
95 /// Trimming staging memory.
96 TrimInProgress,
97 /// swap-out is in progress.
98 SwapOutInProgress,
99 /// swap out succeeded.
100 Active,
101 /// swap-in is in progress.
102 SwapInInProgress,
103 /// swap out failed.
104 Failed,
105 }
106
107 impl From<&SwapState<'_>> for State {
from(state: &SwapState<'_>) -> Self108 fn from(state: &SwapState<'_>) -> Self {
109 match state {
110 SwapState::SwapOutPending => State::Pending,
111 SwapState::Trim(_) => State::TrimInProgress,
112 SwapState::SwapOutInProgress { .. } => State::SwapOutInProgress,
113 SwapState::SwapOutCompleted => State::Active,
114 SwapState::SwapInInProgress(_) => State::SwapInInProgress,
115 SwapState::Failed => State::Failed,
116 }
117 }
118 }
119
120 /// Latency and number of pages of swap operations (move to staging, swap out, swap in).
121 ///
122 /// The meaning of `StateTransition` depends on `State`.
123 ///
124 /// | `State` | `StateTransition` |
125 /// |---------------------|----------------------------------------------|
126 /// | `Ready` | empty or transition record of `swap disable` |
127 /// | `Pending` | transition record of `swap enable` |
128 /// | `SwapOutInProgress` | transition record of `swap out` |
129 /// | `Active` | transition record of `swap out` |
130 /// | `SwapInInProgress` | transition record of `swap disable` |
131 /// | `Failed` | empty |
132 #[derive(Serialize, Deserialize, Debug, Clone, Copy, Default)]
133 pub struct StateTransition {
134 /// The number of pages moved for the state transition.
135 pages: usize,
136 /// Time taken for the state transition.
137 time_ms: u128,
138 }
139
140 /// Current metrics of vmm-swap.
141 ///
142 /// This is only available while vmm-swap is enabled.
143 #[derive(Serialize, Deserialize, Debug, Clone, Default)]
144 pub struct Metrics {
145 /// count of pages on RAM.
146 resident_pages: usize,
147 /// count of pages copied from the vmm-swap file.
148 copied_from_file_pages: usize,
149 /// count of pages copied from the staging memory.
150 copied_from_staging_pages: usize,
151 /// count of pages initialized with zero.
152 zeroed_pages: usize,
153 /// count of pages which were already initialized on page faults. This can happen when several
154 /// threads/processes access the uninitialized/removed page at the same time.
155 redundant_pages: usize,
156 /// count of pages in staging memory.
157 staging_pages: usize,
158 /// count of pages in swap files.
159 swap_pages: usize,
160 }
161
162 impl Metrics {
new(page_handler: &PageHandler) -> Self163 fn new(page_handler: &PageHandler) -> Self {
164 Self {
165 resident_pages: page_handler.compute_resident_pages(),
166 copied_from_file_pages: page_handler.compute_copied_from_file_pages(),
167 copied_from_staging_pages: page_handler.compute_copied_from_staging_pages(),
168 zeroed_pages: page_handler.compute_zeroed_pages(),
169 redundant_pages: page_handler.compute_redundant_pages(),
170 staging_pages: page_handler.compute_staging_pages(),
171 swap_pages: page_handler.compute_swap_pages(),
172 }
173 }
174 }
175
176 /// The response to `crosvm swap status` command.
177 #[derive(Serialize, Deserialize, Debug, Clone)]
178 pub struct Status {
179 state: State,
180 metrics: Metrics,
181 state_transition: StateTransition,
182 }
183
184 impl Status {
new( state: &SwapState, state_transition: StateTransition, page_handler: &PageHandler, ) -> Self185 fn new(
186 state: &SwapState,
187 state_transition: StateTransition,
188 page_handler: &PageHandler,
189 ) -> Self {
190 Status {
191 state: state.into(),
192 metrics: Metrics::new(page_handler),
193 state_transition,
194 }
195 }
196
disabled(state_transition: &StateTransition) -> Self197 fn disabled(state_transition: &StateTransition) -> Self {
198 Status {
199 state: State::Ready,
200 metrics: Metrics::default(),
201 state_transition: *state_transition,
202 }
203 }
204
dummy() -> Self205 fn dummy() -> Self {
206 Status {
207 state: State::Pending,
208 metrics: Metrics::default(),
209 state_transition: StateTransition::default(),
210 }
211 }
212 }
213
214 /// Commands used in vmm-swap feature internally sent to the monitor process from the main and other
215 /// processes.
216 ///
217 /// This is mainly originated from the `crosvm swap <command>` command line.
218 #[derive(Serialize, Deserialize, Debug)]
219 enum Command {
220 Enable,
221 Trim,
222 SwapOut,
223 Disable,
224 Exit,
225 Status,
226 #[serde(with = "base::platform::with_raw_descriptor")]
227 ProcessForked(RawDescriptor),
228 }
229
230 /// [SwapController] provides APIs to control vmm-swap.
231 pub struct SwapController {
232 child_process: Child,
233 uffd_factory: UffdFactory,
234 command_tube: Tube,
235 }
236
237 impl SwapController {
238 /// Launch a monitor process for vmm-swap and return a controller.
239 ///
240 /// Pages on the [GuestMemory] are registered to userfaultfd to track pagefault events.
241 ///
242 /// # Arguments
243 ///
244 /// * `guest_memory` - fresh new [GuestMemory]. Any pages on the [GuestMemory] must not be
245 /// touched.
246 /// * `swap_dir` - directory to store swap files.
launch( guest_memory: GuestMemory, swap_dir: &Path, jail_config: &Option<JailConfig>, ) -> anyhow::Result<Self>247 pub fn launch(
248 guest_memory: GuestMemory,
249 swap_dir: &Path,
250 jail_config: &Option<JailConfig>,
251 ) -> anyhow::Result<Self> {
252 info!("vmm-swap is enabled. launch monitor process.");
253
254 let uffd_factory = UffdFactory::new();
255 let uffd = uffd_factory.create().context("create userfaultfd")?;
256
257 // The swap file is created as `O_TMPFILE` from the specified directory. As benefits:
258 //
259 // * it has no chance to conflict.
260 // * it has a security benefit that no one (except root) can access the swap file.
261 // * it will be automatically deleted by the kernel when crosvm exits/dies or on reboot if
262 // the device panics/hard-resets while crosvm is running.
263 let swap_file = OpenOptions::new()
264 .read(true)
265 .write(true)
266 .custom_flags(libc::O_TMPFILE | libc::O_EXCL)
267 .mode(0o000) // other processes with the same uid can't open the file
268 .open(swap_dir)?;
269 // The internal tube in which [Command]s sent from other processes than the monitor process
270 // to the monitor process. The response is `Status` only.
271 let (command_tube_main, command_tube_monitor) =
272 Tube::pair().context("create swap command tube")?;
273
274 // Allocate eventfd before creating sandbox.
275 let bg_job_control = BackgroundJobControl::new().context("create background job event")?;
276
277 #[cfg(feature = "log_page_fault")]
278 let page_fault_logger = PageFaultEventLogger::create(&swap_dir, &guest_memory)
279 .context("create page fault logger")?;
280
281 let mut keep_rds = vec![
282 stdout().as_raw_descriptor(),
283 stderr().as_raw_descriptor(),
284 uffd.as_raw_descriptor(),
285 swap_file.as_raw_descriptor(),
286 command_tube_monitor.as_raw_descriptor(),
287 bg_job_control.get_completion_event().as_raw_descriptor(),
288 #[cfg(feature = "log_page_fault")]
289 page_fault_logger.as_raw_descriptor(),
290 ];
291
292 syslog::push_descriptors(&mut keep_rds);
293 cros_tracing::push_descriptors!(&mut keep_rds);
294 keep_rds.extend(guest_memory.as_raw_descriptors());
295
296 keep_rds.extend(uffd_factory.as_raw_descriptors());
297
298 // Load and cache transparent hugepage size from sysfs before jumping into sandbox.
299 Lazy::force(&THP_SIZE);
300
301 let mut jail = if let Some(jail_config) = jail_config {
302 let config = SandboxConfig::new(jail_config, "swap_monitor");
303 create_sandbox_minijail(&jail_config.pivot_root, MAX_OPEN_FILES_DEFAULT, &config)
304 .context("create sandbox jail")?
305 } else {
306 create_base_minijail(Path::new("/"), MAX_OPEN_FILES_DEFAULT)
307 .context("create minijail")?
308 };
309 jail.set_rlimit(
310 libc::RLIMIT_MEMLOCK as libc::c_int,
311 MLOCK_BUDGET as u64,
312 MLOCK_BUDGET as u64,
313 )
314 .context("error setting RLIMIT_MEMLOCK")?;
315
316 // Start a page fault monitoring process (this will be the first child process of the
317 // current process)
318 let child_process =
319 fork_process(jail, keep_rds, Some(String::from("swap monitor")), || {
320 if let Err(e) = monitor_process(
321 command_tube_monitor,
322 guest_memory,
323 uffd,
324 swap_file,
325 bg_job_control,
326 #[cfg(feature = "log_page_fault")]
327 page_fault_logger,
328 ) {
329 panic!("page_fault_handler_thread exited with error: {:?}", e)
330 }
331 })
332 .context("fork monitor process")?;
333
334 // send first status request to the monitor process and wait for the response until setup on
335 // the monitor process completes.
336 command_tube_main.send(&Command::Status)?;
337 match command_tube_main
338 .recv::<Status>()
339 .context("recv initial status")?
340 .state
341 {
342 State::Ready => {
343 // The initial state of swap status is Ready and this is a signal that the
344 // monitoring process completes setup and is running.
345 }
346 status => {
347 bail!("initial state is not Ready, but {:?}", status);
348 }
349 };
350
351 Ok(Self {
352 child_process,
353 uffd_factory,
354 command_tube: command_tube_main,
355 })
356 }
357
358 /// Enable monitoring page faults and move guest memory to staging memory.
359 ///
360 /// The pages will be swapped in from the staging memory to the guest memory on page faults
361 /// until pages are written into the swap file by [Self::swap_out()].
362 ///
363 /// This waits until enabling vmm-swap finishes on the monitor process.
364 ///
365 /// The caller must guarantee that any contents on the guest memory is not updated during
366 /// enabling vmm-swap.
367 ///
368 /// # Note
369 ///
370 /// Enabling does not write pages to the swap file. User should call [Self::swap_out()]
371 /// after a suitable time.
372 ///
373 /// Just after enabling vmm-swap, some amount of pages are swapped in as soon as guest resumes.
374 /// By splitting the enable/swap_out operation and by delaying write to the swap file operation,
375 /// it has a benefit of reducing file I/O for hot pages.
enable(&self) -> anyhow::Result<()>376 pub fn enable(&self) -> anyhow::Result<()> {
377 self.command_tube
378 .send(&Command::Enable)
379 .context("send swap enable request")?;
380
381 let _ = self
382 .command_tube
383 .recv::<Status>()
384 .context("receive swap status")?;
385 Ok(())
386 }
387
388 /// Trim pages in the staging memory which are needless to be written back to the swap file.
389 ///
390 /// * zero pages
391 /// * pages which are the same as the pages in the swap file.
trim(&self) -> anyhow::Result<()>392 pub fn trim(&self) -> anyhow::Result<()> {
393 self.command_tube
394 .send(&Command::Trim)
395 .context("send swap trim request")?;
396 Ok(())
397 }
398
399 /// Swap out all the pages in the staging memory to the swap files.
400 ///
401 /// This returns as soon as it succeeds to send request to the monitor process.
402 ///
403 /// Users should call [Self::enable()] before this. See the comment of [Self::enable()] as well.
swap_out(&self) -> anyhow::Result<()>404 pub fn swap_out(&self) -> anyhow::Result<()> {
405 self.command_tube
406 .send(&Command::SwapOut)
407 .context("send swap out request")?;
408 Ok(())
409 }
410
411 /// Swap in all the guest memory and disable monitoring page faults.
412 ///
413 /// This returns as soon as it succeeds to send request to the monitor process.
disable(&self) -> anyhow::Result<()>414 pub fn disable(&self) -> anyhow::Result<()> {
415 self.command_tube
416 .send(&Command::Disable)
417 .context("send swap disable request")?;
418 Ok(())
419 }
420
421 /// Return current swap status.
422 ///
423 /// This blocks until response from the monitor process arrives to the main process.
status(&self) -> anyhow::Result<Status>424 pub fn status(&self) -> anyhow::Result<Status> {
425 self.command_tube
426 .send(&Command::Status)
427 .context("send swap status request")?;
428 let status = self.command_tube.recv().context("receive swap status")?;
429 Ok(status)
430 }
431
432 /// Shutdown the monitor process.
433 ///
434 /// This blocks until the monitor process exits.
435 ///
436 /// This should be called once.
exit(self) -> anyhow::Result<()>437 pub fn exit(self) -> anyhow::Result<()> {
438 self.command_tube
439 .send(&Command::Exit)
440 .context("send exit command")?;
441 self.child_process
442 .wait()
443 .context("wait monitor process shutdown")?;
444 Ok(())
445 }
446
447 /// Create a new userfaultfd and send it to the monitor process.
448 ///
449 /// This must be called as soon as a child process which may touch the guest memory is forked.
450 ///
451 /// Userfaultfd(2) originally has `UFFD_FEATURE_EVENT_FORK`. But it is not applicable to crosvm
452 /// since it does not support non-root user namespace.
on_process_forked(&self) -> anyhow::Result<()>453 pub fn on_process_forked(&self) -> anyhow::Result<()> {
454 let uffd = self.uffd_factory.create().context("create userfaultfd")?;
455 self.command_tube
456 .send(&Command::ProcessForked(uffd.as_raw_descriptor()))
457 .context("send forked event")?;
458 // The fd for Userfaultfd in this process is droped when this method exits, but the
459 // userfaultfd keeps alive in the monitor process which it is sent to.
460 Ok(())
461 }
462
463 /// Suspend device processes using `SIGSTOP` signal.
464 ///
465 /// When the returned `ProcessesGuard` is dropped, the devices resume.
466 ///
467 /// This must be called from the main process.
suspend_devices(&self) -> anyhow::Result<ProcessesGuard>468 pub fn suspend_devices(&self) -> anyhow::Result<ProcessesGuard> {
469 freeze_child_processes(self.child_process.pid)
470 }
471 }
472
473 impl AsRawDescriptors for SwapController {
as_raw_descriptors(&self) -> Vec<RawDescriptor>474 fn as_raw_descriptors(&self) -> Vec<RawDescriptor> {
475 let mut rds = self.uffd_factory.as_raw_descriptors();
476 rds.push(self.command_tube.as_raw_descriptor());
477 rds
478 }
479 }
480
481 #[derive(EventToken)]
482 enum Token {
483 UffdEvents(u32),
484 Command,
485 BackgroundJobCompleted,
486 }
487
488 struct UffdList<'a> {
489 list: Vec<Userfaultfd>,
490 wait_ctx: &'a WaitContext<Token>,
491 }
492
493 impl<'a> UffdList<'a> {
494 const ID_MAIN_UFFD: u32 = 0;
495
new(main_uffd: Userfaultfd, wait_ctx: &'a WaitContext<Token>) -> Self496 fn new(main_uffd: Userfaultfd, wait_ctx: &'a WaitContext<Token>) -> Self {
497 Self {
498 list: vec![main_uffd],
499 wait_ctx,
500 }
501 }
502
register(&mut self, uffd: Userfaultfd) -> anyhow::Result<()>503 fn register(&mut self, uffd: Userfaultfd) -> anyhow::Result<()> {
504 let id_uffd = self
505 .list
506 .len()
507 .try_into()
508 .context("too many userfaultfd forked")?;
509
510 self.wait_ctx
511 .add(&uffd, Token::UffdEvents(id_uffd))
512 .context("add to wait context")?;
513 self.list.push(uffd);
514
515 Ok(())
516 }
517
get(&self, id: u32) -> Option<&Userfaultfd>518 fn get(&self, id: u32) -> Option<&Userfaultfd> {
519 self.list.get(id as usize)
520 }
521
main_uffd(&self) -> &Userfaultfd522 fn main_uffd(&self) -> &Userfaultfd {
523 &self.list[Self::ID_MAIN_UFFD as usize]
524 }
525
get_list(&self) -> &[Userfaultfd]526 fn get_list(&self) -> &[Userfaultfd] {
527 &self.list
528 }
529 }
530
regions_from_guest_memory(guest_memory: &GuestMemory) -> Vec<Range<usize>>531 fn regions_from_guest_memory(guest_memory: &GuestMemory) -> Vec<Range<usize>> {
532 let mut regions = Vec::new();
533 guest_memory
534 .with_regions::<_, ()>(
535 |MemoryRegionInformation {
536 size, host_addr, ..
537 }| {
538 regions.push(host_addr..(host_addr + size));
539 Ok(())
540 },
541 )
542 .unwrap(); // the callback never return error.
543 regions
544 }
545
546 /// The main thread of the monitor process.
monitor_process( command_tube: Tube, guest_memory: GuestMemory, uffd: Userfaultfd, swap_file: File, bg_job_control: BackgroundJobControl, #[cfg(feature = "log_page_fault")] mut page_fault_logger: PageFaultEventLogger, ) -> anyhow::Result<()>547 fn monitor_process(
548 command_tube: Tube,
549 guest_memory: GuestMemory,
550 uffd: Userfaultfd,
551 swap_file: File,
552 bg_job_control: BackgroundJobControl,
553 #[cfg(feature = "log_page_fault")] mut page_fault_logger: PageFaultEventLogger,
554 ) -> anyhow::Result<()> {
555 info!("monitor_process started");
556
557 let wait_ctx = WaitContext::build_with(&[
558 (&command_tube, Token::Command),
559 // Even though swap isn't enabled until the enable command is received, it's necessary to
560 // start waiting on the main uffd here so that uffd fork events can be processed, because
561 // child processes will block until their corresponding uffd fork event is read.
562 (&uffd, Token::UffdEvents(UffdList::ID_MAIN_UFFD)),
563 (
564 bg_job_control.get_completion_event(),
565 Token::BackgroundJobCompleted,
566 ),
567 ])
568 .context("create wait context")?;
569
570 let n_worker = num_cpus::get();
571 info!("start {} workers for staging memory move", n_worker);
572 // The worker threads are killed when the main thread of the monitor process dies.
573 let worker = Worker::new(n_worker, n_worker);
574
575 let mut uffd_list = UffdList::new(uffd, &wait_ctx);
576 let mut state_transition = StateTransition::default();
577
578 loop {
579 let events = wait_ctx.wait().context("wait poll events")?;
580
581 for event in events.iter() {
582 match event.token {
583 Token::UffdEvents(id_uffd) => {
584 let uffd = uffd_list
585 .get(id_uffd)
586 .with_context(|| format!("uffd is not found for idx: {}", id_uffd))?;
587 // Userfaultfd does not work as level triggered but as edge triggered. We need
588 // to read all the events in the userfaultfd here.
589 while let Some(event) = uffd.read_event().context("read userfaultfd event")? {
590 match event {
591 UffdEvent::Remove { .. } => {
592 // BUG(b/272620051): This is a bug of userfaultfd that
593 // UFFD_EVENT_REMOVE can be read even after unregistering memory
594 // from the userfaultfd.
595 warn!("page remove event while vmm-swap disabled");
596 }
597 event => {
598 bail!("unexpected uffd event: {:?}", event);
599 }
600 }
601 }
602 }
603 Token::Command => match command_tube
604 .recv::<Command>()
605 .context("recv swap command")?
606 {
607 Command::ProcessForked(raw_descriptor) => {
608 debug!("new fork uffd: {:?}", raw_descriptor);
609 // Safe because the raw_descriptor is sent from another process via Tube and
610 // no one in this process owns it.
611 let uffd = unsafe { Userfaultfd::from_raw_descriptor(raw_descriptor) };
612 uffd_list.register(uffd).context("register forked uffd")?;
613 }
614 Command::Enable => {
615 info!("enabling vmm-swap");
616
617 let staging_shmem =
618 SharedMemory::new("swap staging memory", guest_memory.memory_size())
619 .context("create staging shmem")?;
620
621 let regions = regions_from_guest_memory(&guest_memory);
622
623 let page_handler = match PageHandler::create(
624 &swap_file,
625 &staging_shmem,
626 ®ions,
627 worker.channel.clone(),
628 ) {
629 Ok(page_handler) => page_handler,
630 Err(e) => {
631 error!("failed to create swap handler: {:?}", e);
632 continue;
633 }
634 };
635
636 // TODO(b/272634283): Should just disable vmm-swap without crash.
637 // Safe because the regions are from guest memory and uffd_list contains all
638 // the processes of crosvm.
639 unsafe { register_regions(®ions, uffd_list.get_list()) }
640 .context("register regions")?;
641
642 // events may contain unprocessed entries, but those pending events will be
643 // immediately re-created when handle_vmm_swap checks wait_ctx because
644 // WaitContext is level triggered.
645 drop(events);
646
647 let mutex_transition = Mutex::new(state_transition);
648
649 bg_job_control.reset()?;
650 let exit = std::thread::scope(|scope| {
651 let exit = handle_vmm_swap(
652 scope,
653 &wait_ctx,
654 &page_handler,
655 &uffd_list,
656 &guest_memory,
657 &command_tube,
658 &worker,
659 &mutex_transition,
660 &bg_job_control,
661 #[cfg(feature = "log_page_fault")]
662 &mut page_fault_logger,
663 );
664 // Abort background jobs to unblock ScopedJoinHandle eariler on a
665 // failure.
666 bg_job_control.abort();
667 exit
668 })?;
669 if exit {
670 return Ok(());
671 }
672 state_transition = mutex_transition.into_inner();
673
674 unregister_regions(®ions, uffd_list.get_list())
675 .context("unregister regions")?;
676
677 // Truncate the swap file to hold minimum resources while disabled.
678 if let Err(e) = swap_file.set_len(0) {
679 error!("failed to clear swap file: {:?}", e);
680 };
681
682 info!("vmm-swap is disabled");
683 // events are obsolete. Run `WaitContext::wait()` again
684 break;
685 }
686 Command::Trim => {
687 warn!("swap trim while disabled");
688 }
689 Command::SwapOut => {
690 warn!("swap out while disabled");
691 }
692 Command::Disable => {
693 warn!("swap is already disabled");
694 }
695 Command::Exit => {
696 return Ok(());
697 }
698 Command::Status => {
699 let status = Status::disabled(&state_transition);
700 command_tube.send(&status).context("send status response")?;
701 info!("swap status: {:?}", status);
702 }
703 },
704 Token::BackgroundJobCompleted => {
705 error!("unexpected background job completed event while swap is disabled");
706 bg_job_control.reset()?;
707 }
708 };
709 }
710 }
711 }
712
713 enum SwapState<'scope> {
714 SwapOutPending,
715 Trim(ScopedJoinHandle<'scope, anyhow::Result<()>>),
716 SwapOutInProgress { started_time: Instant },
717 SwapOutCompleted,
718 SwapInInProgress(ScopedJoinHandle<'scope, anyhow::Result<()>>),
719 Failed,
720 }
721
handle_enable_command<'scope>( state: SwapState, bg_job_control: &BackgroundJobControl, page_handler: &PageHandler, guest_memory: &GuestMemory, worker: &Worker<MoveToStaging>, state_transition: &Mutex<StateTransition>, ) -> anyhow::Result<SwapState<'scope>>722 fn handle_enable_command<'scope>(
723 state: SwapState,
724 bg_job_control: &BackgroundJobControl,
725 page_handler: &PageHandler,
726 guest_memory: &GuestMemory,
727 worker: &Worker<MoveToStaging>,
728 state_transition: &Mutex<StateTransition>,
729 ) -> anyhow::Result<SwapState<'scope>> {
730 match state {
731 SwapState::SwapInInProgress(join_handle) => {
732 info!("abort swap-in");
733 abort_background_job(join_handle, bg_job_control).context("abort swap-in")?;
734 }
735 SwapState::Trim(join_handle) => {
736 info!("abort trim");
737 abort_background_job(join_handle, bg_job_control).context("abort trim")?;
738 }
739 _ => {}
740 }
741
742 info!("start moving memory to staging");
743 match move_guest_to_staging(page_handler, guest_memory, worker) {
744 Ok(new_state_transition) => {
745 info!(
746 "move {} pages to staging in {} ms",
747 new_state_transition.pages, new_state_transition.time_ms
748 );
749 *state_transition.lock() = new_state_transition;
750 Ok(SwapState::SwapOutPending)
751 }
752 Err(e) => {
753 error!("failed to move memory to staging: {}", e);
754 *state_transition.lock() = StateTransition::default();
755 Ok(SwapState::Failed)
756 }
757 }
758 }
759
move_guest_to_staging( page_handler: &PageHandler, guest_memory: &GuestMemory, worker: &Worker<MoveToStaging>, ) -> anyhow::Result<StateTransition>760 fn move_guest_to_staging(
761 page_handler: &PageHandler,
762 guest_memory: &GuestMemory,
763 worker: &Worker<MoveToStaging>,
764 ) -> anyhow::Result<StateTransition> {
765 let start_time = std::time::Instant::now();
766
767 let mut pages = 0;
768
769 let result = guest_memory.with_regions::<_, anyhow::Error>(
770 |MemoryRegionInformation {
771 host_addr,
772 shm,
773 shm_offset,
774 ..
775 }| {
776 // safe because:
777 // * all the regions are registered to all userfaultfd
778 // * no process access the guest memory
779 // * page fault events are handled by PageHandler
780 // * wait for all the copy completed within _processes_guard
781 pages += unsafe { page_handler.move_to_staging(host_addr, shm, shm_offset) }
782 .context("move to staging")?;
783 Ok(())
784 },
785 );
786 worker.channel.wait_complete();
787
788 match result {
789 Ok(()) => {
790 if page_handler.compute_resident_pages() > 0 {
791 error!(
792 "active page is not zero just after swap out but {} pages",
793 page_handler.compute_resident_pages()
794 );
795 }
796 let time_ms = start_time.elapsed().as_millis();
797 Ok(StateTransition { pages, time_ms })
798 }
799 Err(e) => Err(e),
800 }
801 }
802
abort_background_job<T>( join_handle: ScopedJoinHandle<'_, anyhow::Result<T>>, bg_job_control: &BackgroundJobControl, ) -> anyhow::Result<T>803 fn abort_background_job<T>(
804 join_handle: ScopedJoinHandle<'_, anyhow::Result<T>>,
805 bg_job_control: &BackgroundJobControl,
806 ) -> anyhow::Result<T> {
807 bg_job_control.abort();
808 // Wait until the background job is aborted and the thread finishes.
809 let result = join_handle
810 .join()
811 .expect("panic on the background job thread");
812 bg_job_control.reset().context("reset swap in event")?;
813 result.context("failure on background job thread")
814 }
815
handle_vmm_swap<'scope, 'env>( scope: &'scope Scope<'scope, 'env>, wait_ctx: &WaitContext<Token>, page_handler: &'env PageHandler<'env>, uffd_list: &'env UffdList, guest_memory: &GuestMemory, command_tube: &Tube, worker: &Worker<MoveToStaging>, state_transition: &'env Mutex<StateTransition>, bg_job_control: &'env BackgroundJobControl, #[cfg(feature = "log_page_fault")] page_fault_logger: &mut PageFaultEventLogger, ) -> anyhow::Result<bool>816 fn handle_vmm_swap<'scope, 'env>(
817 scope: &'scope Scope<'scope, 'env>,
818 wait_ctx: &WaitContext<Token>,
819 page_handler: &'env PageHandler<'env>,
820 uffd_list: &'env UffdList,
821 guest_memory: &GuestMemory,
822 command_tube: &Tube,
823 worker: &Worker<MoveToStaging>,
824 state_transition: &'env Mutex<StateTransition>,
825 bg_job_control: &'env BackgroundJobControl,
826 #[cfg(feature = "log_page_fault")] page_fault_logger: &mut PageFaultEventLogger,
827 ) -> anyhow::Result<bool> {
828 let mut state = match move_guest_to_staging(page_handler, guest_memory, worker) {
829 Ok(transition) => {
830 info!(
831 "move {} pages to staging in {} ms",
832 transition.pages, transition.time_ms
833 );
834 *state_transition.lock() = transition;
835 SwapState::SwapOutPending
836 }
837 Err(e) => {
838 error!("failed to move memory to staging: {}", e);
839 *state_transition.lock() = StateTransition::default();
840 SwapState::Failed
841 }
842 };
843 command_tube
844 .send(&Status::dummy())
845 .context("send enable finish signal")?;
846
847 loop {
848 let events = match &state {
849 SwapState::SwapOutInProgress { started_time } => {
850 let events = wait_ctx
851 .wait_timeout(Duration::ZERO)
852 .context("wait poll events")?;
853
854 // TODO(b/273129441): swap out on a background thread.
855 // Proceed swap out only when there is no page fault (or other) events.
856 if events.is_empty() {
857 match page_handler.swap_out(MAX_SWAP_CHUNK_SIZE) {
858 Ok(num_pages) => {
859 let mut state_transition = state_transition.lock();
860 state_transition.pages += num_pages;
861 state_transition.time_ms = started_time.elapsed().as_millis();
862 if num_pages == 0 {
863 info!(
864 "swap out all {} pages to file in {} ms",
865 state_transition.pages, state_transition.time_ms
866 );
867 state = SwapState::SwapOutCompleted;
868 }
869 }
870 Err(e) => {
871 error!("failed to swap out: {:?}", e);
872 state = SwapState::Failed;
873 *state_transition.lock() = StateTransition::default();
874 }
875 }
876 continue;
877 }
878
879 events
880 }
881 _ => wait_ctx.wait().context("wait poll events")?,
882 };
883
884 for event in events.iter() {
885 match event.token {
886 Token::UffdEvents(id_uffd) => {
887 let uffd = uffd_list
888 .get(id_uffd)
889 .with_context(|| format!("uffd is not found for idx: {}", id_uffd))?;
890 // Userfaultfd does not work as level triggered but as edge triggered. We need
891 // to read all the events in the userfaultfd here.
892 // TODO(kawasin): Use [userfaultfd::Uffd::read_events()] for performance.
893 while let Some(event) = uffd.read_event().context("read userfaultfd event")? {
894 match event {
895 UffdEvent::Pagefault { addr, .. } => {
896 #[cfg(feature = "log_page_fault")]
897 page_fault_logger.log_page_fault(addr as usize, id_uffd);
898 page_handler
899 .handle_page_fault(uffd, addr as usize)
900 .context("handle fault")?;
901 }
902 UffdEvent::Remove { start, end } => {
903 page_handler
904 .handle_page_remove(start as usize, end as usize)
905 .context("handle fault")?;
906 }
907 event => {
908 bail!("unsupported UffdEvent: {:?}", event);
909 }
910 }
911 }
912 }
913 Token::Command => match command_tube
914 .recv::<Command>()
915 .context("recv swap command")?
916 {
917 Command::ProcessForked(raw_descriptor) => {
918 debug!("new fork uffd: {:?}", raw_descriptor);
919 // TODO(b/266898615): The forked processes must wait running until the
920 // regions are registered to the new uffd if vmm-swap is already enabled.
921 // There are currently no use cases for swap + hotplug, so this is currently
922 // not implemented.
923 bail!("child process is forked while swap is enabled");
924 }
925 Command::Enable => {
926 let result = handle_enable_command(
927 state,
928 bg_job_control,
929 page_handler,
930 guest_memory,
931 worker,
932 state_transition,
933 );
934 command_tube
935 .send(&Status::dummy())
936 .context("send enable finish signal")?;
937 state = result?;
938 }
939 Command::Trim => match &state {
940 SwapState::SwapOutPending => {
941 *state_transition.lock() = StateTransition::default();
942 let join_handle = scope.spawn(|| {
943 let mut ctx = page_handler.start_trim();
944 let job = bg_job_control.new_job();
945 let start_time = std::time::Instant::now();
946
947 while !job.is_aborted() {
948 if let Some(trimmed_pages) =
949 ctx.trim_pages(MAX_TRIM_PAGES).context("trim pages")?
950 {
951 let mut state_transition = state_transition.lock();
952 state_transition.pages += trimmed_pages;
953 state_transition.time_ms = start_time.elapsed().as_millis();
954 } else {
955 // Traversed all pages.
956 break;
957 }
958 }
959
960 if job.is_aborted() {
961 info!("trim is aborted");
962 } else {
963 info!(
964 "trimmed {} clean pages and {} zero pages",
965 ctx.trimmed_clean_pages(),
966 ctx.trimmed_zero_pages()
967 );
968 }
969 Ok(())
970 });
971
972 state = SwapState::Trim(join_handle);
973 info!("start trimming staging memory");
974 }
975 state => {
976 warn!("swap trim is not ready. state: {:?}", State::from(state));
977 }
978 },
979 Command::SwapOut => match &state {
980 SwapState::SwapOutPending => {
981 state = SwapState::SwapOutInProgress {
982 started_time: std::time::Instant::now(),
983 };
984 *state_transition.lock() = StateTransition::default();
985 info!("start swapping out");
986 }
987 state => {
988 warn!("swap out is not ready. state: {:?}", State::from(state));
989 }
990 },
991 Command::Disable => {
992 match state {
993 SwapState::Trim(join_handle) => {
994 info!("abort trim");
995 abort_background_job(join_handle, bg_job_control)
996 .context("abort trim")?;
997 }
998 SwapState::SwapOutInProgress { .. } => {
999 info!("swap out is aborted");
1000 }
1001 SwapState::SwapInInProgress(_) => {
1002 info!("swap in is in progress");
1003 continue;
1004 }
1005 _ => {}
1006 }
1007 *state_transition.lock() = StateTransition::default();
1008
1009 let join_handle = scope.spawn(|| {
1010 let mut ctx = page_handler.start_swap_in();
1011 let uffd = uffd_list.main_uffd();
1012 let job = bg_job_control.new_job();
1013 let start_time = std::time::Instant::now();
1014 while !job.is_aborted() {
1015 match ctx.swap_in(uffd, MAX_SWAP_CHUNK_SIZE) {
1016 Ok(num_pages) => {
1017 if num_pages == 0 {
1018 break;
1019 }
1020 let mut state_transition = state_transition.lock();
1021 state_transition.pages += num_pages;
1022 state_transition.time_ms = start_time.elapsed().as_millis();
1023 }
1024 Err(e) => {
1025 bail!("failed to swap in: {:?}", e);
1026 }
1027 }
1028 }
1029 if job.is_aborted() {
1030 info!("swap in is aborted");
1031 }
1032 Ok(())
1033 });
1034 state = SwapState::SwapInInProgress(join_handle);
1035
1036 info!("start swapping in");
1037 }
1038 Command::Exit => {
1039 match state {
1040 SwapState::SwapInInProgress(join_handle) => {
1041 // Wait until swap-in finishes.
1042 if let Err(e) = join_handle.join() {
1043 bail!("failed to join swap in thread: {:?}", e);
1044 }
1045 return Ok(true);
1046 }
1047 SwapState::Trim(join_handle) => {
1048 abort_background_job(join_handle, bg_job_control)
1049 .context("abort trim")?;
1050 }
1051 _ => {}
1052 }
1053 let mut ctx = page_handler.start_swap_in();
1054 let uffd = uffd_list.main_uffd();
1055 // Swap-in all before exit.
1056 while ctx.swap_in(uffd, MAX_SWAP_CHUNK_SIZE).context("swap in")? > 0 {}
1057 return Ok(true);
1058 }
1059 Command::Status => {
1060 let status = Status::new(&state, *state_transition.lock(), page_handler);
1061 command_tube.send(&status).context("send status response")?;
1062 info!("swap status: {:?}", status);
1063 }
1064 },
1065 Token::BackgroundJobCompleted => {
1066 // Reset the completed event.
1067 if !bg_job_control
1068 .reset()
1069 .context("reset background job event")?
1070 {
1071 // When the job is aborted and the event is comsumed by reset(), the token
1072 // `Token::BackgroundJobCompleted` may remain in the `events`. Just ignore
1073 // the obsolete token here.
1074 continue;
1075 }
1076 match state {
1077 SwapState::SwapInInProgress(join_handle) => {
1078 join_handle
1079 .join()
1080 .expect("panic on the background job thread")
1081 .context("swap in finish")?;
1082 let state_transition = state_transition.lock();
1083 info!(
1084 "swap in all {} pages in {} ms.",
1085 state_transition.pages, state_transition.time_ms
1086 );
1087 return Ok(false);
1088 }
1089 SwapState::Trim(join_handle) => {
1090 join_handle
1091 .join()
1092 .expect("panic on the background job thread")
1093 .context("trim finish")?;
1094 let state_transition = state_transition.lock();
1095 info!(
1096 "trimmed {} pages in {} ms.",
1097 state_transition.pages, state_transition.time_ms
1098 );
1099 state = SwapState::SwapOutPending;
1100 }
1101 state => {
1102 bail!(
1103 "background job completed but the actual state is {:?}",
1104 State::from(&state)
1105 );
1106 }
1107 }
1108 }
1109 };
1110 }
1111 }
1112 }
1113