• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2022 The ChromiumOS Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 //! crate for the vmm-swap feature.
6 
7 #![cfg(unix)]
8 #![deny(missing_docs)]
9 
10 mod file;
11 mod logger;
12 mod pagesize;
13 mod present_list;
14 // this is public only for integration tests.
15 pub mod page_handler;
16 mod processes;
17 mod staging;
18 // this is public only for integration tests.
19 pub mod userfaultfd;
20 // this is public only for integration tests.
21 pub mod worker;
22 
23 use std::fs::File;
24 use std::fs::OpenOptions;
25 use std::io::stderr;
26 use std::io::stdout;
27 use std::ops::Range;
28 use std::os::unix::fs::OpenOptionsExt;
29 use std::path::Path;
30 use std::thread::Scope;
31 use std::thread::ScopedJoinHandle;
32 use std::time::Duration;
33 use std::time::Instant;
34 
35 use anyhow::bail;
36 use anyhow::Context;
37 use base::debug;
38 use base::error;
39 use base::info;
40 use base::syslog;
41 use base::unix::process::fork_process;
42 use base::unix::process::Child;
43 use base::warn;
44 use base::AsRawDescriptor;
45 use base::AsRawDescriptors;
46 use base::EventToken;
47 use base::FromRawDescriptor;
48 use base::RawDescriptor;
49 use base::SharedMemory;
50 use base::Tube;
51 use base::WaitContext;
52 use jail::create_base_minijail;
53 use jail::create_sandbox_minijail;
54 use jail::JailConfig;
55 use jail::SandboxConfig;
56 use jail::MAX_OPEN_FILES_DEFAULT;
57 use once_cell::sync::Lazy;
58 use serde::Deserialize;
59 use serde::Serialize;
60 use sync::Mutex;
61 use vm_memory::GuestMemory;
62 use vm_memory::MemoryRegionInformation;
63 
64 #[cfg(feature = "log_page_fault")]
65 use crate::logger::PageFaultEventLogger;
66 use crate::page_handler::MoveToStaging;
67 use crate::page_handler::PageHandler;
68 use crate::page_handler::MLOCK_BUDGET;
69 use crate::pagesize::THP_SIZE;
70 use crate::processes::freeze_child_processes;
71 use crate::processes::ProcessesGuard;
72 use crate::userfaultfd::register_regions;
73 use crate::userfaultfd::unregister_regions;
74 use crate::userfaultfd::Factory as UffdFactory;
75 use crate::userfaultfd::UffdEvent;
76 use crate::userfaultfd::Userfaultfd;
77 use crate::worker::BackgroundJobControl;
78 use crate::worker::Worker;
79 
80 /// The max size of chunks to swap out/in at once.
81 const MAX_SWAP_CHUNK_SIZE: usize = 2 * 1024 * 1024; // = 2MB
82 /// The max pages to trim at once.
83 const MAX_TRIM_PAGES: usize = 1024;
84 
85 /// Current state of vmm-swap.
86 ///
87 /// This should not contain fields but be a plain enum because this will be displayed to user using
88 /// `serde_json` crate.
89 #[derive(Serialize, Deserialize, Debug, Clone)]
90 pub enum State {
91     /// vmm-swap is ready. userfaultfd is disabled until vmm-swap is enabled.
92     Ready,
93     /// Pages in guest memory are moved to the staging memory.
94     Pending,
95     /// Trimming staging memory.
96     TrimInProgress,
97     /// swap-out is in progress.
98     SwapOutInProgress,
99     /// swap out succeeded.
100     Active,
101     /// swap-in is in progress.
102     SwapInInProgress,
103     /// swap out failed.
104     Failed,
105 }
106 
107 impl From<&SwapState<'_>> for State {
from(state: &SwapState<'_>) -> Self108     fn from(state: &SwapState<'_>) -> Self {
109         match state {
110             SwapState::SwapOutPending => State::Pending,
111             SwapState::Trim(_) => State::TrimInProgress,
112             SwapState::SwapOutInProgress { .. } => State::SwapOutInProgress,
113             SwapState::SwapOutCompleted => State::Active,
114             SwapState::SwapInInProgress(_) => State::SwapInInProgress,
115             SwapState::Failed => State::Failed,
116         }
117     }
118 }
119 
120 /// Latency and number of pages of swap operations (move to staging, swap out, swap in).
121 ///
122 /// The meaning of `StateTransition` depends on `State`.
123 ///
124 /// | `State`             | `StateTransition`                            |
125 /// |---------------------|----------------------------------------------|
126 /// | `Ready`             | empty or transition record of `swap disable` |
127 /// | `Pending`           | transition record of `swap enable`           |
128 /// | `SwapOutInProgress` | transition record of `swap out`              |
129 /// | `Active`            | transition record of `swap out`              |
130 /// | `SwapInInProgress`  | transition record of `swap disable`          |
131 /// | `Failed`            | empty                                        |
132 #[derive(Serialize, Deserialize, Debug, Clone, Copy, Default)]
133 pub struct StateTransition {
134     /// The number of pages moved for the state transition.
135     pages: usize,
136     /// Time taken for the state transition.
137     time_ms: u128,
138 }
139 
140 /// Current metrics of vmm-swap.
141 ///
142 /// This is only available while vmm-swap is enabled.
143 #[derive(Serialize, Deserialize, Debug, Clone, Default)]
144 pub struct Metrics {
145     /// count of pages on RAM.
146     resident_pages: usize,
147     /// count of pages copied from the vmm-swap file.
148     copied_from_file_pages: usize,
149     /// count of pages copied from the staging memory.
150     copied_from_staging_pages: usize,
151     /// count of pages initialized with zero.
152     zeroed_pages: usize,
153     /// count of pages which were already initialized on page faults. This can happen when several
154     /// threads/processes access the uninitialized/removed page at the same time.
155     redundant_pages: usize,
156     /// count of pages in staging memory.
157     staging_pages: usize,
158     /// count of pages in swap files.
159     swap_pages: usize,
160 }
161 
162 impl Metrics {
new(page_handler: &PageHandler) -> Self163     fn new(page_handler: &PageHandler) -> Self {
164         Self {
165             resident_pages: page_handler.compute_resident_pages(),
166             copied_from_file_pages: page_handler.compute_copied_from_file_pages(),
167             copied_from_staging_pages: page_handler.compute_copied_from_staging_pages(),
168             zeroed_pages: page_handler.compute_zeroed_pages(),
169             redundant_pages: page_handler.compute_redundant_pages(),
170             staging_pages: page_handler.compute_staging_pages(),
171             swap_pages: page_handler.compute_swap_pages(),
172         }
173     }
174 }
175 
176 /// The response to `crosvm swap status` command.
177 #[derive(Serialize, Deserialize, Debug, Clone)]
178 pub struct Status {
179     state: State,
180     metrics: Metrics,
181     state_transition: StateTransition,
182 }
183 
184 impl Status {
new( state: &SwapState, state_transition: StateTransition, page_handler: &PageHandler, ) -> Self185     fn new(
186         state: &SwapState,
187         state_transition: StateTransition,
188         page_handler: &PageHandler,
189     ) -> Self {
190         Status {
191             state: state.into(),
192             metrics: Metrics::new(page_handler),
193             state_transition,
194         }
195     }
196 
disabled(state_transition: &StateTransition) -> Self197     fn disabled(state_transition: &StateTransition) -> Self {
198         Status {
199             state: State::Ready,
200             metrics: Metrics::default(),
201             state_transition: *state_transition,
202         }
203     }
204 
dummy() -> Self205     fn dummy() -> Self {
206         Status {
207             state: State::Pending,
208             metrics: Metrics::default(),
209             state_transition: StateTransition::default(),
210         }
211     }
212 }
213 
214 /// Commands used in vmm-swap feature internally sent to the monitor process from the main and other
215 /// processes.
216 ///
217 /// This is mainly originated from the `crosvm swap <command>` command line.
218 #[derive(Serialize, Deserialize, Debug)]
219 enum Command {
220     Enable,
221     Trim,
222     SwapOut,
223     Disable,
224     Exit,
225     Status,
226     #[serde(with = "base::platform::with_raw_descriptor")]
227     ProcessForked(RawDescriptor),
228 }
229 
230 /// [SwapController] provides APIs to control vmm-swap.
231 pub struct SwapController {
232     child_process: Child,
233     uffd_factory: UffdFactory,
234     command_tube: Tube,
235 }
236 
237 impl SwapController {
238     /// Launch a monitor process for vmm-swap and return a controller.
239     ///
240     /// Pages on the [GuestMemory] are registered to userfaultfd to track pagefault events.
241     ///
242     /// # Arguments
243     ///
244     /// * `guest_memory` - fresh new [GuestMemory]. Any pages on the [GuestMemory] must not be
245     ///   touched.
246     /// * `swap_dir` - directory to store swap files.
launch( guest_memory: GuestMemory, swap_dir: &Path, jail_config: &Option<JailConfig>, ) -> anyhow::Result<Self>247     pub fn launch(
248         guest_memory: GuestMemory,
249         swap_dir: &Path,
250         jail_config: &Option<JailConfig>,
251     ) -> anyhow::Result<Self> {
252         info!("vmm-swap is enabled. launch monitor process.");
253 
254         let uffd_factory = UffdFactory::new();
255         let uffd = uffd_factory.create().context("create userfaultfd")?;
256 
257         // The swap file is created as `O_TMPFILE` from the specified directory. As benefits:
258         //
259         // * it has no chance to conflict.
260         // * it has a security benefit that no one (except root) can access the swap file.
261         // * it will be automatically deleted by the kernel when crosvm exits/dies or on reboot if
262         //   the device panics/hard-resets while crosvm is running.
263         let swap_file = OpenOptions::new()
264             .read(true)
265             .write(true)
266             .custom_flags(libc::O_TMPFILE | libc::O_EXCL)
267             .mode(0o000) // other processes with the same uid can't open the file
268             .open(swap_dir)?;
269         // The internal tube in which [Command]s sent from other processes than the monitor process
270         // to the monitor process. The response is `Status` only.
271         let (command_tube_main, command_tube_monitor) =
272             Tube::pair().context("create swap command tube")?;
273 
274         // Allocate eventfd before creating sandbox.
275         let bg_job_control = BackgroundJobControl::new().context("create background job event")?;
276 
277         #[cfg(feature = "log_page_fault")]
278         let page_fault_logger = PageFaultEventLogger::create(&swap_dir, &guest_memory)
279             .context("create page fault logger")?;
280 
281         let mut keep_rds = vec![
282             stdout().as_raw_descriptor(),
283             stderr().as_raw_descriptor(),
284             uffd.as_raw_descriptor(),
285             swap_file.as_raw_descriptor(),
286             command_tube_monitor.as_raw_descriptor(),
287             bg_job_control.get_completion_event().as_raw_descriptor(),
288             #[cfg(feature = "log_page_fault")]
289             page_fault_logger.as_raw_descriptor(),
290         ];
291 
292         syslog::push_descriptors(&mut keep_rds);
293         cros_tracing::push_descriptors!(&mut keep_rds);
294         keep_rds.extend(guest_memory.as_raw_descriptors());
295 
296         keep_rds.extend(uffd_factory.as_raw_descriptors());
297 
298         // Load and cache transparent hugepage size from sysfs before jumping into sandbox.
299         Lazy::force(&THP_SIZE);
300 
301         let mut jail = if let Some(jail_config) = jail_config {
302             let config = SandboxConfig::new(jail_config, "swap_monitor");
303             create_sandbox_minijail(&jail_config.pivot_root, MAX_OPEN_FILES_DEFAULT, &config)
304                 .context("create sandbox jail")?
305         } else {
306             create_base_minijail(Path::new("/"), MAX_OPEN_FILES_DEFAULT)
307                 .context("create minijail")?
308         };
309         jail.set_rlimit(
310             libc::RLIMIT_MEMLOCK as libc::c_int,
311             MLOCK_BUDGET as u64,
312             MLOCK_BUDGET as u64,
313         )
314         .context("error setting RLIMIT_MEMLOCK")?;
315 
316         // Start a page fault monitoring process (this will be the first child process of the
317         // current process)
318         let child_process =
319             fork_process(jail, keep_rds, Some(String::from("swap monitor")), || {
320                 if let Err(e) = monitor_process(
321                     command_tube_monitor,
322                     guest_memory,
323                     uffd,
324                     swap_file,
325                     bg_job_control,
326                     #[cfg(feature = "log_page_fault")]
327                     page_fault_logger,
328                 ) {
329                     panic!("page_fault_handler_thread exited with error: {:?}", e)
330                 }
331             })
332             .context("fork monitor process")?;
333 
334         // send first status request to the monitor process and wait for the response until setup on
335         // the monitor process completes.
336         command_tube_main.send(&Command::Status)?;
337         match command_tube_main
338             .recv::<Status>()
339             .context("recv initial status")?
340             .state
341         {
342             State::Ready => {
343                 // The initial state of swap status is Ready and this is a signal that the
344                 // monitoring process completes setup and is running.
345             }
346             status => {
347                 bail!("initial state is not Ready, but {:?}", status);
348             }
349         };
350 
351         Ok(Self {
352             child_process,
353             uffd_factory,
354             command_tube: command_tube_main,
355         })
356     }
357 
358     /// Enable monitoring page faults and move guest memory to staging memory.
359     ///
360     /// The pages will be swapped in from the staging memory to the guest memory on page faults
361     /// until pages are written into the swap file by [Self::swap_out()].
362     ///
363     /// This waits until enabling vmm-swap finishes on the monitor process.
364     ///
365     /// The caller must guarantee that any contents on the guest memory is not updated during
366     /// enabling vmm-swap.
367     ///
368     /// # Note
369     ///
370     /// Enabling does not write pages to the swap file. User should call [Self::swap_out()]
371     /// after a suitable time.
372     ///
373     /// Just after enabling vmm-swap, some amount of pages are swapped in as soon as guest resumes.
374     /// By splitting the enable/swap_out operation and by delaying write to the swap file operation,
375     /// it has a benefit of reducing file I/O for hot pages.
enable(&self) -> anyhow::Result<()>376     pub fn enable(&self) -> anyhow::Result<()> {
377         self.command_tube
378             .send(&Command::Enable)
379             .context("send swap enable request")?;
380 
381         let _ = self
382             .command_tube
383             .recv::<Status>()
384             .context("receive swap status")?;
385         Ok(())
386     }
387 
388     /// Trim pages in the staging memory which are needless to be written back to the swap file.
389     ///
390     /// * zero pages
391     /// * pages which are the same as the pages in the swap file.
trim(&self) -> anyhow::Result<()>392     pub fn trim(&self) -> anyhow::Result<()> {
393         self.command_tube
394             .send(&Command::Trim)
395             .context("send swap trim request")?;
396         Ok(())
397     }
398 
399     /// Swap out all the pages in the staging memory to the swap files.
400     ///
401     /// This returns as soon as it succeeds to send request to the monitor process.
402     ///
403     /// Users should call [Self::enable()] before this. See the comment of [Self::enable()] as well.
swap_out(&self) -> anyhow::Result<()>404     pub fn swap_out(&self) -> anyhow::Result<()> {
405         self.command_tube
406             .send(&Command::SwapOut)
407             .context("send swap out request")?;
408         Ok(())
409     }
410 
411     /// Swap in all the guest memory and disable monitoring page faults.
412     ///
413     /// This returns as soon as it succeeds to send request to the monitor process.
disable(&self) -> anyhow::Result<()>414     pub fn disable(&self) -> anyhow::Result<()> {
415         self.command_tube
416             .send(&Command::Disable)
417             .context("send swap disable request")?;
418         Ok(())
419     }
420 
421     /// Return current swap status.
422     ///
423     /// This blocks until response from the monitor process arrives to the main process.
status(&self) -> anyhow::Result<Status>424     pub fn status(&self) -> anyhow::Result<Status> {
425         self.command_tube
426             .send(&Command::Status)
427             .context("send swap status request")?;
428         let status = self.command_tube.recv().context("receive swap status")?;
429         Ok(status)
430     }
431 
432     /// Shutdown the monitor process.
433     ///
434     /// This blocks until the monitor process exits.
435     ///
436     /// This should be called once.
exit(self) -> anyhow::Result<()>437     pub fn exit(self) -> anyhow::Result<()> {
438         self.command_tube
439             .send(&Command::Exit)
440             .context("send exit command")?;
441         self.child_process
442             .wait()
443             .context("wait monitor process shutdown")?;
444         Ok(())
445     }
446 
447     /// Create a new userfaultfd and send it to the monitor process.
448     ///
449     /// This must be called as soon as a child process which may touch the guest memory is forked.
450     ///
451     /// Userfaultfd(2) originally has `UFFD_FEATURE_EVENT_FORK`. But it is not applicable to crosvm
452     /// since it does not support non-root user namespace.
on_process_forked(&self) -> anyhow::Result<()>453     pub fn on_process_forked(&self) -> anyhow::Result<()> {
454         let uffd = self.uffd_factory.create().context("create userfaultfd")?;
455         self.command_tube
456             .send(&Command::ProcessForked(uffd.as_raw_descriptor()))
457             .context("send forked event")?;
458         // The fd for Userfaultfd in this process is droped when this method exits, but the
459         // userfaultfd keeps alive in the monitor process which it is sent to.
460         Ok(())
461     }
462 
463     /// Suspend device processes using `SIGSTOP` signal.
464     ///
465     /// When the returned `ProcessesGuard` is dropped, the devices resume.
466     ///
467     /// This must be called from the main process.
suspend_devices(&self) -> anyhow::Result<ProcessesGuard>468     pub fn suspend_devices(&self) -> anyhow::Result<ProcessesGuard> {
469         freeze_child_processes(self.child_process.pid)
470     }
471 }
472 
473 impl AsRawDescriptors for SwapController {
as_raw_descriptors(&self) -> Vec<RawDescriptor>474     fn as_raw_descriptors(&self) -> Vec<RawDescriptor> {
475         let mut rds = self.uffd_factory.as_raw_descriptors();
476         rds.push(self.command_tube.as_raw_descriptor());
477         rds
478     }
479 }
480 
481 #[derive(EventToken)]
482 enum Token {
483     UffdEvents(u32),
484     Command,
485     BackgroundJobCompleted,
486 }
487 
488 struct UffdList<'a> {
489     list: Vec<Userfaultfd>,
490     wait_ctx: &'a WaitContext<Token>,
491 }
492 
493 impl<'a> UffdList<'a> {
494     const ID_MAIN_UFFD: u32 = 0;
495 
new(main_uffd: Userfaultfd, wait_ctx: &'a WaitContext<Token>) -> Self496     fn new(main_uffd: Userfaultfd, wait_ctx: &'a WaitContext<Token>) -> Self {
497         Self {
498             list: vec![main_uffd],
499             wait_ctx,
500         }
501     }
502 
register(&mut self, uffd: Userfaultfd) -> anyhow::Result<()>503     fn register(&mut self, uffd: Userfaultfd) -> anyhow::Result<()> {
504         let id_uffd = self
505             .list
506             .len()
507             .try_into()
508             .context("too many userfaultfd forked")?;
509 
510         self.wait_ctx
511             .add(&uffd, Token::UffdEvents(id_uffd))
512             .context("add to wait context")?;
513         self.list.push(uffd);
514 
515         Ok(())
516     }
517 
get(&self, id: u32) -> Option<&Userfaultfd>518     fn get(&self, id: u32) -> Option<&Userfaultfd> {
519         self.list.get(id as usize)
520     }
521 
main_uffd(&self) -> &Userfaultfd522     fn main_uffd(&self) -> &Userfaultfd {
523         &self.list[Self::ID_MAIN_UFFD as usize]
524     }
525 
get_list(&self) -> &[Userfaultfd]526     fn get_list(&self) -> &[Userfaultfd] {
527         &self.list
528     }
529 }
530 
regions_from_guest_memory(guest_memory: &GuestMemory) -> Vec<Range<usize>>531 fn regions_from_guest_memory(guest_memory: &GuestMemory) -> Vec<Range<usize>> {
532     let mut regions = Vec::new();
533     guest_memory
534         .with_regions::<_, ()>(
535             |MemoryRegionInformation {
536                  size, host_addr, ..
537              }| {
538                 regions.push(host_addr..(host_addr + size));
539                 Ok(())
540             },
541         )
542         .unwrap(); // the callback never return error.
543     regions
544 }
545 
546 /// The main thread of the monitor process.
monitor_process( command_tube: Tube, guest_memory: GuestMemory, uffd: Userfaultfd, swap_file: File, bg_job_control: BackgroundJobControl, #[cfg(feature = "log_page_fault")] mut page_fault_logger: PageFaultEventLogger, ) -> anyhow::Result<()>547 fn monitor_process(
548     command_tube: Tube,
549     guest_memory: GuestMemory,
550     uffd: Userfaultfd,
551     swap_file: File,
552     bg_job_control: BackgroundJobControl,
553     #[cfg(feature = "log_page_fault")] mut page_fault_logger: PageFaultEventLogger,
554 ) -> anyhow::Result<()> {
555     info!("monitor_process started");
556 
557     let wait_ctx = WaitContext::build_with(&[
558         (&command_tube, Token::Command),
559         // Even though swap isn't enabled until the enable command is received, it's necessary to
560         // start waiting on the main uffd here so that uffd fork events can be processed, because
561         // child processes will block until their corresponding uffd fork event is read.
562         (&uffd, Token::UffdEvents(UffdList::ID_MAIN_UFFD)),
563         (
564             bg_job_control.get_completion_event(),
565             Token::BackgroundJobCompleted,
566         ),
567     ])
568     .context("create wait context")?;
569 
570     let n_worker = num_cpus::get();
571     info!("start {} workers for staging memory move", n_worker);
572     // The worker threads are killed when the main thread of the monitor process dies.
573     let worker = Worker::new(n_worker, n_worker);
574 
575     let mut uffd_list = UffdList::new(uffd, &wait_ctx);
576     let mut state_transition = StateTransition::default();
577 
578     loop {
579         let events = wait_ctx.wait().context("wait poll events")?;
580 
581         for event in events.iter() {
582             match event.token {
583                 Token::UffdEvents(id_uffd) => {
584                     let uffd = uffd_list
585                         .get(id_uffd)
586                         .with_context(|| format!("uffd is not found for idx: {}", id_uffd))?;
587                     // Userfaultfd does not work as level triggered but as edge triggered. We need
588                     // to read all the events in the userfaultfd here.
589                     while let Some(event) = uffd.read_event().context("read userfaultfd event")? {
590                         match event {
591                             UffdEvent::Remove { .. } => {
592                                 // BUG(b/272620051): This is a bug of userfaultfd that
593                                 // UFFD_EVENT_REMOVE can be read even after unregistering memory
594                                 // from the userfaultfd.
595                                 warn!("page remove event while vmm-swap disabled");
596                             }
597                             event => {
598                                 bail!("unexpected uffd event: {:?}", event);
599                             }
600                         }
601                     }
602                 }
603                 Token::Command => match command_tube
604                     .recv::<Command>()
605                     .context("recv swap command")?
606                 {
607                     Command::ProcessForked(raw_descriptor) => {
608                         debug!("new fork uffd: {:?}", raw_descriptor);
609                         // Safe because the raw_descriptor is sent from another process via Tube and
610                         // no one in this process owns it.
611                         let uffd = unsafe { Userfaultfd::from_raw_descriptor(raw_descriptor) };
612                         uffd_list.register(uffd).context("register forked uffd")?;
613                     }
614                     Command::Enable => {
615                         info!("enabling vmm-swap");
616 
617                         let staging_shmem =
618                             SharedMemory::new("swap staging memory", guest_memory.memory_size())
619                                 .context("create staging shmem")?;
620 
621                         let regions = regions_from_guest_memory(&guest_memory);
622 
623                         let page_handler = match PageHandler::create(
624                             &swap_file,
625                             &staging_shmem,
626                             &regions,
627                             worker.channel.clone(),
628                         ) {
629                             Ok(page_handler) => page_handler,
630                             Err(e) => {
631                                 error!("failed to create swap handler: {:?}", e);
632                                 continue;
633                             }
634                         };
635 
636                         // TODO(b/272634283): Should just disable vmm-swap without crash.
637                         // Safe because the regions are from guest memory and uffd_list contains all
638                         // the processes of crosvm.
639                         unsafe { register_regions(&regions, uffd_list.get_list()) }
640                             .context("register regions")?;
641 
642                         // events may contain unprocessed entries, but those pending events will be
643                         // immediately re-created when handle_vmm_swap checks wait_ctx because
644                         // WaitContext is level triggered.
645                         drop(events);
646 
647                         let mutex_transition = Mutex::new(state_transition);
648 
649                         bg_job_control.reset()?;
650                         let exit = std::thread::scope(|scope| {
651                             let exit = handle_vmm_swap(
652                                 scope,
653                                 &wait_ctx,
654                                 &page_handler,
655                                 &uffd_list,
656                                 &guest_memory,
657                                 &command_tube,
658                                 &worker,
659                                 &mutex_transition,
660                                 &bg_job_control,
661                                 #[cfg(feature = "log_page_fault")]
662                                 &mut page_fault_logger,
663                             );
664                             // Abort background jobs to unblock ScopedJoinHandle eariler on a
665                             // failure.
666                             bg_job_control.abort();
667                             exit
668                         })?;
669                         if exit {
670                             return Ok(());
671                         }
672                         state_transition = mutex_transition.into_inner();
673 
674                         unregister_regions(&regions, uffd_list.get_list())
675                             .context("unregister regions")?;
676 
677                         // Truncate the swap file to hold minimum resources while disabled.
678                         if let Err(e) = swap_file.set_len(0) {
679                             error!("failed to clear swap file: {:?}", e);
680                         };
681 
682                         info!("vmm-swap is disabled");
683                         // events are obsolete. Run `WaitContext::wait()` again
684                         break;
685                     }
686                     Command::Trim => {
687                         warn!("swap trim while disabled");
688                     }
689                     Command::SwapOut => {
690                         warn!("swap out while disabled");
691                     }
692                     Command::Disable => {
693                         warn!("swap is already disabled");
694                     }
695                     Command::Exit => {
696                         return Ok(());
697                     }
698                     Command::Status => {
699                         let status = Status::disabled(&state_transition);
700                         command_tube.send(&status).context("send status response")?;
701                         info!("swap status: {:?}", status);
702                     }
703                 },
704                 Token::BackgroundJobCompleted => {
705                     error!("unexpected background job completed event while swap is disabled");
706                     bg_job_control.reset()?;
707                 }
708             };
709         }
710     }
711 }
712 
713 enum SwapState<'scope> {
714     SwapOutPending,
715     Trim(ScopedJoinHandle<'scope, anyhow::Result<()>>),
716     SwapOutInProgress { started_time: Instant },
717     SwapOutCompleted,
718     SwapInInProgress(ScopedJoinHandle<'scope, anyhow::Result<()>>),
719     Failed,
720 }
721 
handle_enable_command<'scope>( state: SwapState, bg_job_control: &BackgroundJobControl, page_handler: &PageHandler, guest_memory: &GuestMemory, worker: &Worker<MoveToStaging>, state_transition: &Mutex<StateTransition>, ) -> anyhow::Result<SwapState<'scope>>722 fn handle_enable_command<'scope>(
723     state: SwapState,
724     bg_job_control: &BackgroundJobControl,
725     page_handler: &PageHandler,
726     guest_memory: &GuestMemory,
727     worker: &Worker<MoveToStaging>,
728     state_transition: &Mutex<StateTransition>,
729 ) -> anyhow::Result<SwapState<'scope>> {
730     match state {
731         SwapState::SwapInInProgress(join_handle) => {
732             info!("abort swap-in");
733             abort_background_job(join_handle, bg_job_control).context("abort swap-in")?;
734         }
735         SwapState::Trim(join_handle) => {
736             info!("abort trim");
737             abort_background_job(join_handle, bg_job_control).context("abort trim")?;
738         }
739         _ => {}
740     }
741 
742     info!("start moving memory to staging");
743     match move_guest_to_staging(page_handler, guest_memory, worker) {
744         Ok(new_state_transition) => {
745             info!(
746                 "move {} pages to staging in {} ms",
747                 new_state_transition.pages, new_state_transition.time_ms
748             );
749             *state_transition.lock() = new_state_transition;
750             Ok(SwapState::SwapOutPending)
751         }
752         Err(e) => {
753             error!("failed to move memory to staging: {}", e);
754             *state_transition.lock() = StateTransition::default();
755             Ok(SwapState::Failed)
756         }
757     }
758 }
759 
move_guest_to_staging( page_handler: &PageHandler, guest_memory: &GuestMemory, worker: &Worker<MoveToStaging>, ) -> anyhow::Result<StateTransition>760 fn move_guest_to_staging(
761     page_handler: &PageHandler,
762     guest_memory: &GuestMemory,
763     worker: &Worker<MoveToStaging>,
764 ) -> anyhow::Result<StateTransition> {
765     let start_time = std::time::Instant::now();
766 
767     let mut pages = 0;
768 
769     let result = guest_memory.with_regions::<_, anyhow::Error>(
770         |MemoryRegionInformation {
771              host_addr,
772              shm,
773              shm_offset,
774              ..
775          }| {
776             // safe because:
777             // * all the regions are registered to all userfaultfd
778             // * no process access the guest memory
779             // * page fault events are handled by PageHandler
780             // * wait for all the copy completed within _processes_guard
781             pages += unsafe { page_handler.move_to_staging(host_addr, shm, shm_offset) }
782                 .context("move to staging")?;
783             Ok(())
784         },
785     );
786     worker.channel.wait_complete();
787 
788     match result {
789         Ok(()) => {
790             if page_handler.compute_resident_pages() > 0 {
791                 error!(
792                     "active page is not zero just after swap out but {} pages",
793                     page_handler.compute_resident_pages()
794                 );
795             }
796             let time_ms = start_time.elapsed().as_millis();
797             Ok(StateTransition { pages, time_ms })
798         }
799         Err(e) => Err(e),
800     }
801 }
802 
abort_background_job<T>( join_handle: ScopedJoinHandle<'_, anyhow::Result<T>>, bg_job_control: &BackgroundJobControl, ) -> anyhow::Result<T>803 fn abort_background_job<T>(
804     join_handle: ScopedJoinHandle<'_, anyhow::Result<T>>,
805     bg_job_control: &BackgroundJobControl,
806 ) -> anyhow::Result<T> {
807     bg_job_control.abort();
808     // Wait until the background job is aborted and the thread finishes.
809     let result = join_handle
810         .join()
811         .expect("panic on the background job thread");
812     bg_job_control.reset().context("reset swap in event")?;
813     result.context("failure on background job thread")
814 }
815 
handle_vmm_swap<'scope, 'env>( scope: &'scope Scope<'scope, 'env>, wait_ctx: &WaitContext<Token>, page_handler: &'env PageHandler<'env>, uffd_list: &'env UffdList, guest_memory: &GuestMemory, command_tube: &Tube, worker: &Worker<MoveToStaging>, state_transition: &'env Mutex<StateTransition>, bg_job_control: &'env BackgroundJobControl, #[cfg(feature = "log_page_fault")] page_fault_logger: &mut PageFaultEventLogger, ) -> anyhow::Result<bool>816 fn handle_vmm_swap<'scope, 'env>(
817     scope: &'scope Scope<'scope, 'env>,
818     wait_ctx: &WaitContext<Token>,
819     page_handler: &'env PageHandler<'env>,
820     uffd_list: &'env UffdList,
821     guest_memory: &GuestMemory,
822     command_tube: &Tube,
823     worker: &Worker<MoveToStaging>,
824     state_transition: &'env Mutex<StateTransition>,
825     bg_job_control: &'env BackgroundJobControl,
826     #[cfg(feature = "log_page_fault")] page_fault_logger: &mut PageFaultEventLogger,
827 ) -> anyhow::Result<bool> {
828     let mut state = match move_guest_to_staging(page_handler, guest_memory, worker) {
829         Ok(transition) => {
830             info!(
831                 "move {} pages to staging in {} ms",
832                 transition.pages, transition.time_ms
833             );
834             *state_transition.lock() = transition;
835             SwapState::SwapOutPending
836         }
837         Err(e) => {
838             error!("failed to move memory to staging: {}", e);
839             *state_transition.lock() = StateTransition::default();
840             SwapState::Failed
841         }
842     };
843     command_tube
844         .send(&Status::dummy())
845         .context("send enable finish signal")?;
846 
847     loop {
848         let events = match &state {
849             SwapState::SwapOutInProgress { started_time } => {
850                 let events = wait_ctx
851                     .wait_timeout(Duration::ZERO)
852                     .context("wait poll events")?;
853 
854                 // TODO(b/273129441): swap out on a background thread.
855                 // Proceed swap out only when there is no page fault (or other) events.
856                 if events.is_empty() {
857                     match page_handler.swap_out(MAX_SWAP_CHUNK_SIZE) {
858                         Ok(num_pages) => {
859                             let mut state_transition = state_transition.lock();
860                             state_transition.pages += num_pages;
861                             state_transition.time_ms = started_time.elapsed().as_millis();
862                             if num_pages == 0 {
863                                 info!(
864                                     "swap out all {} pages to file in {} ms",
865                                     state_transition.pages, state_transition.time_ms
866                                 );
867                                 state = SwapState::SwapOutCompleted;
868                             }
869                         }
870                         Err(e) => {
871                             error!("failed to swap out: {:?}", e);
872                             state = SwapState::Failed;
873                             *state_transition.lock() = StateTransition::default();
874                         }
875                     }
876                     continue;
877                 }
878 
879                 events
880             }
881             _ => wait_ctx.wait().context("wait poll events")?,
882         };
883 
884         for event in events.iter() {
885             match event.token {
886                 Token::UffdEvents(id_uffd) => {
887                     let uffd = uffd_list
888                         .get(id_uffd)
889                         .with_context(|| format!("uffd is not found for idx: {}", id_uffd))?;
890                     // Userfaultfd does not work as level triggered but as edge triggered. We need
891                     // to read all the events in the userfaultfd here.
892                     // TODO(kawasin): Use [userfaultfd::Uffd::read_events()] for performance.
893                     while let Some(event) = uffd.read_event().context("read userfaultfd event")? {
894                         match event {
895                             UffdEvent::Pagefault { addr, .. } => {
896                                 #[cfg(feature = "log_page_fault")]
897                                 page_fault_logger.log_page_fault(addr as usize, id_uffd);
898                                 page_handler
899                                     .handle_page_fault(uffd, addr as usize)
900                                     .context("handle fault")?;
901                             }
902                             UffdEvent::Remove { start, end } => {
903                                 page_handler
904                                     .handle_page_remove(start as usize, end as usize)
905                                     .context("handle fault")?;
906                             }
907                             event => {
908                                 bail!("unsupported UffdEvent: {:?}", event);
909                             }
910                         }
911                     }
912                 }
913                 Token::Command => match command_tube
914                     .recv::<Command>()
915                     .context("recv swap command")?
916                 {
917                     Command::ProcessForked(raw_descriptor) => {
918                         debug!("new fork uffd: {:?}", raw_descriptor);
919                         // TODO(b/266898615): The forked processes must wait running until the
920                         // regions are registered to the new uffd if vmm-swap is already enabled.
921                         // There are currently no use cases for swap + hotplug, so this is currently
922                         // not implemented.
923                         bail!("child process is forked while swap is enabled");
924                     }
925                     Command::Enable => {
926                         let result = handle_enable_command(
927                             state,
928                             bg_job_control,
929                             page_handler,
930                             guest_memory,
931                             worker,
932                             state_transition,
933                         );
934                         command_tube
935                             .send(&Status::dummy())
936                             .context("send enable finish signal")?;
937                         state = result?;
938                     }
939                     Command::Trim => match &state {
940                         SwapState::SwapOutPending => {
941                             *state_transition.lock() = StateTransition::default();
942                             let join_handle = scope.spawn(|| {
943                                 let mut ctx = page_handler.start_trim();
944                                 let job = bg_job_control.new_job();
945                                 let start_time = std::time::Instant::now();
946 
947                                 while !job.is_aborted() {
948                                     if let Some(trimmed_pages) =
949                                         ctx.trim_pages(MAX_TRIM_PAGES).context("trim pages")?
950                                     {
951                                         let mut state_transition = state_transition.lock();
952                                         state_transition.pages += trimmed_pages;
953                                         state_transition.time_ms = start_time.elapsed().as_millis();
954                                     } else {
955                                         // Traversed all pages.
956                                         break;
957                                     }
958                                 }
959 
960                                 if job.is_aborted() {
961                                     info!("trim is aborted");
962                                 } else {
963                                     info!(
964                                         "trimmed {} clean pages and {} zero pages",
965                                         ctx.trimmed_clean_pages(),
966                                         ctx.trimmed_zero_pages()
967                                     );
968                                 }
969                                 Ok(())
970                             });
971 
972                             state = SwapState::Trim(join_handle);
973                             info!("start trimming staging memory");
974                         }
975                         state => {
976                             warn!("swap trim is not ready. state: {:?}", State::from(state));
977                         }
978                     },
979                     Command::SwapOut => match &state {
980                         SwapState::SwapOutPending => {
981                             state = SwapState::SwapOutInProgress {
982                                 started_time: std::time::Instant::now(),
983                             };
984                             *state_transition.lock() = StateTransition::default();
985                             info!("start swapping out");
986                         }
987                         state => {
988                             warn!("swap out is not ready. state: {:?}", State::from(state));
989                         }
990                     },
991                     Command::Disable => {
992                         match state {
993                             SwapState::Trim(join_handle) => {
994                                 info!("abort trim");
995                                 abort_background_job(join_handle, bg_job_control)
996                                     .context("abort trim")?;
997                             }
998                             SwapState::SwapOutInProgress { .. } => {
999                                 info!("swap out is aborted");
1000                             }
1001                             SwapState::SwapInInProgress(_) => {
1002                                 info!("swap in is in progress");
1003                                 continue;
1004                             }
1005                             _ => {}
1006                         }
1007                         *state_transition.lock() = StateTransition::default();
1008 
1009                         let join_handle = scope.spawn(|| {
1010                             let mut ctx = page_handler.start_swap_in();
1011                             let uffd = uffd_list.main_uffd();
1012                             let job = bg_job_control.new_job();
1013                             let start_time = std::time::Instant::now();
1014                             while !job.is_aborted() {
1015                                 match ctx.swap_in(uffd, MAX_SWAP_CHUNK_SIZE) {
1016                                     Ok(num_pages) => {
1017                                         if num_pages == 0 {
1018                                             break;
1019                                         }
1020                                         let mut state_transition = state_transition.lock();
1021                                         state_transition.pages += num_pages;
1022                                         state_transition.time_ms = start_time.elapsed().as_millis();
1023                                     }
1024                                     Err(e) => {
1025                                         bail!("failed to swap in: {:?}", e);
1026                                     }
1027                                 }
1028                             }
1029                             if job.is_aborted() {
1030                                 info!("swap in is aborted");
1031                             }
1032                             Ok(())
1033                         });
1034                         state = SwapState::SwapInInProgress(join_handle);
1035 
1036                         info!("start swapping in");
1037                     }
1038                     Command::Exit => {
1039                         match state {
1040                             SwapState::SwapInInProgress(join_handle) => {
1041                                 // Wait until swap-in finishes.
1042                                 if let Err(e) = join_handle.join() {
1043                                     bail!("failed to join swap in thread: {:?}", e);
1044                                 }
1045                                 return Ok(true);
1046                             }
1047                             SwapState::Trim(join_handle) => {
1048                                 abort_background_job(join_handle, bg_job_control)
1049                                     .context("abort trim")?;
1050                             }
1051                             _ => {}
1052                         }
1053                         let mut ctx = page_handler.start_swap_in();
1054                         let uffd = uffd_list.main_uffd();
1055                         // Swap-in all before exit.
1056                         while ctx.swap_in(uffd, MAX_SWAP_CHUNK_SIZE).context("swap in")? > 0 {}
1057                         return Ok(true);
1058                     }
1059                     Command::Status => {
1060                         let status = Status::new(&state, *state_transition.lock(), page_handler);
1061                         command_tube.send(&status).context("send status response")?;
1062                         info!("swap status: {:?}", status);
1063                     }
1064                 },
1065                 Token::BackgroundJobCompleted => {
1066                     // Reset the completed event.
1067                     if !bg_job_control
1068                         .reset()
1069                         .context("reset background job event")?
1070                     {
1071                         // When the job is aborted and the event is comsumed by reset(), the token
1072                         // `Token::BackgroundJobCompleted` may remain in the `events`. Just ignore
1073                         // the obsolete token here.
1074                         continue;
1075                     }
1076                     match state {
1077                         SwapState::SwapInInProgress(join_handle) => {
1078                             join_handle
1079                                 .join()
1080                                 .expect("panic on the background job thread")
1081                                 .context("swap in finish")?;
1082                             let state_transition = state_transition.lock();
1083                             info!(
1084                                 "swap in all {} pages in {} ms.",
1085                                 state_transition.pages, state_transition.time_ms
1086                             );
1087                             return Ok(false);
1088                         }
1089                         SwapState::Trim(join_handle) => {
1090                             join_handle
1091                                 .join()
1092                                 .expect("panic on the background job thread")
1093                                 .context("trim finish")?;
1094                             let state_transition = state_transition.lock();
1095                             info!(
1096                                 "trimmed {} pages in {} ms.",
1097                                 state_transition.pages, state_transition.time_ms
1098                             );
1099                             state = SwapState::SwapOutPending;
1100                         }
1101                         state => {
1102                             bail!(
1103                                 "background job completed but the actual state is {:?}",
1104                                 State::from(&state)
1105                             );
1106                         }
1107                     }
1108                 }
1109             };
1110         }
1111     }
1112 }
1113