• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2022 The ChromiumOS Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 //! PageHandler manages the page states of multiple regions.
6 
7 #![deny(missing_docs)]
8 
9 use std::fs::File;
10 use std::mem;
11 use std::ops::Range;
12 use std::sync::Arc;
13 
14 use anyhow::Context;
15 use base::error;
16 use base::sys::find_next_data;
17 use base::unix::FileDataIterator;
18 use base::AsRawDescriptor;
19 use base::SharedMemory;
20 use data_model::VolatileSlice;
21 use sync::Mutex;
22 use thiserror::Error as ThisError;
23 
24 use crate::file::Error as FileError;
25 use crate::file::SwapFile;
26 use crate::pagesize::addr_to_page_idx;
27 use crate::pagesize::bytes_to_pages;
28 use crate::pagesize::is_hugepage_aligned;
29 use crate::pagesize::is_page_aligned;
30 use crate::pagesize::page_base_addr;
31 use crate::pagesize::page_idx_to_addr;
32 use crate::pagesize::pages_to_bytes;
33 use crate::pagesize::round_up_hugepage_size;
34 use crate::pagesize::THP_SIZE;
35 use crate::staging::CopyOp;
36 use crate::staging::Error as StagingError;
37 use crate::staging::StagingMemory;
38 use crate::userfaultfd::Error as UffdError;
39 use crate::userfaultfd::Userfaultfd;
40 use crate::worker::Channel;
41 use crate::worker::Task;
42 
43 pub(crate) const MLOCK_BUDGET: usize = 16 * 1024 * 1024; // = 16MB
44 const PREFETCH_THRESHOLD: usize = 4 * 1024 * 1024; // = 4MB
45 
46 /// Result for PageHandler
47 pub type Result<T> = std::result::Result<T, Error>;
48 
49 /// Errors for PageHandler
50 #[derive(ThisError, Debug)]
51 pub enum Error {
52     #[error("the address is invalid {0:#018X}")]
53     /// the address is invalid
54     InvalidAddress(usize),
55     #[error("the regions {0:?} and {1:?} overlap")]
56     /// regions are overlaps on registering
57     RegionOverlap(Range<usize>, Range<usize>),
58     #[error("failed to create page handler {0:?}")]
59     /// failed to create page handler
60     CreateFailed(anyhow::Error),
61     #[error("file operation failed : {0:?}")]
62     /// file operation failed
63     File(#[from] FileError),
64     #[error("staging operation failed : {0:?}")]
65     /// staging operation failed
66     Staging(#[from] StagingError),
67     #[error("userfaultfd failed : {0:?}")]
68     /// userfaultfd operation failed
69     Userfaultfd(#[from] UffdError),
70 }
71 
72 /// Remove the memory range on the guest memory.
73 ///
74 /// This is an alternative to [vm_memory::GuestMemory::remove_range()] when working with host
75 /// addresses instead of guest addresses.
76 ///
77 /// # Safety
78 ///
79 /// The memory range must be on the guest memory.
80 #[deny(unsafe_op_in_unsafe_fn)]
remove_memory(addr: usize, len: usize) -> std::result::Result<(), base::Error>81 unsafe fn remove_memory(addr: usize, len: usize) -> std::result::Result<(), base::Error> {
82     // Safe because the caller guarantees addr is in guest memory, so this does not affect any rust
83     // managed memory.
84     let ret = unsafe { libc::madvise(addr as *mut libc::c_void, len, libc::MADV_REMOVE) };
85     if ret < 0 {
86         base::errno_result()
87     } else {
88         Ok(())
89     }
90 }
91 
uffd_copy_all( uffd: &Userfaultfd, mut page_addr: usize, mut data_slice: VolatileSlice, wake: bool, ) -> std::result::Result<(), UffdError>92 fn uffd_copy_all(
93     uffd: &Userfaultfd,
94     mut page_addr: usize,
95     mut data_slice: VolatileSlice,
96     wake: bool,
97 ) -> std::result::Result<(), UffdError> {
98     loop {
99         let result = uffd.copy(page_addr, data_slice.size(), data_slice.as_ptr(), wake);
100         match result {
101             Err(UffdError::PartiallyCopied(copied)) => {
102                 page_addr += copied;
103                 data_slice.advance(copied);
104             }
105             other => {
106                 // Even EEXIST for copy operation should be an error for page fault handling. If
107                 // the page was swapped in before, the page should be cleared from the swap file
108                 // and do `Userfaultfd::zero()` instead.
109                 return other.map(|_| ());
110             }
111         }
112     }
113 }
114 
115 /// [Region] represents a memory region and corresponding [SwapFile].
116 struct Region<'a> {
117     /// the head page index of the region.
118     head_page_idx: usize,
119     file: SwapFile<'a>,
120     staging_memory: StagingMemory,
121     copied_from_file_pages: usize,
122     copied_from_staging_pages: usize,
123     zeroed_pages: usize,
124     swap_in_pages: usize,
125     /// the amount of pages which were already initialized on page faults.
126     redundant_pages: usize,
127     swap_active: bool,
128 }
129 
130 /// MoveToStaging copies chunks of consecutive pages next to each other on the guest memory to the
131 /// staging memory and removes the chunks on the guest memory.
132 pub struct MoveToStaging {
133     remove_area: Range<usize>,
134     copies: Vec<CopyOp>,
135 }
136 
137 impl Task for MoveToStaging {
execute(self)138     fn execute(self) {
139         for copy_op in self.copies {
140             copy_op.execute();
141         }
142         // Remove chunks of pages at once to reduce madvise(2) syscall.
143         // Safe because the region is already backed by the file and the content will be
144         // swapped in on a page fault.
145         let result = unsafe {
146             remove_memory(
147                 self.remove_area.start,
148                 self.remove_area.end - self.remove_area.start,
149             )
150         };
151         if let Err(e) = result {
152             panic!("failed to remove memory: {:?}", e);
153         }
154     }
155 }
156 
157 struct PageHandleContext<'a> {
158     regions: Vec<Region<'a>>,
159     mlock_budget_pages: usize,
160 }
161 
162 /// PageHandler manages the page states of multiple regions.
163 ///
164 /// Handles multiple events derived from userfaultfd and swap out requests.
165 /// All the addresses and sizes in bytes are converted to page id internally.
166 pub struct PageHandler<'a> {
167     ctx: Mutex<PageHandleContext<'a>>,
168     channel: Arc<Channel<MoveToStaging>>,
169     swap_raw_file: &'a File,
170 }
171 
172 impl<'a> PageHandler<'a> {
173     /// Creates [PageHandler] for the given region.
174     ///
175     /// If any of regions overlaps, this returns [Error::RegionOverlap].
176     ///
177     /// # Arguments
178     ///
179     /// * `swap_file` - The swap file.
180     /// * `staging_shmem` - The staging memory. It must have enough size to hold guest memory.
181     ///   Otherwise monitor process crashes on creating a mmap.
182     /// * `address_ranges` - The list of address range of the regions. the start address must align
183     ///   with page. the size must be multiple of pagesize.
create( swap_raw_file: &'a File, staging_shmem: &'a SharedMemory, address_ranges: &[Range<usize>], stating_move_context: Arc<Channel<MoveToStaging>>, ) -> Result<Self>184     pub fn create(
185         swap_raw_file: &'a File,
186         staging_shmem: &'a SharedMemory,
187         address_ranges: &[Range<usize>],
188         stating_move_context: Arc<Channel<MoveToStaging>>,
189     ) -> Result<Self> {
190         // Truncate the file into the size to hold all regions, otherwise access beyond the end of
191         // file may cause SIGBUS.
192         swap_raw_file
193             .set_len(
194                 address_ranges
195                     .iter()
196                     .map(|r| (r.end.saturating_sub(r.start)) as u64)
197                     .sum(),
198             )
199             .context("truncate swap file")
200             .map_err(Error::CreateFailed)?;
201 
202         let mut regions: Vec<Region> = Vec::new();
203         let mut offset_pages = 0;
204         for address_range in address_ranges {
205             let head_page_idx = addr_to_page_idx(address_range.start);
206             if address_range.end < address_range.start {
207                 return Err(Error::CreateFailed(anyhow::anyhow!(
208                     "invalid region end < start"
209                 )));
210             }
211             let region_size = address_range.end - address_range.start;
212             let num_of_pages = bytes_to_pages(region_size);
213 
214             // Find an overlapping region
215             match regions.iter().position(|region| {
216                 if region.head_page_idx < head_page_idx {
217                     region.head_page_idx + region.file.num_pages() > head_page_idx
218                 } else {
219                     region.head_page_idx < head_page_idx + num_of_pages
220                 }
221             }) {
222                 Some(i) => {
223                     let region = &regions[i];
224 
225                     return Err(Error::RegionOverlap(
226                         address_range.clone(),
227                         page_idx_to_addr(region.head_page_idx)
228                             ..(page_idx_to_addr(region.head_page_idx + region.file.num_pages())),
229                     ));
230                 }
231                 None => {
232                     let base_addr = address_range.start;
233                     assert!(is_page_aligned(base_addr));
234                     assert!(is_page_aligned(region_size));
235 
236                     let file = SwapFile::new(swap_raw_file, offset_pages, num_of_pages)?;
237                     let staging_memory = StagingMemory::new(
238                         staging_shmem,
239                         pages_to_bytes(offset_pages) as u64,
240                         num_of_pages,
241                     )?;
242                     regions.push(Region {
243                         head_page_idx,
244                         file,
245                         staging_memory,
246                         copied_from_file_pages: 0,
247                         copied_from_staging_pages: 0,
248                         zeroed_pages: 0,
249                         swap_in_pages: 0,
250                         redundant_pages: 0,
251                         swap_active: false,
252                     });
253                     offset_pages += num_of_pages;
254                 }
255             }
256         }
257 
258         Ok(Self {
259             ctx: Mutex::new(PageHandleContext {
260                 regions,
261                 mlock_budget_pages: bytes_to_pages(MLOCK_BUDGET),
262             }),
263             channel: stating_move_context,
264             swap_raw_file,
265         })
266     }
267 
find_region<'b>( regions: &'b mut [Region<'a>], page_idx: usize, ) -> Option<&'b mut Region<'a>>268     fn find_region<'b>(
269         regions: &'b mut [Region<'a>],
270         page_idx: usize,
271     ) -> Option<&'b mut Region<'a>> {
272         // sequential search the corresponding page map from the list. It should be fast enough
273         // because there are a few regions (usually only 1).
274         regions.iter_mut().find(|region| {
275             region.head_page_idx <= page_idx
276                 && page_idx < region.head_page_idx + region.file.num_pages()
277         })
278     }
279 
280     /// Fills the faulted page with zero if the page is not initialized, with the content in the
281     /// swap file if the page is swapped out.
282     ///
283     /// # Arguments
284     ///
285     /// * `uffd` - the reference to the [Userfaultfd] for the faulting process.
286     /// * `address` - the address that triggered the page fault.
handle_page_fault(&self, uffd: &Userfaultfd, address: usize) -> Result<()>287     pub fn handle_page_fault(&self, uffd: &Userfaultfd, address: usize) -> Result<()> {
288         let page_idx = addr_to_page_idx(address);
289         // the head address of the page.
290         let page_addr = page_base_addr(address);
291         let page_size = pages_to_bytes(1);
292         let mut ctx = self.ctx.lock();
293         let region =
294             Self::find_region(&mut ctx.regions, page_idx).ok_or(Error::InvalidAddress(address))?;
295 
296         let idx_in_region = page_idx - region.head_page_idx;
297         if let Some(page_slice) = region.staging_memory.page_content(idx_in_region)? {
298             uffd_copy_all(uffd, page_addr, page_slice, true)?;
299             // TODO(b/265758094): optimize clear operation.
300             region
301                 .staging_memory
302                 .clear_range(idx_in_region..idx_in_region + 1)?;
303             region.copied_from_staging_pages += 1;
304             Ok(())
305         } else if let Some(page_slice) = region.file.page_content(idx_in_region)? {
306             // TODO(kawasin): Unlock regions to proceed swap-in operation background.
307             uffd_copy_all(uffd, page_addr, page_slice, true)?;
308             // TODO(b/265758094): optimize clear operation.
309             // Do not erase the page from the disk for trimming optimization on next swap out.
310             let munlocked_pages = region.file.clear_range(idx_in_region..idx_in_region + 1)?;
311             region.copied_from_file_pages += 1;
312             ctx.mlock_budget_pages += munlocked_pages;
313             Ok(())
314         } else {
315             // Map a zero page since no swap file has been created yet but the fault
316             // happened.
317             // safe because the fault page is notified by uffd.
318             let result = uffd.zero(page_addr, page_size, true);
319             match result {
320                 Ok(_) => {
321                     region.zeroed_pages += 1;
322                     Ok(())
323                 }
324                 Err(UffdError::PageExist) => {
325                     // This case can happen if page faults on the same page happen on different
326                     // processes.
327                     uffd.wake(page_addr, page_size)?;
328                     region.redundant_pages += 1;
329                     Ok(())
330                 }
331                 Err(e) => Err(e.into()),
332             }
333         }
334     }
335 
336     /// Clear the internal state for the pages.
337     ///
338     /// When pages are removed by madvise with `MADV_DONTNEED` or `MADV_REMOVE`, userfaultfd
339     /// notifies the event as `UFFD_EVENT_REMOVE`. This handles the remove event.
340     ///
341     /// In crosvm, balloon frees the guest memory and cause `UFFD_EVENT_REMOVE`.
342     ///
343     /// # Arguments
344     ///
345     /// * `start_addr` - the head address of the memory area to be freed.
346     /// * `end_addr` - the end address of the memory area to be freed. `UFFD_EVENT_REMOVE` tells the
347     ///   head address of the next memory area of the freed area. (i.e. the exact tail address of
348     ///   the memory area is `end_addr - 1`.)
handle_page_remove(&self, start_addr: usize, end_addr: usize) -> Result<()>349     pub fn handle_page_remove(&self, start_addr: usize, end_addr: usize) -> Result<()> {
350         if !is_page_aligned(start_addr) {
351             return Err(Error::InvalidAddress(start_addr));
352         } else if !is_page_aligned(end_addr) {
353             return Err(Error::InvalidAddress(end_addr));
354         }
355         let start_page_idx = addr_to_page_idx(start_addr);
356         let last_page_idx = addr_to_page_idx(end_addr);
357         let mut ctx = self.ctx.lock();
358         // TODO(b/269983521): Clear multiple pages in the same region at once.
359         for page_idx in start_page_idx..(last_page_idx) {
360             let page_addr = page_idx_to_addr(page_idx);
361             // TODO(kawasin): Cache the position if the range does not span multiple regions.
362             let region = Self::find_region(&mut ctx.regions, page_idx)
363                 .ok_or(Error::InvalidAddress(page_addr))?;
364             let idx_in_region = page_idx - region.head_page_idx;
365             let idx_range = idx_in_region..idx_in_region + 1;
366             if let Err(e) = region.staging_memory.clear_range(idx_range.clone()) {
367                 error!("failed to clear removed page from staging: {:?}", e);
368             }
369             // Erase the pages from the disk because the pages are removed from the guest memory.
370             let munlocked_pages = region.file.erase_from_disk(idx_range)?;
371             ctx.mlock_budget_pages += munlocked_pages;
372         }
373         Ok(())
374     }
375 
376     /// Move active pages in the memory region to the staging memory.
377     ///
378     /// It only moves active contents in the guest memory to the swap file and skips empty pages
379     /// (e.g. pages not touched, freed by balloon) using `lseek(2)` + `SEEK_HOLE/DATA`.
380     ///
381     /// Returns the count of moved out pages.
382     ///
383     /// # Arguments
384     ///
385     /// * `base_addr` - the head address of the memory region.
386     /// * `memfd` - the file descriptor of the memfd backing the guest memory region.
387     /// * `base_offset` - the offset of the memory region in the memfd.
388     ///
389     /// # Safety
390     ///
391     /// The region must have been registered to all userfaultfd of processes which may touch the
392     /// region.
393     ///
394     /// The memory must be protected not to be updated while moving.
395     ///
396     /// The page fault events for the region from the userfaultfd must be handled by
397     /// [Self::handle_page_fault()].
398     ///
399     /// Must call [Channel::wait_complete()] to wait all the copy operation complete within the
400     /// memory protection period.
401     #[deny(unsafe_op_in_unsafe_fn)]
move_to_staging<T>( &self, base_addr: usize, memfd: &T, base_offset: u64, ) -> Result<usize> where T: AsRawDescriptor,402     pub unsafe fn move_to_staging<T>(
403         &self,
404         base_addr: usize,
405         memfd: &T,
406         base_offset: u64,
407     ) -> Result<usize>
408     where
409         T: AsRawDescriptor,
410     {
411         let hugepage_size = *THP_SIZE;
412         let mut ctx = self.ctx.lock();
413         let region = Self::find_region(&mut ctx.regions, addr_to_page_idx(base_addr))
414             .ok_or(Error::InvalidAddress(base_addr))?;
415 
416         if page_idx_to_addr(region.head_page_idx) != base_addr {
417             return Err(Error::InvalidAddress(base_addr));
418         }
419         let region_size = pages_to_bytes(region.file.num_pages());
420         let mut file_data = FileDataIterator::new(memfd, base_offset, region_size as u64);
421         let mut moved_size = 0;
422         let mut copies = Vec::new();
423         let mut remaining_batch_size = hugepage_size;
424         let mut batch_head_offset = 0;
425         let mut cur_data = None;
426         while let Some(data_range) = cur_data.take().or_else(|| file_data.next()) {
427             // Assert offset is page aligned
428             let offset = (data_range.start - base_offset) as usize;
429             assert!(is_page_aligned(offset));
430 
431             // The chunk size must be within usize since the chunk is within the guest memory.
432             let chunk_size = (data_range.end - data_range.start) as usize;
433             let data_range = if chunk_size > remaining_batch_size {
434                 // Split the chunk if it is bigger than remaining_batch_size.
435 
436                 let split_size = if chunk_size >= hugepage_size {
437                     // If the chunk size is bigger than or equals to huge page size, the chunk may
438                     // contains a huge page. If we MADV_REMOVE a huge page partially, it can cause
439                     // inconsistency between the actual page table and vmm-swap internal state.
440                     let chunk_addr = base_addr + offset;
441                     if !is_hugepage_aligned(chunk_addr) {
442                         // Split the chunk before the where a huge page could start.
443                         std::cmp::min(
444                             round_up_hugepage_size(chunk_addr) - chunk_addr,
445                             remaining_batch_size,
446                         )
447                     } else {
448                         if remaining_batch_size < hugepage_size {
449                             // Remove the batch since it does not have enough room for a huge page.
450                             self.channel.push(MoveToStaging {
451                                 remove_area: base_addr + batch_head_offset..base_addr + offset,
452                                 copies: mem::take(&mut copies),
453                             });
454                             remaining_batch_size = hugepage_size;
455                             batch_head_offset = offset;
456                         }
457                         hugepage_size
458                     }
459                 } else {
460                     remaining_batch_size
461                 };
462                 // Cache the rest of splitted chunk to avoid useless lseek(2) syscall.
463                 cur_data = Some(data_range.start + split_size as u64..data_range.end);
464                 data_range.start..data_range.start + split_size as u64
465             } else {
466                 data_range
467             };
468 
469             let size = (data_range.end - data_range.start) as usize;
470             assert!(is_page_aligned(size));
471 
472             // Safe because:
473             // * src_addr is aligned with page size
474             // * the data_range starting from src_addr is on the guest memory.
475             let copy_op = unsafe {
476                 region.staging_memory.copy(
477                     (base_addr + offset) as *const u8,
478                     bytes_to_pages(offset),
479                     bytes_to_pages(size),
480                 )?
481             };
482             copies.push(copy_op);
483 
484             moved_size += size;
485             // The size must be smaller than or equals to remaining_batch_size.
486             remaining_batch_size -= size;
487 
488             if remaining_batch_size == 0 {
489                 // Remove the batch of pages at once to reduce madvise(2) syscall.
490                 self.channel.push(MoveToStaging {
491                     remove_area: base_addr + batch_head_offset..base_addr + offset + size,
492                     copies: mem::take(&mut copies),
493                 });
494                 remaining_batch_size = hugepage_size;
495                 batch_head_offset = offset + size;
496             }
497         }
498         // Remove the final batch of pages.
499         self.channel.push(MoveToStaging {
500             remove_area: base_addr + batch_head_offset..base_addr + region_size,
501             copies,
502         });
503 
504         let moved_pages = bytes_to_pages(moved_size);
505         // Suppress error log on the first swap_out, since page counts are not initialized but zero.
506         if region.swap_active
507             && moved_pages
508                 != (region.copied_from_file_pages
509                     + region.copied_from_staging_pages
510                     + region.zeroed_pages
511                     + region.swap_in_pages)
512         {
513             error!(
514                 "moved pages ({}) does not match with resident pages (copied(file): {}, copied(staging): {}, zeroed: {}, swap_in: {}).",
515                 moved_pages, region.copied_from_file_pages, region.copied_from_staging_pages,
516                 region.zeroed_pages, region.swap_in_pages
517             );
518         }
519         region.copied_from_file_pages = 0;
520         region.copied_from_staging_pages = 0;
521         region.zeroed_pages = 0;
522         region.swap_in_pages = 0;
523         region.redundant_pages = 0;
524         region.swap_active = true;
525 
526         Ok(moved_pages)
527     }
528 
529     /// Write a chunk of consecutive pages in the staging memory to the swap file.
530     ///
531     /// If there is no active pages in the staging memory, this returns `Ok(0)`.
532     ///
533     /// The pages in guest memory have been moved to staging memory by [Self::move_to_staging()].
534     ///
535     /// Returns the count of swapped out pages.
536     ///
537     /// Even if swap_out fails on any internal steps, it does not break the page state management
538     /// and `PageHandler` can continue working with a little pages leaking in staging memory or swap
539     /// file. The leaked pages are removed when vmm-swap is disabled and `PageHandler` is dropped.
540     ///
541     /// # Arguments
542     ///
543     /// * `max_size` - the upper limit of the chunk size to write into the swap file at once. The
544     ///   chunk is splitted if it is bigger than `max_size`.
swap_out(&self, max_size: usize) -> Result<usize>545     pub fn swap_out(&self, max_size: usize) -> Result<usize> {
546         let max_pages = bytes_to_pages(max_size);
547         let mut ctx = self.ctx.lock();
548         for region in ctx.regions.iter_mut() {
549             if let Some(idx_range) = region.staging_memory.first_data_range(max_pages) {
550                 let pages = idx_range.end - idx_range.start;
551                 let slice = region.staging_memory.get_slice(idx_range.clone())?;
552                 // Convert VolatileSlice to &[u8]
553                 // Safe because the range of volatile slice is already validated.
554                 let slice = unsafe { std::slice::from_raw_parts(slice.as_ptr(), slice.size()) };
555                 region.file.write_to_file(idx_range.start, slice)?;
556                 // TODO(kawasin): clear state_list on each write and MADV_REMOVE several chunk at
557                 // once.
558                 region.staging_memory.clear_range(idx_range)?;
559                 // TODO(kawasin): free the page cache of the swap file.
560                 // TODO(kawasin): use writev() to swap_out several small chunks at once.
561                 return Ok(pages);
562             }
563         }
564         Ok(0)
565     }
566 
567     /// Create a new [SwapInContext].
start_swap_in(&'a self) -> SwapInContext<'a>568     pub fn start_swap_in(&'a self) -> SwapInContext<'a> {
569         SwapInContext {
570             ctx: &self.ctx,
571             cur_populate: 0,
572             cur_staging: 0,
573             cur_file: 0,
574         }
575     }
576 
577     /// Create a new [TrimContext].
start_trim(&'a self) -> TrimContext<'a>578     pub fn start_trim(&'a self) -> TrimContext<'a> {
579         TrimContext {
580             ctx: &self.ctx,
581             swap_raw_file: self.swap_raw_file,
582             cur_page: 0,
583             cur_region: 0,
584             next_data_in_file: 0..0,
585             clean_pages: 0,
586             zero_pages: 0,
587         }
588     }
589 
590     /// Returns count of pages active on the memory.
compute_resident_pages(&self) -> usize591     pub fn compute_resident_pages(&self) -> usize {
592         self.ctx
593             .lock()
594             .regions
595             .iter()
596             .map(|r| r.copied_from_file_pages + r.copied_from_staging_pages + r.zeroed_pages)
597             .sum()
598     }
599 
600     /// Returns count of pages copied from vmm-swap file to the guest memory.
compute_copied_from_file_pages(&self) -> usize601     pub fn compute_copied_from_file_pages(&self) -> usize {
602         self.ctx
603             .lock()
604             .regions
605             .iter()
606             .map(|r| r.copied_from_file_pages)
607             .sum()
608     }
609 
610     /// Returns count of pages copied from staging memory to the guest memory.
compute_copied_from_staging_pages(&self) -> usize611     pub fn compute_copied_from_staging_pages(&self) -> usize {
612         self.ctx
613             .lock()
614             .regions
615             .iter()
616             .map(|r| r.copied_from_staging_pages)
617             .sum()
618     }
619 
620     /// Returns count of pages initialized with zero.
compute_zeroed_pages(&self) -> usize621     pub fn compute_zeroed_pages(&self) -> usize {
622         self.ctx.lock().regions.iter().map(|r| r.zeroed_pages).sum()
623     }
624 
625     /// Returns count of pages which were already initialized on page faults.
compute_redundant_pages(&self) -> usize626     pub fn compute_redundant_pages(&self) -> usize {
627         self.ctx
628             .lock()
629             .regions
630             .iter()
631             .map(|r| r.redundant_pages)
632             .sum()
633     }
634 
635     /// Returns count of pages present in the staging memory.
compute_staging_pages(&self) -> usize636     pub fn compute_staging_pages(&self) -> usize {
637         self.ctx
638             .lock()
639             .regions
640             .iter()
641             .map(|r| r.staging_memory.present_pages())
642             .sum()
643     }
644 
645     /// Returns count of pages present in the swap files.
compute_swap_pages(&self) -> usize646     pub fn compute_swap_pages(&self) -> usize {
647         self.ctx
648             .lock()
649             .regions
650             .iter()
651             .map(|r| r.file.present_pages())
652             .sum()
653     }
654 }
655 
656 /// Context for swap-in operation.
657 ///
658 /// This holds cursor of indices in the regions for each step for optimization.
659 pub struct SwapInContext<'a> {
660     ctx: &'a Mutex<PageHandleContext<'a>>,
661     cur_populate: usize,
662     cur_staging: usize,
663     cur_file: usize,
664 }
665 
666 impl SwapInContext<'_> {
667     /// Swap in a chunk of consecutive pages from the staging memory and the swap file.
668     ///
669     /// If there is no more pages present outside of the guest memory, this returns `Ok(0)`.
670     ///
671     /// Returns the count of swapped in pages.
672     ///
673     /// # Arguments
674     ///
675     /// * `uffd` - the main [Userfaultfd].
676     /// * `max_size` - the upper limit of the chunk size to swap into the guest memory at once. The
677     ///   chunk is splitted if it is bigger than `max_size`.
swap_in(&mut self, uffd: &Userfaultfd, max_size: usize) -> Result<usize>678     pub fn swap_in(&mut self, uffd: &Userfaultfd, max_size: usize) -> Result<usize> {
679         let mut ctx = self.ctx.lock();
680         // Request the kernel to pre-populate the present pages in the swap file to page cache
681         // background. At most 16MB of pages will be populated.
682         // The threshold is to apply MADV_WILLNEED to bigger chunk of pages. The kernel populates
683         // consective pages at once on MADV_WILLNEED.
684         if ctx.mlock_budget_pages > bytes_to_pages(PREFETCH_THRESHOLD) {
685             let PageHandleContext {
686                 regions,
687                 mlock_budget_pages,
688             } = &mut *ctx;
689             'prefetch_loop: for region in regions[self.cur_populate..].iter_mut() {
690                 loop {
691                     let locked_pages = region.file.lock_and_async_prefetch(*mlock_budget_pages)?;
692                     if locked_pages > 0 {
693                         *mlock_budget_pages -= locked_pages;
694                         if *mlock_budget_pages == 0 {
695                             break 'prefetch_loop;
696                         }
697                     } else {
698                         // next region.
699                         self.cur_populate += 1;
700                         break;
701                     }
702                 }
703             }
704         }
705 
706         let max_pages = bytes_to_pages(max_size);
707         for region in ctx.regions[self.cur_staging..].iter_mut() {
708             // TODO(kawasin): swap_in multiple chunks less than max_size at once.
709             if let Some(idx_range) = region.staging_memory.first_data_range(max_pages) {
710                 let pages = idx_range.end - idx_range.start;
711                 let page_addr = page_idx_to_addr(region.head_page_idx + idx_range.start);
712                 let slice = region.staging_memory.get_slice(idx_range.clone())?;
713                 uffd_copy_all(uffd, page_addr, slice, false)?;
714                 // Clear the staging memory to avoid memory spike.
715                 // TODO(kawasin): reduce the call count of MADV_REMOVE by removing several data
716                 // at once.
717                 region.staging_memory.clear_range(idx_range)?;
718                 region.swap_in_pages += pages;
719                 return Ok(pages);
720             }
721             self.cur_staging += 1;
722         }
723 
724         for region in ctx.regions[self.cur_file..].iter_mut() {
725             if let Some(idx_range) = region.file.first_data_range(max_pages) {
726                 let pages = idx_range.end - idx_range.start;
727                 let page_addr = page_idx_to_addr(region.head_page_idx + idx_range.start);
728                 let slice = region.file.get_slice(idx_range.clone())?;
729                 // TODO(kawasin): Unlock regions to proceed page fault handling on the main thread.
730                 //                We also need to handle the EEXIST error from UFFD_COPY.
731                 uffd_copy_all(uffd, page_addr, slice, false)?;
732                 // Do not erase each chunk of pages from disk on swap_in. The whole file will be
733                 // truncated when swap_in is completed. Even if swap_in is aborted, the remaining
734                 // disk contents help the trimming optimization on swap_out.
735                 let munlocked_pages = region.file.clear_range(idx_range)?;
736                 region.swap_in_pages += pages;
737                 ctx.mlock_budget_pages += munlocked_pages;
738                 return Ok(pages);
739             }
740             self.cur_file += 1;
741         }
742         Ok(0)
743     }
744 }
745 
746 impl Drop for SwapInContext<'_> {
drop(&mut self)747     fn drop(&mut self) {
748         let mut ctx = self.ctx.lock();
749         for region in ctx.regions.iter_mut() {
750             if let Err(e) = region.file.clear_mlock() {
751                 panic!("failed to clear mlock: {:?}", e);
752             }
753         }
754         ctx.mlock_budget_pages = bytes_to_pages(MLOCK_BUDGET);
755     }
756 }
757 
758 /// Context for trim operation.
759 ///
760 /// This drops 2 types of pages in the staging memory to reduce disk write.
761 ///
762 /// * Clean pages
763 ///   * The pages which have been swapped out to the disk and have not been changed.
764 ///   * Drop the pages in the staging memory and mark it as present on the swap file.
765 /// * Zero pages
766 ///   * Drop the pages in the staging memory. The pages will be UFFD_ZEROed on page fault.
767 pub struct TrimContext<'a> {
768     ctx: &'a Mutex<PageHandleContext<'a>>,
769     swap_raw_file: &'a File,
770     cur_region: usize,
771     cur_page: usize,
772     /// The page idx range of pages which have been stored in the swap file.
773     next_data_in_file: Range<usize>,
774     clean_pages: usize,
775     zero_pages: usize,
776 }
777 
778 impl TrimContext<'_> {
779     /// Trim pages in the staging memory.
780     ///
781     /// This returns the pages trimmed. This returns `None` if it traversed all pages in the staging
782     /// memory.
783     ///
784     /// # Arguments
785     ///
786     /// `max_size` - The maximum pages to be compared.
trim_pages(&mut self, max_pages: usize) -> anyhow::Result<Option<usize>>787     pub fn trim_pages(&mut self, max_pages: usize) -> anyhow::Result<Option<usize>> {
788         let mut ctx = self.ctx.lock();
789         if self.cur_region >= ctx.regions.len() {
790             return Ok(None);
791         }
792         let region = &mut ctx.regions[self.cur_region];
793         let region_size_bytes = pages_to_bytes(region.file.num_pages()) as u64;
794         let mut n_trimmed = 0;
795 
796         for _ in 0..max_pages {
797             if let Some(slice_in_staging) = region
798                 .staging_memory
799                 .page_content(self.cur_page)
800                 .context("get page of staging memory")?
801             {
802                 let idx_range = self.cur_page..self.cur_page + 1;
803 
804                 if self.cur_page >= self.next_data_in_file.end {
805                     let offset_in_region = pages_to_bytes(self.cur_page) as u64;
806                     let offset = region.file.base_offset() + offset_in_region;
807                     if let Some(offset_range) = find_next_data(
808                         self.swap_raw_file,
809                         offset,
810                         region_size_bytes - offset_in_region,
811                     )
812                     .context("find next data in swap file")?
813                     {
814                         let start = bytes_to_pages(
815                             (offset_range.start - region.file.base_offset()) as usize,
816                         );
817                         let end =
818                             bytes_to_pages((offset_range.end - region.file.base_offset()) as usize);
819                         self.next_data_in_file = start..end;
820                     } else {
821                         self.next_data_in_file = region.file.num_pages()..region.file.num_pages();
822                     }
823                 }
824 
825                 // Check zero page on the staging memory first. If the page is non-zero and have not
826                 // been changed, zero checking is useless, but less cost than file I/O for the pages
827                 // which were in the swap file and now is zero.
828                 // Check 2 types of page in the same loop to utilize CPU cache for staging memory.
829                 if slice_in_staging.is_all_zero() {
830                     region
831                         .staging_memory
832                         .clear_range(idx_range.clone())
833                         .context("clear a page in staging memory")?;
834                     if self.cur_page >= self.next_data_in_file.start {
835                         // The page is on the swap file as well.
836                         let munlocked_pages = region
837                             .file
838                             .erase_from_disk(idx_range)
839                             .context("clear a page in swap file")?;
840                         if munlocked_pages != 0 {
841                             // Only either of swap-in or trimming runs at the same time. This is not
842                             // expected path. Just logging an error because leaking
843                             // mlock_budget_pages is not fatal.
844                             error!("pages are mlock(2)ed while trimming");
845                         }
846                     }
847                     n_trimmed += 1;
848                     self.zero_pages += 1;
849                 } else if self.cur_page >= self.next_data_in_file.start {
850                     // The previous content of the page is on the disk.
851                     let slice_in_file = region
852                         .file
853                         .get_slice(idx_range.clone())
854                         .context("get slice in swap file")?;
855 
856                     if slice_in_staging == slice_in_file {
857                         region
858                             .staging_memory
859                             .clear_range(idx_range.clone())
860                             .context("clear a page in staging memory")?;
861                         region.file.mark_as_present(self.cur_page);
862                         n_trimmed += 1;
863                         self.clean_pages += 1;
864                     }
865                 }
866             }
867 
868             self.cur_page += 1;
869             if self.cur_page >= region.file.num_pages() {
870                 self.cur_region += 1;
871                 self.cur_page = 0;
872                 self.next_data_in_file = 0..0;
873                 break;
874             }
875         }
876 
877         Ok(Some(n_trimmed))
878     }
879 
880     /// Total trimmed clean pages.
trimmed_clean_pages(&self) -> usize881     pub fn trimmed_clean_pages(&self) -> usize {
882         self.clean_pages
883     }
884 
885     /// Total trimmed zero pages.
trimmed_zero_pages(&self) -> usize886     pub fn trimmed_zero_pages(&self) -> usize {
887         self.zero_pages
888     }
889 }
890