1 // Copyright 2022 The ChromiumOS Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 //! PageHandler manages the page states of multiple regions.
6
7 #![deny(missing_docs)]
8
9 use std::fs::File;
10 use std::mem;
11 use std::ops::Range;
12 use std::sync::Arc;
13
14 use anyhow::Context;
15 use base::error;
16 use base::sys::find_next_data;
17 use base::unix::FileDataIterator;
18 use base::AsRawDescriptor;
19 use base::SharedMemory;
20 use data_model::VolatileSlice;
21 use sync::Mutex;
22 use thiserror::Error as ThisError;
23
24 use crate::file::Error as FileError;
25 use crate::file::SwapFile;
26 use crate::pagesize::addr_to_page_idx;
27 use crate::pagesize::bytes_to_pages;
28 use crate::pagesize::is_hugepage_aligned;
29 use crate::pagesize::is_page_aligned;
30 use crate::pagesize::page_base_addr;
31 use crate::pagesize::page_idx_to_addr;
32 use crate::pagesize::pages_to_bytes;
33 use crate::pagesize::round_up_hugepage_size;
34 use crate::pagesize::THP_SIZE;
35 use crate::staging::CopyOp;
36 use crate::staging::Error as StagingError;
37 use crate::staging::StagingMemory;
38 use crate::userfaultfd::Error as UffdError;
39 use crate::userfaultfd::Userfaultfd;
40 use crate::worker::Channel;
41 use crate::worker::Task;
42
43 pub(crate) const MLOCK_BUDGET: usize = 16 * 1024 * 1024; // = 16MB
44 const PREFETCH_THRESHOLD: usize = 4 * 1024 * 1024; // = 4MB
45
46 /// Result for PageHandler
47 pub type Result<T> = std::result::Result<T, Error>;
48
49 /// Errors for PageHandler
50 #[derive(ThisError, Debug)]
51 pub enum Error {
52 #[error("the address is invalid {0:#018X}")]
53 /// the address is invalid
54 InvalidAddress(usize),
55 #[error("the regions {0:?} and {1:?} overlap")]
56 /// regions are overlaps on registering
57 RegionOverlap(Range<usize>, Range<usize>),
58 #[error("failed to create page handler {0:?}")]
59 /// failed to create page handler
60 CreateFailed(anyhow::Error),
61 #[error("file operation failed : {0:?}")]
62 /// file operation failed
63 File(#[from] FileError),
64 #[error("staging operation failed : {0:?}")]
65 /// staging operation failed
66 Staging(#[from] StagingError),
67 #[error("userfaultfd failed : {0:?}")]
68 /// userfaultfd operation failed
69 Userfaultfd(#[from] UffdError),
70 }
71
72 /// Remove the memory range on the guest memory.
73 ///
74 /// This is an alternative to [vm_memory::GuestMemory::remove_range()] when working with host
75 /// addresses instead of guest addresses.
76 ///
77 /// # Safety
78 ///
79 /// The memory range must be on the guest memory.
80 #[deny(unsafe_op_in_unsafe_fn)]
remove_memory(addr: usize, len: usize) -> std::result::Result<(), base::Error>81 unsafe fn remove_memory(addr: usize, len: usize) -> std::result::Result<(), base::Error> {
82 // Safe because the caller guarantees addr is in guest memory, so this does not affect any rust
83 // managed memory.
84 let ret = unsafe { libc::madvise(addr as *mut libc::c_void, len, libc::MADV_REMOVE) };
85 if ret < 0 {
86 base::errno_result()
87 } else {
88 Ok(())
89 }
90 }
91
uffd_copy_all( uffd: &Userfaultfd, mut page_addr: usize, mut data_slice: VolatileSlice, wake: bool, ) -> std::result::Result<(), UffdError>92 fn uffd_copy_all(
93 uffd: &Userfaultfd,
94 mut page_addr: usize,
95 mut data_slice: VolatileSlice,
96 wake: bool,
97 ) -> std::result::Result<(), UffdError> {
98 loop {
99 let result = uffd.copy(page_addr, data_slice.size(), data_slice.as_ptr(), wake);
100 match result {
101 Err(UffdError::PartiallyCopied(copied)) => {
102 page_addr += copied;
103 data_slice.advance(copied);
104 }
105 other => {
106 // Even EEXIST for copy operation should be an error for page fault handling. If
107 // the page was swapped in before, the page should be cleared from the swap file
108 // and do `Userfaultfd::zero()` instead.
109 return other.map(|_| ());
110 }
111 }
112 }
113 }
114
115 /// [Region] represents a memory region and corresponding [SwapFile].
116 struct Region<'a> {
117 /// the head page index of the region.
118 head_page_idx: usize,
119 file: SwapFile<'a>,
120 staging_memory: StagingMemory,
121 copied_from_file_pages: usize,
122 copied_from_staging_pages: usize,
123 zeroed_pages: usize,
124 swap_in_pages: usize,
125 /// the amount of pages which were already initialized on page faults.
126 redundant_pages: usize,
127 swap_active: bool,
128 }
129
130 /// MoveToStaging copies chunks of consecutive pages next to each other on the guest memory to the
131 /// staging memory and removes the chunks on the guest memory.
132 pub struct MoveToStaging {
133 remove_area: Range<usize>,
134 copies: Vec<CopyOp>,
135 }
136
137 impl Task for MoveToStaging {
execute(self)138 fn execute(self) {
139 for copy_op in self.copies {
140 copy_op.execute();
141 }
142 // Remove chunks of pages at once to reduce madvise(2) syscall.
143 // Safe because the region is already backed by the file and the content will be
144 // swapped in on a page fault.
145 let result = unsafe {
146 remove_memory(
147 self.remove_area.start,
148 self.remove_area.end - self.remove_area.start,
149 )
150 };
151 if let Err(e) = result {
152 panic!("failed to remove memory: {:?}", e);
153 }
154 }
155 }
156
157 struct PageHandleContext<'a> {
158 regions: Vec<Region<'a>>,
159 mlock_budget_pages: usize,
160 }
161
162 /// PageHandler manages the page states of multiple regions.
163 ///
164 /// Handles multiple events derived from userfaultfd and swap out requests.
165 /// All the addresses and sizes in bytes are converted to page id internally.
166 pub struct PageHandler<'a> {
167 ctx: Mutex<PageHandleContext<'a>>,
168 channel: Arc<Channel<MoveToStaging>>,
169 swap_raw_file: &'a File,
170 }
171
172 impl<'a> PageHandler<'a> {
173 /// Creates [PageHandler] for the given region.
174 ///
175 /// If any of regions overlaps, this returns [Error::RegionOverlap].
176 ///
177 /// # Arguments
178 ///
179 /// * `swap_file` - The swap file.
180 /// * `staging_shmem` - The staging memory. It must have enough size to hold guest memory.
181 /// Otherwise monitor process crashes on creating a mmap.
182 /// * `address_ranges` - The list of address range of the regions. the start address must align
183 /// with page. the size must be multiple of pagesize.
create( swap_raw_file: &'a File, staging_shmem: &'a SharedMemory, address_ranges: &[Range<usize>], stating_move_context: Arc<Channel<MoveToStaging>>, ) -> Result<Self>184 pub fn create(
185 swap_raw_file: &'a File,
186 staging_shmem: &'a SharedMemory,
187 address_ranges: &[Range<usize>],
188 stating_move_context: Arc<Channel<MoveToStaging>>,
189 ) -> Result<Self> {
190 // Truncate the file into the size to hold all regions, otherwise access beyond the end of
191 // file may cause SIGBUS.
192 swap_raw_file
193 .set_len(
194 address_ranges
195 .iter()
196 .map(|r| (r.end.saturating_sub(r.start)) as u64)
197 .sum(),
198 )
199 .context("truncate swap file")
200 .map_err(Error::CreateFailed)?;
201
202 let mut regions: Vec<Region> = Vec::new();
203 let mut offset_pages = 0;
204 for address_range in address_ranges {
205 let head_page_idx = addr_to_page_idx(address_range.start);
206 if address_range.end < address_range.start {
207 return Err(Error::CreateFailed(anyhow::anyhow!(
208 "invalid region end < start"
209 )));
210 }
211 let region_size = address_range.end - address_range.start;
212 let num_of_pages = bytes_to_pages(region_size);
213
214 // Find an overlapping region
215 match regions.iter().position(|region| {
216 if region.head_page_idx < head_page_idx {
217 region.head_page_idx + region.file.num_pages() > head_page_idx
218 } else {
219 region.head_page_idx < head_page_idx + num_of_pages
220 }
221 }) {
222 Some(i) => {
223 let region = ®ions[i];
224
225 return Err(Error::RegionOverlap(
226 address_range.clone(),
227 page_idx_to_addr(region.head_page_idx)
228 ..(page_idx_to_addr(region.head_page_idx + region.file.num_pages())),
229 ));
230 }
231 None => {
232 let base_addr = address_range.start;
233 assert!(is_page_aligned(base_addr));
234 assert!(is_page_aligned(region_size));
235
236 let file = SwapFile::new(swap_raw_file, offset_pages, num_of_pages)?;
237 let staging_memory = StagingMemory::new(
238 staging_shmem,
239 pages_to_bytes(offset_pages) as u64,
240 num_of_pages,
241 )?;
242 regions.push(Region {
243 head_page_idx,
244 file,
245 staging_memory,
246 copied_from_file_pages: 0,
247 copied_from_staging_pages: 0,
248 zeroed_pages: 0,
249 swap_in_pages: 0,
250 redundant_pages: 0,
251 swap_active: false,
252 });
253 offset_pages += num_of_pages;
254 }
255 }
256 }
257
258 Ok(Self {
259 ctx: Mutex::new(PageHandleContext {
260 regions,
261 mlock_budget_pages: bytes_to_pages(MLOCK_BUDGET),
262 }),
263 channel: stating_move_context,
264 swap_raw_file,
265 })
266 }
267
find_region<'b>( regions: &'b mut [Region<'a>], page_idx: usize, ) -> Option<&'b mut Region<'a>>268 fn find_region<'b>(
269 regions: &'b mut [Region<'a>],
270 page_idx: usize,
271 ) -> Option<&'b mut Region<'a>> {
272 // sequential search the corresponding page map from the list. It should be fast enough
273 // because there are a few regions (usually only 1).
274 regions.iter_mut().find(|region| {
275 region.head_page_idx <= page_idx
276 && page_idx < region.head_page_idx + region.file.num_pages()
277 })
278 }
279
280 /// Fills the faulted page with zero if the page is not initialized, with the content in the
281 /// swap file if the page is swapped out.
282 ///
283 /// # Arguments
284 ///
285 /// * `uffd` - the reference to the [Userfaultfd] for the faulting process.
286 /// * `address` - the address that triggered the page fault.
handle_page_fault(&self, uffd: &Userfaultfd, address: usize) -> Result<()>287 pub fn handle_page_fault(&self, uffd: &Userfaultfd, address: usize) -> Result<()> {
288 let page_idx = addr_to_page_idx(address);
289 // the head address of the page.
290 let page_addr = page_base_addr(address);
291 let page_size = pages_to_bytes(1);
292 let mut ctx = self.ctx.lock();
293 let region =
294 Self::find_region(&mut ctx.regions, page_idx).ok_or(Error::InvalidAddress(address))?;
295
296 let idx_in_region = page_idx - region.head_page_idx;
297 if let Some(page_slice) = region.staging_memory.page_content(idx_in_region)? {
298 uffd_copy_all(uffd, page_addr, page_slice, true)?;
299 // TODO(b/265758094): optimize clear operation.
300 region
301 .staging_memory
302 .clear_range(idx_in_region..idx_in_region + 1)?;
303 region.copied_from_staging_pages += 1;
304 Ok(())
305 } else if let Some(page_slice) = region.file.page_content(idx_in_region)? {
306 // TODO(kawasin): Unlock regions to proceed swap-in operation background.
307 uffd_copy_all(uffd, page_addr, page_slice, true)?;
308 // TODO(b/265758094): optimize clear operation.
309 // Do not erase the page from the disk for trimming optimization on next swap out.
310 let munlocked_pages = region.file.clear_range(idx_in_region..idx_in_region + 1)?;
311 region.copied_from_file_pages += 1;
312 ctx.mlock_budget_pages += munlocked_pages;
313 Ok(())
314 } else {
315 // Map a zero page since no swap file has been created yet but the fault
316 // happened.
317 // safe because the fault page is notified by uffd.
318 let result = uffd.zero(page_addr, page_size, true);
319 match result {
320 Ok(_) => {
321 region.zeroed_pages += 1;
322 Ok(())
323 }
324 Err(UffdError::PageExist) => {
325 // This case can happen if page faults on the same page happen on different
326 // processes.
327 uffd.wake(page_addr, page_size)?;
328 region.redundant_pages += 1;
329 Ok(())
330 }
331 Err(e) => Err(e.into()),
332 }
333 }
334 }
335
336 /// Clear the internal state for the pages.
337 ///
338 /// When pages are removed by madvise with `MADV_DONTNEED` or `MADV_REMOVE`, userfaultfd
339 /// notifies the event as `UFFD_EVENT_REMOVE`. This handles the remove event.
340 ///
341 /// In crosvm, balloon frees the guest memory and cause `UFFD_EVENT_REMOVE`.
342 ///
343 /// # Arguments
344 ///
345 /// * `start_addr` - the head address of the memory area to be freed.
346 /// * `end_addr` - the end address of the memory area to be freed. `UFFD_EVENT_REMOVE` tells the
347 /// head address of the next memory area of the freed area. (i.e. the exact tail address of
348 /// the memory area is `end_addr - 1`.)
handle_page_remove(&self, start_addr: usize, end_addr: usize) -> Result<()>349 pub fn handle_page_remove(&self, start_addr: usize, end_addr: usize) -> Result<()> {
350 if !is_page_aligned(start_addr) {
351 return Err(Error::InvalidAddress(start_addr));
352 } else if !is_page_aligned(end_addr) {
353 return Err(Error::InvalidAddress(end_addr));
354 }
355 let start_page_idx = addr_to_page_idx(start_addr);
356 let last_page_idx = addr_to_page_idx(end_addr);
357 let mut ctx = self.ctx.lock();
358 // TODO(b/269983521): Clear multiple pages in the same region at once.
359 for page_idx in start_page_idx..(last_page_idx) {
360 let page_addr = page_idx_to_addr(page_idx);
361 // TODO(kawasin): Cache the position if the range does not span multiple regions.
362 let region = Self::find_region(&mut ctx.regions, page_idx)
363 .ok_or(Error::InvalidAddress(page_addr))?;
364 let idx_in_region = page_idx - region.head_page_idx;
365 let idx_range = idx_in_region..idx_in_region + 1;
366 if let Err(e) = region.staging_memory.clear_range(idx_range.clone()) {
367 error!("failed to clear removed page from staging: {:?}", e);
368 }
369 // Erase the pages from the disk because the pages are removed from the guest memory.
370 let munlocked_pages = region.file.erase_from_disk(idx_range)?;
371 ctx.mlock_budget_pages += munlocked_pages;
372 }
373 Ok(())
374 }
375
376 /// Move active pages in the memory region to the staging memory.
377 ///
378 /// It only moves active contents in the guest memory to the swap file and skips empty pages
379 /// (e.g. pages not touched, freed by balloon) using `lseek(2)` + `SEEK_HOLE/DATA`.
380 ///
381 /// Returns the count of moved out pages.
382 ///
383 /// # Arguments
384 ///
385 /// * `base_addr` - the head address of the memory region.
386 /// * `memfd` - the file descriptor of the memfd backing the guest memory region.
387 /// * `base_offset` - the offset of the memory region in the memfd.
388 ///
389 /// # Safety
390 ///
391 /// The region must have been registered to all userfaultfd of processes which may touch the
392 /// region.
393 ///
394 /// The memory must be protected not to be updated while moving.
395 ///
396 /// The page fault events for the region from the userfaultfd must be handled by
397 /// [Self::handle_page_fault()].
398 ///
399 /// Must call [Channel::wait_complete()] to wait all the copy operation complete within the
400 /// memory protection period.
401 #[deny(unsafe_op_in_unsafe_fn)]
move_to_staging<T>( &self, base_addr: usize, memfd: &T, base_offset: u64, ) -> Result<usize> where T: AsRawDescriptor,402 pub unsafe fn move_to_staging<T>(
403 &self,
404 base_addr: usize,
405 memfd: &T,
406 base_offset: u64,
407 ) -> Result<usize>
408 where
409 T: AsRawDescriptor,
410 {
411 let hugepage_size = *THP_SIZE;
412 let mut ctx = self.ctx.lock();
413 let region = Self::find_region(&mut ctx.regions, addr_to_page_idx(base_addr))
414 .ok_or(Error::InvalidAddress(base_addr))?;
415
416 if page_idx_to_addr(region.head_page_idx) != base_addr {
417 return Err(Error::InvalidAddress(base_addr));
418 }
419 let region_size = pages_to_bytes(region.file.num_pages());
420 let mut file_data = FileDataIterator::new(memfd, base_offset, region_size as u64);
421 let mut moved_size = 0;
422 let mut copies = Vec::new();
423 let mut remaining_batch_size = hugepage_size;
424 let mut batch_head_offset = 0;
425 let mut cur_data = None;
426 while let Some(data_range) = cur_data.take().or_else(|| file_data.next()) {
427 // Assert offset is page aligned
428 let offset = (data_range.start - base_offset) as usize;
429 assert!(is_page_aligned(offset));
430
431 // The chunk size must be within usize since the chunk is within the guest memory.
432 let chunk_size = (data_range.end - data_range.start) as usize;
433 let data_range = if chunk_size > remaining_batch_size {
434 // Split the chunk if it is bigger than remaining_batch_size.
435
436 let split_size = if chunk_size >= hugepage_size {
437 // If the chunk size is bigger than or equals to huge page size, the chunk may
438 // contains a huge page. If we MADV_REMOVE a huge page partially, it can cause
439 // inconsistency between the actual page table and vmm-swap internal state.
440 let chunk_addr = base_addr + offset;
441 if !is_hugepage_aligned(chunk_addr) {
442 // Split the chunk before the where a huge page could start.
443 std::cmp::min(
444 round_up_hugepage_size(chunk_addr) - chunk_addr,
445 remaining_batch_size,
446 )
447 } else {
448 if remaining_batch_size < hugepage_size {
449 // Remove the batch since it does not have enough room for a huge page.
450 self.channel.push(MoveToStaging {
451 remove_area: base_addr + batch_head_offset..base_addr + offset,
452 copies: mem::take(&mut copies),
453 });
454 remaining_batch_size = hugepage_size;
455 batch_head_offset = offset;
456 }
457 hugepage_size
458 }
459 } else {
460 remaining_batch_size
461 };
462 // Cache the rest of splitted chunk to avoid useless lseek(2) syscall.
463 cur_data = Some(data_range.start + split_size as u64..data_range.end);
464 data_range.start..data_range.start + split_size as u64
465 } else {
466 data_range
467 };
468
469 let size = (data_range.end - data_range.start) as usize;
470 assert!(is_page_aligned(size));
471
472 // Safe because:
473 // * src_addr is aligned with page size
474 // * the data_range starting from src_addr is on the guest memory.
475 let copy_op = unsafe {
476 region.staging_memory.copy(
477 (base_addr + offset) as *const u8,
478 bytes_to_pages(offset),
479 bytes_to_pages(size),
480 )?
481 };
482 copies.push(copy_op);
483
484 moved_size += size;
485 // The size must be smaller than or equals to remaining_batch_size.
486 remaining_batch_size -= size;
487
488 if remaining_batch_size == 0 {
489 // Remove the batch of pages at once to reduce madvise(2) syscall.
490 self.channel.push(MoveToStaging {
491 remove_area: base_addr + batch_head_offset..base_addr + offset + size,
492 copies: mem::take(&mut copies),
493 });
494 remaining_batch_size = hugepage_size;
495 batch_head_offset = offset + size;
496 }
497 }
498 // Remove the final batch of pages.
499 self.channel.push(MoveToStaging {
500 remove_area: base_addr + batch_head_offset..base_addr + region_size,
501 copies,
502 });
503
504 let moved_pages = bytes_to_pages(moved_size);
505 // Suppress error log on the first swap_out, since page counts are not initialized but zero.
506 if region.swap_active
507 && moved_pages
508 != (region.copied_from_file_pages
509 + region.copied_from_staging_pages
510 + region.zeroed_pages
511 + region.swap_in_pages)
512 {
513 error!(
514 "moved pages ({}) does not match with resident pages (copied(file): {}, copied(staging): {}, zeroed: {}, swap_in: {}).",
515 moved_pages, region.copied_from_file_pages, region.copied_from_staging_pages,
516 region.zeroed_pages, region.swap_in_pages
517 );
518 }
519 region.copied_from_file_pages = 0;
520 region.copied_from_staging_pages = 0;
521 region.zeroed_pages = 0;
522 region.swap_in_pages = 0;
523 region.redundant_pages = 0;
524 region.swap_active = true;
525
526 Ok(moved_pages)
527 }
528
529 /// Write a chunk of consecutive pages in the staging memory to the swap file.
530 ///
531 /// If there is no active pages in the staging memory, this returns `Ok(0)`.
532 ///
533 /// The pages in guest memory have been moved to staging memory by [Self::move_to_staging()].
534 ///
535 /// Returns the count of swapped out pages.
536 ///
537 /// Even if swap_out fails on any internal steps, it does not break the page state management
538 /// and `PageHandler` can continue working with a little pages leaking in staging memory or swap
539 /// file. The leaked pages are removed when vmm-swap is disabled and `PageHandler` is dropped.
540 ///
541 /// # Arguments
542 ///
543 /// * `max_size` - the upper limit of the chunk size to write into the swap file at once. The
544 /// chunk is splitted if it is bigger than `max_size`.
swap_out(&self, max_size: usize) -> Result<usize>545 pub fn swap_out(&self, max_size: usize) -> Result<usize> {
546 let max_pages = bytes_to_pages(max_size);
547 let mut ctx = self.ctx.lock();
548 for region in ctx.regions.iter_mut() {
549 if let Some(idx_range) = region.staging_memory.first_data_range(max_pages) {
550 let pages = idx_range.end - idx_range.start;
551 let slice = region.staging_memory.get_slice(idx_range.clone())?;
552 // Convert VolatileSlice to &[u8]
553 // Safe because the range of volatile slice is already validated.
554 let slice = unsafe { std::slice::from_raw_parts(slice.as_ptr(), slice.size()) };
555 region.file.write_to_file(idx_range.start, slice)?;
556 // TODO(kawasin): clear state_list on each write and MADV_REMOVE several chunk at
557 // once.
558 region.staging_memory.clear_range(idx_range)?;
559 // TODO(kawasin): free the page cache of the swap file.
560 // TODO(kawasin): use writev() to swap_out several small chunks at once.
561 return Ok(pages);
562 }
563 }
564 Ok(0)
565 }
566
567 /// Create a new [SwapInContext].
start_swap_in(&'a self) -> SwapInContext<'a>568 pub fn start_swap_in(&'a self) -> SwapInContext<'a> {
569 SwapInContext {
570 ctx: &self.ctx,
571 cur_populate: 0,
572 cur_staging: 0,
573 cur_file: 0,
574 }
575 }
576
577 /// Create a new [TrimContext].
start_trim(&'a self) -> TrimContext<'a>578 pub fn start_trim(&'a self) -> TrimContext<'a> {
579 TrimContext {
580 ctx: &self.ctx,
581 swap_raw_file: self.swap_raw_file,
582 cur_page: 0,
583 cur_region: 0,
584 next_data_in_file: 0..0,
585 clean_pages: 0,
586 zero_pages: 0,
587 }
588 }
589
590 /// Returns count of pages active on the memory.
compute_resident_pages(&self) -> usize591 pub fn compute_resident_pages(&self) -> usize {
592 self.ctx
593 .lock()
594 .regions
595 .iter()
596 .map(|r| r.copied_from_file_pages + r.copied_from_staging_pages + r.zeroed_pages)
597 .sum()
598 }
599
600 /// Returns count of pages copied from vmm-swap file to the guest memory.
compute_copied_from_file_pages(&self) -> usize601 pub fn compute_copied_from_file_pages(&self) -> usize {
602 self.ctx
603 .lock()
604 .regions
605 .iter()
606 .map(|r| r.copied_from_file_pages)
607 .sum()
608 }
609
610 /// Returns count of pages copied from staging memory to the guest memory.
compute_copied_from_staging_pages(&self) -> usize611 pub fn compute_copied_from_staging_pages(&self) -> usize {
612 self.ctx
613 .lock()
614 .regions
615 .iter()
616 .map(|r| r.copied_from_staging_pages)
617 .sum()
618 }
619
620 /// Returns count of pages initialized with zero.
compute_zeroed_pages(&self) -> usize621 pub fn compute_zeroed_pages(&self) -> usize {
622 self.ctx.lock().regions.iter().map(|r| r.zeroed_pages).sum()
623 }
624
625 /// Returns count of pages which were already initialized on page faults.
compute_redundant_pages(&self) -> usize626 pub fn compute_redundant_pages(&self) -> usize {
627 self.ctx
628 .lock()
629 .regions
630 .iter()
631 .map(|r| r.redundant_pages)
632 .sum()
633 }
634
635 /// Returns count of pages present in the staging memory.
compute_staging_pages(&self) -> usize636 pub fn compute_staging_pages(&self) -> usize {
637 self.ctx
638 .lock()
639 .regions
640 .iter()
641 .map(|r| r.staging_memory.present_pages())
642 .sum()
643 }
644
645 /// Returns count of pages present in the swap files.
compute_swap_pages(&self) -> usize646 pub fn compute_swap_pages(&self) -> usize {
647 self.ctx
648 .lock()
649 .regions
650 .iter()
651 .map(|r| r.file.present_pages())
652 .sum()
653 }
654 }
655
656 /// Context for swap-in operation.
657 ///
658 /// This holds cursor of indices in the regions for each step for optimization.
659 pub struct SwapInContext<'a> {
660 ctx: &'a Mutex<PageHandleContext<'a>>,
661 cur_populate: usize,
662 cur_staging: usize,
663 cur_file: usize,
664 }
665
666 impl SwapInContext<'_> {
667 /// Swap in a chunk of consecutive pages from the staging memory and the swap file.
668 ///
669 /// If there is no more pages present outside of the guest memory, this returns `Ok(0)`.
670 ///
671 /// Returns the count of swapped in pages.
672 ///
673 /// # Arguments
674 ///
675 /// * `uffd` - the main [Userfaultfd].
676 /// * `max_size` - the upper limit of the chunk size to swap into the guest memory at once. The
677 /// chunk is splitted if it is bigger than `max_size`.
swap_in(&mut self, uffd: &Userfaultfd, max_size: usize) -> Result<usize>678 pub fn swap_in(&mut self, uffd: &Userfaultfd, max_size: usize) -> Result<usize> {
679 let mut ctx = self.ctx.lock();
680 // Request the kernel to pre-populate the present pages in the swap file to page cache
681 // background. At most 16MB of pages will be populated.
682 // The threshold is to apply MADV_WILLNEED to bigger chunk of pages. The kernel populates
683 // consective pages at once on MADV_WILLNEED.
684 if ctx.mlock_budget_pages > bytes_to_pages(PREFETCH_THRESHOLD) {
685 let PageHandleContext {
686 regions,
687 mlock_budget_pages,
688 } = &mut *ctx;
689 'prefetch_loop: for region in regions[self.cur_populate..].iter_mut() {
690 loop {
691 let locked_pages = region.file.lock_and_async_prefetch(*mlock_budget_pages)?;
692 if locked_pages > 0 {
693 *mlock_budget_pages -= locked_pages;
694 if *mlock_budget_pages == 0 {
695 break 'prefetch_loop;
696 }
697 } else {
698 // next region.
699 self.cur_populate += 1;
700 break;
701 }
702 }
703 }
704 }
705
706 let max_pages = bytes_to_pages(max_size);
707 for region in ctx.regions[self.cur_staging..].iter_mut() {
708 // TODO(kawasin): swap_in multiple chunks less than max_size at once.
709 if let Some(idx_range) = region.staging_memory.first_data_range(max_pages) {
710 let pages = idx_range.end - idx_range.start;
711 let page_addr = page_idx_to_addr(region.head_page_idx + idx_range.start);
712 let slice = region.staging_memory.get_slice(idx_range.clone())?;
713 uffd_copy_all(uffd, page_addr, slice, false)?;
714 // Clear the staging memory to avoid memory spike.
715 // TODO(kawasin): reduce the call count of MADV_REMOVE by removing several data
716 // at once.
717 region.staging_memory.clear_range(idx_range)?;
718 region.swap_in_pages += pages;
719 return Ok(pages);
720 }
721 self.cur_staging += 1;
722 }
723
724 for region in ctx.regions[self.cur_file..].iter_mut() {
725 if let Some(idx_range) = region.file.first_data_range(max_pages) {
726 let pages = idx_range.end - idx_range.start;
727 let page_addr = page_idx_to_addr(region.head_page_idx + idx_range.start);
728 let slice = region.file.get_slice(idx_range.clone())?;
729 // TODO(kawasin): Unlock regions to proceed page fault handling on the main thread.
730 // We also need to handle the EEXIST error from UFFD_COPY.
731 uffd_copy_all(uffd, page_addr, slice, false)?;
732 // Do not erase each chunk of pages from disk on swap_in. The whole file will be
733 // truncated when swap_in is completed. Even if swap_in is aborted, the remaining
734 // disk contents help the trimming optimization on swap_out.
735 let munlocked_pages = region.file.clear_range(idx_range)?;
736 region.swap_in_pages += pages;
737 ctx.mlock_budget_pages += munlocked_pages;
738 return Ok(pages);
739 }
740 self.cur_file += 1;
741 }
742 Ok(0)
743 }
744 }
745
746 impl Drop for SwapInContext<'_> {
drop(&mut self)747 fn drop(&mut self) {
748 let mut ctx = self.ctx.lock();
749 for region in ctx.regions.iter_mut() {
750 if let Err(e) = region.file.clear_mlock() {
751 panic!("failed to clear mlock: {:?}", e);
752 }
753 }
754 ctx.mlock_budget_pages = bytes_to_pages(MLOCK_BUDGET);
755 }
756 }
757
758 /// Context for trim operation.
759 ///
760 /// This drops 2 types of pages in the staging memory to reduce disk write.
761 ///
762 /// * Clean pages
763 /// * The pages which have been swapped out to the disk and have not been changed.
764 /// * Drop the pages in the staging memory and mark it as present on the swap file.
765 /// * Zero pages
766 /// * Drop the pages in the staging memory. The pages will be UFFD_ZEROed on page fault.
767 pub struct TrimContext<'a> {
768 ctx: &'a Mutex<PageHandleContext<'a>>,
769 swap_raw_file: &'a File,
770 cur_region: usize,
771 cur_page: usize,
772 /// The page idx range of pages which have been stored in the swap file.
773 next_data_in_file: Range<usize>,
774 clean_pages: usize,
775 zero_pages: usize,
776 }
777
778 impl TrimContext<'_> {
779 /// Trim pages in the staging memory.
780 ///
781 /// This returns the pages trimmed. This returns `None` if it traversed all pages in the staging
782 /// memory.
783 ///
784 /// # Arguments
785 ///
786 /// `max_size` - The maximum pages to be compared.
trim_pages(&mut self, max_pages: usize) -> anyhow::Result<Option<usize>>787 pub fn trim_pages(&mut self, max_pages: usize) -> anyhow::Result<Option<usize>> {
788 let mut ctx = self.ctx.lock();
789 if self.cur_region >= ctx.regions.len() {
790 return Ok(None);
791 }
792 let region = &mut ctx.regions[self.cur_region];
793 let region_size_bytes = pages_to_bytes(region.file.num_pages()) as u64;
794 let mut n_trimmed = 0;
795
796 for _ in 0..max_pages {
797 if let Some(slice_in_staging) = region
798 .staging_memory
799 .page_content(self.cur_page)
800 .context("get page of staging memory")?
801 {
802 let idx_range = self.cur_page..self.cur_page + 1;
803
804 if self.cur_page >= self.next_data_in_file.end {
805 let offset_in_region = pages_to_bytes(self.cur_page) as u64;
806 let offset = region.file.base_offset() + offset_in_region;
807 if let Some(offset_range) = find_next_data(
808 self.swap_raw_file,
809 offset,
810 region_size_bytes - offset_in_region,
811 )
812 .context("find next data in swap file")?
813 {
814 let start = bytes_to_pages(
815 (offset_range.start - region.file.base_offset()) as usize,
816 );
817 let end =
818 bytes_to_pages((offset_range.end - region.file.base_offset()) as usize);
819 self.next_data_in_file = start..end;
820 } else {
821 self.next_data_in_file = region.file.num_pages()..region.file.num_pages();
822 }
823 }
824
825 // Check zero page on the staging memory first. If the page is non-zero and have not
826 // been changed, zero checking is useless, but less cost than file I/O for the pages
827 // which were in the swap file and now is zero.
828 // Check 2 types of page in the same loop to utilize CPU cache for staging memory.
829 if slice_in_staging.is_all_zero() {
830 region
831 .staging_memory
832 .clear_range(idx_range.clone())
833 .context("clear a page in staging memory")?;
834 if self.cur_page >= self.next_data_in_file.start {
835 // The page is on the swap file as well.
836 let munlocked_pages = region
837 .file
838 .erase_from_disk(idx_range)
839 .context("clear a page in swap file")?;
840 if munlocked_pages != 0 {
841 // Only either of swap-in or trimming runs at the same time. This is not
842 // expected path. Just logging an error because leaking
843 // mlock_budget_pages is not fatal.
844 error!("pages are mlock(2)ed while trimming");
845 }
846 }
847 n_trimmed += 1;
848 self.zero_pages += 1;
849 } else if self.cur_page >= self.next_data_in_file.start {
850 // The previous content of the page is on the disk.
851 let slice_in_file = region
852 .file
853 .get_slice(idx_range.clone())
854 .context("get slice in swap file")?;
855
856 if slice_in_staging == slice_in_file {
857 region
858 .staging_memory
859 .clear_range(idx_range.clone())
860 .context("clear a page in staging memory")?;
861 region.file.mark_as_present(self.cur_page);
862 n_trimmed += 1;
863 self.clean_pages += 1;
864 }
865 }
866 }
867
868 self.cur_page += 1;
869 if self.cur_page >= region.file.num_pages() {
870 self.cur_region += 1;
871 self.cur_page = 0;
872 self.next_data_in_file = 0..0;
873 break;
874 }
875 }
876
877 Ok(Some(n_trimmed))
878 }
879
880 /// Total trimmed clean pages.
trimmed_clean_pages(&self) -> usize881 pub fn trimmed_clean_pages(&self) -> usize {
882 self.clean_pages
883 }
884
885 /// Total trimmed zero pages.
trimmed_zero_pages(&self) -> usize886 pub fn trimmed_zero_pages(&self) -> usize {
887 self.zero_pages
888 }
889 }
890