1 // Copyright 2022 The ChromiumOS Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 //! Provides wrapper of userfaultfd crate for vmm-swap feature.
6 
7 #![deny(missing_docs)]
8 
9 use std::convert::From;
10 use std::fs::File;
11 use std::fs::OpenOptions;
12 use std::ops::Range;
13 use std::os::unix::io::AsRawFd;
14 use std::os::unix::prelude::FromRawFd;
15 use std::os::unix::prelude::OpenOptionsExt;
16 
17 use anyhow::Context;
18 use base::errno_result;
19 use base::info;
20 use base::ioctl_io_nr;
21 use base::ioctl_iowr_nr;
22 use base::ioctl_with_mut_ref;
23 use base::ioctl_with_val;
24 use base::AsRawDescriptor;
25 use base::AsRawDescriptors;
26 use base::FromRawDescriptor;
27 use base::RawDescriptor;
28 use thiserror::Error as ThisError;
29 use userfaultfd::Error as UffdError;
30 pub use userfaultfd::Event as UffdEvent;
31 use userfaultfd::FeatureFlags;
32 use userfaultfd::IoctlFlags;
33 use userfaultfd::Uffd;
34 use userfaultfd::UffdBuilder;
35 
36 const DEV_USERFAULTFD_PATH: &str = "/dev/userfaultfd";
37 const USERFAULTFD_IOC: u32 = 0xAA;
38 ioctl_io_nr!(USERFAULTFD_IOC_NEW, USERFAULTFD_IOC, 0x00);
39 ioctl_iowr_nr!(
40     UFFDIO_API,
41     userfaultfd_sys::UFFDIO,
42     userfaultfd_sys::_UFFDIO_API,
43     userfaultfd_sys::uffdio_api
44 );
45 
46 /// Result for Userfaultfd
47 pub type Result<T> = std::result::Result<T, Error>;
48 
49 /// Errors for Userfaultfd
50 #[derive(ThisError, Debug)]
51 pub enum Error {
52     #[error("userfaultfd error: {0:?}")]
53     /// unrecoverable userfaultfd error.
54     Userfaultfd(UffdError),
55     #[error("copy partially succeeded: {0:?} bytes copied")]
56     /// UFFDIO_COPY partillay succeed.
57     PartiallyCopied(usize),
58     #[error("the page is already filled")]
59     /// The page is already filled.
60     PageExist,
61     #[error("the uffd in the corresponding process is already closed")]
62     /// The corresponding process is already dead or has run exec(2).
63     UffdClosed,
64 }
65 
66 impl From<UffdError> for Error {
from(e: UffdError) -> Self67     fn from(e: UffdError) -> Self {
68         match e {
69             UffdError::PartiallyCopied(copied) => Self::PartiallyCopied(copied),
70             UffdError::ZeropageFailed(errno) if errno as i32 == libc::EEXIST => Self::PageExist,
71             other => Self::Userfaultfd(other),
72         }
73     }
74 }
75 
76 /// Register all the regions to all the userfaultfd
77 ///
78 /// # Arguments
79 ///
80 /// * `regions` - the list of address range of regions.
81 /// * `uffds` - the reference to the list of [Userfaultfd] for all the processes which may touch the
82 ///   `address_range` to be registered.
83 ///
84 /// # Safety
85 ///
86 /// Each address range in `regions` must be from guest memory.
87 ///
88 /// The `uffds` must cover all the processes which may touch the `address_range`. otherwise some
89 /// pages are zeroed by kernel on the unregistered process instead of swapping in from the swap
90 /// file.
91 #[deny(unsafe_op_in_unsafe_fn)]
register_regions(regions: &[Range<usize>], uffds: &[Userfaultfd]) -> Result<()>92 pub unsafe fn register_regions(regions: &[Range<usize>], uffds: &[Userfaultfd]) -> Result<()> {
93     for address_range in regions {
94         for uffd in uffds {
95             // Safe because the range is from the guest memory region.
96             let result = unsafe {
97                 uffd.register(address_range.start, address_range.end - address_range.start)
98             };
99             match result {
100                 Ok(_) => {}
101                 // Skip the userfaultfd for dead processes.
102                 Err(Error::UffdClosed) => {}
103                 Err(e) => {
104                     return Err(e);
105                 }
106             };
107         }
108     }
109     Ok(())
110 }
111 
112 /// Unregister all the regions from all the userfaultfd.
113 ///
114 /// `UFFDIO_UNREGISTER` unblocks any threads currently waiting on the region and remove page fault
115 /// events on the region from the userfaultfd event queue.
116 ///
117 /// # Arguments
118 ///
119 /// * `regions` - the list of address range of regions.
120 /// * `uffds` - the reference to the list of registered [Userfaultfd].
unregister_regions(regions: &[Range<usize>], uffds: &[Userfaultfd]) -> Result<()>121 pub fn unregister_regions(regions: &[Range<usize>], uffds: &[Userfaultfd]) -> Result<()> {
122     for address_range in regions {
123         for uffd in uffds {
124             let result =
125                 uffd.unregister(address_range.start, address_range.end - address_range.start);
126             match result {
127                 Ok(_) => {}
128                 // Skip the userfaultfd for dead processes.
129                 Err(Error::UffdClosed) => {}
130                 Err(e) => {
131                     return Err(e);
132                 }
133             };
134         }
135     }
136     Ok(())
137 }
138 
139 /// Factory for [Userfaultfd].
140 ///
141 /// If `/dev/userfaultfd` (introduced from Linux 6.1) exists, creates userfaultfd from the dev file.
142 /// Otherwise use `userfaultfd(2)` to create a userfaultfd.
143 pub struct Factory {
144     dev_file: Option<File>,
145 }
146 
147 impl Factory {
148     /// Create [Factory] and try open `/dev/userfaultfd`.
149     ///
150     /// If it fails to open `/dev/userfaultfd`, userfaultfd creation fallback to `userfaultfd(2)`
151     /// syscall.
new() -> Self152     pub fn new() -> Self {
153         let dev_file = OpenOptions::new()
154             .read(true)
155             .custom_flags(libc::O_CLOEXEC | libc::O_NONBLOCK)
156             .open(DEV_USERFAULTFD_PATH);
157         match dev_file {
158             Ok(dev_file) => Self {
159                 dev_file: Some(dev_file),
160             },
161             Err(e) => {
162                 info!(
163                     "Failed to open /dev/userfaultfd ({:?}), will fall back to userfaultfd(2)",
164                     e
165                 );
166                 Self { dev_file: None }
167             }
168         }
169     }
170 
171     /// Creates a new [Userfaultfd] for this process.
create(&self) -> anyhow::Result<Userfaultfd>172     pub fn create(&self) -> anyhow::Result<Userfaultfd> {
173         if let Some(dev_file) = &self.dev_file {
174             // Safe because ioctl(2) USERFAULTFD_IOC_NEW with does not change Rust memory safety.
175             let res = unsafe {
176                 ioctl_with_val(
177                     dev_file,
178                     USERFAULTFD_IOC_NEW(),
179                     (libc::O_CLOEXEC | libc::O_NONBLOCK) as libc::c_ulong,
180                 )
181             };
182             let uffd = if res < 0 {
183                 return errno_result().context("USERFAULTFD_IOC_NEW");
184             } else {
185                 // Safe because the uffd is not owned by anyone in this process.
186                 unsafe { Userfaultfd::from_raw_descriptor(res) }
187             };
188             let mut api = userfaultfd_sys::uffdio_api {
189                 api: userfaultfd_sys::UFFD_API,
190                 features: (FeatureFlags::MISSING_SHMEM | FeatureFlags::EVENT_REMOVE).bits(),
191                 ioctls: 0,
192             };
193             // Safe because ioctl(2) UFFDIO_API with does not change Rust memory safety.
194             let res = unsafe { ioctl_with_mut_ref(&uffd, UFFDIO_API(), &mut api) };
195             if res < 0 {
196                 errno_result().context("UFFDIO_API")
197             } else {
198                 Ok(uffd)
199             }
200         } else {
201             Userfaultfd::new().context("create userfaultfd")
202         }
203     }
204 }
205 
206 impl AsRawDescriptors for Factory {
as_raw_descriptors(&self) -> Vec<RawDescriptor>207     fn as_raw_descriptors(&self) -> Vec<RawDescriptor> {
208         if let Some(dev_file) = &self.dev_file {
209             vec![dev_file.as_raw_descriptor()]
210         } else {
211             Vec::new()
212         }
213     }
214 }
215 
216 /// Wrapper for [`userfaultfd::Uffd`] to be used in the vmm-swap feature.
217 ///
218 /// # Safety
219 ///
220 /// The userfaultfd operations (`UFFDIO_COPY` and `UFFDIO_ZEROPAGE`) looks unsafe since it fills a
221 /// memory content directly. But they actually are not unsafe operation but `UFFDIO_REGISTER` should
222 /// be the unsafe operation for Rust memory safety.
223 ///
224 /// According to [the Rust document](https://doc.rust-lang.org/nomicon/uninitialized.html),
225 ///
226 /// > All runtime-allocated memory in a Rust program begins its life as uninitialized.
227 ///
228 /// The userfaultfd operations actually does not change/overwrite the existing memory contents but
229 /// they just setup the "uninitialized" pages. If the page was already initialized, the userfaultfd
230 /// operations fail and return EEXIST error (which is not documented unfortunately). So they
231 /// originally does not affect the Rust memory safety.
232 ///
233 /// The "uninitialized" page in this context has 2 patterns:
234 ///
235 /// 1. pages which is never touched or,
236 /// 2. pages which is never touched after MADV_REMOVE
237 ///
238 /// Filling the (1) pages with any contents should not affect the Rust memory safety.
239 ///
240 /// Filling the (2) pages potentially may break the memory used by Rust. But the safety should be
241 /// examined at `MADV_REMOVE` and `UFFDIO_REGISTER` timing.
242 pub struct Userfaultfd {
243     uffd: Uffd,
244 }
245 
246 impl Userfaultfd {
247     /// Creates a new userfaultfd using userfaultfd(2) syscall.
248     ///
249     /// This is public for tests.
new() -> Result<Self>250     pub fn new() -> Result<Self> {
251         let uffd = UffdBuilder::new()
252             .close_on_exec(true)
253             .non_blocking(true)
254             .user_mode_only(false)
255             .require_features(FeatureFlags::MISSING_SHMEM | FeatureFlags::EVENT_REMOVE)
256             .create()?;
257         Ok(Self { uffd })
258     }
259 
260     /// Register a range of memory to the userfaultfd.
261     ///
262     /// After this registration, any page faults on the range will be caught by the userfaultfd.
263     ///
264     /// # Arguments
265     ///
266     /// * `addr` - the starting address of the range of memory.
267     /// * `len` - the length in bytes of the range of memory.
268     ///
269     /// # Safety
270     ///
271     /// [addr, addr+len) must lie within a [MemoryMapping](base::MemoryMapping), and that mapping
272     /// must live for the lifespan of the userfaultfd kernel object (which may be distinct from the
273     /// `Userfaultfd` rust object in this process).
register(&self, addr: usize, len: usize) -> Result<IoctlFlags>274     pub unsafe fn register(&self, addr: usize, len: usize) -> Result<IoctlFlags> {
275         match self.uffd.register(addr as *mut libc::c_void, len) {
276             Ok(flags) => Ok(flags),
277             Err(UffdError::SystemError(errno)) if errno as i32 == libc::ENOMEM => {
278                 // Userfaultfd returns `ENOMEM` if the corresponding process dies or run as another
279                 // program by `exec` system call.
280                 // TODO(b/267124393): Verify UFFDIO_ZEROPAGE + ESRCH as well since ENOMEM may be for
281                 // other reasons.
282                 Err(Error::UffdClosed)
283             }
284             Err(e) => Err(e.into()),
285         }
286     }
287 
288     /// Unregister a range of memory from the userfaultfd.
289     ///
290     /// # Arguments
291     ///
292     /// * `addr` - the starting address of the range of memory.
293     /// * `len` - the length in bytes of the range of memory.
unregister(&self, addr: usize, len: usize) -> Result<()>294     pub fn unregister(&self, addr: usize, len: usize) -> Result<()> {
295         match self.uffd.unregister(addr as *mut libc::c_void, len) {
296             Ok(_) => Ok(()),
297             Err(UffdError::SystemError(errno)) if errno as i32 == libc::ENOMEM => {
298                 // Userfaultfd returns `ENOMEM` if the corresponding process dies or run as another
299                 // program by `exec` system call.
300                 // TODO(b/267124393): Verify UFFDIO_ZEROPAGE + ESRCH as well since ENOMEM may be for
301                 // other reasons.
302                 Err(Error::UffdClosed)
303             }
304             Err(e) => Err(e.into()),
305         }
306     }
307 
308     /// Initialize page(s) and fill it with zero.
309     ///
310     /// # Arguments
311     ///
312     /// * `addr` - the starting address of the page(s) to be initialzed with zero.
313     /// * `len` - the length in bytes of the page(s).
314     /// * `wake` - whether or not to unblock the faulting thread.
zero(&self, addr: usize, len: usize, wake: bool) -> Result<usize>315     pub fn zero(&self, addr: usize, len: usize, wake: bool) -> Result<usize> {
316         // safe because zeroing untouched pages does not break the Rust memory safety since "All
317         // runtime-allocated memory in a Rust program begins its life as uninitialized."
318         // https://doc.rust-lang.org/nomicon/uninitialized.html
319         Ok(unsafe { self.uffd.zeropage(addr as *mut libc::c_void, len, wake) }?)
320     }
321 
322     /// Copy the `data` to the page(s) starting from `addr`.
323     ///
324     /// # Arguments
325     ///
326     /// * `addr` - the starting address of the page(s) to be initialzed with data.
327     /// * `len` - the length in bytes of the page(s).
328     /// * `data` - the starting address of the content.
329     /// * `wake` - whether or not to unblock the faulting thread.
copy(&self, addr: usize, len: usize, data: *const u8, wake: bool) -> Result<usize>330     pub fn copy(&self, addr: usize, len: usize, data: *const u8, wake: bool) -> Result<usize> {
331         // safe because filling untouched pages with data does not break the Rust memory safety
332         // since "All runtime-allocated memory in a Rust program begins its life as uninitialized."
333         // https://doc.rust-lang.org/nomicon/uninitialized.html
334         Ok(unsafe {
335             self.uffd.copy(
336                 data as *const libc::c_void,
337                 addr as *mut libc::c_void,
338                 len,
339                 wake,
340             )
341         }?)
342     }
343 
344     /// Wake the faulting thread blocked by the page(s).
345     ///
346     /// If the page is not initialized, the thread causes a page fault again.
347     ///
348     /// # Arguments
349     ///
350     /// * `addr` - the starting address of the page(s).
351     /// * `len` - the length in bytes of the page(s).
wake(&self, addr: usize, len: usize) -> Result<()>352     pub fn wake(&self, addr: usize, len: usize) -> Result<()> {
353         Ok(self.uffd.wake(addr as *mut libc::c_void, len)?)
354     }
355 
356     /// Read an event from the userfaultfd.
357     ///
358     /// Return `None` immediately if no events is ready to read.
read_event(&self) -> Result<Option<UffdEvent>>359     pub fn read_event(&self) -> Result<Option<UffdEvent>> {
360         Ok(self.uffd.read_event()?)
361     }
362 }
363 
364 impl From<Uffd> for Userfaultfd {
from(uffd: Uffd) -> Self365     fn from(uffd: Uffd) -> Self {
366         Self { uffd }
367     }
368 }
369 
370 impl FromRawDescriptor for Userfaultfd {
from_raw_descriptor(descriptor: RawDescriptor) -> Self371     unsafe fn from_raw_descriptor(descriptor: RawDescriptor) -> Self {
372         Self::from(Uffd::from_raw_fd(descriptor))
373     }
374 }
375 
376 impl AsRawDescriptor for Userfaultfd {
as_raw_descriptor(&self) -> RawDescriptor377     fn as_raw_descriptor(&self) -> RawDescriptor {
378         self.uffd.as_raw_fd()
379     }
380 }
381