1 // Copyright 2019 The ChromiumOS Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 use std::borrow::Cow;
6 use std::cell::RefCell;
7 use std::cmp;
8 use std::collections::btree_map;
9 use std::collections::BTreeMap;
10 use std::ffi::CStr;
11 use std::ffi::CString;
12 use std::fs::File;
13 use std::io;
14 use std::mem;
15 use std::mem::size_of;
16 use std::mem::MaybeUninit;
17 use std::os::raw::c_int;
18 use std::os::raw::c_long;
19 use std::ptr;
20 use std::ptr::addr_of;
21 use std::ptr::addr_of_mut;
22 use std::sync::atomic::AtomicBool;
23 use std::sync::atomic::AtomicU64;
24 use std::sync::atomic::Ordering;
25 use std::sync::Arc;
26 use std::sync::MutexGuard;
27 use std::time::Duration;
28
29 use base::error;
30 use base::ioctl_ior_nr;
31 use base::ioctl_iow_nr;
32 use base::ioctl_iowr_nr;
33 use base::ioctl_with_mut_ptr;
34 use base::ioctl_with_ptr;
35 use base::syscall;
36 use base::unix::FileFlags;
37 use base::warn;
38 use base::AsRawDescriptor;
39 use base::FromRawDescriptor;
40 use base::Protection;
41 use base::RawDescriptor;
42 use fuse::filesystem::Context;
43 use fuse::filesystem::DirectoryIterator;
44 use fuse::filesystem::Entry;
45 use fuse::filesystem::FileSystem;
46 use fuse::filesystem::FsOptions;
47 use fuse::filesystem::GetxattrReply;
48 use fuse::filesystem::IoctlFlags;
49 use fuse::filesystem::IoctlReply;
50 use fuse::filesystem::ListxattrReply;
51 use fuse::filesystem::OpenOptions;
52 use fuse::filesystem::RemoveMappingOne;
53 use fuse::filesystem::SetattrValid;
54 use fuse::filesystem::ZeroCopyReader;
55 use fuse::filesystem::ZeroCopyWriter;
56 use fuse::filesystem::ROOT_ID;
57 use fuse::sys::WRITE_KILL_PRIV;
58 use fuse::Mapper;
59 #[cfg(feature = "arc_quota")]
60 use protobuf::Message;
61 use sync::Mutex;
62 #[cfg(feature = "arc_quota")]
63 use system_api::client::OrgChromiumSpaced;
64 #[cfg(feature = "arc_quota")]
65 use system_api::spaced::SetProjectIdReply;
66 #[cfg(feature = "arc_quota")]
67 use system_api::spaced::SetProjectInheritanceFlagReply;
68 use zerocopy::AsBytes;
69 use zerocopy::FromBytes;
70 use zerocopy::FromZeroes;
71
72 use crate::virtio::fs::caps::Capability;
73 use crate::virtio::fs::caps::Caps;
74 use crate::virtio::fs::caps::Set as CapSet;
75 use crate::virtio::fs::caps::Value as CapValue;
76 use crate::virtio::fs::config::CachePolicy;
77 use crate::virtio::fs::config::Config;
78 use crate::virtio::fs::expiring_map::ExpiringMap;
79 use crate::virtio::fs::multikey::MultikeyBTreeMap;
80 use crate::virtio::fs::read_dir::ReadDir;
81
82 const EMPTY_CSTR: &[u8] = b"\0";
83 const ROOT_CSTR: &[u8] = b"/\0";
84 const PROC_CSTR: &[u8] = b"/proc\0";
85 const UNLABELED_CSTR: &[u8] = b"unlabeled\0";
86
87 const USER_VIRTIOFS_XATTR: &[u8] = b"user.virtiofs.";
88 const SECURITY_XATTR: &[u8] = b"security.";
89 const SELINUX_XATTR: &[u8] = b"security.selinux";
90
91 const FSCRYPT_KEY_DESCRIPTOR_SIZE: usize = 8;
92 const FSCRYPT_KEY_IDENTIFIER_SIZE: usize = 16;
93
94 #[cfg(feature = "arc_quota")]
95 const FS_PROJINHERIT_FL: c_int = 0x20000000;
96
97 // 25 seconds is the default timeout for dbus-send.
98 #[cfg(feature = "arc_quota")]
99 const DEFAULT_DBUS_TIMEOUT: Duration = Duration::from_secs(25);
100
101 /// Internal utility wrapper for `cros_tracing::trace_event!()` macro with VirtioFS calls.
102 macro_rules! fs_trace {
103 ($tag:expr, $name:expr, $($arg:expr),+) => {
104 cros_tracing::trace_event!(VirtioFs, $name, $tag, $($arg),*)
105 };
106 }
107
108 #[repr(C)]
109 #[derive(Clone, Copy, AsBytes, FromZeroes, FromBytes)]
110 struct fscrypt_policy_v1 {
111 _version: u8,
112 _contents_encryption_mode: u8,
113 _filenames_encryption_mode: u8,
114 _flags: u8,
115 _master_key_descriptor: [u8; FSCRYPT_KEY_DESCRIPTOR_SIZE],
116 }
117
118 #[repr(C)]
119 #[derive(Clone, Copy, AsBytes, FromZeroes, FromBytes)]
120 struct fscrypt_policy_v2 {
121 _version: u8,
122 _contents_encryption_mode: u8,
123 _filenames_encryption_mode: u8,
124 _flags: u8,
125 __reserved: [u8; 4],
126 master_key_identifier: [u8; FSCRYPT_KEY_IDENTIFIER_SIZE],
127 }
128
129 #[repr(C)]
130 #[derive(Copy, Clone, FromZeroes, FromBytes)]
131 union fscrypt_policy {
132 _version: u8,
133 _v1: fscrypt_policy_v1,
134 _v2: fscrypt_policy_v2,
135 }
136
137 #[repr(C)]
138 #[derive(Copy, Clone, FromZeroes, FromBytes)]
139 struct fscrypt_get_policy_ex_arg {
140 policy_size: u64, /* input/output */
141 policy: fscrypt_policy, /* output */
142 }
143
144 impl From<&fscrypt_get_policy_ex_arg> for &[u8] {
from(value: &fscrypt_get_policy_ex_arg) -> Self145 fn from(value: &fscrypt_get_policy_ex_arg) -> Self {
146 assert!(value.policy_size <= size_of::<fscrypt_policy>() as u64);
147 let data_raw: *const fscrypt_get_policy_ex_arg = value;
148 // SAFETY: the length of the output slice is asserted to be within the struct it points to
149 unsafe {
150 std::slice::from_raw_parts(
151 data_raw.cast(),
152 value.policy_size as usize + size_of::<u64>(),
153 )
154 }
155 }
156 }
157
158 ioctl_iowr_nr!(FS_IOC_GET_ENCRYPTION_POLICY_EX, 'f' as u32, 22, [u8; 9]);
159
160 #[repr(C)]
161 #[derive(Clone, Copy, AsBytes, FromZeroes, FromBytes)]
162 struct fsxattr {
163 fsx_xflags: u32, /* xflags field value (get/set) */
164 fsx_extsize: u32, /* extsize field value (get/set) */
165 fsx_nextents: u32, /* nextents field value (get) */
166 fsx_projid: u32, /* project identifier (get/set) */
167 fsx_cowextsize: u32, /* CoW extsize field value (get/set) */
168 fsx_pad: [u8; 8],
169 }
170
171 ioctl_ior_nr!(FS_IOC_FSGETXATTR, 'X' as u32, 31, fsxattr);
172 ioctl_iow_nr!(FS_IOC_FSSETXATTR, 'X' as u32, 32, fsxattr);
173
174 ioctl_ior_nr!(FS_IOC_GETFLAGS, 'f' as u32, 1, c_long);
175 ioctl_iow_nr!(FS_IOC_SETFLAGS, 'f' as u32, 2, c_long);
176
177 ioctl_ior_nr!(FS_IOC32_GETFLAGS, 'f' as u32, 1, u32);
178 ioctl_iow_nr!(FS_IOC32_SETFLAGS, 'f' as u32, 2, u32);
179
180 ioctl_ior_nr!(FS_IOC64_GETFLAGS, 'f' as u32, 1, u64);
181 ioctl_iow_nr!(FS_IOC64_SETFLAGS, 'f' as u32, 2, u64);
182
183 #[repr(C)]
184 #[derive(Clone, Copy, AsBytes, FromZeroes, FromBytes)]
185 struct fsverity_enable_arg {
186 _version: u32,
187 _hash_algorithm: u32,
188 _block_size: u32,
189 salt_size: u32,
190 salt_ptr: u64,
191 sig_size: u32,
192 __reserved1: u32,
193 sig_ptr: u64,
194 __reserved2: [u64; 11],
195 }
196
197 #[repr(C)]
198 #[derive(Clone, Copy, AsBytes, FromZeroes, FromBytes)]
199 struct fsverity_digest {
200 _digest_algorithm: u16,
201 digest_size: u16,
202 // __u8 digest[];
203 }
204
205 ioctl_iow_nr!(FS_IOC_ENABLE_VERITY, 'f' as u32, 133, fsverity_enable_arg);
206 ioctl_iowr_nr!(FS_IOC_MEASURE_VERITY, 'f' as u32, 134, fsverity_digest);
207
208 pub type Inode = u64;
209 type Handle = u64;
210
211 #[derive(Clone, Copy, Debug, PartialOrd, Ord, PartialEq, Eq)]
212 struct InodeAltKey {
213 ino: libc::ino64_t,
214 dev: libc::dev_t,
215 }
216
217 #[derive(PartialEq, Eq, Debug)]
218 enum FileType {
219 Regular,
220 Directory,
221 Other,
222 }
223
224 impl From<libc::mode_t> for FileType {
from(mode: libc::mode_t) -> Self225 fn from(mode: libc::mode_t) -> Self {
226 match mode & libc::S_IFMT {
227 libc::S_IFREG => FileType::Regular,
228 libc::S_IFDIR => FileType::Directory,
229 _ => FileType::Other,
230 }
231 }
232 }
233
234 #[derive(Debug)]
235 struct InodeData {
236 inode: Inode,
237 // (File, open_flags)
238 file: Mutex<(File, libc::c_int)>,
239 refcount: AtomicU64,
240 filetype: FileType,
241 path: String,
242 }
243
244 impl AsRawDescriptor for InodeData {
as_raw_descriptor(&self) -> RawDescriptor245 fn as_raw_descriptor(&self) -> RawDescriptor {
246 self.file.lock().0.as_raw_descriptor()
247 }
248 }
249
250 #[derive(Debug)]
251 struct HandleData {
252 inode: Inode,
253 file: Mutex<File>,
254 }
255
256 impl AsRawDescriptor for HandleData {
as_raw_descriptor(&self) -> RawDescriptor257 fn as_raw_descriptor(&self) -> RawDescriptor {
258 self.file.lock().as_raw_descriptor()
259 }
260 }
261
262 macro_rules! scoped_cred {
263 ($name:ident, $ty:ty, $syscall_nr:expr) => {
264 #[derive(Debug)]
265 struct $name {
266 old: $ty,
267 }
268
269 impl $name {
270 // Changes the effective uid/gid of the current thread to `val`. Changes the thread's
271 // credentials back to `old` when the returned struct is dropped.
272 fn new(val: $ty, old: $ty) -> io::Result<Option<$name>> {
273 if val == old {
274 // Nothing to do since we already have the correct value.
275 return Ok(None);
276 }
277
278 // We want credential changes to be per-thread because otherwise
279 // we might interfere with operations being carried out on other
280 // threads with different uids/gids. However, posix requires that
281 // all threads in a process share the same credentials. To do this
282 // libc uses signals to ensure that when one thread changes its
283 // credentials the other threads do the same thing.
284 //
285 // So instead we invoke the syscall directly in order to get around
286 // this limitation. Another option is to use the setfsuid and
287 // setfsgid systems calls. However since those calls have no way to
288 // return an error, it's preferable to do this instead.
289
290 // SAFETY: this call is safe because it doesn't modify any memory and we
291 // check the return value.
292 let res = unsafe { libc::syscall($syscall_nr, -1, val, -1) };
293 if res == 0 {
294 Ok(Some($name { old }))
295 } else {
296 Err(io::Error::last_os_error())
297 }
298 }
299 }
300
301 impl Drop for $name {
302 fn drop(&mut self) {
303 // SAFETY: trivially safe
304 let res = unsafe { libc::syscall($syscall_nr, -1, self.old, -1) };
305 if res < 0 {
306 error!(
307 "failed to change credentials back to {}: {}",
308 self.old,
309 io::Error::last_os_error(),
310 );
311 }
312 }
313 }
314 };
315 }
316 #[cfg(not(target_arch = "arm"))]
317 scoped_cred!(ScopedUid, libc::uid_t, libc::SYS_setresuid);
318 #[cfg(target_arch = "arm")]
319 scoped_cred!(ScopedUid, libc::uid_t, libc::SYS_setresuid32);
320
321 #[cfg(not(target_arch = "arm"))]
322 scoped_cred!(ScopedGid, libc::gid_t, libc::SYS_setresgid);
323 #[cfg(target_arch = "arm")]
324 scoped_cred!(ScopedGid, libc::gid_t, libc::SYS_setresgid32);
325
326 #[cfg(not(target_arch = "arm"))]
327 const SYS_GETEUID: libc::c_long = libc::SYS_geteuid;
328 #[cfg(target_arch = "arm")]
329 const SYS_GETEUID: libc::c_long = libc::SYS_geteuid32;
330
331 #[cfg(not(target_arch = "arm"))]
332 const SYS_GETEGID: libc::c_long = libc::SYS_getegid;
333 #[cfg(target_arch = "arm")]
334 const SYS_GETEGID: libc::c_long = libc::SYS_getegid32;
335
336 thread_local! {
337 // SAFETY: both calls take no parameters and only return an integer value. The kernel also
338 // guarantees that they can never fail.
339 static THREAD_EUID: libc::uid_t = unsafe { libc::syscall(SYS_GETEUID) as libc::uid_t };
340 // SAFETY: both calls take no parameters and only return an integer value. The kernel also
341 // guarantees that they can never fail.
342 static THREAD_EGID: libc::gid_t = unsafe { libc::syscall(SYS_GETEGID) as libc::gid_t };
343 }
344
set_creds( uid: libc::uid_t, gid: libc::gid_t, ) -> io::Result<(Option<ScopedUid>, Option<ScopedGid>)>345 fn set_creds(
346 uid: libc::uid_t,
347 gid: libc::gid_t,
348 ) -> io::Result<(Option<ScopedUid>, Option<ScopedGid>)> {
349 let olduid = THREAD_EUID.with(|uid| *uid);
350 let oldgid = THREAD_EGID.with(|gid| *gid);
351
352 // We have to change the gid before we change the uid because if we change the uid first then we
353 // lose the capability to change the gid. However changing back can happen in any order.
354 ScopedGid::new(gid, oldgid).and_then(|gid| Ok((ScopedUid::new(uid, olduid)?, gid)))
355 }
356
357 thread_local!(static THREAD_FSCREATE: RefCell<Option<File>> = RefCell::new(None));
358
359 // Opens and returns a write-only handle to /proc/thread-self/attr/fscreate. Panics if it fails to
360 // open the file.
open_fscreate(proc: &File) -> File361 fn open_fscreate(proc: &File) -> File {
362 // SAFETY: This string is nul-terminated and does not contain any interior nul bytes
363 let fscreate = unsafe { CStr::from_bytes_with_nul_unchecked(b"thread-self/attr/fscreate\0") };
364
365 // SAFETY: this doesn't modify any memory and we check the return value.
366 let raw_descriptor = unsafe {
367 libc::openat(
368 proc.as_raw_descriptor(),
369 fscreate.as_ptr(),
370 libc::O_CLOEXEC | libc::O_WRONLY,
371 )
372 };
373
374 // We don't expect this to fail and we're not in a position to return an error here so just
375 // panic.
376 if raw_descriptor < 0 {
377 panic!(
378 "Failed to open /proc/thread-self/attr/fscreate: {}",
379 io::Error::last_os_error()
380 );
381 }
382
383 // SAFETY: safe because we just opened this descriptor.
384 unsafe { File::from_raw_descriptor(raw_descriptor) }
385 }
386
387 struct ScopedSecurityContext;
388
389 impl ScopedSecurityContext {
new(proc: &File, ctx: &CStr) -> io::Result<ScopedSecurityContext>390 fn new(proc: &File, ctx: &CStr) -> io::Result<ScopedSecurityContext> {
391 THREAD_FSCREATE.with(|thread_fscreate| {
392 let mut fscreate = thread_fscreate.borrow_mut();
393 let file = fscreate.get_or_insert_with(|| open_fscreate(proc));
394 // SAFETY: this doesn't modify any memory and we check the return value.
395 let ret = unsafe {
396 libc::write(
397 file.as_raw_descriptor(),
398 ctx.as_ptr() as *const libc::c_void,
399 ctx.to_bytes_with_nul().len(),
400 )
401 };
402 if ret < 0 {
403 Err(io::Error::last_os_error())
404 } else {
405 Ok(ScopedSecurityContext)
406 }
407 })
408 }
409 }
410
411 impl Drop for ScopedSecurityContext {
drop(&mut self)412 fn drop(&mut self) {
413 THREAD_FSCREATE.with(|thread_fscreate| {
414 // expect is safe here because the thread local would have been initialized by the call
415 // to `new` above.
416 let fscreate = thread_fscreate.borrow();
417 let file = fscreate
418 .as_ref()
419 .expect("Uninitialized thread-local when dropping ScopedSecurityContext");
420
421 // SAFETY: this doesn't modify any memory and we check the return value.
422 let ret = unsafe { libc::write(file.as_raw_descriptor(), ptr::null(), 0) };
423
424 if ret < 0 {
425 warn!(
426 "Failed to restore security context: {}",
427 io::Error::last_os_error()
428 );
429 }
430 })
431 }
432 }
433
434 struct ScopedUmask {
435 old: libc::mode_t,
436 mask: libc::mode_t,
437 }
438
439 impl ScopedUmask {
new(mask: libc::mode_t) -> ScopedUmask440 fn new(mask: libc::mode_t) -> ScopedUmask {
441 ScopedUmask {
442 // SAFETY: this doesn't modify any memory and always succeeds.
443 old: unsafe { libc::umask(mask) },
444 mask,
445 }
446 }
447 }
448
449 impl Drop for ScopedUmask {
drop(&mut self)450 fn drop(&mut self) {
451 // SAFETY: this doesn't modify any memory and always succeeds.
452 let previous = unsafe { libc::umask(self.old) };
453 debug_assert_eq!(
454 previous, self.mask,
455 "umask changed while holding ScopedUmask"
456 );
457 }
458 }
459
460 struct ScopedFsetid(Caps);
461 impl Drop for ScopedFsetid {
drop(&mut self)462 fn drop(&mut self) {
463 if let Err(e) = raise_cap_fsetid(&mut self.0) {
464 error!(
465 "Failed to restore CAP_FSETID: {}. Some operations may be broken.",
466 e
467 )
468 }
469 }
470 }
471
raise_cap_fsetid(c: &mut Caps) -> io::Result<()>472 fn raise_cap_fsetid(c: &mut Caps) -> io::Result<()> {
473 c.update(&[Capability::Fsetid], CapSet::Effective, CapValue::Set)?;
474 c.apply()
475 }
476
477 // Drops CAP_FSETID from the effective set for the current thread and returns an RAII guard that
478 // adds the capability back when it is dropped.
drop_cap_fsetid() -> io::Result<ScopedFsetid>479 fn drop_cap_fsetid() -> io::Result<ScopedFsetid> {
480 let mut caps = Caps::for_current_thread()?;
481 caps.update(&[Capability::Fsetid], CapSet::Effective, CapValue::Clear)?;
482 caps.apply()?;
483 Ok(ScopedFsetid(caps))
484 }
485
ebadf() -> io::Error486 fn ebadf() -> io::Error {
487 io::Error::from_raw_os_error(libc::EBADF)
488 }
489
eexist() -> io::Error490 fn eexist() -> io::Error {
491 io::Error::from_raw_os_error(libc::EEXIST)
492 }
493
stat<F: AsRawDescriptor + ?Sized>(f: &F) -> io::Result<libc::stat64>494 fn stat<F: AsRawDescriptor + ?Sized>(f: &F) -> io::Result<libc::stat64> {
495 let mut st = MaybeUninit::<libc::stat64>::zeroed();
496
497 // SAFETY: this is a constant value that is a nul-terminated string without interior nul bytes.
498 let pathname = unsafe { CStr::from_bytes_with_nul_unchecked(EMPTY_CSTR) };
499
500 // SAFETY: the kernel will only write data in `st` and we check the return value.
501 syscall!(unsafe {
502 libc::fstatat64(
503 f.as_raw_descriptor(),
504 pathname.as_ptr(),
505 st.as_mut_ptr(),
506 libc::AT_EMPTY_PATH | libc::AT_SYMLINK_NOFOLLOW,
507 )
508 })?;
509
510 // SAFETY: the kernel guarantees that the struct is now fully initialized.
511 Ok(unsafe { st.assume_init() })
512 }
513
statat<D: AsRawDescriptor>(dir: &D, name: &CStr) -> io::Result<libc::stat64>514 fn statat<D: AsRawDescriptor>(dir: &D, name: &CStr) -> io::Result<libc::stat64> {
515 let mut st = MaybeUninit::<libc::stat64>::zeroed();
516
517 // SAFETY: the kernel will only write data in `st` and we check the return value.
518 syscall!(unsafe {
519 libc::fstatat64(
520 dir.as_raw_descriptor(),
521 name.as_ptr(),
522 st.as_mut_ptr(),
523 libc::AT_SYMLINK_NOFOLLOW,
524 )
525 })?;
526
527 // SAFETY: the kernel guarantees that the struct is now fully initialized.
528 Ok(unsafe { st.assume_init() })
529 }
530
531 #[cfg(feature = "arc_quota")]
is_android_project_id(project_id: u32) -> bool532 fn is_android_project_id(project_id: u32) -> bool {
533 // The following constants defines the valid range of project ID used by
534 // Android and are taken from android_filesystem_config.h in Android
535 // codebase.
536 //
537 // Project IDs reserved for Android files on external storage. Total 100 IDs
538 // from PROJECT_ID_EXT_DEFAULT (1000) are reserved.
539 const PROJECT_ID_FOR_ANDROID_FILES: std::ops::RangeInclusive<u32> = 1000..=1099;
540 // Project IDs reserved for Android apps.
541 // The lower-limit of the range is PROJECT_ID_EXT_DATA_START.
542 // The upper-limit of the range differs before and after T. Here we use that
543 // of T (PROJECT_ID_APP_CACHE_END) as it is larger.
544 const PROJECT_ID_FOR_ANDROID_APPS: std::ops::RangeInclusive<u32> = 20000..=69999;
545
546 PROJECT_ID_FOR_ANDROID_FILES.contains(&project_id)
547 || PROJECT_ID_FOR_ANDROID_APPS.contains(&project_id)
548 }
549
550 /// Per-directory cache for `PassthroughFs::ascii_casefold_lookup()`.
551 ///
552 /// The key of the underlying `BTreeMap` is a lower-cased file name in the direcoty.
553 /// The value is the case-sensitive file name stored in the host file system.
554 /// We assume that if PassthroughFs has exclusive access to the filesystem, this cache exhaustively
555 /// covers all file names that exist within the directory.
556 /// So every `PassthroughFs`'s handler that adds or removes files in the directory is expected to
557 /// update this cache.
558 struct CasefoldCache(BTreeMap<Vec<u8>, CString>);
559
560 impl CasefoldCache {
new(dir: &InodeData) -> io::Result<Self>561 fn new(dir: &InodeData) -> io::Result<Self> {
562 let mut mp = BTreeMap::new();
563
564 let mut buf = [0u8; 1024];
565 let mut offset = 0;
566 loop {
567 let mut read_dir = ReadDir::new(dir, offset, &mut buf[..])?;
568 if read_dir.remaining() == 0 {
569 break;
570 }
571
572 while let Some(entry) = read_dir.next() {
573 offset = entry.offset as libc::off64_t;
574 let entry_name = entry.name;
575 mp.insert(
576 entry_name.to_bytes().to_ascii_lowercase(),
577 entry_name.to_owned(),
578 );
579 }
580 }
581 Ok(Self(mp))
582 }
583
insert(&mut self, name: &CStr)584 fn insert(&mut self, name: &CStr) {
585 let lower_case = name.to_bytes().to_ascii_lowercase();
586 self.0.insert(lower_case, name.into());
587 }
588
lookup(&self, name: &[u8]) -> Option<CString>589 fn lookup(&self, name: &[u8]) -> Option<CString> {
590 let lower = name.to_ascii_lowercase();
591 self.0.get(&lower).cloned()
592 }
593
remove(&mut self, name: &CStr)594 fn remove(&mut self, name: &CStr) {
595 let lower_case = name.to_bytes().to_ascii_lowercase();
596 self.0.remove(&lower_case);
597 }
598 }
599
600 /// Time expiring mapping from an inode of a directory to `CasefoldCache` for the directory.
601 /// Each entry will be expired after `timeout`.
602 /// When ascii_casefold is disabled, this struct does nothing.
603 struct ExpiringCasefoldLookupCaches {
604 inner: ExpiringMap<Inode, CasefoldCache>,
605 }
606
607 impl ExpiringCasefoldLookupCaches {
new(timeout: Duration) -> Self608 fn new(timeout: Duration) -> Self {
609 Self {
610 inner: ExpiringMap::new(timeout),
611 }
612 }
613
insert(&mut self, parent: Inode, name: &CStr)614 fn insert(&mut self, parent: Inode, name: &CStr) {
615 if let Some(dir_cache) = self.inner.get_mut(&parent) {
616 dir_cache.insert(name);
617 }
618 }
619
remove(&mut self, parent: Inode, name: &CStr)620 fn remove(&mut self, parent: Inode, name: &CStr) {
621 if let Some(dir_cache) = self.inner.get_mut(&parent) {
622 dir_cache.remove(name);
623 }
624 }
625
forget(&mut self, parent: Inode)626 fn forget(&mut self, parent: Inode) {
627 self.inner.remove(&parent);
628 }
629
630 /// Get `CasefoldCache` for the given directory.
631 /// If the cache doesn't exist, generate it by fetching directory information with
632 /// `getdents64()`.
get(&mut self, parent: &InodeData) -> io::Result<&CasefoldCache>633 fn get(&mut self, parent: &InodeData) -> io::Result<&CasefoldCache> {
634 self.inner
635 .get_or_insert_with(&parent.inode, || CasefoldCache::new(parent))
636 }
637
638 #[cfg(test)]
exists_in_cache(&mut self, parent: Inode, name: &CStr) -> bool639 fn exists_in_cache(&mut self, parent: Inode, name: &CStr) -> bool {
640 if let Some(dir_cache) = self.inner.get(&parent) {
641 dir_cache.lookup(name.to_bytes()).is_some()
642 } else {
643 false
644 }
645 }
646 }
647
648 /// A file system that simply "passes through" all requests it receives to the underlying file
649 /// system. To keep the implementation simple it servers the contents of its root directory. Users
650 /// that wish to serve only a specific directory should set up the environment so that that
651 /// directory ends up as the root of the file system process. One way to accomplish this is via a
652 /// combination of mount namespaces and the pivot_root system call.
653 pub struct PassthroughFs {
654 // Mutex that must be acquired before executing a process-wide operation such as fchdir.
655 process_lock: Mutex<()>,
656 // virtio-fs tag that the guest uses when mounting. This is only used for debugging
657 // when tracing is enabled.
658 #[cfg_attr(not(feature = "trace_marker"), allow(dead_code))]
659 tag: String,
660
661 // File descriptors for various points in the file system tree.
662 inodes: Mutex<MultikeyBTreeMap<Inode, InodeAltKey, Arc<InodeData>>>,
663 next_inode: AtomicU64,
664
665 // File descriptors for open files and directories. Unlike the fds in `inodes`, these _can_ be
666 // used for reading and writing data.
667 handles: Mutex<BTreeMap<Handle, Arc<HandleData>>>,
668 next_handle: AtomicU64,
669
670 // File descriptor pointing to the `/proc` directory. This is used to convert an fd from
671 // `inodes` into one that can go into `handles`. This is accomplished by reading the
672 // `self/fd/{}` symlink. We keep an open fd here in case the file system tree that we are meant
673 // to be serving doesn't have access to `/proc`.
674 proc: File,
675
676 // Whether writeback caching is enabled for this directory. This will only be true when
677 // `cfg.writeback` is true and `init` was called with `FsOptions::WRITEBACK_CACHE`.
678 writeback: AtomicBool,
679
680 // Whether zero message opens are supported by the kernel driver.
681 zero_message_open: AtomicBool,
682
683 // Whether zero message opendir is supported by the kernel driver.
684 zero_message_opendir: AtomicBool,
685
686 // Used to communicate with other processes using D-Bus.
687 #[cfg(feature = "arc_quota")]
688 dbus_connection: Option<Mutex<dbus::blocking::Connection>>,
689 #[cfg(feature = "arc_quota")]
690 dbus_fd: Option<std::os::unix::io::RawFd>,
691
692 // Time-expiring cache for `ascii_casefold_lookup()`.
693 // The key is an inode of a directory, and the value is a cache for the directory.
694 // Each value will be expired `cfg.timeout` after it's created.
695 //
696 // TODO(b/267748212): Instead of per-device Mutex, we might want to have per-directory Mutex
697 // if we use PassthroughFs in multi-threaded environments.
698 expiring_casefold_lookup_caches: Option<Mutex<ExpiringCasefoldLookupCaches>>,
699
700 cfg: Config,
701 }
702
703 impl std::fmt::Debug for PassthroughFs {
fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result704 fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
705 f.debug_struct("PassthroughFs")
706 .field("tag", &self.tag)
707 .field("next_inode", &self.next_inode)
708 .field("next_handle", &self.next_handle)
709 .field("proc", &self.proc)
710 .field("writeback", &self.writeback)
711 .field("zero_message_open", &self.zero_message_open)
712 .field("zero_message_opendir", &self.zero_message_opendir)
713 .field("cfg", &self.cfg)
714 .finish()
715 }
716 }
717
718 impl PassthroughFs {
new(tag: &str, cfg: Config) -> io::Result<PassthroughFs>719 pub fn new(tag: &str, cfg: Config) -> io::Result<PassthroughFs> {
720 // SAFETY: this is a constant value that is a nul-terminated string without interior
721 // nul bytes.
722 let proc_cstr = unsafe { CStr::from_bytes_with_nul_unchecked(PROC_CSTR) };
723
724 // SAFETY: this doesn't modify any memory and we check the return value.
725 let raw_descriptor = syscall!(unsafe {
726 libc::openat64(
727 libc::AT_FDCWD,
728 proc_cstr.as_ptr(),
729 libc::O_PATH | libc::O_NOFOLLOW | libc::O_CLOEXEC,
730 )
731 })?;
732
733 // Privileged UIDs can use D-Bus to perform some operations.
734 #[cfg(feature = "arc_quota")]
735 let (dbus_connection, dbus_fd) = if cfg.privileged_quota_uids.is_empty() {
736 (None, None)
737 } else {
738 let mut channel = dbus::channel::Channel::get_private(dbus::channel::BusType::System)
739 .map_err(|e| io::Error::new(io::ErrorKind::Other, e))?;
740 channel.set_watch_enabled(true);
741 let dbus_fd = channel.watch().fd;
742 channel.set_watch_enabled(false);
743 (
744 Some(Mutex::new(dbus::blocking::Connection::from(channel))),
745 Some(dbus_fd),
746 )
747 };
748
749 // SAFETY: safe because we just opened this descriptor.
750 let proc = unsafe { File::from_raw_descriptor(raw_descriptor) };
751
752 let expiring_casefold_lookup_caches = if cfg.ascii_casefold {
753 Some(Mutex::new(ExpiringCasefoldLookupCaches::new(cfg.timeout)))
754 } else {
755 None
756 };
757
758 let passthroughfs = PassthroughFs {
759 process_lock: Mutex::new(()),
760 tag: tag.to_string(),
761 inodes: Mutex::new(MultikeyBTreeMap::new()),
762 next_inode: AtomicU64::new(ROOT_ID + 1),
763
764 handles: Mutex::new(BTreeMap::new()),
765 next_handle: AtomicU64::new(1),
766
767 proc,
768
769 writeback: AtomicBool::new(false),
770 zero_message_open: AtomicBool::new(false),
771 zero_message_opendir: AtomicBool::new(false),
772
773 #[cfg(feature = "arc_quota")]
774 dbus_connection,
775 #[cfg(feature = "arc_quota")]
776 dbus_fd,
777 expiring_casefold_lookup_caches,
778 cfg,
779 };
780
781 cros_tracing::trace_simple_print!(
782 VirtioFs,
783 "New PassthroughFS initialized: {:?}",
784 passthroughfs
785 );
786 Ok(passthroughfs)
787 }
788
cfg(&self) -> &Config789 pub fn cfg(&self) -> &Config {
790 &self.cfg
791 }
792
keep_rds(&self) -> Vec<RawDescriptor>793 pub fn keep_rds(&self) -> Vec<RawDescriptor> {
794 #[cfg_attr(not(feature = "arc_quota"), allow(unused_mut))]
795 let mut keep_rds = vec![self.proc.as_raw_descriptor()];
796 #[cfg(feature = "arc_quota")]
797 if let Some(fd) = self.dbus_fd {
798 keep_rds.push(fd);
799 }
800 keep_rds
801 }
802
rewrite_xattr_name<'xattr>(&self, name: &'xattr CStr) -> Cow<'xattr, CStr>803 fn rewrite_xattr_name<'xattr>(&self, name: &'xattr CStr) -> Cow<'xattr, CStr> {
804 if !self.cfg.rewrite_security_xattrs {
805 return Cow::Borrowed(name);
806 }
807
808 // Does not include nul-terminator.
809 let buf = name.to_bytes();
810 if !buf.starts_with(SECURITY_XATTR) || buf == SELINUX_XATTR {
811 return Cow::Borrowed(name);
812 }
813
814 let mut newname = USER_VIRTIOFS_XATTR.to_vec();
815 newname.extend_from_slice(buf);
816
817 // The unwrap is safe here because the prefix doesn't contain any interior nul-bytes and the
818 // to_bytes() call above will not return a byte slice with any interior nul-bytes either.
819 Cow::Owned(CString::new(newname).expect("Failed to re-write xattr name"))
820 }
821
find_inode(&self, inode: Inode) -> io::Result<Arc<InodeData>>822 fn find_inode(&self, inode: Inode) -> io::Result<Arc<InodeData>> {
823 self.inodes
824 .lock()
825 .get(&inode)
826 .map(Arc::clone)
827 .ok_or_else(ebadf)
828 }
829
find_handle(&self, handle: Handle, inode: Inode) -> io::Result<Arc<HandleData>>830 fn find_handle(&self, handle: Handle, inode: Inode) -> io::Result<Arc<HandleData>> {
831 self.handles
832 .lock()
833 .get(&handle)
834 .filter(|hd| hd.inode == inode)
835 .map(Arc::clone)
836 .ok_or_else(ebadf)
837 }
838
open_fd(&self, fd: RawDescriptor, flags: i32) -> io::Result<File>839 fn open_fd(&self, fd: RawDescriptor, flags: i32) -> io::Result<File> {
840 let pathname = CString::new(format!("self/fd/{}", fd))
841 .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
842
843 // SAFETY: this doesn't modify any memory and we check the return value. We don't really
844 // check `flags` because if the kernel can't handle poorly specified flags then we have
845 // much bigger problems. Also, clear the `O_NOFOLLOW` flag if it is set since we need
846 // to follow the `/proc/self/fd` symlink to get the file.
847 let raw_descriptor = syscall!(unsafe {
848 libc::openat64(
849 self.proc.as_raw_descriptor(),
850 pathname.as_ptr(),
851 (flags | libc::O_CLOEXEC) & !(libc::O_NOFOLLOW | libc::O_DIRECT),
852 )
853 })?;
854
855 // SAFETY: safe because we just opened this descriptor.
856 Ok(unsafe { File::from_raw_descriptor(raw_descriptor) })
857 }
858
859 /// Modifies the provided open flags based on the writeback caching configuration.
860 /// Return the updated open flags.
update_open_flags(&self, mut flags: i32) -> i32861 fn update_open_flags(&self, mut flags: i32) -> i32 {
862 // When writeback caching is enabled, the kernel may send read requests even if the
863 // userspace program opened the file write-only. So we need to ensure that we have opened
864 // the file for reading as well as writing.
865 let writeback = self.writeback.load(Ordering::Relaxed);
866 if writeback && flags & libc::O_ACCMODE == libc::O_WRONLY {
867 flags &= !libc::O_ACCMODE;
868 flags |= libc::O_RDWR;
869 }
870
871 // When writeback caching is enabled the kernel is responsible for handling `O_APPEND`.
872 // However, this breaks atomicity as the file may have changed on disk, invalidating the
873 // cached copy of the data in the kernel and the offset that the kernel thinks is the end of
874 // the file. Just allow this for now as it is the user's responsibility to enable writeback
875 // caching only for directories that are not shared. It also means that we need to clear the
876 // `O_APPEND` flag.
877 if writeback && flags & libc::O_APPEND != 0 {
878 flags &= !libc::O_APPEND;
879 }
880
881 flags
882 }
883
open_inode(&self, inode: &InodeData, mut flags: i32) -> io::Result<File>884 fn open_inode(&self, inode: &InodeData, mut flags: i32) -> io::Result<File> {
885 // handle writeback caching cases
886 flags = self.update_open_flags(flags);
887
888 self.open_fd(inode.as_raw_descriptor(), flags)
889 }
890
891 // Increases the inode refcount and returns the inode.
increase_inode_refcount(&self, inode_data: &InodeData) -> Inode892 fn increase_inode_refcount(&self, inode_data: &InodeData) -> Inode {
893 // Matches with the release store in `forget`.
894 inode_data.refcount.fetch_add(1, Ordering::Acquire);
895 inode_data.inode
896 }
897
898 // Creates a new entry for `f` or increases the refcount of the existing entry for `f`.
899 // The inodes mutex lock must not be already taken by the same thread otherwise this
900 // will deadlock.
add_entry(&self, f: File, st: libc::stat64, open_flags: libc::c_int, path: String) -> Entry901 fn add_entry(&self, f: File, st: libc::stat64, open_flags: libc::c_int, path: String) -> Entry {
902 let mut inodes = self.inodes.lock();
903
904 let altkey = InodeAltKey {
905 ino: st.st_ino,
906 dev: st.st_dev,
907 };
908
909 let inode = if let Some(data) = inodes.get_alt(&altkey) {
910 self.increase_inode_refcount(data)
911 } else {
912 let inode = self.next_inode.fetch_add(1, Ordering::Relaxed);
913 inodes.insert(
914 inode,
915 altkey,
916 Arc::new(InodeData {
917 inode,
918 file: Mutex::new((f, open_flags)),
919 refcount: AtomicU64::new(1),
920 filetype: st.st_mode.into(),
921 path,
922 }),
923 );
924
925 inode
926 };
927
928 Entry {
929 inode,
930 generation: 0,
931 attr: st,
932 // We use the same timeout for the attribute and the entry.
933 attr_timeout: self.cfg.timeout,
934 entry_timeout: self.cfg.timeout,
935 }
936 }
937
938 /// Acquires lock of `expiring_casefold_lookup_caches` if `ascii_casefold` is enabled.
lock_casefold_lookup_caches(&self) -> Option<MutexGuard<'_, ExpiringCasefoldLookupCaches>>939 fn lock_casefold_lookup_caches(&self) -> Option<MutexGuard<'_, ExpiringCasefoldLookupCaches>> {
940 self.expiring_casefold_lookup_caches
941 .as_ref()
942 .map(|c| c.lock())
943 }
944
945 // Returns an actual case-sensitive file name that matches with the given `name`.
946 // Returns `Ok(None)` if no file matches with the give `name`.
947 // This function will panic if casefold is not enabled.
get_case_unfolded_name( &self, parent: &InodeData, name: &[u8], ) -> io::Result<Option<CString>>948 fn get_case_unfolded_name(
949 &self,
950 parent: &InodeData,
951 name: &[u8],
952 ) -> io::Result<Option<CString>> {
953 let mut caches = self
954 .lock_casefold_lookup_caches()
955 .expect("casefold must be enabled");
956 let dir_cache = caches.get(parent)?;
957 Ok(dir_cache.lookup(name))
958 }
959
960 // Performs an ascii case insensitive lookup.
ascii_casefold_lookup(&self, parent: &InodeData, name: &[u8]) -> io::Result<Entry>961 fn ascii_casefold_lookup(&self, parent: &InodeData, name: &[u8]) -> io::Result<Entry> {
962 match self.get_case_unfolded_name(parent, name)? {
963 None => Err(io::Error::from_raw_os_error(libc::ENOENT)),
964 Some(actual_name) => self.do_lookup(parent, &actual_name),
965 }
966 }
967
968 #[cfg(test)]
exists_in_casefold_cache(&self, parent: Inode, name: &CStr) -> bool969 fn exists_in_casefold_cache(&self, parent: Inode, name: &CStr) -> bool {
970 let mut cache = self
971 .lock_casefold_lookup_caches()
972 .expect("casefold must be enabled");
973 cache.exists_in_cache(parent, name)
974 }
975
do_lookup(&self, parent: &InodeData, name: &CStr) -> io::Result<Entry>976 fn do_lookup(&self, parent: &InodeData, name: &CStr) -> io::Result<Entry> {
977 let st = statat(parent, name)?;
978
979 let altkey = InodeAltKey {
980 ino: st.st_ino,
981 dev: st.st_dev,
982 };
983
984 // Check if we already have an entry before opening a new file.
985 if let Some(data) = self.inodes.lock().get_alt(&altkey) {
986 // Return the same inode with the reference counter increased.
987 return Ok(Entry {
988 inode: self.increase_inode_refcount(data),
989 generation: 0,
990 attr: st,
991 // We use the same timeout for the attribute and the entry.
992 attr_timeout: self.cfg.timeout,
993 entry_timeout: self.cfg.timeout,
994 });
995 }
996
997 // Open a regular file with O_RDONLY to store in `InodeData` so explicit open requests can
998 // be skipped later if the ZERO_MESSAGE_{OPEN,OPENDIR} features are enabled.
999 // If the crosvm process doesn't have a read permission, fall back to O_PATH below.
1000 let mut flags = libc::O_RDONLY | libc::O_NOFOLLOW | libc::O_CLOEXEC;
1001 match FileType::from(st.st_mode) {
1002 FileType::Regular => {}
1003 FileType::Directory => flags |= libc::O_DIRECTORY,
1004 FileType::Other => flags |= libc::O_PATH,
1005 };
1006
1007 // SAFETY: this doesn't modify any memory and we check the return value.
1008 let fd = match unsafe {
1009 syscall!(libc::openat64(
1010 parent.as_raw_descriptor(),
1011 name.as_ptr(),
1012 flags
1013 ))
1014 } {
1015 Ok(fd) => fd,
1016 Err(e) if e.errno() == libc::EACCES => {
1017 // If O_RDONLY is unavailable, fall back to O_PATH to get an FD to store in
1018 // `InodeData`.
1019 // Note that some operations which should be allowed without read permissions
1020 // require syscalls that don't support O_PATH fds. For those syscalls, we will
1021 // need to fall back to their path-based equivalents with /self/fd/${FD}.
1022 // e.g. `fgetxattr()` for an O_PATH FD fails while `getxaattr()` for /self/fd/${FD}
1023 // works.
1024 flags |= libc::O_PATH;
1025 // SAFETY: this doesn't modify any memory and we check the return value.
1026 unsafe {
1027 syscall!(libc::openat64(
1028 parent.as_raw_descriptor(),
1029 name.as_ptr(),
1030 flags
1031 ))
1032 }?
1033 }
1034 Err(e) => {
1035 return Err(e.into());
1036 }
1037 };
1038
1039 // SAFETY: safe because we own the fd.
1040 let f = unsafe { File::from_raw_descriptor(fd) };
1041 let path = format!(
1042 "{}/{}",
1043 parent.path.clone(),
1044 name.to_str().unwrap_or("<non UTF-8 str>")
1045 );
1046 // We made sure the lock acquired for `self.inodes` is released automatically when
1047 // the if block above is exited, so a call to `self.add_entry()` should not cause a deadlock
1048 // here. This would not be the case if this were executed in an else block instead.
1049 Ok(self.add_entry(f, st, flags, path))
1050 }
1051
get_cache_open_options(&self, flags: u32) -> OpenOptions1052 fn get_cache_open_options(&self, flags: u32) -> OpenOptions {
1053 let mut opts = OpenOptions::empty();
1054 match self.cfg.cache_policy {
1055 // We only set the direct I/O option on files.
1056 CachePolicy::Never => opts.set(
1057 OpenOptions::DIRECT_IO,
1058 flags & (libc::O_DIRECTORY as u32) == 0,
1059 ),
1060 CachePolicy::Always => {
1061 opts |= if flags & (libc::O_DIRECTORY as u32) == 0 {
1062 OpenOptions::KEEP_CACHE
1063 } else {
1064 OpenOptions::CACHE_DIR
1065 }
1066 }
1067 _ => {}
1068 };
1069 opts
1070 }
1071
1072 // Performs lookup using original name first, if it fails and ascii_casefold is enabled,
1073 // it tries to unfold the name and do lookup again.
do_lookup_with_casefold_fallback( &self, parent: &InodeData, name: &CStr, ) -> io::Result<Entry>1074 fn do_lookup_with_casefold_fallback(
1075 &self,
1076 parent: &InodeData,
1077 name: &CStr,
1078 ) -> io::Result<Entry> {
1079 let mut res = self.do_lookup(parent, name);
1080 // If `ascii_casefold` is enabled, fallback to `ascii_casefold_lookup()`.
1081 if res.is_err() && self.cfg.ascii_casefold {
1082 res = self.ascii_casefold_lookup(parent, name.to_bytes());
1083 }
1084 res
1085 }
1086
do_open(&self, inode: Inode, flags: u32) -> io::Result<(Option<Handle>, OpenOptions)>1087 fn do_open(&self, inode: Inode, flags: u32) -> io::Result<(Option<Handle>, OpenOptions)> {
1088 let inode_data = self.find_inode(inode)?;
1089
1090 let file = Mutex::new(self.open_inode(&inode_data, flags as i32)?);
1091
1092 let handle = self.next_handle.fetch_add(1, Ordering::Relaxed);
1093 let data = HandleData { inode, file };
1094
1095 self.handles.lock().insert(handle, Arc::new(data));
1096
1097 let opts = self.get_cache_open_options(flags);
1098
1099 Ok((Some(handle), opts))
1100 }
1101
do_open_at( &self, parent_data: Arc<InodeData>, name: &CStr, inode: Inode, flags: u32, ) -> io::Result<(Option<Handle>, OpenOptions)>1102 fn do_open_at(
1103 &self,
1104 parent_data: Arc<InodeData>,
1105 name: &CStr,
1106 inode: Inode,
1107 flags: u32,
1108 ) -> io::Result<(Option<Handle>, OpenOptions)> {
1109 let open_flags = self.update_open_flags(flags as i32);
1110
1111 let fd_open = syscall!(
1112 // SAFETY: return value is checked.
1113 unsafe {
1114 libc::openat64(
1115 parent_data.as_raw_descriptor(),
1116 name.as_ptr(),
1117 (open_flags | libc::O_CLOEXEC) & !(libc::O_NOFOLLOW | libc::O_DIRECT),
1118 )
1119 }
1120 )?;
1121
1122 // SAFETY: fd_open is valid
1123 let file_open = unsafe { File::from_raw_descriptor(fd_open) };
1124 let handle = self.next_handle.fetch_add(1, Ordering::Relaxed);
1125 let data = HandleData {
1126 inode,
1127 file: Mutex::new(file_open),
1128 };
1129
1130 self.handles.lock().insert(handle, Arc::new(data));
1131
1132 let opts = self.get_cache_open_options(open_flags as u32);
1133 Ok((Some(handle), opts))
1134 }
1135
do_release(&self, inode: Inode, handle: Handle) -> io::Result<()>1136 fn do_release(&self, inode: Inode, handle: Handle) -> io::Result<()> {
1137 let mut handles = self.handles.lock();
1138
1139 if let btree_map::Entry::Occupied(e) = handles.entry(handle) {
1140 if e.get().inode == inode {
1141 // We don't need to close the file here because that will happen automatically when
1142 // the last `Arc` is dropped.
1143 e.remove();
1144 return Ok(());
1145 }
1146 }
1147
1148 Err(ebadf())
1149 }
1150
do_getattr(&self, inode: &InodeData) -> io::Result<(libc::stat64, Duration)>1151 fn do_getattr(&self, inode: &InodeData) -> io::Result<(libc::stat64, Duration)> {
1152 let st = stat(inode)?;
1153
1154 Ok((st, self.cfg.timeout))
1155 }
1156
do_unlink(&self, parent: &InodeData, name: &CStr, flags: libc::c_int) -> io::Result<()>1157 fn do_unlink(&self, parent: &InodeData, name: &CStr, flags: libc::c_int) -> io::Result<()> {
1158 // SAFETY: this doesn't modify any memory and we check the return value.
1159 syscall!(unsafe { libc::unlinkat(parent.as_raw_descriptor(), name.as_ptr(), flags) })?;
1160 Ok(())
1161 }
1162
do_fsync<F: AsRawDescriptor>(&self, file: &F, datasync: bool) -> io::Result<()>1163 fn do_fsync<F: AsRawDescriptor>(&self, file: &F, datasync: bool) -> io::Result<()> {
1164 // SAFETY: this doesn't modify any memory and we check the return value.
1165 syscall!(unsafe {
1166 if datasync {
1167 libc::fdatasync(file.as_raw_descriptor())
1168 } else {
1169 libc::fsync(file.as_raw_descriptor())
1170 }
1171 })?;
1172
1173 Ok(())
1174 }
1175
1176 // Changes the CWD to `self.proc`, runs `f`, and then changes the CWD back to the root
1177 // directory. This effectively emulates an *at syscall starting at /proc, which is useful when
1178 // there is no *at syscall available. Panics if any of the fchdir calls fail or if there is no
1179 // root inode.
1180 //
1181 // NOTE: this method acquires an `self`-wide lock. If any locks are acquired in `f`, care must
1182 // be taken to avoid the risk of deadlocks.
with_proc_chdir<F, T>(&self, f: F) -> T where F: FnOnce() -> T,1183 fn with_proc_chdir<F, T>(&self, f: F) -> T
1184 where
1185 F: FnOnce() -> T,
1186 {
1187 let root = self.find_inode(ROOT_ID).expect("failed to find root inode");
1188
1189 // Acquire a lock for `fchdir`.
1190 let _proc_lock = self.process_lock.lock();
1191 // SAFETY: this doesn't modify any memory and we check the return value. Since the
1192 // fchdir should never fail we just use debug_asserts.
1193 let proc_cwd = unsafe { libc::fchdir(self.proc.as_raw_descriptor()) };
1194 debug_assert_eq!(
1195 proc_cwd,
1196 0,
1197 "failed to fchdir to /proc: {}",
1198 io::Error::last_os_error()
1199 );
1200
1201 let res = f();
1202
1203 // SAFETY: this doesn't modify any memory and we check the return value. Since the
1204 // fchdir should never fail we just use debug_asserts.
1205 let root_cwd = unsafe { libc::fchdir(root.as_raw_descriptor()) };
1206 debug_assert_eq!(
1207 root_cwd,
1208 0,
1209 "failed to fchdir back to root directory: {}",
1210 io::Error::last_os_error()
1211 );
1212
1213 res
1214 }
1215
do_getxattr(&self, inode: &InodeData, name: &CStr, value: &mut [u8]) -> io::Result<usize>1216 fn do_getxattr(&self, inode: &InodeData, name: &CStr, value: &mut [u8]) -> io::Result<usize> {
1217 let file = inode.file.lock();
1218 let o_path_file = (file.1 & libc::O_PATH) != 0;
1219 let res = if o_path_file {
1220 // For FDs opened with `O_PATH`, we cannot call `fgetxattr` normally. Instead we
1221 // emulate an _at syscall by changing the CWD to /proc, running the path based syscall,
1222 // and then setting the CWD back to the root directory.
1223 let path = CString::new(format!("self/fd/{}", file.0.as_raw_descriptor()))
1224 .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
1225
1226 // SAFETY: this will only modify `value` and we check the return value.
1227 self.with_proc_chdir(|| unsafe {
1228 libc::getxattr(
1229 path.as_ptr(),
1230 name.as_ptr(),
1231 value.as_mut_ptr() as *mut libc::c_void,
1232 value.len() as libc::size_t,
1233 )
1234 })
1235 } else {
1236 // For regular files and directories, we can just use fgetxattr.
1237 // SAFETY: this will only write to `value` and we check the return value.
1238 unsafe {
1239 libc::fgetxattr(
1240 file.0.as_raw_descriptor(),
1241 name.as_ptr(),
1242 value.as_mut_ptr() as *mut libc::c_void,
1243 value.len() as libc::size_t,
1244 )
1245 }
1246 };
1247
1248 if res < 0 {
1249 Err(io::Error::last_os_error())
1250 } else {
1251 Ok(res as usize)
1252 }
1253 }
1254
get_encryption_policy_ex<R: io::Read>( &self, inode: Inode, handle: Handle, mut r: R, ) -> io::Result<IoctlReply>1255 fn get_encryption_policy_ex<R: io::Read>(
1256 &self,
1257 inode: Inode,
1258 handle: Handle,
1259 mut r: R,
1260 ) -> io::Result<IoctlReply> {
1261 let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
1262 self.find_inode(inode)?
1263 } else {
1264 self.find_handle(handle, inode)?
1265 };
1266
1267 // SAFETY: this struct only has integer fields and any value is valid.
1268 let mut arg = unsafe { MaybeUninit::<fscrypt_get_policy_ex_arg>::zeroed().assume_init() };
1269 r.read_exact(arg.policy_size.as_bytes_mut())?;
1270
1271 let policy_size = cmp::min(arg.policy_size, size_of::<fscrypt_policy>() as u64);
1272 arg.policy_size = policy_size;
1273
1274 let res =
1275 // SAFETY: the kernel will only write to `arg` and we check the return value.
1276 unsafe { ioctl_with_mut_ptr(&*data, FS_IOC_GET_ENCRYPTION_POLICY_EX(), &mut arg) };
1277 if res < 0 {
1278 Ok(IoctlReply::Done(Err(io::Error::last_os_error())))
1279 } else {
1280 let len = size_of::<u64>() + arg.policy_size as usize;
1281 Ok(IoctlReply::Done(Ok(<&[u8]>::from(&arg)[..len].to_vec())))
1282 }
1283 }
1284
get_fsxattr(&self, inode: Inode, handle: Handle) -> io::Result<IoctlReply>1285 fn get_fsxattr(&self, inode: Inode, handle: Handle) -> io::Result<IoctlReply> {
1286 let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
1287 self.find_inode(inode)?
1288 } else {
1289 self.find_handle(handle, inode)?
1290 };
1291
1292 let mut buf = MaybeUninit::<fsxattr>::zeroed();
1293
1294 // SAFETY: the kernel will only write to `buf` and we check the return value.
1295 let res = unsafe { ioctl_with_mut_ptr(&*data, FS_IOC_FSGETXATTR(), buf.as_mut_ptr()) };
1296 if res < 0 {
1297 Ok(IoctlReply::Done(Err(io::Error::last_os_error())))
1298 } else {
1299 // SAFETY: the kernel guarantees that the policy is now initialized.
1300 let xattr = unsafe { buf.assume_init() };
1301 Ok(IoctlReply::Done(Ok(xattr.as_bytes().to_vec())))
1302 }
1303 }
1304
set_fsxattr<R: io::Read>( &self, #[cfg_attr(not(feature = "arc_quota"), allow(unused_variables))] ctx: Context, inode: Inode, handle: Handle, mut r: R, ) -> io::Result<IoctlReply>1305 fn set_fsxattr<R: io::Read>(
1306 &self,
1307 #[cfg_attr(not(feature = "arc_quota"), allow(unused_variables))] ctx: Context,
1308 inode: Inode,
1309 handle: Handle,
1310 mut r: R,
1311 ) -> io::Result<IoctlReply> {
1312 let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
1313 self.find_inode(inode)?
1314 } else {
1315 self.find_handle(handle, inode)?
1316 };
1317
1318 let mut in_attr = fsxattr::new_zeroed();
1319 r.read_exact(in_attr.as_bytes_mut())?;
1320
1321 #[cfg(feature = "arc_quota")]
1322 let st = stat(&*data)?;
1323
1324 // Changing quota project ID requires CAP_FOWNER or being file owner.
1325 // Here we use privileged_quota_uids because we cannot perform a CAP_FOWNER check.
1326 #[cfg(feature = "arc_quota")]
1327 if ctx.uid == st.st_uid || self.cfg.privileged_quota_uids.contains(&ctx.uid) {
1328 // Get the current fsxattr.
1329 let mut buf = MaybeUninit::<fsxattr>::zeroed();
1330 // SAFETY: the kernel will only write to `buf` and we check the return value.
1331 let res = unsafe { ioctl_with_mut_ptr(&*data, FS_IOC_FSGETXATTR(), buf.as_mut_ptr()) };
1332 if res < 0 {
1333 return Ok(IoctlReply::Done(Err(io::Error::last_os_error())));
1334 }
1335 // SAFETY: the kernel guarantees that the policy is now initialized.
1336 let current_attr = unsafe { buf.assume_init() };
1337
1338 // Project ID cannot be changed inside a user namespace.
1339 // Use Spaced to avoid this restriction.
1340 if current_attr.fsx_projid != in_attr.fsx_projid {
1341 let connection = self.dbus_connection.as_ref().unwrap().lock();
1342 let proxy = connection.with_proxy(
1343 "org.chromium.Spaced",
1344 "/org/chromium/Spaced",
1345 DEFAULT_DBUS_TIMEOUT,
1346 );
1347 let project_id = in_attr.fsx_projid;
1348 if !is_android_project_id(project_id) {
1349 return Err(io::Error::from_raw_os_error(libc::EINVAL));
1350 }
1351 let file_clone = base::SafeDescriptor::try_from(&*data)?;
1352 match proxy.set_project_id(file_clone.into(), project_id) {
1353 Ok(r) => {
1354 let r = SetProjectIdReply::parse_from_bytes(&r)
1355 .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
1356 if !r.success {
1357 return Ok(IoctlReply::Done(Err(io::Error::from_raw_os_error(
1358 r.error,
1359 ))));
1360 }
1361 }
1362 Err(e) => {
1363 return Err(io::Error::new(io::ErrorKind::Other, e));
1364 }
1365 };
1366 }
1367 }
1368
1369 // SAFETY: this doesn't modify any memory and we check the return value.
1370 let res = unsafe { ioctl_with_ptr(&*data, FS_IOC_FSSETXATTR(), &in_attr) };
1371 if res < 0 {
1372 Ok(IoctlReply::Done(Err(io::Error::last_os_error())))
1373 } else {
1374 Ok(IoctlReply::Done(Ok(Vec::new())))
1375 }
1376 }
1377
get_flags(&self, inode: Inode, handle: Handle) -> io::Result<IoctlReply>1378 fn get_flags(&self, inode: Inode, handle: Handle) -> io::Result<IoctlReply> {
1379 let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
1380 self.find_inode(inode)?
1381 } else {
1382 self.find_handle(handle, inode)?
1383 };
1384
1385 // The ioctl encoding is a long but the parameter is actually an int.
1386 let mut flags: c_int = 0;
1387
1388 // SAFETY: the kernel will only write to `flags` and we check the return value.
1389 let res = unsafe { ioctl_with_mut_ptr(&*data, FS_IOC_GETFLAGS(), &mut flags) };
1390 if res < 0 {
1391 Ok(IoctlReply::Done(Err(io::Error::last_os_error())))
1392 } else {
1393 Ok(IoctlReply::Done(Ok(flags.to_ne_bytes().to_vec())))
1394 }
1395 }
1396
set_flags<R: io::Read>( &self, #[cfg_attr(not(feature = "arc_quota"), allow(unused_variables))] ctx: Context, inode: Inode, handle: Handle, mut r: R, ) -> io::Result<IoctlReply>1397 fn set_flags<R: io::Read>(
1398 &self,
1399 #[cfg_attr(not(feature = "arc_quota"), allow(unused_variables))] ctx: Context,
1400 inode: Inode,
1401 handle: Handle,
1402 mut r: R,
1403 ) -> io::Result<IoctlReply> {
1404 let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
1405 self.find_inode(inode)?
1406 } else {
1407 self.find_handle(handle, inode)?
1408 };
1409
1410 // The ioctl encoding is a long but the parameter is actually an int.
1411 let mut in_flags: c_int = 0;
1412 r.read_exact(in_flags.as_bytes_mut())?;
1413
1414 #[cfg(feature = "arc_quota")]
1415 let st = stat(&*data)?;
1416
1417 // Only privleged uid can perform FS_IOC_SETFLAGS through cryptohome.
1418 #[cfg(feature = "arc_quota")]
1419 if ctx.uid == st.st_uid || self.cfg.privileged_quota_uids.contains(&ctx.uid) {
1420 // Get the current flag.
1421 let mut buf = MaybeUninit::<c_int>::zeroed();
1422 // SAFETY: the kernel will only write to `buf` and we check the return value.
1423 let res = unsafe { ioctl_with_mut_ptr(&*data, FS_IOC_GETFLAGS(), buf.as_mut_ptr()) };
1424 if res < 0 {
1425 return Ok(IoctlReply::Done(Err(io::Error::last_os_error())));
1426 }
1427 // SAFETY: the kernel guarantees that the policy is now initialized.
1428 let current_flags = unsafe { buf.assume_init() };
1429
1430 // Project inheritance flag cannot be changed inside a user namespace.
1431 // Use Spaced to avoid this restriction.
1432 if (in_flags & FS_PROJINHERIT_FL) != (current_flags & FS_PROJINHERIT_FL) {
1433 let connection = self.dbus_connection.as_ref().unwrap().lock();
1434 let proxy = connection.with_proxy(
1435 "org.chromium.Spaced",
1436 "/org/chromium/Spaced",
1437 DEFAULT_DBUS_TIMEOUT,
1438 );
1439 // If the input flags contain FS_PROJINHERIT_FL, then it is a set. Otherwise it is a
1440 // reset.
1441 let enable = (in_flags & FS_PROJINHERIT_FL) == FS_PROJINHERIT_FL;
1442 let file_clone = base::SafeDescriptor::try_from(&*data)?;
1443 match proxy.set_project_inheritance_flag(file_clone.into(), enable) {
1444 Ok(r) => {
1445 let r = SetProjectInheritanceFlagReply::parse_from_bytes(&r)
1446 .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
1447 if !r.success {
1448 return Ok(IoctlReply::Done(Err(io::Error::from_raw_os_error(
1449 r.error,
1450 ))));
1451 }
1452 }
1453 Err(e) => {
1454 return Err(io::Error::new(io::ErrorKind::Other, e));
1455 }
1456 };
1457 }
1458 }
1459
1460 // SAFETY: this doesn't modify any memory and we check the return value.
1461 let res = unsafe { ioctl_with_ptr(&*data, FS_IOC_SETFLAGS(), &in_flags) };
1462 if res < 0 {
1463 Ok(IoctlReply::Done(Err(io::Error::last_os_error())))
1464 } else {
1465 Ok(IoctlReply::Done(Ok(Vec::new())))
1466 }
1467 }
1468
enable_verity<R: io::Read>( &self, inode: Inode, handle: Handle, mut r: R, ) -> io::Result<IoctlReply>1469 fn enable_verity<R: io::Read>(
1470 &self,
1471 inode: Inode,
1472 handle: Handle,
1473 mut r: R,
1474 ) -> io::Result<IoctlReply> {
1475 let inode_data = self.find_inode(inode)?;
1476
1477 // These match the return codes from `fsverity_ioctl_enable` in the kernel.
1478 match inode_data.filetype {
1479 FileType::Regular => {}
1480 FileType::Directory => return Err(io::Error::from_raw_os_error(libc::EISDIR)),
1481 FileType::Other => return Err(io::Error::from_raw_os_error(libc::EINVAL)),
1482 }
1483
1484 {
1485 // We cannot enable verity while holding a writable fd so get a new one, if necessary.
1486 let mut file = inode_data.file.lock();
1487 let mut flags = file.1;
1488 match flags & libc::O_ACCMODE {
1489 libc::O_WRONLY | libc::O_RDWR => {
1490 flags &= !libc::O_ACCMODE;
1491 flags |= libc::O_RDONLY;
1492
1493 // We need to get a read-only handle for this file.
1494 let newfile = self.open_fd(file.0.as_raw_descriptor(), libc::O_RDONLY)?;
1495 *file = (newfile, flags);
1496 }
1497 libc::O_RDONLY => {}
1498 _ => panic!("Unexpected flags: {:#x}", flags),
1499 }
1500 }
1501
1502 let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
1503 inode_data
1504 } else {
1505 let data = self.find_handle(handle, inode)?;
1506
1507 {
1508 // We can't enable verity while holding a writable fd. We don't know whether the
1509 // file was opened for writing so check it here. We don't expect
1510 // this to be a frequent operation so the extra latency should be
1511 // fine.
1512 let mut file = data.file.lock();
1513 let flags = FileFlags::from_file(&*file).map_err(io::Error::from)?;
1514 match flags {
1515 FileFlags::ReadWrite | FileFlags::Write => {
1516 // We need to get a read-only handle for this file.
1517 *file = self.open_fd(file.as_raw_descriptor(), libc::O_RDONLY)?;
1518 }
1519 FileFlags::Read => {}
1520 }
1521 }
1522
1523 data
1524 };
1525
1526 let mut arg = fsverity_enable_arg::new_zeroed();
1527 r.read_exact(arg.as_bytes_mut())?;
1528
1529 let mut salt;
1530 if arg.salt_size > 0 {
1531 if arg.salt_size > self.max_buffer_size() {
1532 return Ok(IoctlReply::Done(Err(io::Error::from_raw_os_error(
1533 libc::ENOMEM,
1534 ))));
1535 }
1536 salt = vec![0; arg.salt_size as usize];
1537 r.read_exact(&mut salt)?;
1538 arg.salt_ptr = salt.as_ptr() as usize as u64;
1539 } else {
1540 arg.salt_ptr = 0;
1541 }
1542
1543 let mut sig;
1544 if arg.sig_size > 0 {
1545 if arg.sig_size > self.max_buffer_size() {
1546 return Ok(IoctlReply::Done(Err(io::Error::from_raw_os_error(
1547 libc::ENOMEM,
1548 ))));
1549 }
1550 sig = vec![0; arg.sig_size as usize];
1551 r.read_exact(&mut sig)?;
1552 arg.sig_ptr = sig.as_ptr() as usize as u64;
1553 } else {
1554 arg.sig_ptr = 0;
1555 }
1556
1557 // SAFETY: this doesn't modify any memory and we check the return value.
1558 let res = unsafe { ioctl_with_ptr(&*data, FS_IOC_ENABLE_VERITY(), &arg) };
1559 if res < 0 {
1560 Ok(IoctlReply::Done(Err(io::Error::last_os_error())))
1561 } else {
1562 Ok(IoctlReply::Done(Ok(Vec::new())))
1563 }
1564 }
1565
measure_verity<R: io::Read>( &self, inode: Inode, handle: Handle, mut r: R, out_size: u32, ) -> io::Result<IoctlReply>1566 fn measure_verity<R: io::Read>(
1567 &self,
1568 inode: Inode,
1569 handle: Handle,
1570 mut r: R,
1571 out_size: u32,
1572 ) -> io::Result<IoctlReply> {
1573 let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
1574 self.find_inode(inode)?
1575 } else {
1576 self.find_handle(handle, inode)?
1577 };
1578
1579 let mut digest = fsverity_digest::new_zeroed();
1580 r.read_exact(digest.as_bytes_mut())?;
1581
1582 // Taken from fs/verity/fsverity_private.h.
1583 const FS_VERITY_MAX_DIGEST_SIZE: u16 = 64;
1584
1585 // This digest size is what the fsverity command line utility uses.
1586 const DIGEST_SIZE: u16 = FS_VERITY_MAX_DIGEST_SIZE * 2 + 1;
1587 const BUFLEN: usize = size_of::<fsverity_digest>() + DIGEST_SIZE as usize;
1588 const ROUNDED_LEN: usize =
1589 (BUFLEN + size_of::<fsverity_digest>() - 1) / size_of::<fsverity_digest>();
1590
1591 // Make sure we get a properly aligned allocation.
1592 let mut buf = [MaybeUninit::<fsverity_digest>::uninit(); ROUNDED_LEN];
1593
1594 // SAFETY: we are only writing data and not reading uninitialized memory.
1595 unsafe {
1596 // TODO: Replace with `MaybeUninit::slice_as_mut_ptr` once it is stabilized.
1597 addr_of_mut!((*(buf.as_mut_ptr() as *mut fsverity_digest)).digest_size)
1598 .write(DIGEST_SIZE)
1599 };
1600
1601 // SAFETY: this will only modify `buf` and we check the return value.
1602 let res = unsafe { ioctl_with_mut_ptr(&*data, FS_IOC_MEASURE_VERITY(), buf.as_mut_ptr()) };
1603 if res < 0 {
1604 Ok(IoctlReply::Done(Err(io::Error::last_os_error())))
1605 } else {
1606 let digest_size =
1607 // SAFETY: this value was initialized by us already and then overwritten by the kernel.
1608 // TODO: Replace with `MaybeUninit::slice_as_ptr` once it is stabilized.
1609 unsafe { addr_of!((*(buf.as_ptr() as *const fsverity_digest)).digest_size).read() };
1610 let outlen = size_of::<fsverity_digest>() as u32 + u32::from(digest_size);
1611
1612 // The kernel guarantees this but it doesn't hurt to be paranoid.
1613 debug_assert!(outlen <= (ROUNDED_LEN * size_of::<fsverity_digest>()) as u32);
1614 if digest.digest_size < digest_size || out_size < outlen {
1615 return Ok(IoctlReply::Done(Err(io::Error::from_raw_os_error(
1616 libc::EOVERFLOW,
1617 ))));
1618 }
1619
1620 let buf: [MaybeUninit<u8>; ROUNDED_LEN * size_of::<fsverity_digest>()] =
1621 // SAFETY: any bit pattern is valid for `MaybeUninit<u8>` and `fsverity_digest`
1622 // doesn't contain any references.
1623 unsafe { mem::transmute(buf) };
1624
1625 let buf =
1626 // SAFETY: Casting to `*const [u8]` is safe because the kernel guarantees that the
1627 // first `outlen` bytes of `buf` are initialized and `MaybeUninit<u8>` is guaranteed
1628 // to have the same layout as `u8`.
1629 // TODO: Replace with `MaybeUninit::slice_assume_init_ref` once it is stabilized.
1630 unsafe { &*(&buf[..outlen as usize] as *const [MaybeUninit<u8>] as *const [u8]) };
1631 Ok(IoctlReply::Done(Ok(buf.to_vec())))
1632 }
1633 }
1634 }
1635
1636 /// Decrements the refcount of the inode.
1637 /// Returns `true` if the refcount became 0.
forget_one( inodes: &mut MultikeyBTreeMap<Inode, InodeAltKey, Arc<InodeData>>, inode: Inode, count: u64, ) -> bool1638 fn forget_one(
1639 inodes: &mut MultikeyBTreeMap<Inode, InodeAltKey, Arc<InodeData>>,
1640 inode: Inode,
1641 count: u64,
1642 ) -> bool {
1643 if let Some(data) = inodes.get(&inode) {
1644 // Acquiring the write lock on the inode map prevents new lookups from incrementing the
1645 // refcount but there is the possibility that a previous lookup already acquired a
1646 // reference to the inode data and is in the process of updating the refcount so we need
1647 // to loop here until we can decrement successfully.
1648 loop {
1649 let refcount = data.refcount.load(Ordering::Relaxed);
1650
1651 // Saturating sub because it doesn't make sense for a refcount to go below zero and
1652 // we don't want misbehaving clients to cause integer overflow.
1653 let new_count = refcount.saturating_sub(count);
1654
1655 // Synchronizes with the acquire load in `do_lookup`.
1656 if data
1657 .refcount
1658 .compare_exchange_weak(refcount, new_count, Ordering::Release, Ordering::Relaxed)
1659 .is_ok()
1660 {
1661 if new_count == 0 {
1662 // We just removed the last refcount for this inode. There's no need for an
1663 // acquire fence here because we hold a write lock on the inode map and any
1664 // thread that is waiting to do a forget on the same inode will have to wait
1665 // until we release the lock. So there's is no other release store for us to
1666 // synchronize with before deleting the entry.
1667 inodes.remove(&inode);
1668 return true;
1669 }
1670 break;
1671 }
1672 }
1673 }
1674 false
1675 }
1676
1677 // Strips any `user.virtiofs.` prefix from `buf`. If buf contains one or more nul-bytes, each
1678 // nul-byte-separated slice is treated as a C string and the prefix is stripped from each one.
strip_xattr_prefix(buf: &mut Vec<u8>)1679 fn strip_xattr_prefix(buf: &mut Vec<u8>) {
1680 fn next_cstr(b: &[u8], start: usize) -> Option<&[u8]> {
1681 if start >= b.len() {
1682 return None;
1683 }
1684
1685 let end = b[start..]
1686 .iter()
1687 .position(|&c| c == b'\0')
1688 .map(|p| start + p + 1)
1689 .unwrap_or(b.len());
1690
1691 Some(&b[start..end])
1692 }
1693
1694 let mut pos = 0;
1695 while let Some(name) = next_cstr(buf, pos) {
1696 if !name.starts_with(USER_VIRTIOFS_XATTR) {
1697 pos += name.len();
1698 continue;
1699 }
1700
1701 let newlen = name.len() - USER_VIRTIOFS_XATTR.len();
1702 buf.drain(pos..pos + USER_VIRTIOFS_XATTR.len());
1703 pos += newlen;
1704 }
1705 }
1706
1707 impl FileSystem for PassthroughFs {
1708 type Inode = Inode;
1709 type Handle = Handle;
1710 type DirIter = ReadDir<Box<[u8]>>;
1711
init(&self, capable: FsOptions) -> io::Result<FsOptions>1712 fn init(&self, capable: FsOptions) -> io::Result<FsOptions> {
1713 // SAFETY: this is a constant value that is a nul-terminated string without interior
1714 // nul bytes.
1715 let root = unsafe { CStr::from_bytes_with_nul_unchecked(ROOT_CSTR) };
1716
1717 let flags = libc::O_DIRECTORY | libc::O_NOFOLLOW | libc::O_CLOEXEC;
1718 // SAFETY: this doesn't modify any memory and we check the return value.
1719 let raw_descriptor = unsafe { libc::openat64(libc::AT_FDCWD, root.as_ptr(), flags) };
1720 if raw_descriptor < 0 {
1721 return Err(io::Error::last_os_error());
1722 }
1723
1724 // SAFETY: safe because we just opened this descriptor above.
1725 let f = unsafe { File::from_raw_descriptor(raw_descriptor) };
1726
1727 let st = stat(&f)?;
1728
1729 // SAFETY: this doesn't modify any memory and there is no need to check the return
1730 // value because this system call always succeeds. We need to clear the umask here because
1731 // we want the client to be able to set all the bits in the mode.
1732 unsafe { libc::umask(0o000) };
1733
1734 let mut inodes = self.inodes.lock();
1735
1736 // Not sure why the root inode gets a refcount of 2 but that's what libfuse does.
1737 inodes.insert(
1738 ROOT_ID,
1739 InodeAltKey {
1740 ino: st.st_ino,
1741 dev: st.st_dev,
1742 },
1743 Arc::new(InodeData {
1744 inode: ROOT_ID,
1745 file: Mutex::new((f, flags)),
1746 refcount: AtomicU64::new(2),
1747 filetype: st.st_mode.into(),
1748 path: "".to_string(),
1749 }),
1750 );
1751
1752 let mut opts = FsOptions::DO_READDIRPLUS
1753 | FsOptions::READDIRPLUS_AUTO
1754 | FsOptions::EXPORT_SUPPORT
1755 | FsOptions::DONT_MASK
1756 | FsOptions::CACHE_SYMLINKS
1757 | FsOptions::SECURITY_CONTEXT;
1758 if self.cfg.posix_acl {
1759 opts |= FsOptions::POSIX_ACL;
1760 }
1761 if self.cfg.writeback && capable.contains(FsOptions::WRITEBACK_CACHE) {
1762 opts |= FsOptions::WRITEBACK_CACHE;
1763 self.writeback.store(true, Ordering::Relaxed);
1764 }
1765 if self.cfg.cache_policy == CachePolicy::Always {
1766 if capable.contains(FsOptions::ZERO_MESSAGE_OPEN) {
1767 opts |= FsOptions::ZERO_MESSAGE_OPEN;
1768 self.zero_message_open.store(true, Ordering::Relaxed);
1769 }
1770 if capable.contains(FsOptions::ZERO_MESSAGE_OPENDIR) {
1771 opts |= FsOptions::ZERO_MESSAGE_OPENDIR;
1772 self.zero_message_opendir.store(true, Ordering::Relaxed);
1773 }
1774 }
1775 Ok(opts)
1776 }
1777
destroy(&self)1778 fn destroy(&self) {
1779 cros_tracing::trace_simple_print!(VirtioFs, "{:?}: destroy", self);
1780 self.handles.lock().clear();
1781 self.inodes.lock().clear();
1782 }
1783
statfs(&self, _ctx: Context, inode: Inode) -> io::Result<libc::statvfs64>1784 fn statfs(&self, _ctx: Context, inode: Inode) -> io::Result<libc::statvfs64> {
1785 let _trace = fs_trace!(self.tag, "statfs", inode);
1786 let data = self.find_inode(inode)?;
1787
1788 let mut out = MaybeUninit::<libc::statvfs64>::zeroed();
1789
1790 // SAFETY: this will only modify `out` and we check the return value.
1791 syscall!(unsafe { libc::fstatvfs64(data.as_raw_descriptor(), out.as_mut_ptr()) })?;
1792
1793 // SAFETY: the kernel guarantees that `out` has been initialized.
1794 Ok(unsafe { out.assume_init() })
1795 }
1796
lookup(&self, _ctx: Context, parent: Inode, name: &CStr) -> io::Result<Entry>1797 fn lookup(&self, _ctx: Context, parent: Inode, name: &CStr) -> io::Result<Entry> {
1798 let data = self.find_inode(parent)?;
1799 #[allow(unused_variables)]
1800 let path = format!(
1801 "{}/{}",
1802 data.path,
1803 name.to_str().unwrap_or("<non UTF-8 path>")
1804 );
1805 let _trace = fs_trace!(self.tag, "lookup", parent, path);
1806
1807 let mut res = self.do_lookup_with_casefold_fallback(&data, name);
1808
1809 // FUSE takes a inode=0 as a request to do negative dentry cache.
1810 // So, if `negative_timeout` is set, return success with the timeout value and inode=0 as a
1811 // response.
1812 if let Err(e) = &res {
1813 if e.kind() == std::io::ErrorKind::NotFound && !self.cfg.negative_timeout.is_zero() {
1814 res = Ok(Entry::new_negative(self.cfg.negative_timeout));
1815 }
1816 }
1817
1818 res
1819 }
1820
forget(&self, _ctx: Context, inode: Inode, count: u64)1821 fn forget(&self, _ctx: Context, inode: Inode, count: u64) {
1822 let _trace = fs_trace!(self.tag, "forget", inode, count);
1823 let mut inodes = self.inodes.lock();
1824 let caches = self.lock_casefold_lookup_caches();
1825 if forget_one(&mut inodes, inode, count) {
1826 if let Some(mut c) = caches {
1827 c.forget(inode);
1828 }
1829 }
1830 }
1831
batch_forget(&self, _ctx: Context, requests: Vec<(Inode, u64)>)1832 fn batch_forget(&self, _ctx: Context, requests: Vec<(Inode, u64)>) {
1833 let mut inodes = self.inodes.lock();
1834 let mut caches = self.lock_casefold_lookup_caches();
1835 for (inode, count) in requests {
1836 if forget_one(&mut inodes, inode, count) {
1837 if let Some(c) = caches.as_mut() {
1838 c.forget(inode);
1839 }
1840 }
1841 }
1842 }
1843
opendir( &self, _ctx: Context, inode: Inode, flags: u32, ) -> io::Result<(Option<Handle>, OpenOptions)>1844 fn opendir(
1845 &self,
1846 _ctx: Context,
1847 inode: Inode,
1848 flags: u32,
1849 ) -> io::Result<(Option<Handle>, OpenOptions)> {
1850 let _trace = fs_trace!(self.tag, "opendir", inode, flags);
1851 if self.zero_message_opendir.load(Ordering::Relaxed) {
1852 Err(io::Error::from_raw_os_error(libc::ENOSYS))
1853 } else {
1854 self.do_open(inode, flags | (libc::O_DIRECTORY as u32))
1855 }
1856 }
1857
releasedir( &self, _ctx: Context, inode: Inode, _flags: u32, handle: Handle, ) -> io::Result<()>1858 fn releasedir(
1859 &self,
1860 _ctx: Context,
1861 inode: Inode,
1862 _flags: u32,
1863 handle: Handle,
1864 ) -> io::Result<()> {
1865 let _trace = fs_trace!(self.tag, "releasedir", inode, handle);
1866 if self.zero_message_opendir.load(Ordering::Relaxed) {
1867 Ok(())
1868 } else {
1869 self.do_release(inode, handle)
1870 }
1871 }
1872
mkdir( &self, ctx: Context, parent: Inode, name: &CStr, mode: u32, umask: u32, security_ctx: Option<&CStr>, ) -> io::Result<Entry>1873 fn mkdir(
1874 &self,
1875 ctx: Context,
1876 parent: Inode,
1877 name: &CStr,
1878 mode: u32,
1879 umask: u32,
1880 security_ctx: Option<&CStr>,
1881 ) -> io::Result<Entry> {
1882 let _trace = fs_trace!(self.tag, "mkdir", parent, name, mode, umask, security_ctx);
1883 let data = self.find_inode(parent)?;
1884
1885 let _ctx = security_ctx
1886 .filter(|ctx| ctx.to_bytes_with_nul() != UNLABELED_CSTR)
1887 .map(|ctx| ScopedSecurityContext::new(&self.proc, ctx))
1888 .transpose()?;
1889
1890 let (_uid, _gid) = set_creds(ctx.uid, ctx.gid)?;
1891 {
1892 let casefold_cache = self.lock_casefold_lookup_caches();
1893 let _scoped_umask = ScopedUmask::new(umask);
1894
1895 // SAFETY: this doesn't modify any memory and we check the return value.
1896 syscall!(unsafe { libc::mkdirat(data.as_raw_descriptor(), name.as_ptr(), mode) })?;
1897 if let Some(mut c) = casefold_cache {
1898 c.insert(data.inode, name);
1899 }
1900 }
1901 self.do_lookup(&data, name)
1902 }
1903
rmdir(&self, _ctx: Context, parent: Inode, name: &CStr) -> io::Result<()>1904 fn rmdir(&self, _ctx: Context, parent: Inode, name: &CStr) -> io::Result<()> {
1905 let _trace = fs_trace!(self.tag, "rmdir", parent, name);
1906 let data = self.find_inode(parent)?;
1907 let casefold_cache = self.lock_casefold_lookup_caches();
1908 // TODO(b/278691962): If ascii_casefold is enabled, we need to call
1909 // `get_case_unfolded_name()` to get the actual name to be unlinked.
1910 self.do_unlink(&data, name, libc::AT_REMOVEDIR)?;
1911 if let Some(mut c) = casefold_cache {
1912 c.remove(data.inode, name);
1913 }
1914 Ok(())
1915 }
1916
readdir( &self, _ctx: Context, inode: Inode, handle: Handle, size: u32, offset: u64, ) -> io::Result<Self::DirIter>1917 fn readdir(
1918 &self,
1919 _ctx: Context,
1920 inode: Inode,
1921 handle: Handle,
1922 size: u32,
1923 offset: u64,
1924 ) -> io::Result<Self::DirIter> {
1925 let _trace = fs_trace!(self.tag, "readdir", inode, handle, size, offset);
1926 let buf = vec![0; size as usize].into_boxed_slice();
1927
1928 if self.zero_message_opendir.load(Ordering::Relaxed) {
1929 let data = self.find_inode(inode)?;
1930 ReadDir::new(&*data, offset as libc::off64_t, buf)
1931 } else {
1932 let data = self.find_handle(handle, inode)?;
1933
1934 let dir = data.file.lock();
1935
1936 ReadDir::new(&*dir, offset as libc::off64_t, buf)
1937 }
1938 }
1939
open( &self, _ctx: Context, inode: Inode, flags: u32, ) -> io::Result<(Option<Handle>, OpenOptions)>1940 fn open(
1941 &self,
1942 _ctx: Context,
1943 inode: Inode,
1944 flags: u32,
1945 ) -> io::Result<(Option<Handle>, OpenOptions)> {
1946 if self.zero_message_open.load(Ordering::Relaxed) {
1947 let _trace = fs_trace!(self.tag, "open (zero-message)", inode, flags);
1948 Err(io::Error::from_raw_os_error(libc::ENOSYS))
1949 } else {
1950 let _trace = fs_trace!(self.tag, "open", inode, flags);
1951 self.do_open(inode, flags)
1952 }
1953 }
1954
release( &self, _ctx: Context, inode: Inode, _flags: u32, handle: Handle, _flush: bool, _flock_release: bool, _lock_owner: Option<u64>, ) -> io::Result<()>1955 fn release(
1956 &self,
1957 _ctx: Context,
1958 inode: Inode,
1959 _flags: u32,
1960 handle: Handle,
1961 _flush: bool,
1962 _flock_release: bool,
1963 _lock_owner: Option<u64>,
1964 ) -> io::Result<()> {
1965 if self.zero_message_open.load(Ordering::Relaxed) {
1966 let _trace = fs_trace!(self.tag, "release (zero-message)", inode, handle);
1967 Ok(())
1968 } else {
1969 let _trace = fs_trace!(self.tag, "release", inode, handle);
1970 self.do_release(inode, handle)
1971 }
1972 }
1973
chromeos_tmpfile( &self, ctx: Context, parent: Self::Inode, mode: u32, umask: u32, security_ctx: Option<&CStr>, ) -> io::Result<Entry>1974 fn chromeos_tmpfile(
1975 &self,
1976 ctx: Context,
1977 parent: Self::Inode,
1978 mode: u32,
1979 umask: u32,
1980 security_ctx: Option<&CStr>,
1981 ) -> io::Result<Entry> {
1982 let _trace = fs_trace!(
1983 self.tag,
1984 "chromeos_tempfile",
1985 parent,
1986 mode,
1987 umask,
1988 security_ctx
1989 );
1990 let data = self.find_inode(parent)?;
1991
1992 let _ctx = security_ctx
1993 .filter(|ctx| ctx.to_bytes_with_nul() != UNLABELED_CSTR)
1994 .map(|ctx| ScopedSecurityContext::new(&self.proc, ctx))
1995 .transpose()?;
1996
1997 let (_uid, _gid) = set_creds(ctx.uid, ctx.gid)?;
1998
1999 let tmpflags = libc::O_RDWR | libc::O_TMPFILE | libc::O_CLOEXEC | libc::O_NOFOLLOW;
2000
2001 // SAFETY: This string is nul-terminated and does not contain any interior nul bytes
2002 let current_dir = unsafe { CStr::from_bytes_with_nul_unchecked(b".\0") };
2003
2004 let fd = {
2005 let _scoped_umask = ScopedUmask::new(umask);
2006
2007 // SAFETY: this doesn't modify any memory and we check the return value.
2008 syscall!(unsafe {
2009 libc::openat64(
2010 data.as_raw_descriptor(),
2011 current_dir.as_ptr(),
2012 tmpflags,
2013 mode,
2014 )
2015 })?
2016 };
2017 // No need to add casefold_cache becuase we created an anonymous file.
2018
2019 // SAFETY: safe because we just opened this fd.
2020 let tmpfile = unsafe { File::from_raw_descriptor(fd) };
2021
2022 let st = stat(&tmpfile)?;
2023 let path = format!(
2024 "{}/{}",
2025 data.path.clone(),
2026 current_dir.to_str().unwrap_or("<non UTF-8 str>")
2027 );
2028 Ok(self.add_entry(tmpfile, st, tmpflags, path))
2029 }
2030
create( &self, ctx: Context, parent: Inode, name: &CStr, mode: u32, flags: u32, umask: u32, security_ctx: Option<&CStr>, ) -> io::Result<(Entry, Option<Handle>, OpenOptions)>2031 fn create(
2032 &self,
2033 ctx: Context,
2034 parent: Inode,
2035 name: &CStr,
2036 mode: u32,
2037 flags: u32,
2038 umask: u32,
2039 security_ctx: Option<&CStr>,
2040 ) -> io::Result<(Entry, Option<Handle>, OpenOptions)> {
2041 let _trace = fs_trace!(
2042 self.tag,
2043 "create",
2044 parent,
2045 name,
2046 mode,
2047 flags,
2048 umask,
2049 security_ctx
2050 );
2051 let data = self.find_inode(parent)?;
2052
2053 let _ctx = security_ctx
2054 .filter(|ctx| ctx.to_bytes_with_nul() != UNLABELED_CSTR)
2055 .map(|ctx| ScopedSecurityContext::new(&self.proc, ctx))
2056 .transpose()?;
2057
2058 let (_uid, _gid) = set_creds(ctx.uid, ctx.gid)?;
2059
2060 let create_flags =
2061 (flags as i32 | libc::O_CREAT | libc::O_CLOEXEC | libc::O_NOFOLLOW) & !libc::O_DIRECT;
2062
2063 let fd = {
2064 let _scoped_umask = ScopedUmask::new(umask);
2065 let casefold_cache = self.lock_casefold_lookup_caches();
2066
2067 // SAFETY: this doesn't modify any memory and we check the return value. We don't really
2068 // check `flags` because if the kernel can't handle poorly specified flags then we have
2069 // much bigger problems.
2070 // TODO(b/278691962): If ascii_casefold is enabled, we need to call
2071 // `get_case_unfolded_name()` to get the actual name to be created.
2072 let fd = syscall!(unsafe {
2073 libc::openat64(data.as_raw_descriptor(), name.as_ptr(), create_flags, mode)
2074 })?;
2075 if let Some(mut c) = casefold_cache {
2076 c.insert(parent, name);
2077 }
2078 fd
2079 };
2080
2081 // SAFETY: safe because we just opened this fd.
2082 let file = unsafe { File::from_raw_descriptor(fd) };
2083
2084 let st = stat(&file)?;
2085 let path = format!(
2086 "{}/{}",
2087 data.path.clone(),
2088 name.to_str().unwrap_or("<non UTF-8 str>")
2089 );
2090 let entry = self.add_entry(file, st, create_flags, path);
2091
2092 let (handle, opts) = if self.zero_message_open.load(Ordering::Relaxed) {
2093 (None, OpenOptions::KEEP_CACHE)
2094 } else {
2095 self.do_open_at(
2096 data,
2097 name,
2098 entry.inode,
2099 flags & !((libc::O_CREAT | libc::O_EXCL | libc::O_NOCTTY) as u32),
2100 )
2101 .map_err(|e| {
2102 // Don't leak the entry.
2103 self.forget(ctx, entry.inode, 1);
2104 e
2105 })?
2106 };
2107 Ok((entry, handle, opts))
2108 }
2109
unlink(&self, _ctx: Context, parent: Inode, name: &CStr) -> io::Result<()>2110 fn unlink(&self, _ctx: Context, parent: Inode, name: &CStr) -> io::Result<()> {
2111 let _trace = fs_trace!(self.tag, "unlink", parent, name);
2112 let data = self.find_inode(parent)?;
2113 let casefold_cache = self.lock_casefold_lookup_caches();
2114 // TODO(b/278691962): If ascii_casefold is enabled, we need to call
2115 // `get_case_unfolded_name()` to get the actual name to be unlinked.
2116 self.do_unlink(&data, name, 0)?;
2117 if let Some(mut c) = casefold_cache {
2118 c.remove(data.inode, name);
2119 }
2120 Ok(())
2121 }
2122
read<W: io::Write + ZeroCopyWriter>( &self, _ctx: Context, inode: Inode, handle: Handle, mut w: W, size: u32, offset: u64, _lock_owner: Option<u64>, _flags: u32, ) -> io::Result<usize>2123 fn read<W: io::Write + ZeroCopyWriter>(
2124 &self,
2125 _ctx: Context,
2126 inode: Inode,
2127 handle: Handle,
2128 mut w: W,
2129 size: u32,
2130 offset: u64,
2131 _lock_owner: Option<u64>,
2132 _flags: u32,
2133 ) -> io::Result<usize> {
2134 if self.zero_message_open.load(Ordering::Relaxed) {
2135 let _trace = fs_trace!(self.tag, "read (zero-message)", inode, handle, size, offset);
2136 let data = self.find_inode(inode)?;
2137
2138 let mut file = data.file.lock();
2139 let mut flags = file.1;
2140 match flags & libc::O_ACCMODE {
2141 libc::O_WRONLY => {
2142 flags &= !libc::O_WRONLY;
2143 flags |= libc::O_RDWR;
2144
2145 // We need to get a readable handle for this file.
2146 let newfile = self.open_fd(file.0.as_raw_descriptor(), libc::O_RDWR)?;
2147 *file = (newfile, flags);
2148 }
2149 libc::O_RDONLY | libc::O_RDWR => {}
2150 _ => panic!("Unexpected flags: {:#x}", flags),
2151 }
2152
2153 w.write_from(&mut file.0, size as usize, offset)
2154 } else {
2155 let _trace = fs_trace!(self.tag, "read", inode, handle, size, offset);
2156 let data = self.find_handle(handle, inode)?;
2157
2158 let mut f = data.file.lock();
2159 w.write_from(&mut f, size as usize, offset)
2160 }
2161 }
2162
write<R: io::Read + ZeroCopyReader>( &self, _ctx: Context, inode: Inode, handle: Handle, mut r: R, size: u32, offset: u64, _lock_owner: Option<u64>, _delayed_write: bool, flags: u32, ) -> io::Result<usize>2163 fn write<R: io::Read + ZeroCopyReader>(
2164 &self,
2165 _ctx: Context,
2166 inode: Inode,
2167 handle: Handle,
2168 mut r: R,
2169 size: u32,
2170 offset: u64,
2171 _lock_owner: Option<u64>,
2172 _delayed_write: bool,
2173 flags: u32,
2174 ) -> io::Result<usize> {
2175 // When the WRITE_KILL_PRIV flag is set, drop CAP_FSETID so that the kernel will
2176 // automatically clear the setuid and setgid bits for us.
2177 let _fsetid = if flags & WRITE_KILL_PRIV != 0 {
2178 Some(drop_cap_fsetid()?)
2179 } else {
2180 None
2181 };
2182
2183 if self.zero_message_open.load(Ordering::Relaxed) {
2184 let _trace = fs_trace!(
2185 self.tag,
2186 "write (zero-message)",
2187 inode,
2188 handle,
2189 size,
2190 offset
2191 );
2192
2193 let data = self.find_inode(inode)?;
2194
2195 let mut file = data.file.lock();
2196 let mut flags = file.1;
2197 match flags & libc::O_ACCMODE {
2198 libc::O_RDONLY => {
2199 flags &= !libc::O_RDONLY;
2200 flags |= libc::O_RDWR;
2201
2202 // We need to get a writable handle for this file.
2203 let newfile = self.open_fd(file.0.as_raw_descriptor(), libc::O_RDWR)?;
2204 *file = (newfile, flags);
2205 }
2206 libc::O_WRONLY | libc::O_RDWR => {}
2207 _ => panic!("Unexpected flags: {:#x}", flags),
2208 }
2209
2210 r.read_to(&mut file.0, size as usize, offset)
2211 } else {
2212 let _trace = fs_trace!(self.tag, "write", inode, handle, size, offset);
2213
2214 let data = self.find_handle(handle, inode)?;
2215
2216 let mut f = data.file.lock();
2217 r.read_to(&mut f, size as usize, offset)
2218 }
2219 }
2220
getattr( &self, _ctx: Context, inode: Inode, _handle: Option<Handle>, ) -> io::Result<(libc::stat64, Duration)>2221 fn getattr(
2222 &self,
2223 _ctx: Context,
2224 inode: Inode,
2225 _handle: Option<Handle>,
2226 ) -> io::Result<(libc::stat64, Duration)> {
2227 let _trace = fs_trace!(self.tag, "getattr", inode, _handle);
2228
2229 let data = self.find_inode(inode)?;
2230 self.do_getattr(&data)
2231 }
2232
setattr( &self, _ctx: Context, inode: Inode, attr: libc::stat64, handle: Option<Handle>, valid: SetattrValid, ) -> io::Result<(libc::stat64, Duration)>2233 fn setattr(
2234 &self,
2235 _ctx: Context,
2236 inode: Inode,
2237 attr: libc::stat64,
2238 handle: Option<Handle>,
2239 valid: SetattrValid,
2240 ) -> io::Result<(libc::stat64, Duration)> {
2241 let _trace = fs_trace!(self.tag, "setattr", inode, handle);
2242 let inode_data = self.find_inode(inode)?;
2243
2244 enum Data {
2245 Handle(Arc<HandleData>, RawDescriptor),
2246 ProcPath(CString),
2247 }
2248
2249 // If we have a handle then use it otherwise get a new fd from the inode.
2250 let data = if let Some(handle) = handle.filter(|&h| h != 0) {
2251 let hd = self.find_handle(handle, inode)?;
2252
2253 let fd = hd.file.lock().as_raw_descriptor();
2254 Data::Handle(hd, fd)
2255 } else {
2256 let pathname = CString::new(format!("self/fd/{}", inode_data.as_raw_descriptor()))
2257 .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
2258 Data::ProcPath(pathname)
2259 };
2260
2261 if valid.contains(SetattrValid::MODE) {
2262 // SAFETY: this doesn't modify any memory and we check the return value.
2263 syscall!(unsafe {
2264 match data {
2265 Data::Handle(_, fd) => libc::fchmod(fd, attr.st_mode),
2266 Data::ProcPath(ref p) => {
2267 libc::fchmodat(self.proc.as_raw_descriptor(), p.as_ptr(), attr.st_mode, 0)
2268 }
2269 }
2270 })?;
2271 }
2272
2273 if valid.intersects(SetattrValid::UID | SetattrValid::GID) {
2274 let uid = if valid.contains(SetattrValid::UID) {
2275 attr.st_uid
2276 } else {
2277 // Cannot use -1 here because these are unsigned values.
2278 ::std::u32::MAX
2279 };
2280 let gid = if valid.contains(SetattrValid::GID) {
2281 attr.st_gid
2282 } else {
2283 // Cannot use -1 here because these are unsigned values.
2284 ::std::u32::MAX
2285 };
2286
2287 // SAFETY: this is a constant value that is a nul-terminated string without interior
2288 // nul bytes.
2289 let empty = unsafe { CStr::from_bytes_with_nul_unchecked(EMPTY_CSTR) };
2290
2291 // SAFETY: this doesn't modify any memory and we check the return value.
2292 syscall!(unsafe {
2293 libc::fchownat(
2294 inode_data.as_raw_descriptor(),
2295 empty.as_ptr(),
2296 uid,
2297 gid,
2298 libc::AT_EMPTY_PATH | libc::AT_SYMLINK_NOFOLLOW,
2299 )
2300 })?;
2301 }
2302
2303 if valid.contains(SetattrValid::SIZE) {
2304 syscall!(match data {
2305 Data::Handle(_, fd) => {
2306 // SAFETY: this doesn't modify any memory and we check the return value.
2307 unsafe { libc::ftruncate64(fd, attr.st_size) }
2308 }
2309 _ => {
2310 // There is no `ftruncateat` so we need to get a new fd and truncate it.
2311 let f = self.open_inode(&inode_data, libc::O_NONBLOCK | libc::O_RDWR)?;
2312 // SAFETY: this doesn't modify any memory and we check the return value.
2313 unsafe { libc::ftruncate64(f.as_raw_descriptor(), attr.st_size) }
2314 }
2315 })?;
2316 }
2317
2318 if valid.intersects(SetattrValid::ATIME | SetattrValid::MTIME) {
2319 let mut tvs = [
2320 libc::timespec {
2321 tv_sec: 0,
2322 tv_nsec: libc::UTIME_OMIT,
2323 },
2324 libc::timespec {
2325 tv_sec: 0,
2326 tv_nsec: libc::UTIME_OMIT,
2327 },
2328 ];
2329
2330 if valid.contains(SetattrValid::ATIME_NOW) {
2331 tvs[0].tv_nsec = libc::UTIME_NOW;
2332 } else if valid.contains(SetattrValid::ATIME) {
2333 tvs[0].tv_sec = attr.st_atime;
2334 tvs[0].tv_nsec = attr.st_atime_nsec;
2335 }
2336
2337 if valid.contains(SetattrValid::MTIME_NOW) {
2338 tvs[1].tv_nsec = libc::UTIME_NOW;
2339 } else if valid.contains(SetattrValid::MTIME) {
2340 tvs[1].tv_sec = attr.st_mtime;
2341 tvs[1].tv_nsec = attr.st_mtime_nsec;
2342 }
2343
2344 // SAFETY: this doesn't modify any memory and we check the return value.
2345 syscall!(unsafe {
2346 match data {
2347 Data::Handle(_, fd) => libc::futimens(fd, tvs.as_ptr()),
2348 Data::ProcPath(ref p) => {
2349 libc::utimensat(self.proc.as_raw_descriptor(), p.as_ptr(), tvs.as_ptr(), 0)
2350 }
2351 }
2352 })?;
2353 }
2354
2355 self.do_getattr(&inode_data)
2356 }
2357
rename( &self, _ctx: Context, olddir: Inode, oldname: &CStr, newdir: Inode, newname: &CStr, flags: u32, ) -> io::Result<()>2358 fn rename(
2359 &self,
2360 _ctx: Context,
2361 olddir: Inode,
2362 oldname: &CStr,
2363 newdir: Inode,
2364 newname: &CStr,
2365 flags: u32,
2366 ) -> io::Result<()> {
2367 let _trace = fs_trace!(self.tag, "rename", olddir, oldname, newdir, newname, flags);
2368
2369 let old_inode = self.find_inode(olddir)?;
2370 let new_inode = self.find_inode(newdir)?;
2371 {
2372 let casefold_cache = self.lock_casefold_lookup_caches();
2373
2374 // SAFETY: this doesn't modify any memory and we check the return value.
2375 // TODO: Switch to libc::renameat2 once https://github.com/rust-lang/libc/pull/1508 lands
2376 // and we have glibc 2.28.
2377 syscall!(unsafe {
2378 libc::syscall(
2379 libc::SYS_renameat2,
2380 old_inode.as_raw_descriptor(),
2381 oldname.as_ptr(),
2382 new_inode.as_raw_descriptor(),
2383 newname.as_ptr(),
2384 flags,
2385 )
2386 })?;
2387 if let Some(mut c) = casefold_cache {
2388 c.remove(olddir, oldname);
2389 c.insert(newdir, newname);
2390 }
2391 }
2392
2393 Ok(())
2394 }
2395
mknod( &self, ctx: Context, parent: Inode, name: &CStr, mode: u32, rdev: u32, umask: u32, security_ctx: Option<&CStr>, ) -> io::Result<Entry>2396 fn mknod(
2397 &self,
2398 ctx: Context,
2399 parent: Inode,
2400 name: &CStr,
2401 mode: u32,
2402 rdev: u32,
2403 umask: u32,
2404 security_ctx: Option<&CStr>,
2405 ) -> io::Result<Entry> {
2406 let _trace = fs_trace!(
2407 self.tag,
2408 "mknod",
2409 parent,
2410 name,
2411 mode,
2412 rdev,
2413 umask,
2414 security_ctx
2415 );
2416 let data = self.find_inode(parent)?;
2417
2418 let _ctx = security_ctx
2419 .filter(|ctx| ctx.to_bytes_with_nul() != UNLABELED_CSTR)
2420 .map(|ctx| ScopedSecurityContext::new(&self.proc, ctx))
2421 .transpose()?;
2422
2423 let (_uid, _gid) = set_creds(ctx.uid, ctx.gid)?;
2424 {
2425 let _scoped_umask = ScopedUmask::new(umask);
2426 let casefold_cache = self.lock_casefold_lookup_caches();
2427
2428 // SAFETY: this doesn't modify any memory and we check the return value.
2429 syscall!(unsafe {
2430 libc::mknodat(
2431 data.as_raw_descriptor(),
2432 name.as_ptr(),
2433 mode as libc::mode_t,
2434 rdev as libc::dev_t,
2435 )
2436 })?;
2437 if let Some(mut c) = casefold_cache {
2438 c.insert(parent, name);
2439 }
2440 }
2441
2442 self.do_lookup(&data, name)
2443 }
2444
link( &self, _ctx: Context, inode: Inode, newparent: Inode, newname: &CStr, ) -> io::Result<Entry>2445 fn link(
2446 &self,
2447 _ctx: Context,
2448 inode: Inode,
2449 newparent: Inode,
2450 newname: &CStr,
2451 ) -> io::Result<Entry> {
2452 let _trace = fs_trace!(self.tag, "link", inode, newparent, newname);
2453 let data = self.find_inode(inode)?;
2454 let new_inode = self.find_inode(newparent)?;
2455
2456 let path = CString::new(format!("self/fd/{}", data.as_raw_descriptor()))
2457 .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
2458
2459 {
2460 let casefold_cache = self.lock_casefold_lookup_caches();
2461 // SAFETY: this doesn't modify any memory and we check the return value.
2462 syscall!(unsafe {
2463 libc::linkat(
2464 self.proc.as_raw_descriptor(),
2465 path.as_ptr(),
2466 new_inode.as_raw_descriptor(),
2467 newname.as_ptr(),
2468 libc::AT_SYMLINK_FOLLOW,
2469 )
2470 })?;
2471 if let Some(mut c) = casefold_cache {
2472 c.insert(newparent, newname);
2473 }
2474 }
2475
2476 self.do_lookup(&new_inode, newname)
2477 }
2478
symlink( &self, ctx: Context, linkname: &CStr, parent: Inode, name: &CStr, security_ctx: Option<&CStr>, ) -> io::Result<Entry>2479 fn symlink(
2480 &self,
2481 ctx: Context,
2482 linkname: &CStr,
2483 parent: Inode,
2484 name: &CStr,
2485 security_ctx: Option<&CStr>,
2486 ) -> io::Result<Entry> {
2487 let _trace = fs_trace!(self.tag, "symlink", parent, linkname, name, security_ctx);
2488 let data = self.find_inode(parent)?;
2489
2490 let _ctx = security_ctx
2491 .filter(|ctx| ctx.to_bytes_with_nul() != UNLABELED_CSTR)
2492 .map(|ctx| ScopedSecurityContext::new(&self.proc, ctx))
2493 .transpose()?;
2494
2495 let (_uid, _gid) = set_creds(ctx.uid, ctx.gid)?;
2496 {
2497 let casefold_cache = self.lock_casefold_lookup_caches();
2498 // SAFETY: this doesn't modify any memory and we check the return value.
2499 syscall!(unsafe {
2500 libc::symlinkat(linkname.as_ptr(), data.as_raw_descriptor(), name.as_ptr())
2501 })?;
2502 if let Some(mut c) = casefold_cache {
2503 c.insert(parent, name);
2504 }
2505 }
2506
2507 self.do_lookup(&data, name)
2508 }
2509
readlink(&self, _ctx: Context, inode: Inode) -> io::Result<Vec<u8>>2510 fn readlink(&self, _ctx: Context, inode: Inode) -> io::Result<Vec<u8>> {
2511 let _trace = fs_trace!(self.tag, "readlink", inode);
2512 let data = self.find_inode(inode)?;
2513
2514 let mut buf = vec![0; libc::PATH_MAX as usize];
2515
2516 // SAFETY: this is a constant value that is a nul-terminated string without interior nul
2517 // bytes.
2518 let empty = unsafe { CStr::from_bytes_with_nul_unchecked(EMPTY_CSTR) };
2519
2520 // SAFETY: this will only modify the contents of `buf` and we check the return value.
2521 let res = syscall!(unsafe {
2522 libc::readlinkat(
2523 data.as_raw_descriptor(),
2524 empty.as_ptr(),
2525 buf.as_mut_ptr() as *mut libc::c_char,
2526 buf.len(),
2527 )
2528 })?;
2529
2530 buf.resize(res as usize, 0);
2531 Ok(buf)
2532 }
2533
flush( &self, _ctx: Context, inode: Inode, handle: Handle, _lock_owner: u64, ) -> io::Result<()>2534 fn flush(
2535 &self,
2536 _ctx: Context,
2537 inode: Inode,
2538 handle: Handle,
2539 _lock_owner: u64,
2540 ) -> io::Result<()> {
2541 let _trace = fs_trace!(self.tag, "flush", inode, handle);
2542 let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
2543 self.find_inode(inode)?
2544 } else {
2545 self.find_handle(handle, inode)?
2546 };
2547
2548 // SAFETY:
2549 // Since this method is called whenever an fd is closed in the client, we can emulate that
2550 // behavior by doing the same thing (dup-ing the fd and then immediately closing it). Safe
2551 // because this doesn't modify any memory and we check the return values.
2552 unsafe {
2553 let newfd = syscall!(libc::fcntl(
2554 data.as_raw_descriptor(),
2555 libc::F_DUPFD_CLOEXEC,
2556 0
2557 ))?;
2558
2559 syscall!(libc::close(newfd))?;
2560 }
2561 Ok(())
2562 }
2563
fsync(&self, _ctx: Context, inode: Inode, datasync: bool, handle: Handle) -> io::Result<()>2564 fn fsync(&self, _ctx: Context, inode: Inode, datasync: bool, handle: Handle) -> io::Result<()> {
2565 if self.zero_message_open.load(Ordering::Relaxed) {
2566 let _trace = fs_trace!(self.tag, "fsync (zero-message)", inode, datasync, handle);
2567 let data = self.find_inode(inode)?;
2568 self.do_fsync(&*data, datasync)
2569 } else {
2570 let _trace = fs_trace!(self.tag, "fsync", inode, datasync, handle);
2571 let data = self.find_handle(handle, inode)?;
2572
2573 let file = data.file.lock();
2574 self.do_fsync(&*file, datasync)
2575 }
2576 }
2577
fsyncdir( &self, _ctx: Context, inode: Inode, datasync: bool, handle: Handle, ) -> io::Result<()>2578 fn fsyncdir(
2579 &self,
2580 _ctx: Context,
2581 inode: Inode,
2582 datasync: bool,
2583 handle: Handle,
2584 ) -> io::Result<()> {
2585 if self.zero_message_opendir.load(Ordering::Relaxed) {
2586 let _trace = fs_trace!(self.tag, "fsyncdir (zero-message)", inode, datasync, handle);
2587 let data = self.find_inode(inode)?;
2588 self.do_fsync(&*data, datasync)
2589 } else {
2590 let _trace = fs_trace!(self.tag, "fsyncdir", inode, datasync, handle);
2591 let data = self.find_handle(handle, inode)?;
2592
2593 let file = data.file.lock();
2594 self.do_fsync(&*file, datasync)
2595 }
2596 }
2597
access(&self, ctx: Context, inode: Inode, mask: u32) -> io::Result<()>2598 fn access(&self, ctx: Context, inode: Inode, mask: u32) -> io::Result<()> {
2599 let _trace = fs_trace!(self.tag, "access", inode, mask);
2600 let data = self.find_inode(inode)?;
2601
2602 let st = stat(&*data)?;
2603 let mode = mask as i32 & (libc::R_OK | libc::W_OK | libc::X_OK);
2604
2605 if mode == libc::F_OK {
2606 // The file exists since we were able to call `stat(2)` on it.
2607 return Ok(());
2608 }
2609
2610 if (mode & libc::R_OK) != 0 {
2611 if ctx.uid != 0
2612 && (st.st_uid != ctx.uid || st.st_mode & 0o400 == 0)
2613 && (st.st_gid != ctx.gid || st.st_mode & 0o040 == 0)
2614 && st.st_mode & 0o004 == 0
2615 {
2616 return Err(io::Error::from_raw_os_error(libc::EACCES));
2617 }
2618 }
2619
2620 if (mode & libc::W_OK) != 0 {
2621 if ctx.uid != 0
2622 && (st.st_uid != ctx.uid || st.st_mode & 0o200 == 0)
2623 && (st.st_gid != ctx.gid || st.st_mode & 0o020 == 0)
2624 && st.st_mode & 0o002 == 0
2625 {
2626 return Err(io::Error::from_raw_os_error(libc::EACCES));
2627 }
2628 }
2629
2630 // root can only execute something if it is executable by one of the owner, the group, or
2631 // everyone.
2632 if (mode & libc::X_OK) != 0 {
2633 if (ctx.uid != 0 || st.st_mode & 0o111 == 0)
2634 && (st.st_uid != ctx.uid || st.st_mode & 0o100 == 0)
2635 && (st.st_gid != ctx.gid || st.st_mode & 0o010 == 0)
2636 && st.st_mode & 0o001 == 0
2637 {
2638 return Err(io::Error::from_raw_os_error(libc::EACCES));
2639 }
2640 }
2641
2642 Ok(())
2643 }
2644
setxattr( &self, _ctx: Context, inode: Inode, name: &CStr, value: &[u8], flags: u32, ) -> io::Result<()>2645 fn setxattr(
2646 &self,
2647 _ctx: Context,
2648 inode: Inode,
2649 name: &CStr,
2650 value: &[u8],
2651 flags: u32,
2652 ) -> io::Result<()> {
2653 let _trace = fs_trace!(self.tag, "setxattr", inode, name, flags);
2654 // We can't allow the VM to set this xattr because an unprivileged process may use it to set
2655 // a privileged xattr.
2656 if self.cfg.rewrite_security_xattrs && name.to_bytes().starts_with(USER_VIRTIOFS_XATTR) {
2657 return Err(io::Error::from_raw_os_error(libc::EPERM));
2658 }
2659
2660 let data = self.find_inode(inode)?;
2661 let name = self.rewrite_xattr_name(name);
2662 let file = data.file.lock();
2663 let o_path_file = (file.1 & libc::O_PATH) != 0;
2664 if o_path_file {
2665 // For FDs opened with `O_PATH`, we cannot call `fsetxattr` normally. Instead we emulate
2666 // an _at syscall by changing the CWD to /proc, running the path based syscall, and then
2667 // setting the CWD back to the root directory.
2668 let path = CString::new(format!("self/fd/{}", file.0.as_raw_descriptor()))
2669 .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
2670
2671 syscall!(self.with_proc_chdir(|| {
2672 // SAFETY: this doesn't modify any memory and we check the return value.
2673 unsafe {
2674 libc::setxattr(
2675 path.as_ptr(),
2676 name.as_ptr(),
2677 value.as_ptr() as *const libc::c_void,
2678 value.len() as libc::size_t,
2679 flags as c_int,
2680 )
2681 }
2682 }))?;
2683 } else {
2684 syscall!(
2685 // For regular files and directories, we can just use fsetxattr.
2686 // SAFETY: this doesn't modify any memory and we check the return value.
2687 unsafe {
2688 libc::fsetxattr(
2689 file.0.as_raw_descriptor(),
2690 name.as_ptr(),
2691 value.as_ptr() as *const libc::c_void,
2692 value.len() as libc::size_t,
2693 flags as c_int,
2694 )
2695 }
2696 )?;
2697 }
2698
2699 Ok(())
2700 }
2701
getxattr( &self, _ctx: Context, inode: Inode, name: &CStr, size: u32, ) -> io::Result<GetxattrReply>2702 fn getxattr(
2703 &self,
2704 _ctx: Context,
2705 inode: Inode,
2706 name: &CStr,
2707 size: u32,
2708 ) -> io::Result<GetxattrReply> {
2709 let _trace = fs_trace!(self.tag, "getxattr", inode, name, size);
2710 // We don't allow the VM to set this xattr so we also pretend there is no value associated
2711 // with it.
2712 if self.cfg.rewrite_security_xattrs && name.to_bytes().starts_with(USER_VIRTIOFS_XATTR) {
2713 return Err(io::Error::from_raw_os_error(libc::ENODATA));
2714 }
2715
2716 let data = self.find_inode(inode)?;
2717 let name = self.rewrite_xattr_name(name);
2718 let mut buf = vec![0u8; size as usize];
2719
2720 // SAFETY: this will only modify the contents of `buf`.
2721 let res = self.do_getxattr(&data, &name, &mut buf[..])?;
2722 if size == 0 {
2723 Ok(GetxattrReply::Count(res as u32))
2724 } else {
2725 buf.truncate(res);
2726 Ok(GetxattrReply::Value(buf))
2727 }
2728 }
2729
listxattr(&self, _ctx: Context, inode: Inode, size: u32) -> io::Result<ListxattrReply>2730 fn listxattr(&self, _ctx: Context, inode: Inode, size: u32) -> io::Result<ListxattrReply> {
2731 let _trace = fs_trace!(self.tag, "listxattr", inode, size);
2732 let data = self.find_inode(inode)?;
2733
2734 let mut buf = vec![0u8; size as usize];
2735
2736 let file = data.file.lock();
2737 let o_path_file = (file.1 & libc::O_PATH) != 0;
2738 let res = if o_path_file {
2739 // For FDs opened with `O_PATH`, we cannot call `flistxattr` normally. Instead we
2740 // emulate an _at syscall by changing the CWD to /proc, running the path based syscall,
2741 // and then setting the CWD back to the root directory.
2742 let path = CString::new(format!("self/fd/{}", file.0.as_raw_descriptor()))
2743 .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
2744
2745 // SAFETY: this will only modify `buf` and we check the return value.
2746 syscall!(self.with_proc_chdir(|| unsafe {
2747 libc::listxattr(
2748 path.as_ptr(),
2749 buf.as_mut_ptr() as *mut libc::c_char,
2750 buf.len() as libc::size_t,
2751 )
2752 }))?
2753 } else {
2754 // For regular files and directories, we can just flistxattr.
2755 // SAFETY: this will only write to `buf` and we check the return value.
2756 syscall!(unsafe {
2757 libc::flistxattr(
2758 file.0.as_raw_descriptor(),
2759 buf.as_mut_ptr() as *mut libc::c_char,
2760 buf.len() as libc::size_t,
2761 )
2762 })?
2763 };
2764
2765 if size == 0 {
2766 Ok(ListxattrReply::Count(res as u32))
2767 } else {
2768 buf.truncate(res as usize);
2769
2770 if self.cfg.rewrite_security_xattrs {
2771 strip_xattr_prefix(&mut buf);
2772 }
2773 Ok(ListxattrReply::Names(buf))
2774 }
2775 }
2776
removexattr(&self, _ctx: Context, inode: Inode, name: &CStr) -> io::Result<()>2777 fn removexattr(&self, _ctx: Context, inode: Inode, name: &CStr) -> io::Result<()> {
2778 let _trace = fs_trace!(self.tag, "removexattr", inode, name);
2779 // We don't allow the VM to set this xattr so we also pretend there is no value associated
2780 // with it.
2781 if self.cfg.rewrite_security_xattrs && name.to_bytes().starts_with(USER_VIRTIOFS_XATTR) {
2782 return Err(io::Error::from_raw_os_error(libc::ENODATA));
2783 }
2784
2785 let data = self.find_inode(inode)?;
2786 let name = self.rewrite_xattr_name(name);
2787
2788 let file = data.file.lock();
2789 let o_path_file = (file.1 & libc::O_PATH) != 0;
2790 if o_path_file {
2791 // For files opened with `O_PATH`, we cannot call `fremovexattr` normally. Instead we
2792 // emulate an _at syscall by changing the CWD to /proc, running the path based syscall,
2793 // and then setting the CWD back to the root directory.
2794 let path = CString::new(format!("self/fd/{}", file.0.as_raw_descriptor()))
2795 .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
2796
2797 syscall!(self.with_proc_chdir(||
2798 // SAFETY: this doesn't modify any memory and we check the return value.
2799 unsafe { libc::removexattr(path.as_ptr(), name.as_ptr()) }))?;
2800 } else {
2801 // For regular files and directories, we can just use fremovexattr.
2802 syscall!(
2803 // SAFETY: this doesn't modify any memory and we check the return value.
2804 unsafe { libc::fremovexattr(file.0.as_raw_descriptor(), name.as_ptr()) }
2805 )?;
2806 }
2807
2808 Ok(())
2809 }
2810
fallocate( &self, _ctx: Context, inode: Inode, handle: Handle, mode: u32, offset: u64, length: u64, ) -> io::Result<()>2811 fn fallocate(
2812 &self,
2813 _ctx: Context,
2814 inode: Inode,
2815 handle: Handle,
2816 mode: u32,
2817 offset: u64,
2818 length: u64,
2819 ) -> io::Result<()> {
2820 let _trace = fs_trace!(self.tag, "fallocate", inode, handle, mode, offset, length);
2821
2822 let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
2823 let data = self.find_inode(inode)?;
2824
2825 {
2826 // fallocate needs a writable fd
2827 let mut file = data.file.lock();
2828 let mut flags = file.1;
2829 match flags & libc::O_ACCMODE {
2830 libc::O_RDONLY => {
2831 flags &= !libc::O_RDONLY;
2832 flags |= libc::O_RDWR;
2833
2834 // We need to get a writable handle for this file.
2835 let newfile = self.open_fd(file.0.as_raw_descriptor(), libc::O_RDWR)?;
2836 *file = (newfile, flags);
2837 }
2838 libc::O_WRONLY | libc::O_RDWR => {}
2839 _ => panic!("Unexpected flags: {:#x}", flags),
2840 }
2841 }
2842
2843 data
2844 } else {
2845 self.find_handle(handle, inode)?
2846 };
2847
2848 let fd = data.as_raw_descriptor();
2849 // SAFETY: this doesn't modify any memory and we check the return value.
2850 syscall!(unsafe {
2851 libc::fallocate64(
2852 fd,
2853 mode as libc::c_int,
2854 offset as libc::off64_t,
2855 length as libc::off64_t,
2856 )
2857 })?;
2858
2859 Ok(())
2860 }
2861
2862 #[allow(clippy::unnecessary_cast)]
ioctl<R: io::Read>( &self, ctx: Context, inode: Inode, handle: Handle, _flags: IoctlFlags, cmd: u32, _arg: u64, in_size: u32, out_size: u32, r: R, ) -> io::Result<IoctlReply>2863 fn ioctl<R: io::Read>(
2864 &self,
2865 ctx: Context,
2866 inode: Inode,
2867 handle: Handle,
2868 _flags: IoctlFlags,
2869 cmd: u32,
2870 _arg: u64,
2871 in_size: u32,
2872 out_size: u32,
2873 r: R,
2874 ) -> io::Result<IoctlReply> {
2875 let _trace = fs_trace!(self.tag, "ioctl", inode, handle, cmd, in_size, out_size);
2876
2877 const GET_ENCRYPTION_POLICY_EX: u32 = FS_IOC_GET_ENCRYPTION_POLICY_EX() as u32;
2878 const GET_FSXATTR: u32 = FS_IOC_FSGETXATTR() as u32;
2879 const SET_FSXATTR: u32 = FS_IOC_FSSETXATTR() as u32;
2880 const GET_FLAGS32: u32 = FS_IOC32_GETFLAGS() as u32;
2881 const SET_FLAGS32: u32 = FS_IOC32_SETFLAGS() as u32;
2882 const GET_FLAGS64: u32 = FS_IOC64_GETFLAGS() as u32;
2883 const SET_FLAGS64: u32 = FS_IOC64_SETFLAGS() as u32;
2884 const ENABLE_VERITY: u32 = FS_IOC_ENABLE_VERITY() as u32;
2885 const MEASURE_VERITY: u32 = FS_IOC_MEASURE_VERITY() as u32;
2886
2887 match cmd {
2888 GET_ENCRYPTION_POLICY_EX => self.get_encryption_policy_ex(inode, handle, r),
2889 GET_FSXATTR => {
2890 if out_size < size_of::<fsxattr>() as u32 {
2891 Err(io::Error::from_raw_os_error(libc::ENOMEM))
2892 } else {
2893 self.get_fsxattr(inode, handle)
2894 }
2895 }
2896 SET_FSXATTR => {
2897 if in_size < size_of::<fsxattr>() as u32 {
2898 Err(io::Error::from_raw_os_error(libc::EINVAL))
2899 } else {
2900 self.set_fsxattr(ctx, inode, handle, r)
2901 }
2902 }
2903 GET_FLAGS32 | GET_FLAGS64 => {
2904 if out_size < size_of::<c_int>() as u32 {
2905 Err(io::Error::from_raw_os_error(libc::ENOMEM))
2906 } else {
2907 self.get_flags(inode, handle)
2908 }
2909 }
2910 SET_FLAGS32 | SET_FLAGS64 => {
2911 if in_size < size_of::<c_int>() as u32 {
2912 Err(io::Error::from_raw_os_error(libc::ENOMEM))
2913 } else {
2914 self.set_flags(ctx, inode, handle, r)
2915 }
2916 }
2917 ENABLE_VERITY => {
2918 if in_size < size_of::<fsverity_enable_arg>() as u32 {
2919 Err(io::Error::from_raw_os_error(libc::ENOMEM))
2920 } else {
2921 self.enable_verity(inode, handle, r)
2922 }
2923 }
2924 MEASURE_VERITY => {
2925 if in_size < size_of::<fsverity_digest>() as u32
2926 || out_size < size_of::<fsverity_digest>() as u32
2927 {
2928 Err(io::Error::from_raw_os_error(libc::ENOMEM))
2929 } else {
2930 self.measure_verity(inode, handle, r, out_size)
2931 }
2932 }
2933 _ => Err(io::Error::from_raw_os_error(libc::ENOTTY)),
2934 }
2935 }
2936
copy_file_range( &self, ctx: Context, inode_src: Inode, handle_src: Handle, offset_src: u64, inode_dst: Inode, handle_dst: Handle, offset_dst: u64, length: u64, flags: u64, ) -> io::Result<usize>2937 fn copy_file_range(
2938 &self,
2939 ctx: Context,
2940 inode_src: Inode,
2941 handle_src: Handle,
2942 offset_src: u64,
2943 inode_dst: Inode,
2944 handle_dst: Handle,
2945 offset_dst: u64,
2946 length: u64,
2947 flags: u64,
2948 ) -> io::Result<usize> {
2949 let _trace = fs_trace!(
2950 self.tag,
2951 "copy_file_range",
2952 inode_src,
2953 handle_src,
2954 offset_src,
2955 inode_dst,
2956 handle_dst,
2957 offset_dst,
2958 length,
2959 flags
2960 );
2961 // We need to change credentials during a write so that the kernel will remove setuid or
2962 // setgid bits from the file if it was written to by someone other than the owner.
2963 let (_uid, _gid) = set_creds(ctx.uid, ctx.gid)?;
2964 let (src_data, dst_data): (Arc<dyn AsRawDescriptor>, Arc<dyn AsRawDescriptor>) =
2965 if self.zero_message_open.load(Ordering::Relaxed) {
2966 (self.find_inode(inode_src)?, self.find_inode(inode_dst)?)
2967 } else {
2968 (
2969 self.find_handle(handle_src, inode_src)?,
2970 self.find_handle(handle_dst, inode_dst)?,
2971 )
2972 };
2973
2974 let src = src_data.as_raw_descriptor();
2975 let dst = dst_data.as_raw_descriptor();
2976
2977 Ok(syscall!(
2978 // SAFETY: this call is safe because it doesn't modify any memory and we
2979 // check the return value.
2980 unsafe {
2981 libc::syscall(
2982 libc::SYS_copy_file_range,
2983 src,
2984 &offset_src,
2985 dst,
2986 &offset_dst,
2987 length,
2988 flags,
2989 )
2990 }
2991 )? as usize)
2992 }
2993
set_up_mapping<M: Mapper>( &self, _ctx: Context, inode: Self::Inode, _handle: Self::Handle, file_offset: u64, mem_offset: u64, size: usize, prot: u32, mapper: M, ) -> io::Result<()>2994 fn set_up_mapping<M: Mapper>(
2995 &self,
2996 _ctx: Context,
2997 inode: Self::Inode,
2998 _handle: Self::Handle,
2999 file_offset: u64,
3000 mem_offset: u64,
3001 size: usize,
3002 prot: u32,
3003 mapper: M,
3004 ) -> io::Result<()> {
3005 let _trace = fs_trace!(
3006 self.tag,
3007 "set_up_mapping",
3008 inode,
3009 file_offset,
3010 mem_offset,
3011 size,
3012 prot
3013 );
3014 if !self.cfg.use_dax {
3015 return Err(io::Error::from_raw_os_error(libc::ENOSYS));
3016 }
3017
3018 let read = prot & libc::PROT_READ as u32 != 0;
3019 let write = prot & libc::PROT_WRITE as u32 != 0;
3020 let (mmap_flags, prot) = match (read, write) {
3021 (true, true) => (libc::O_RDWR, Protection::read_write()),
3022 (true, false) => (libc::O_RDONLY, Protection::read()),
3023 // Write-only is mapped to O_RDWR since mmap always requires an fd opened for reading.
3024 (false, true) => (libc::O_RDWR, Protection::write()),
3025 (false, false) => return Err(io::Error::from_raw_os_error(libc::EINVAL)),
3026 };
3027
3028 let data = self.find_inode(inode)?;
3029
3030 if self.zero_message_open.load(Ordering::Relaxed) {
3031 let mut file = data.file.lock();
3032 let mut open_flags = file.1;
3033 match (mmap_flags, open_flags & libc::O_ACCMODE) {
3034 (libc::O_RDONLY, libc::O_WRONLY)
3035 | (libc::O_RDWR, libc::O_RDONLY)
3036 | (libc::O_RDWR, libc::O_WRONLY) => {
3037 // We have a read-only or write-only fd and we need to upgrade it.
3038 open_flags &= !libc::O_ACCMODE;
3039 open_flags |= libc::O_RDWR;
3040
3041 let newfile = self.open_fd(file.0.as_raw_descriptor(), libc::O_RDWR)?;
3042 *file = (newfile, open_flags);
3043 }
3044 (libc::O_RDONLY, libc::O_RDONLY)
3045 | (libc::O_RDONLY, libc::O_RDWR)
3046 | (libc::O_RDWR, libc::O_RDWR) => {}
3047 (m, o) => panic!(
3048 "Unexpected combination of access flags: ({:#x}, {:#x})",
3049 m, o
3050 ),
3051 }
3052 mapper.map(mem_offset, size, &file.0, file_offset, prot)
3053 } else {
3054 let file = self.open_inode(&data, mmap_flags | libc::O_NONBLOCK)?;
3055 mapper.map(mem_offset, size, &file, file_offset, prot)
3056 }
3057 }
3058
remove_mapping<M: Mapper>(&self, msgs: &[RemoveMappingOne], mapper: M) -> io::Result<()>3059 fn remove_mapping<M: Mapper>(&self, msgs: &[RemoveMappingOne], mapper: M) -> io::Result<()> {
3060 let _trace = fs_trace!(self.tag, "remove_mapping", msgs);
3061 if !self.cfg.use_dax {
3062 return Err(io::Error::from_raw_os_error(libc::ENOSYS));
3063 }
3064
3065 for RemoveMappingOne { moffset, len } in msgs {
3066 mapper.unmap(*moffset, *len)?;
3067 }
3068 Ok(())
3069 }
3070
atomic_open( &self, ctx: Context, parent: Self::Inode, name: &CStr, mode: u32, flags: u32, umask: u32, security_ctx: Option<&CStr>, ) -> io::Result<(Entry, Option<Self::Handle>, OpenOptions)>3071 fn atomic_open(
3072 &self,
3073 ctx: Context,
3074 parent: Self::Inode,
3075 name: &CStr,
3076 mode: u32,
3077 flags: u32,
3078 umask: u32,
3079 security_ctx: Option<&CStr>,
3080 ) -> io::Result<(Entry, Option<Self::Handle>, OpenOptions)> {
3081 let _trace = fs_trace!(
3082 self.tag,
3083 "atomic_open",
3084 parent,
3085 name,
3086 mode,
3087 flags,
3088 umask,
3089 security_ctx
3090 );
3091 let (_uid, _gid) = set_creds(ctx.uid, ctx.gid)?;
3092
3093 // Perform lookup but not create negative dentry
3094 let data = self.find_inode(parent)?;
3095
3096 // This lookup serves two purposes:
3097 // 1. If the O_CREATE flag is not set, it retrieves the d_entry for the file.
3098 // 2. If the O_CREATE flag is set, it checks whether the file exists.
3099 let res = self.do_lookup_with_casefold_fallback(&data, name);
3100
3101 if let Err(e) = res {
3102 if e.kind() == std::io::ErrorKind::NotFound && (flags as i32 & libc::O_CREAT) != 0 {
3103 // If the file did not exist & O_CREAT is set,
3104 // create file & set FILE_CREATED bits in open options
3105 let (entry, handler, mut opts) =
3106 self.create(ctx, parent, name, mode, flags, umask, security_ctx)?;
3107 opts |= OpenOptions::FILE_CREATED;
3108 return Ok((entry, handler, opts));
3109 } else if e.kind() == std::io::ErrorKind::NotFound
3110 && !self.cfg.negative_timeout.is_zero()
3111 {
3112 return Ok((
3113 Entry::new_negative(self.cfg.negative_timeout),
3114 None,
3115 OpenOptions::empty(),
3116 ));
3117 }
3118 return Err(e);
3119 }
3120
3121 // SAFETY: checked res is not error before
3122 let entry = res.unwrap();
3123
3124 if entry.attr.st_mode & libc::S_IFMT == libc::S_IFLNK {
3125 return Ok((entry, None, OpenOptions::empty()));
3126 }
3127
3128 if (flags as i32 & (libc::O_CREAT | libc::O_EXCL)) == (libc::O_CREAT | libc::O_EXCL) {
3129 return Err(eexist());
3130 }
3131
3132 let (handler, opts) = if self.zero_message_open.load(Ordering::Relaxed) {
3133 (None, OpenOptions::KEEP_CACHE)
3134 } else {
3135 let (handler, opts) = self.do_open(entry.inode, flags)?;
3136 (handler, opts)
3137 };
3138 Ok((entry, handler, opts))
3139 }
3140 }
3141
3142 #[cfg(test)]
3143 mod tests {
3144 use std::path::Path;
3145
3146 use named_lock::NamedLock;
3147 use tempfile::TempDir;
3148
3149 use super::*;
3150
3151 const UNITTEST_LOCK_NAME: &str = "passthroughfs_unittest_lock";
3152
3153 // Create an instance of `Context` with valid uid, gid, and pid.
3154 // The correct ids are necessary for test cases where new files are created.
get_context() -> Context3155 fn get_context() -> Context {
3156 // SAFETY: both calls take no parameters and only return an integer value. The kernel also
3157 // guarantees that they can never fail.
3158 let uid = unsafe { libc::syscall(SYS_GETEUID) as libc::uid_t };
3159 // SAFETY: both calls take no parameters and only return an integer value. The kernel also
3160 // guarantees that they can never fail.
3161 let gid = unsafe { libc::syscall(SYS_GETEGID) as libc::gid_t };
3162 let pid = std::process::id() as libc::pid_t;
3163 Context { uid, gid, pid }
3164 }
3165
3166 /// Creates the given directories and files under `temp_dir`.
create_test_data(temp_dir: &TempDir, dirs: &[&str], files: &[&str])3167 fn create_test_data(temp_dir: &TempDir, dirs: &[&str], files: &[&str]) {
3168 let path = temp_dir.path();
3169
3170 for d in dirs {
3171 std::fs::create_dir_all(path.join(d)).unwrap();
3172 }
3173
3174 for f in files {
3175 File::create(path.join(f)).unwrap();
3176 }
3177 }
3178
3179 /// Looks up the given `path` in `fs`.
lookup(fs: &PassthroughFs, path: &Path) -> io::Result<Inode>3180 fn lookup(fs: &PassthroughFs, path: &Path) -> io::Result<Inode> {
3181 let mut inode = 1;
3182 let ctx = get_context();
3183 for name in path.iter() {
3184 let name = CString::new(name.to_str().unwrap()).unwrap();
3185 let ent = match fs.lookup(ctx, inode, &name) {
3186 Ok(ent) => ent,
3187 Err(e) => {
3188 return Err(e);
3189 }
3190 };
3191 inode = ent.inode;
3192 }
3193 Ok(inode)
3194 }
3195
3196 /// Creates a file at the given `path`.
create(fs: &PassthroughFs, path: &Path) -> io::Result<Entry>3197 fn create(fs: &PassthroughFs, path: &Path) -> io::Result<Entry> {
3198 let parent = path.parent().unwrap();
3199 let filename = CString::new(path.file_name().unwrap().to_str().unwrap()).unwrap();
3200 let parent_inode = lookup(fs, parent)?;
3201 let ctx = get_context();
3202 let security_ctx = None;
3203 fs.create(
3204 ctx,
3205 parent_inode,
3206 &filename,
3207 0o666,
3208 libc::O_RDWR as u32,
3209 0,
3210 security_ctx,
3211 )
3212 .map(|(entry, _, _)| entry)
3213 }
3214
3215 /// Removes a file at the given `path`.
unlink(fs: &PassthroughFs, path: &Path) -> io::Result<()>3216 fn unlink(fs: &PassthroughFs, path: &Path) -> io::Result<()> {
3217 let parent = path.parent().unwrap();
3218 let filename = CString::new(path.file_name().unwrap().to_str().unwrap()).unwrap();
3219 let parent_inode = lookup(fs, parent)?;
3220 let ctx = get_context();
3221 fs.unlink(ctx, parent_inode, &filename)
3222 }
3223
3224 /// Forgets cache.
forget(fs: &PassthroughFs, path: &Path) -> io::Result<()>3225 fn forget(fs: &PassthroughFs, path: &Path) -> io::Result<()> {
3226 let ctx = get_context();
3227 let inode = lookup(fs, path)?;
3228 // Pass `u64::MAX` to ensure that the refcount goes to 0 and we forget inode.
3229 fs.forget(ctx, inode, u64::MAX);
3230 Ok(())
3231 }
3232
3233 /// Looks up and open the given `path` in `fs`.
atomic_open( fs: &PassthroughFs, path: &Path, mode: u32, flags: u32, umask: u32, security_ctx: Option<&CStr>, ) -> io::Result<(Entry, Option<Handle>, OpenOptions)>3234 fn atomic_open(
3235 fs: &PassthroughFs,
3236 path: &Path,
3237 mode: u32,
3238 flags: u32,
3239 umask: u32,
3240 security_ctx: Option<&CStr>,
3241 ) -> io::Result<(Entry, Option<Handle>, OpenOptions)> {
3242 let mut inode = 1;
3243 let ctx = get_context();
3244
3245 let path_vec: Vec<_> = path.iter().collect();
3246 let vec_len = path_vec.len();
3247
3248 // Do lookup before util (vec_len-1)-th pathname, this operation is to simulate
3249 // the behavior of VFS, since when VFS call atomic_open only at last look up.
3250 for name in &path_vec[0..vec_len - 1] {
3251 let name = CString::new(name.to_str().unwrap()).unwrap();
3252 let ent = fs.lookup(ctx, inode, &name)?;
3253 inode = ent.inode;
3254 }
3255
3256 let name = CString::new(path_vec[vec_len - 1].to_str().unwrap()).unwrap();
3257
3258 fs.atomic_open(ctx, inode, &name, mode, flags, umask, security_ctx)
3259 }
3260
symlink( fs: &PassthroughFs, linkname: &Path, name: &Path, security_ctx: Option<&CStr>, ) -> io::Result<Entry>3261 fn symlink(
3262 fs: &PassthroughFs,
3263 linkname: &Path,
3264 name: &Path,
3265 security_ctx: Option<&CStr>,
3266 ) -> io::Result<Entry> {
3267 let inode = 1;
3268 let ctx = get_context();
3269 let name = CString::new(name.to_str().unwrap()).unwrap();
3270 let linkname = CString::new(linkname.to_str().unwrap()).unwrap();
3271 fs.symlink(ctx, &linkname, inode, &name, security_ctx)
3272 }
3273
3274 #[test]
rewrite_xattr_names()3275 fn rewrite_xattr_names() {
3276 // Since PassthroughFs may executes process-wide operations such as `fchdir`, acquire
3277 // `NamedLock` before starting each unit test creating a `PassthroughFs` instance.
3278 let lock = NamedLock::create(UNITTEST_LOCK_NAME).expect("create named lock");
3279 let _guard = lock.lock().expect("acquire named lock");
3280
3281 let cfg = Config {
3282 rewrite_security_xattrs: true,
3283 ..Default::default()
3284 };
3285
3286 let p = PassthroughFs::new("tag", cfg).expect("Failed to create PassthroughFs");
3287
3288 // Selinux shouldn't get overwritten.
3289 // SAFETY: trivially safe
3290 let selinux = unsafe { CStr::from_bytes_with_nul_unchecked(b"security.selinux\0") };
3291 assert_eq!(p.rewrite_xattr_name(selinux).to_bytes(), selinux.to_bytes());
3292
3293 // user, trusted, and system should not be changed either.
3294 // SAFETY: trivially safe
3295 let user = unsafe { CStr::from_bytes_with_nul_unchecked(b"user.foobar\0") };
3296 assert_eq!(p.rewrite_xattr_name(user).to_bytes(), user.to_bytes());
3297 // SAFETY: trivially safe
3298 let trusted = unsafe { CStr::from_bytes_with_nul_unchecked(b"trusted.foobar\0") };
3299 assert_eq!(p.rewrite_xattr_name(trusted).to_bytes(), trusted.to_bytes());
3300 // SAFETY: trivially safe
3301 let system = unsafe { CStr::from_bytes_with_nul_unchecked(b"system.foobar\0") };
3302 assert_eq!(p.rewrite_xattr_name(system).to_bytes(), system.to_bytes());
3303
3304 // sehash should be re-written.
3305 // SAFETY: trivially safe
3306 let sehash = unsafe { CStr::from_bytes_with_nul_unchecked(b"security.sehash\0") };
3307 assert_eq!(
3308 p.rewrite_xattr_name(sehash).to_bytes(),
3309 b"user.virtiofs.security.sehash"
3310 );
3311 }
3312
3313 #[test]
strip_xattr_names()3314 fn strip_xattr_names() {
3315 let only_nuls = b"\0\0\0\0\0";
3316 let mut actual = only_nuls.to_vec();
3317 strip_xattr_prefix(&mut actual);
3318 assert_eq!(&actual[..], &only_nuls[..]);
3319
3320 let no_nuls = b"security.sehashuser.virtiofs";
3321 let mut actual = no_nuls.to_vec();
3322 strip_xattr_prefix(&mut actual);
3323 assert_eq!(&actual[..], &no_nuls[..]);
3324
3325 let empty = b"";
3326 let mut actual = empty.to_vec();
3327 strip_xattr_prefix(&mut actual);
3328 assert_eq!(&actual[..], &empty[..]);
3329
3330 let no_strippable_names = b"security.selinux\0user.foobar\0system.test\0";
3331 let mut actual = no_strippable_names.to_vec();
3332 strip_xattr_prefix(&mut actual);
3333 assert_eq!(&actual[..], &no_strippable_names[..]);
3334
3335 let only_strippable_names = b"user.virtiofs.security.sehash\0user.virtiofs.security.wat\0";
3336 let mut actual = only_strippable_names.to_vec();
3337 strip_xattr_prefix(&mut actual);
3338 assert_eq!(&actual[..], b"security.sehash\0security.wat\0");
3339
3340 let mixed_names = b"user.virtiofs.security.sehash\0security.selinux\0user.virtiofs.security.wat\0user.foobar\0";
3341 let mut actual = mixed_names.to_vec();
3342 strip_xattr_prefix(&mut actual);
3343 let expected = b"security.sehash\0security.selinux\0security.wat\0user.foobar\0";
3344 assert_eq!(&actual[..], &expected[..]);
3345
3346 let no_nul_with_prefix = b"user.virtiofs.security.sehash";
3347 let mut actual = no_nul_with_prefix.to_vec();
3348 strip_xattr_prefix(&mut actual);
3349 assert_eq!(&actual[..], b"security.sehash");
3350 }
3351
3352 #[test]
lookup_files()3353 fn lookup_files() {
3354 // Since PassthroughFs may executes process-wide operations such as `fchdir`, acquire
3355 // `NamedLock` before starting each unit test creating a `PassthroughFs` instance.
3356 let lock = NamedLock::create(UNITTEST_LOCK_NAME).expect("create named lock");
3357 let _guard = lock.lock().expect("acquire named lock");
3358
3359 let temp_dir = TempDir::new().unwrap();
3360 create_test_data(&temp_dir, &["dir"], &["a.txt", "dir/b.txt"]);
3361
3362 let cfg = Default::default();
3363 let fs = PassthroughFs::new("tag", cfg).unwrap();
3364
3365 let capable = FsOptions::empty();
3366 fs.init(capable).unwrap();
3367
3368 assert!(lookup(&fs, &temp_dir.path().join("a.txt")).is_ok());
3369 assert!(lookup(&fs, &temp_dir.path().join("dir")).is_ok());
3370 assert!(lookup(&fs, &temp_dir.path().join("dir/b.txt")).is_ok());
3371
3372 assert_eq!(
3373 lookup(&fs, &temp_dir.path().join("nonexistent-file"))
3374 .expect_err("file must not exist")
3375 .kind(),
3376 io::ErrorKind::NotFound
3377 );
3378 // "A.txt" is different from "a.txt".
3379 assert_eq!(
3380 lookup(&fs, &temp_dir.path().join("A.txt"))
3381 .expect_err("file must not exist")
3382 .kind(),
3383 io::ErrorKind::NotFound
3384 );
3385 }
3386
3387 #[test]
lookup_files_ascii_casefold()3388 fn lookup_files_ascii_casefold() {
3389 // Since PassthroughFs may executes process-wide operations such as `fchdir`, acquire
3390 // `NamedLock` before starting each unit test creating a `PassthroughFs` instance.
3391 let lock = NamedLock::create(UNITTEST_LOCK_NAME).expect("create named lock");
3392 let _guard = lock.lock().expect("acquire named lock");
3393
3394 let temp_dir = TempDir::new().unwrap();
3395 create_test_data(&temp_dir, &["dir"], &["a.txt", "dir/b.txt"]);
3396
3397 let cfg = Config {
3398 ascii_casefold: true,
3399 ..Default::default()
3400 };
3401 let fs = PassthroughFs::new("tag", cfg).unwrap();
3402
3403 let capable = FsOptions::empty();
3404 fs.init(capable).unwrap();
3405
3406 // Ensure that "A.txt" is equated with "a.txt".
3407 let a_inode = lookup(&fs, &temp_dir.path().join("a.txt")).expect("a.txt must be found");
3408 assert_eq!(
3409 lookup(&fs, &temp_dir.path().join("A.txt")).expect("A.txt must exist"),
3410 a_inode
3411 );
3412
3413 let dir_inode = lookup(&fs, &temp_dir.path().join("dir")).expect("dir must be found");
3414 assert_eq!(
3415 lookup(&fs, &temp_dir.path().join("DiR")).expect("DiR must exist"),
3416 dir_inode
3417 );
3418
3419 let b_inode =
3420 lookup(&fs, &temp_dir.path().join("dir/b.txt")).expect("dir/b.txt must be found");
3421 assert_eq!(
3422 lookup(&fs, &temp_dir.path().join("dIr/B.TxT")).expect("dIr/B.TxT must exist"),
3423 b_inode
3424 );
3425
3426 assert_eq!(
3427 lookup(&fs, &temp_dir.path().join("nonexistent-file"))
3428 .expect_err("file must not exist")
3429 .kind(),
3430 io::ErrorKind::NotFound
3431 );
3432 }
3433
test_create_and_remove(ascii_casefold: bool)3434 fn test_create_and_remove(ascii_casefold: bool) {
3435 // Since PassthroughFs may executes process-wide operations such as `fchdir`, acquire
3436 // `NamedLock` before starting each unit test creating a `PassthroughFs` instance.
3437 let lock = NamedLock::create(UNITTEST_LOCK_NAME).expect("create named lock");
3438 let _guard = lock.lock().expect("acquire named lock");
3439
3440 let temp_dir = TempDir::new().unwrap();
3441 let timeout = Duration::from_millis(10);
3442 let cfg = Config {
3443 timeout,
3444 cache_policy: CachePolicy::Auto,
3445 ascii_casefold,
3446 ..Default::default()
3447 };
3448 let fs = PassthroughFs::new("tag", cfg).unwrap();
3449
3450 let capable = FsOptions::empty();
3451 fs.init(capable).unwrap();
3452
3453 // Create a.txt and b.txt.
3454 let a_path = temp_dir.path().join("a.txt");
3455 let b_path = temp_dir.path().join("b.txt");
3456 let a_entry = create(&fs, &a_path).expect("create a.txt");
3457 let b_entry = create(&fs, &b_path).expect("create b.txt");
3458 assert_eq!(
3459 a_entry.inode,
3460 lookup(&fs, &a_path).expect("lookup a.txt"),
3461 "Created file 'a.txt' must be looked up"
3462 );
3463 assert_eq!(
3464 b_entry.inode,
3465 lookup(&fs, &b_path).expect("lookup b.txt"),
3466 "Created file 'b.txt' must be looked up"
3467 );
3468
3469 // Remove a.txt only
3470 unlink(&fs, &a_path).expect("Remove");
3471 assert_eq!(
3472 lookup(&fs, &a_path)
3473 .expect_err("file must not exist")
3474 .kind(),
3475 io::ErrorKind::NotFound,
3476 "a.txt must be removed"
3477 );
3478 // "A.TXT" must not be found regardless of whether casefold is enabled or not.
3479 let upper_a_path = temp_dir.path().join("A.TXT");
3480 assert_eq!(
3481 lookup(&fs, &upper_a_path)
3482 .expect_err("file must not exist")
3483 .kind(),
3484 io::ErrorKind::NotFound,
3485 "A.txt must be removed"
3486 );
3487
3488 // Check if the host file system doesn't have a.txt but does b.txt.
3489 assert!(!a_path.exists(), "a.txt must be removed");
3490 assert!(b_path.exists(), "b.txt must exist");
3491 }
3492
3493 #[test]
create_and_remove()3494 fn create_and_remove() {
3495 test_create_and_remove(false /* casefold */);
3496 }
3497
3498 #[test]
create_and_remove_casefold()3499 fn create_and_remove_casefold() {
3500 test_create_and_remove(true /* casefold */);
3501 }
3502
test_create_and_forget(ascii_casefold: bool)3503 fn test_create_and_forget(ascii_casefold: bool) {
3504 // Since PassthroughFs may executes process-wide operations such as `fchdir`, acquire
3505 // `NamedLock` before starting each unit test creating a `PassthroughFs` instance.
3506 let lock = NamedLock::create(UNITTEST_LOCK_NAME).expect("create named lock");
3507 let _guard = lock.lock().expect("acquire named lock");
3508
3509 let temp_dir = TempDir::new().unwrap();
3510 let timeout = Duration::from_millis(10);
3511 let cfg = Config {
3512 timeout,
3513 cache_policy: CachePolicy::Auto,
3514 ascii_casefold,
3515 ..Default::default()
3516 };
3517 let fs = PassthroughFs::new("tag", cfg).unwrap();
3518
3519 let capable = FsOptions::empty();
3520 fs.init(capable).unwrap();
3521
3522 // Create a.txt.
3523 let a_path = temp_dir.path().join("a.txt");
3524 let a_entry = create(&fs, &a_path).expect("create a.txt");
3525 assert_eq!(
3526 a_entry.inode,
3527 lookup(&fs, &a_path).expect("lookup a.txt"),
3528 "Created file 'a.txt' must be looked up"
3529 );
3530
3531 // Forget a.txt's inode from PassthroughFs's internal cache.
3532 forget(&fs, &a_path).expect("forget a.txt");
3533
3534 if ascii_casefold {
3535 let upper_a_path = temp_dir.path().join("A.TXT");
3536 let new_a_inode = lookup(&fs, &upper_a_path).expect("lookup a.txt");
3537 assert_ne!(
3538 a_entry.inode, new_a_inode,
3539 "inode must be changed after forget()"
3540 );
3541 assert_eq!(
3542 new_a_inode,
3543 lookup(&fs, &a_path).expect("lookup a.txt"),
3544 "inode must be same for a.txt and A.TXT"
3545 );
3546 } else {
3547 assert_ne!(
3548 a_entry.inode,
3549 lookup(&fs, &a_path).expect("lookup a.txt"),
3550 "inode must be changed after forget()"
3551 );
3552 }
3553 }
3554
3555 #[test]
create_and_forget()3556 fn create_and_forget() {
3557 test_create_and_forget(false /* ascii_casefold */);
3558 }
3559
3560 #[test]
create_and_forget_casefold()3561 fn create_and_forget_casefold() {
3562 test_create_and_forget(true /* ascii_casefold */);
3563 }
3564
3565 #[test]
casefold_lookup_cache()3566 fn casefold_lookup_cache() {
3567 let temp_dir = TempDir::new().unwrap();
3568 // Prepare `a.txt` before starting the test.
3569 create_test_data(&temp_dir, &[], &["a.txt"]);
3570
3571 let cfg = Config {
3572 ascii_casefold: true,
3573 ..Default::default()
3574 };
3575 let fs = PassthroughFs::new("tag", cfg).unwrap();
3576
3577 let capable = FsOptions::empty();
3578 fs.init(capable).unwrap();
3579
3580 let parent = lookup(&fs, temp_dir.path()).expect("lookup temp_dir");
3581
3582 // Since `a.txt` exists, "A.TXT" must exist.
3583 let large_a_path = temp_dir.path().join("A.TXT");
3584 // Looking up "A.TXT" must create a CasefoldCache entry.
3585 lookup(&fs, &large_a_path).expect("A.TXT must exist");
3586 assert!(fs.exists_in_casefold_cache(parent, &CString::new("A.TXT").unwrap()));
3587
3588 // Create b.txt.
3589 let b_path = temp_dir.path().join("b.txt");
3590 create(&fs, &b_path).expect("create b.txt");
3591 // Then, b.txt must exists in the cache.
3592 assert!(fs.exists_in_casefold_cache(parent, &CString::new("B.TXT").unwrap()));
3593 // When removing b.txt, it must be removed from the cache as well.
3594 unlink(&fs, &b_path).expect("remove b.txt");
3595 assert!(!fs.exists_in_casefold_cache(parent, &CString::new("B.TXT").unwrap()));
3596 }
3597
3598 #[test]
lookup_negative_cache()3599 fn lookup_negative_cache() {
3600 let temp_dir = TempDir::new().unwrap();
3601 // Prepare `a.txt` before starting the test.
3602 create_test_data(&temp_dir, &[], &[]);
3603
3604 let cfg = Config {
3605 negative_timeout: Duration::from_secs(5),
3606 ..Default::default()
3607 };
3608 let fs = PassthroughFs::new("tag", cfg).unwrap();
3609
3610 let capable = FsOptions::empty();
3611 fs.init(capable).unwrap();
3612
3613 let a_path = temp_dir.path().join("a.txt");
3614 // a.txt hasn't existed yet.
3615 // Since negative_timeout is enabled, success with inode=0 is expected.
3616 assert_eq!(
3617 0,
3618 lookup(&fs, &a_path).expect("lookup a.txt"),
3619 "Entry with inode=0 is expected for non-existing file 'a.txt'"
3620 );
3621 // Create a.txt
3622 let a_entry = create(&fs, &a_path).expect("create a.txt");
3623 assert_eq!(
3624 a_entry.inode,
3625 lookup(&fs, &a_path).expect("lookup a.txt"),
3626 "Created file 'a.txt' must be looked up"
3627 );
3628 // Remove a.txt
3629 unlink(&fs, &a_path).expect("Remove");
3630 assert_eq!(
3631 0,
3632 lookup(&fs, &a_path).expect("lookup a.txt"),
3633 "Entry with inode=0 is expected for the removed file 'a.txt'"
3634 );
3635 }
3636 #[test]
test_atomic_open_existing_file()3637 fn test_atomic_open_existing_file() {
3638 atomic_open_existing_file(false);
3639 }
3640
3641 #[test]
test_atomic_open_existing_file_zero_message()3642 fn test_atomic_open_existing_file_zero_message() {
3643 atomic_open_existing_file(true);
3644 }
3645
atomic_open_existing_file(zero_message_open: bool)3646 fn atomic_open_existing_file(zero_message_open: bool) {
3647 // Since PassthroughFs may executes process-wide operations such as `fchdir`, acquire
3648 // `NamedLock` before starting each unit test creating a `PassthroughFs` instance.
3649 let lock = NamedLock::create(UNITTEST_LOCK_NAME).expect("create named lock");
3650 let _guard = lock.lock().expect("acquire named lock");
3651
3652 let temp_dir = TempDir::new().unwrap();
3653 create_test_data(&temp_dir, &["dir"], &["a.txt", "dir/b.txt", "dir/c.txt"]);
3654
3655 let cache_policy = match zero_message_open {
3656 true => CachePolicy::Always,
3657 false => CachePolicy::Auto,
3658 };
3659
3660 let cfg = Config {
3661 cache_policy,
3662 ..Default::default()
3663 };
3664 let fs = PassthroughFs::new("tag", cfg).unwrap();
3665
3666 let capable = FsOptions::ZERO_MESSAGE_OPEN;
3667 fs.init(capable).unwrap();
3668
3669 // atomic_open with flag O_RDWR, should return positive dentry and file handler
3670 let res = atomic_open(
3671 &fs,
3672 &temp_dir.path().join("a.txt"),
3673 0o666,
3674 libc::O_RDWR as u32,
3675 0,
3676 None,
3677 );
3678 assert!(res.is_ok());
3679 let (entry, handler, open_options) = res.unwrap();
3680 assert_ne!(entry.inode, 0);
3681
3682 if zero_message_open {
3683 assert!(handler.is_none());
3684 assert_eq!(open_options, OpenOptions::KEEP_CACHE);
3685 } else {
3686 assert!(handler.is_some());
3687 assert_ne!(
3688 open_options & OpenOptions::FILE_CREATED,
3689 OpenOptions::FILE_CREATED
3690 );
3691 }
3692
3693 // atomic_open with flag O_RDWR | O_CREATE, should return positive dentry and file handler
3694 let res = atomic_open(
3695 &fs,
3696 &temp_dir.path().join("dir/b.txt"),
3697 0o666,
3698 (libc::O_RDWR | libc::O_CREAT) as u32,
3699 0,
3700 None,
3701 );
3702 assert!(res.is_ok());
3703 let (entry, handler, open_options) = res.unwrap();
3704 assert_ne!(entry.inode, 0);
3705
3706 if zero_message_open {
3707 assert!(handler.is_none());
3708 assert_eq!(open_options, OpenOptions::KEEP_CACHE);
3709 } else {
3710 assert!(handler.is_some());
3711 assert_ne!(
3712 open_options & OpenOptions::FILE_CREATED,
3713 OpenOptions::FILE_CREATED
3714 );
3715 }
3716
3717 // atomic_open with flag O_RDWR | O_CREATE | O_EXCL, should return positive dentry and file
3718 // handler
3719 let res = atomic_open(
3720 &fs,
3721 &temp_dir.path().join("dir/c.txt"),
3722 0o666,
3723 (libc::O_RDWR | libc::O_CREAT | libc::O_EXCL) as u32,
3724 0,
3725 None,
3726 );
3727 assert!(res.is_err());
3728 let err_kind = res.unwrap_err().kind();
3729 assert_eq!(err_kind, io::ErrorKind::AlreadyExists);
3730 }
3731
3732 #[test]
test_atomic_open_non_existing_file()3733 fn test_atomic_open_non_existing_file() {
3734 atomic_open_non_existing_file(false);
3735 }
3736
3737 #[test]
test_atomic_open_non_existing_file_zero_message()3738 fn test_atomic_open_non_existing_file_zero_message() {
3739 atomic_open_non_existing_file(true);
3740 }
3741
atomic_open_non_existing_file(zero_message_open: bool)3742 fn atomic_open_non_existing_file(zero_message_open: bool) {
3743 // Since PassthroughFs may executes process-wide operations such as `fchdir`, acquire
3744 // `NamedLock` before starting each unit test creating a `PassthroughFs` instance.
3745 let lock = NamedLock::create(UNITTEST_LOCK_NAME).expect("create named lock");
3746 let _guard = lock.lock().expect("acquire named lock");
3747
3748 let temp_dir = TempDir::new().unwrap();
3749
3750 let cache_policy = match zero_message_open {
3751 true => CachePolicy::Always,
3752 false => CachePolicy::Auto,
3753 };
3754
3755 let cfg = Config {
3756 cache_policy,
3757 ..Default::default()
3758 };
3759 let fs = PassthroughFs::new("tag", cfg).unwrap();
3760
3761 let capable = FsOptions::ZERO_MESSAGE_OPEN;
3762 fs.init(capable).unwrap();
3763
3764 // atomic_open with flag O_RDWR, should return NO_EXIST error
3765 let res = atomic_open(
3766 &fs,
3767 &temp_dir.path().join("a.txt"),
3768 0o666,
3769 libc::O_RDWR as u32,
3770 0,
3771 None,
3772 );
3773 assert!(res.is_err());
3774 let err_kind = res.unwrap_err().kind();
3775 assert_eq!(err_kind, io::ErrorKind::NotFound);
3776
3777 // atomic_open with flag O_RDWR | O_CREATE, should return positive dentry and file handler
3778 let res = atomic_open(
3779 &fs,
3780 &temp_dir.path().join("b.txt"),
3781 0o666,
3782 (libc::O_RDWR | libc::O_CREAT) as u32,
3783 0,
3784 None,
3785 );
3786 assert!(res.is_ok());
3787 let (entry, handler, open_options) = res.unwrap();
3788 assert_ne!(entry.inode, 0);
3789
3790 if zero_message_open {
3791 assert!(handler.is_none());
3792 assert_eq!(
3793 open_options & OpenOptions::KEEP_CACHE,
3794 OpenOptions::KEEP_CACHE
3795 );
3796 } else {
3797 assert!(handler.is_some());
3798 }
3799 assert_eq!(
3800 open_options & OpenOptions::FILE_CREATED,
3801 OpenOptions::FILE_CREATED
3802 );
3803 }
3804
3805 #[test]
atomic_open_symbol_link()3806 fn atomic_open_symbol_link() {
3807 // Since PassthroughFs may executes process-wide operations such as `fchdir`, acquire
3808 // `NamedLock` before starting each unit test creating a `PassthroughFs` instance.
3809 let lock = NamedLock::create(UNITTEST_LOCK_NAME).expect("create named lock");
3810 let _guard = lock.lock().expect("acquire named lock");
3811
3812 let temp_dir = TempDir::new().unwrap();
3813 create_test_data(&temp_dir, &["dir"], &["a.txt"]);
3814
3815 let cfg = Default::default();
3816 let fs = PassthroughFs::new("tag", cfg).unwrap();
3817
3818 let capable = FsOptions::empty();
3819 fs.init(capable).unwrap();
3820
3821 // atomic open the link destination file
3822 let res_dst = atomic_open(
3823 &fs,
3824 &temp_dir.path().join("a.txt"),
3825 0o666,
3826 libc::O_RDWR as u32,
3827 0,
3828 None,
3829 );
3830 assert!(res_dst.is_ok());
3831 let (entry_dst, handler_dst, _) = res_dst.unwrap();
3832 assert_ne!(entry_dst.inode, 0);
3833 assert!(handler_dst.is_some());
3834
3835 // create depth 1 symbol link
3836 let sym1_res = symlink(
3837 &fs,
3838 &temp_dir.path().join("a.txt"),
3839 &temp_dir.path().join("blink"),
3840 None,
3841 );
3842 assert!(sym1_res.is_ok());
3843 let sym1_entry = sym1_res.unwrap();
3844 assert_ne!(sym1_entry.inode, 0);
3845
3846 // atomic_open symbol link, should return dentry with no handler
3847 let res = atomic_open(
3848 &fs,
3849 &temp_dir.path().join("blink"),
3850 0o666,
3851 libc::O_RDWR as u32,
3852 0,
3853 None,
3854 );
3855 assert!(res.is_ok());
3856 let (entry, handler, open_options) = res.unwrap();
3857 assert_eq!(entry.inode, sym1_entry.inode);
3858 assert!(handler.is_none());
3859 assert_eq!(open_options, OpenOptions::empty());
3860
3861 // delete link destination
3862 unlink(&fs, &temp_dir.path().join("a.txt")).expect("Remove");
3863 assert_eq!(
3864 lookup(&fs, &temp_dir.path().join("a.txt"))
3865 .expect_err("file must not exist")
3866 .kind(),
3867 io::ErrorKind::NotFound,
3868 "a.txt must be removed"
3869 );
3870
3871 // after link destination removed, should still return valid dentry
3872 let res = atomic_open(
3873 &fs,
3874 &temp_dir.path().join("blink"),
3875 0o666,
3876 libc::O_RDWR as u32,
3877 0,
3878 None,
3879 );
3880 assert!(res.is_ok());
3881 let (entry, handler, open_options) = res.unwrap();
3882 assert_eq!(entry.inode, sym1_entry.inode);
3883 assert!(handler.is_none());
3884 assert_eq!(open_options, OpenOptions::empty());
3885 }
3886 }
3887