clap/src/osstringext.rs

use std::ffi::OsStr;
#[cfg(not(any(target_os = "windows", target_arch = "wasm32")))]
use std::os::unix::ffi::OsStrExt;
#[cfg(any(target_os = "windows", target_arch = "wasm32"))]
use INVALID_UTF8;

#[cfg(any(target_os = "windows", target_arch = "wasm32"))]
pub trait OsStrExt3 {
    fn from_bytes(b: &[u8]) -> &Self;
    fn as_bytes(&self) -> &[u8];
}

#[doc(hidden)]
pub trait OsStrExt2 {
    fn starts_with(&self, s: &[u8]) -> bool;
    fn split_at_byte(&self, b: u8) -> (&OsStr, &OsStr);
    fn split_at(&self, i: usize) -> (&OsStr, &OsStr);
    fn trim_left_matches(&self, b: u8) -> &OsStr;
    fn contains_byte(&self, b: u8) -> bool;
    fn split(&self, b: u8) -> OsSplit;
}

// A starts-with implementation that does not panic when the OsStr contains
// invalid Unicode.
//
// A Windows OsStr is usually UTF-16. If `prefix` is valid UTF-8, we can
// re-encode it as UTF-16, and ask whether `osstr` starts with the same series
// of u16 code units. If `prefix` is not valid UTF-8, then this comparison
// isn't meaningful, and we just return false.
#[cfg(target_os = "windows")]
fn windows_osstr_starts_with(osstr: &OsStr, prefix: &[u8]) -> bool {
    use std::os::windows::ffi::OsStrExt;
    let prefix_str = if let Ok(s) = std::str::from_utf8(prefix) {
        s
    } else {
        return false;
    };
    let mut osstr_units = osstr.encode_wide();
    let mut prefix_units = prefix_str.encode_utf16();
    loop {
        match (osstr_units.next(), prefix_units.next()) {
            // These code units match. Keep looping.
            (Some(o), Some(p)) if o == p => continue,
            // We've reached the end of the prefix. It's a match.
            (_, None) => return true,
            // Otherwise, it's not a match.
            _ => return false,
        }
    }
}

#[test]
#[cfg(target_os = "windows")]
fn test_windows_osstr_starts_with() {
    use std::ffi::OsString;
    use std::os::windows::ffi::OsStringExt;

    fn from_ascii(ascii: &[u8]) -> OsString {
        let u16_vec: Vec<u16> = ascii.iter().map(|&c| c as u16).collect();
        OsString::from_wide(&u16_vec)
    }

    // Test all the basic cases.
    assert!(windows_osstr_starts_with(&from_ascii(b"abcdef"), b"abc"));
    assert!(windows_osstr_starts_with(&from_ascii(b"abcdef"), b"abcdef"));
    assert!(!windows_osstr_starts_with(&from_ascii(b"abcdef"), b"def"));
    assert!(!windows_osstr_starts_with(&from_ascii(b"abc"), b"abcd"));

    // Test the case where the candidate prefix is not valid UTF-8. Note that a
    // standalone \xff byte is valid ASCII but not valid UTF-8. Thus although
    // these strings look identical, they do not match.
    assert!(!windows_osstr_starts_with(&from_ascii(b"\xff"), b"\xff"));

    // Test the case where the OsString is not valid UTF-16. It should still be
    // possible to match the valid characters at the front.
    //
    // UTF-16 surrogate characters are only valid in pairs. Including one on
    // the end by itself makes this invalid UTF-16.
    let surrogate_char: u16 = 0xDC00;
    let mut invalid_unicode =
        OsString::from_wide(&['a' as u16, 'b' as u16, 'c' as u16, surrogate_char]);
    assert!(
        invalid_unicode.to_str().is_none(),
        "This string is invalid Unicode, and conversion to &str should fail.",
    );
    assert!(windows_osstr_starts_with(&invalid_unicode, b"abc"));
    assert!(!windows_osstr_starts_with(&invalid_unicode, b"abcd"));
}

#[cfg(any(target_os = "windows", target_arch = "wasm32"))]
impl OsStrExt3 for OsStr {
    fn from_bytes(b: &[u8]) -> &Self {
        use std::mem;
        unsafe { mem::transmute(b) }
    }
    fn as_bytes(&self) -> &[u8] {
        self.to_str().map(|s| s.as_bytes()).expect(INVALID_UTF8)
    }
}

impl OsStrExt2 for OsStr {
    fn starts_with(&self, s: &[u8]) -> bool {
        #[cfg(target_os = "windows")]
        {
            // On Windows, the as_bytes() method will panic if the OsStr
            // contains invalid Unicode. To avoid this, we use a
            // Windows-specific starts-with function that doesn't rely on
            // as_bytes(). This is necessary for Windows command line
            // applications to handle non-Unicode arguments successfully. This
            // allows common cases like `clap.exe [invalid]` to succeed, though
            // cases that require string splitting will still fail, like
            // `clap.exe --arg=[invalid]`. Note that this entire module is
            // replaced in Clap 3.x, so this workaround is specific to the 2.x
            // branch.
            return windows_osstr_starts_with(self, s);
        }
        self.as_bytes().starts_with(s)
    }

    fn contains_byte(&self, byte: u8) -> bool {
        for b in self.as_bytes() {
            if b == &byte {
                return true;
            }
        }
        false
    }

    fn split_at_byte(&self, byte: u8) -> (&OsStr, &OsStr) {
        for (i, b) in self.as_bytes().iter().enumerate() {
            if b == &byte {
                return (
                    OsStr::from_bytes(&self.as_bytes()[..i]),
                    OsStr::from_bytes(&self.as_bytes()[i + 1..]),
                );
            }
        }
        (
            &*self,
            OsStr::from_bytes(&self.as_bytes()[self.len()..self.len()]),
        )
    }

    fn trim_left_matches(&self, byte: u8) -> &OsStr {
        let mut found = false;
        for (i, b) in self.as_bytes().iter().enumerate() {
            if b != &byte {
                return OsStr::from_bytes(&self.as_bytes()[i..]);
            } else {
                found = true;
            }
        }
        if found {
            return OsStr::from_bytes(&self.as_bytes()[self.len()..]);
        }
        &*self
    }

    fn split_at(&self, i: usize) -> (&OsStr, &OsStr) {
        (
            OsStr::from_bytes(&self.as_bytes()[..i]),
            OsStr::from_bytes(&self.as_bytes()[i..]),
        )
    }

    fn split(&self, b: u8) -> OsSplit {
        OsSplit {
            sep: b,
            val: self.as_bytes(),
            pos: 0,
        }
    }
}

#[doc(hidden)]
#[derive(Clone, Debug)]
pub struct OsSplit<'a> {
    sep: u8,
    val: &'a [u8],
    pos: usize,
}

impl<'a> Iterator for OsSplit<'a> {
    type Item = &'a OsStr;

    fn next(&mut self) -> Option<&'a OsStr> {
        debugln!("OsSplit::next: self={:?}", self);
        if self.pos == self.val.len() {
            return None;
        }
        let start = self.pos;
        for b in &self.val[start..] {
            self.pos += 1;
            if *b == self.sep {
                return Some(OsStr::from_bytes(&self.val[start..self.pos - 1]));
            }
        }
        Some(OsStr::from_bytes(&self.val[start..]))
    }
}