• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 //! The Windows command line is just a string
2 //! <https://docs.microsoft.com/en-us/archive/blogs/larryosterman/the-windows-command-line-is-just-a-string>
3 //!
4 //! This module implements the parsing necessary to turn that string into a list of arguments.
5 
6 #[cfg(test)]
7 mod tests;
8 
9 use crate::ffi::{OsStr, OsString};
10 use crate::fmt;
11 use crate::io;
12 use crate::num::NonZeroU16;
13 use crate::os::windows::prelude::*;
14 use crate::path::{Path, PathBuf};
15 use crate::sys::path::get_long_path;
16 use crate::sys::process::ensure_no_nuls;
17 use crate::sys::windows::os::current_exe;
18 use crate::sys::{c, to_u16s};
19 use crate::sys_common::wstr::WStrUnits;
20 use crate::sys_common::AsInner;
21 use crate::vec;
22 
23 use crate::iter;
24 
25 /// This is the const equivalent to `NonZeroU16::new(n).unwrap()`
26 ///
27 /// FIXME: This can be removed once `Option::unwrap` is stably const.
28 /// See the `const_option` feature (#67441).
non_zero_u16(n: u16) -> NonZeroU1629 const fn non_zero_u16(n: u16) -> NonZeroU16 {
30     match NonZeroU16::new(n) {
31         Some(n) => n,
32         None => panic!("called `unwrap` on a `None` value"),
33     }
34 }
35 
args() -> Args36 pub fn args() -> Args {
37     // SAFETY: `GetCommandLineW` returns a pointer to a null terminated UTF-16
38     // string so it's safe for `WStrUnits` to use.
39     unsafe {
40         let lp_cmd_line = c::GetCommandLineW();
41         let parsed_args_list = parse_lp_cmd_line(WStrUnits::new(lp_cmd_line), || {
42             current_exe().map(PathBuf::into_os_string).unwrap_or_else(|_| OsString::new())
43         });
44 
45         Args { parsed_args_list: parsed_args_list.into_iter() }
46     }
47 }
48 
49 /// Implements the Windows command-line argument parsing algorithm.
50 ///
51 /// Microsoft's documentation for the Windows CLI argument format can be found at
52 /// <https://docs.microsoft.com/en-us/cpp/cpp/main-function-command-line-args?view=msvc-160#parsing-c-command-line-arguments>
53 ///
54 /// A more in-depth explanation is here:
55 /// <https://daviddeley.com/autohotkey/parameters/parameters.htm#WIN>
56 ///
57 /// Windows includes a function to do command line parsing in shell32.dll.
58 /// However, this is not used for two reasons:
59 ///
60 /// 1. Linking with that DLL causes the process to be registered as a GUI application.
61 /// GUI applications add a bunch of overhead, even if no windows are drawn. See
62 /// <https://randomascii.wordpress.com/2018/12/03/a-not-called-function-can-cause-a-5x-slowdown/>.
63 ///
64 /// 2. It does not follow the modern C/C++ argv rules outlined in the first two links above.
65 ///
66 /// This function was tested for equivalence to the C/C++ parsing rules using an
67 /// extensive test suite available at
68 /// <https://github.com/ChrisDenton/winarg/tree/std>.
parse_lp_cmd_line<'a, F: Fn() -> OsString>( lp_cmd_line: Option<WStrUnits<'a>>, exe_name: F, ) -> Vec<OsString>69 fn parse_lp_cmd_line<'a, F: Fn() -> OsString>(
70     lp_cmd_line: Option<WStrUnits<'a>>,
71     exe_name: F,
72 ) -> Vec<OsString> {
73     const BACKSLASH: NonZeroU16 = non_zero_u16(b'\\' as u16);
74     const QUOTE: NonZeroU16 = non_zero_u16(b'"' as u16);
75     const TAB: NonZeroU16 = non_zero_u16(b'\t' as u16);
76     const SPACE: NonZeroU16 = non_zero_u16(b' ' as u16);
77 
78     let mut ret_val = Vec::new();
79     // If the cmd line pointer is null or it points to an empty string then
80     // return the name of the executable as argv[0].
81     if lp_cmd_line.as_ref().and_then(|cmd| cmd.peek()).is_none() {
82         ret_val.push(exe_name());
83         return ret_val;
84     }
85     let mut code_units = lp_cmd_line.unwrap();
86 
87     // The executable name at the beginning is special.
88     let mut in_quotes = false;
89     let mut cur = Vec::new();
90     for w in &mut code_units {
91         match w {
92             // A quote mark always toggles `in_quotes` no matter what because
93             // there are no escape characters when parsing the executable name.
94             QUOTE => in_quotes = !in_quotes,
95             // If not `in_quotes` then whitespace ends argv[0].
96             SPACE | TAB if !in_quotes => break,
97             // In all other cases the code unit is taken literally.
98             _ => cur.push(w.get()),
99         }
100     }
101     // Skip whitespace.
102     code_units.advance_while(|w| w == SPACE || w == TAB);
103     ret_val.push(OsString::from_wide(&cur));
104 
105     // Parse the arguments according to these rules:
106     // * All code units are taken literally except space, tab, quote and backslash.
107     // * When not `in_quotes`, space and tab separate arguments. Consecutive spaces and tabs are
108     // treated as a single separator.
109     // * A space or tab `in_quotes` is taken literally.
110     // * A quote toggles `in_quotes` mode unless it's escaped. An escaped quote is taken literally.
111     // * A quote can be escaped if preceded by an odd number of backslashes.
112     // * If any number of backslashes is immediately followed by a quote then the number of
113     // backslashes is halved (rounding down).
114     // * Backslashes not followed by a quote are all taken literally.
115     // * If `in_quotes` then a quote can also be escaped using another quote
116     // (i.e. two consecutive quotes become one literal quote).
117     let mut cur = Vec::new();
118     let mut in_quotes = false;
119     while let Some(w) = code_units.next() {
120         match w {
121             // If not `in_quotes`, a space or tab ends the argument.
122             SPACE | TAB if !in_quotes => {
123                 ret_val.push(OsString::from_wide(&cur[..]));
124                 cur.truncate(0);
125 
126                 // Skip whitespace.
127                 code_units.advance_while(|w| w == SPACE || w == TAB);
128             }
129             // Backslashes can escape quotes or backslashes but only if consecutive backslashes are followed by a quote.
130             BACKSLASH => {
131                 let backslash_count = code_units.advance_while(|w| w == BACKSLASH) + 1;
132                 if code_units.peek() == Some(QUOTE) {
133                     cur.extend(iter::repeat(BACKSLASH.get()).take(backslash_count / 2));
134                     // The quote is escaped if there are an odd number of backslashes.
135                     if backslash_count % 2 == 1 {
136                         code_units.next();
137                         cur.push(QUOTE.get());
138                     }
139                 } else {
140                     // If there is no quote on the end then there is no escaping.
141                     cur.extend(iter::repeat(BACKSLASH.get()).take(backslash_count));
142                 }
143             }
144             // If `in_quotes` and not backslash escaped (see above) then a quote either
145             // unsets `in_quote` or is escaped by another quote.
146             QUOTE if in_quotes => match code_units.peek() {
147                 // Two consecutive quotes when `in_quotes` produces one literal quote.
148                 Some(QUOTE) => {
149                     cur.push(QUOTE.get());
150                     code_units.next();
151                 }
152                 // Otherwise set `in_quotes`.
153                 Some(_) => in_quotes = false,
154                 // The end of the command line.
155                 // Push `cur` even if empty, which we do by breaking while `in_quotes` is still set.
156                 None => break,
157             },
158             // If not `in_quotes` and not BACKSLASH escaped (see above) then a quote sets `in_quote`.
159             QUOTE => in_quotes = true,
160             // Everything else is always taken literally.
161             _ => cur.push(w.get()),
162         }
163     }
164     // Push the final argument, if any.
165     if !cur.is_empty() || in_quotes {
166         ret_val.push(OsString::from_wide(&cur[..]));
167     }
168     ret_val
169 }
170 
171 pub struct Args {
172     parsed_args_list: vec::IntoIter<OsString>,
173 }
174 
175 impl fmt::Debug for Args {
fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result176     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
177         self.parsed_args_list.as_slice().fmt(f)
178     }
179 }
180 
181 impl Iterator for Args {
182     type Item = OsString;
next(&mut self) -> Option<OsString>183     fn next(&mut self) -> Option<OsString> {
184         self.parsed_args_list.next()
185     }
size_hint(&self) -> (usize, Option<usize>)186     fn size_hint(&self) -> (usize, Option<usize>) {
187         self.parsed_args_list.size_hint()
188     }
189 }
190 
191 impl DoubleEndedIterator for Args {
next_back(&mut self) -> Option<OsString>192     fn next_back(&mut self) -> Option<OsString> {
193         self.parsed_args_list.next_back()
194     }
195 }
196 
197 impl ExactSizeIterator for Args {
len(&self) -> usize198     fn len(&self) -> usize {
199         self.parsed_args_list.len()
200     }
201 }
202 
203 #[derive(Debug)]
204 pub(crate) enum Arg {
205     /// Add quotes (if needed)
206     Regular(OsString),
207     /// Append raw string without quoting
208     Raw(OsString),
209 }
210 
211 enum Quote {
212     // Every arg is quoted
213     Always,
214     // Whitespace and empty args are quoted
215     Auto,
216     // Arg appended without any changes (#29494)
217     Never,
218 }
219 
append_arg(cmd: &mut Vec<u16>, arg: &Arg, force_quotes: bool) -> io::Result<()>220 pub(crate) fn append_arg(cmd: &mut Vec<u16>, arg: &Arg, force_quotes: bool) -> io::Result<()> {
221     let (arg, quote) = match arg {
222         Arg::Regular(arg) => (arg, if force_quotes { Quote::Always } else { Quote::Auto }),
223         Arg::Raw(arg) => (arg, Quote::Never),
224     };
225 
226     // If an argument has 0 characters then we need to quote it to ensure
227     // that it actually gets passed through on the command line or otherwise
228     // it will be dropped entirely when parsed on the other end.
229     ensure_no_nuls(arg)?;
230     let arg_bytes = arg.as_os_str_bytes();
231     let (quote, escape) = match quote {
232         Quote::Always => (true, true),
233         Quote::Auto => {
234             (arg_bytes.iter().any(|c| *c == b' ' || *c == b'\t') || arg_bytes.is_empty(), true)
235         }
236         Quote::Never => (false, false),
237     };
238     if quote {
239         cmd.push('"' as u16);
240     }
241 
242     let mut backslashes: usize = 0;
243     for x in arg.encode_wide() {
244         if escape {
245             if x == '\\' as u16 {
246                 backslashes += 1;
247             } else {
248                 if x == '"' as u16 {
249                     // Add n+1 backslashes to total 2n+1 before internal '"'.
250                     cmd.extend((0..=backslashes).map(|_| '\\' as u16));
251                 }
252                 backslashes = 0;
253             }
254         }
255         cmd.push(x);
256     }
257 
258     if quote {
259         // Add n backslashes to total 2n before ending '"'.
260         cmd.extend((0..backslashes).map(|_| '\\' as u16));
261         cmd.push('"' as u16);
262     }
263     Ok(())
264 }
265 
append_bat_arg(cmd: &mut Vec<u16>, arg: &OsStr, mut quote: bool) -> io::Result<()>266 fn append_bat_arg(cmd: &mut Vec<u16>, arg: &OsStr, mut quote: bool) -> io::Result<()> {
267     ensure_no_nuls(arg)?;
268     // If an argument has 0 characters then we need to quote it to ensure
269     // that it actually gets passed through on the command line or otherwise
270     // it will be dropped entirely when parsed on the other end.
271     //
272     // We also need to quote the argument if it ends with `\` to guard against
273     // bat usage such as `"%~2"` (i.e. force quote arguments) otherwise a
274     // trailing slash will escape the closing quote.
275     if arg.is_empty() || arg.as_encoded_bytes().last() == Some(&b'\\') {
276         quote = true;
277     }
278     for cp in arg.as_inner().inner.code_points() {
279         if let Some(cp) = cp.to_char() {
280             // Rather than trying to find every ascii symbol that must be quoted,
281             // we assume that all ascii symbols must be quoted unless they're known to be good.
282             // We also quote Unicode control blocks for good measure.
283             // Note an unquoted `\` is fine so long as the argument isn't otherwise quoted.
284             static UNQUOTED: &str = r"#$*+-./:?@\_";
285             let ascii_needs_quotes =
286                 cp.is_ascii() && !(cp.is_ascii_alphanumeric() || UNQUOTED.contains(cp));
287             if ascii_needs_quotes || cp.is_control() {
288                 quote = true;
289             }
290         }
291     }
292 
293     if quote {
294         cmd.push('"' as u16);
295     }
296     // Loop through the string, escaping `\` only if followed by `"`.
297     // And escaping `"` by doubling them.
298     let mut backslashes: usize = 0;
299     for x in arg.encode_wide() {
300         if x == '\\' as u16 {
301             backslashes += 1;
302         } else {
303             if x == '"' as u16 {
304                 // Add n backslashes to total 2n before internal `"`.
305                 cmd.extend((0..backslashes).map(|_| '\\' as u16));
306                 // Appending an additional double-quote acts as an escape.
307                 cmd.push(b'"' as u16)
308             } else if x == '%' as u16 || x == '\r' as u16 {
309                 // yt-dlp hack: replaces `%` with `%%cd:~,%` to stop %VAR% being expanded as an environment variable.
310                 //
311                 // # Explanation
312                 //
313                 // cmd supports extracting a substring from a variable using the following syntax:
314                 //     %variable:~start_index,end_index%
315                 //
316                 // In the above command `cd` is used as the variable and the start_index and end_index are left blank.
317                 // `cd` is a built-in variable that dynamically expands to the current directory so it's always available.
318                 // Explicitly omitting both the start and end index creates a zero-length substring.
319                 //
320                 // Therefore it all resolves to nothing. However, by doing this no-op we distract cmd.exe
321                 // from potentially expanding %variables% in the argument.
322                 cmd.extend_from_slice(&[
323                     '%' as u16, '%' as u16, 'c' as u16, 'd' as u16, ':' as u16, '~' as u16,
324                     ',' as u16,
325                 ]);
326             }
327             backslashes = 0;
328         }
329         cmd.push(x);
330     }
331     if quote {
332         // Add n backslashes to total 2n before ending `"`.
333         cmd.extend((0..backslashes).map(|_| '\\' as u16));
334         cmd.push('"' as u16);
335     }
336     Ok(())
337 }
338 
make_bat_command_line( script: &[u16], args: &[Arg], force_quotes: bool, ) -> io::Result<Vec<u16>>339 pub(crate) fn make_bat_command_line(
340     script: &[u16],
341     args: &[Arg],
342     force_quotes: bool,
343 ) -> io::Result<Vec<u16>> {
344     const INVALID_ARGUMENT_ERROR: io::Error =
345         io::const_io_error!(io::ErrorKind::InvalidInput, r#"batch file arguments are invalid"#);
346     // Set the start of the command line to `cmd.exe /c "`
347     // It is necessary to surround the command in an extra pair of quotes,
348     // hence the trailing quote here. It will be closed after all arguments
349     // have been added.
350     // Using /e:ON enables "command extensions" which is essential for the `%` hack to work.
351     let mut cmd: Vec<u16> = "cmd.exe /e:ON /v:OFF /d /c \"".encode_utf16().collect();
352 
353     // Push the script name surrounded by its quote pair.
354     cmd.push(b'"' as u16);
355     // Windows file names cannot contain a `"` character or end with `\\`.
356     // If the script name does then return an error.
357     if script.contains(&(b'"' as u16)) || script.last() == Some(&(b'\\' as u16)) {
358         return Err(io::const_io_error!(
359             io::ErrorKind::InvalidInput,
360             "Windows file names may not contain `\"` or end with `\\`"
361         ));
362     }
363     cmd.extend_from_slice(script.strip_suffix(&[0]).unwrap_or(script));
364     cmd.push(b'"' as u16);
365 
366     // Append the arguments.
367     // FIXME: This needs tests to ensure that the arguments are properly
368     // reconstructed by the batch script by default.
369     for arg in args {
370         cmd.push(' ' as u16);
371         match arg {
372             Arg::Regular(arg_os) => {
373                 let arg_bytes = arg_os.as_encoded_bytes();
374                 // Disallow \r and \n as they may truncate the arguments.
375                 const DISALLOWED: &[u8] = b"\r\n";
376                 if arg_bytes.iter().any(|c| DISALLOWED.contains(c)) {
377                     return Err(INVALID_ARGUMENT_ERROR);
378                 }
379                 append_bat_arg(&mut cmd, arg_os, force_quotes)?;
380             }
381             _ => {
382                 // Raw arguments are passed on as-is.
383                 // It's the user's responsibility to properly handle arguments in this case.
384                 append_arg(&mut cmd, arg, force_quotes)?;
385             }
386         };
387     }
388 
389     // Close the quote we left opened earlier.
390     cmd.push(b'"' as u16);
391 
392     Ok(cmd)
393 }
394 
395 /// Takes a path and tries to return a non-verbatim path.
396 ///
397 /// This is necessary because cmd.exe does not support verbatim paths.
to_user_path(path: &Path) -> io::Result<Vec<u16>>398 pub(crate) fn to_user_path(path: &Path) -> io::Result<Vec<u16>> {
399     from_wide_to_user_path(to_u16s(path)?)
400 }
from_wide_to_user_path(mut path: Vec<u16>) -> io::Result<Vec<u16>>401 pub(crate) fn from_wide_to_user_path(mut path: Vec<u16>) -> io::Result<Vec<u16>> {
402     use crate::ptr;
403     use crate::sys::windows::fill_utf16_buf;
404 
405     // UTF-16 encoded code points, used in parsing and building UTF-16 paths.
406     // All of these are in the ASCII range so they can be cast directly to `u16`.
407     const SEP: u16 = b'\\' as _;
408     const QUERY: u16 = b'?' as _;
409     const COLON: u16 = b':' as _;
410     const U: u16 = b'U' as _;
411     const N: u16 = b'N' as _;
412     const C: u16 = b'C' as _;
413 
414     // Early return if the path is too long to remove the verbatim prefix.
415     const LEGACY_MAX_PATH: usize = 260;
416     if path.len() > LEGACY_MAX_PATH {
417         return Ok(path);
418     }
419 
420     match &path[..] {
421         // `\\?\C:\...` => `C:\...`
422         [SEP, SEP, QUERY, SEP, _, COLON, SEP, ..] => unsafe {
423             let lpfilename = path[4..].as_ptr();
424             fill_utf16_buf(
425                 |buffer, size| c::GetFullPathNameW(lpfilename, size, buffer, ptr::null_mut()),
426                 |full_path: &[u16]| {
427                     if full_path == &path[4..path.len() - 1] {
428                         let mut path: Vec<u16> = full_path.into();
429                         path.push(0);
430                         path
431                     } else {
432                         path
433                     }
434                 },
435             )
436         },
437         // `\\?\UNC\...` => `\\...`
438         [SEP, SEP, QUERY, SEP, U, N, C, SEP, ..] => unsafe {
439             // Change the `C` in `UNC\` to `\` so we can get a slice that starts with `\\`.
440             path[6] = b'\\' as u16;
441             let lpfilename = path[6..].as_ptr();
442             fill_utf16_buf(
443                 |buffer, size| c::GetFullPathNameW(lpfilename, size, buffer, ptr::null_mut()),
444                 |full_path: &[u16]| {
445                     if full_path == &path[6..path.len() - 1] {
446                         let mut path: Vec<u16> = full_path.into();
447                         path.push(0);
448                         path
449                     } else {
450                         // Restore the 'C' in "UNC".
451                         path[6] = b'C' as u16;
452                         path
453                     }
454                 },
455             )
456         },
457         // For everything else, leave the path unchanged.
458         _ => get_long_path(path, false),
459     }
460 }
461