• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 use super::{char_encoding_generic::*, JvmError};
2 use std::{
3     borrow::Cow,
4     convert::TryInto,
5     ffi::{c_int, c_uint, CStr},
6     io,
7     mem::MaybeUninit,
8     ptr,
9 };
10 use windows_sys::Win32::Globalization as winnls;
11 
12 // The integer type used by `WideCharToMultiByte` for string lengths.
13 type WSize = c_int;
14 
15 // The type of Windows codepage numbers.
16 type WCodepage = c_uint;
17 
18 // The maximum length, in UTF-8 bytes, of strings that will be accepted for transcoding.
19 //
20 // The purpose of this limit is to prevent overflow. `WideCharToMultiByte` behaves rather badly
21 // (see https://github.com/jni-rs/jni-rs/pull/414 for discussion) if the string is long enough to
22 // overflow its counters.
23 //
24 // Although it is possible to transcode a string of any length by splitting it into smaller
25 // substrings, the code complexity needed to do so isn't worthwhile just for transcoding JVM
26 // options. Also, `test_overflow` would take a very long time to run, which was deemed unacceptable
27 // (see https://github.com/jni-rs/jni-rs/pull/414#issuecomment-1419130483). We set this arbitrary
28 // limit instead.
29 const MAX_INPUT_LEN: usize = 1048576;
30 
31 /// Converts `s` into a `Cow<CStr>` encoded in the specified Windows code page.
str_to_cstr_win32<'a>( s: Cow<'a, str>, needed_codepage: WCodepage, ) -> Result<Cow<'static, CStr>, JvmError>32 pub(super) fn str_to_cstr_win32<'a>(
33     s: Cow<'a, str>,
34     needed_codepage: WCodepage,
35 ) -> Result<Cow<'static, CStr>, JvmError> {
36     // First, check if the input string (UTF-8) is too long to transcode. Bail early if so.
37     if s.len() > MAX_INPUT_LEN {
38         return Err(JvmError::OptStringTooLong {
39             opt_string: s.into_owned(),
40         });
41     }
42 
43     // This function will generate an error if `WideCharToMultiByte` fails.
44     fn convert_error(s: Cow<str>) -> JvmError {
45         JvmError::OptStringTranscodeFailure {
46             opt_string: s.into_owned(),
47             error: io::Error::last_os_error(),
48         }
49     }
50 
51     // Convert the string to UTF-16 first.
52     let s_utf16: Vec<u16> = s.encode_utf16().collect();
53 
54     // Determine how long the string is, in UTF-16 units, in the integer type that Win32 expects.
55     // Overflow should be impossible; panic if it happens.
56     let s_utf16_len: WSize = s_utf16
57         .len()
58         .try_into()
59         .expect("UTF-16 form of input string is too long");
60 
61     // Decide which flags we're going to use.
62     let conversion_flags = match needed_codepage {
63         // No flags may be given for the following code pages.
64         // https://learn.microsoft.com/en-us/windows/win32/api/stringapiset/nf-stringapiset-widechartomultibyte
65         42
66         | 50220
67         | 50221
68         | 50222
69         | 50225
70         | 50227
71         | 50229
72         | 54936
73         | 57002..=57011
74         | 65000
75         | 65001 => 0,
76 
77         _ => winnls::WC_COMPOSITECHECK | winnls::WC_NO_BEST_FIT_CHARS,
78     };
79 
80     // Find out how much buffer space will be needed for the output and whether the string is
81     // fully representable.
82     let mut is_non_representable: Option<MaybeUninit<_>> = match needed_codepage {
83         // All characters are representable in UTF-7 and UTF-8, and moreover
84         // `WideCharToMultiByte` will fail if the target encoding is UTF-7 or UTF-8 and this is not
85         // `None`.
86         winnls::CP_UTF7 | winnls::CP_UTF8 => None,
87         _ => Some(MaybeUninit::uninit()),
88     };
89 
90     // Safety: `s_utf16.as_ptr()` is a valid pointer to a UTF-16 string, and `s_utf16_len` is its
91     // length. `lpDefaultChar` is null. `lpUsedDefaultChar` is either null or valid. `cbMultiByte`
92     // is zero.
93     let required_buffer_space = unsafe {
94         winnls::WideCharToMultiByte(
95             needed_codepage,
96             conversion_flags,
97             s_utf16.as_ptr(),
98             s_utf16_len,
99             ptr::null_mut(),
100             0,
101             ptr::null(),
102             match &mut is_non_representable {
103                 Some(x) => x.as_mut_ptr(),
104                 None => ptr::null_mut(),
105             },
106         )
107     };
108 
109     // Bail on error.
110     if required_buffer_space == 0 {
111         drop(s_utf16);
112 
113         return Err(convert_error(s));
114     }
115 
116     // Check if the string is not fully representable.
117     if let Some(is_non_representable) = is_non_representable {
118         // Safety: `is_non_representable` has been initialized by `WideCharToMultiByte`.
119         let is_non_representable = unsafe { is_non_representable.assume_init() };
120 
121         if is_non_representable != 0 {
122             drop(s_utf16);
123 
124             return Err(JvmError::OptStringNotRepresentable {
125                 opt_string: s.into_owned(),
126             });
127         }
128     }
129 
130     // Convert the required buffer space to `usize`, and increment it by one for the null
131     // terminator.
132     //
133     // This shouldn't overflow (see the comment on `MAX_INPUT_LEN` above), so we won't check for
134     // overflow here.
135     let required_buffer_space_usize: usize = required_buffer_space as _;
136     let required_buffer_space_usize_with_nul: usize = required_buffer_space_usize + 1;
137 
138     // Allocate enough buffer space, including one byte for the null terminator.
139     let mut output = Vec::<u8>::with_capacity(required_buffer_space_usize_with_nul);
140 
141     // Perform the actual conversion.
142     //
143     // Safety: `chunk.as_ptr()` is a valid pointer, and `chunk_len_i32` is its length.
144     // `chunk_output_ptr` is a valid pointer, and `required_buffer_space` is its length.
145     // All other raw pointers are null.
146     let used_buffer_space = unsafe {
147         winnls::WideCharToMultiByte(
148             needed_codepage,
149             conversion_flags,
150             s_utf16.as_ptr(),
151             s_utf16_len,
152             output.as_mut_ptr(),
153             required_buffer_space,
154             ptr::null(),
155             ptr::null_mut(),
156         )
157     };
158 
159     drop(s_utf16);
160 
161     // Bail on error.
162     if used_buffer_space == 0 {
163         drop(output);
164 
165         return Err(convert_error(s));
166     }
167 
168     let used_buffer_space_usize: usize = used_buffer_space as usize;
169 
170     // Set the new length of the output buffer. Don't use `required_buffer_space`, just in case
171     // `WideCharToMultiByte` changes its mind about how much buffer space it's actually going to
172     // use.
173     //
174     // Safety: `used_buffer_space_usize` is the number of bytes that `WideCharToMultiByte` has
175     // just initialized.
176     unsafe {
177         output.set_len(used_buffer_space_usize);
178     }
179 
180     // That's it, it's converted. Now turn it into a `CString`. This will add a null terminator if
181     // there isn't one already and check for null bytes in the middle.
182     unsafe { bytes_to_cstr(Cow::Owned(output), Some(s.into())) }
183 }
184 
185 /// Converts `s` into the Windows default character encoding.
str_to_cstr_win32_default_codepage<'a>( s: Cow<'a, str>, ) -> Result<Cow<'a, CStr>, JvmError>186 pub(super) fn str_to_cstr_win32_default_codepage<'a>(
187     s: Cow<'a, str>,
188 ) -> Result<Cow<'a, CStr>, JvmError> {
189     // Get the code page. There is a remote possibility that it is UTF-8. If so, pass the
190     // string through unchanged (other than adding a null terminator). If not, we need to have
191     // Windows convert the string to the expected code page first.
192 
193     // Safety: This function isn't actually unsafe.
194     let needed_codepage = unsafe { winnls::GetACP() };
195 
196     if needed_codepage == winnls::CP_UTF8 {
197         // The code page is UTF-8! Lucky us.
198         return utf8_to_cstr(s);
199     }
200 
201     // The code page is not UTF-8, so do the transcoding.
202     str_to_cstr_win32(s, needed_codepage)
203 }
204 
205 /// Transcodes text in an arbitrary Windows codepage into a Rust `String`. Used to test
206 /// round-tripping.
207 #[cfg(test)]
codepage_to_string_win32( codepage_string: impl AsRef<[u8]>, codepage: WCodepage, max_expected_utf16_len: WSize, ) -> io::Result<String>208 fn codepage_to_string_win32(
209     codepage_string: impl AsRef<[u8]>,
210     codepage: WCodepage,
211     max_expected_utf16_len: WSize,
212 ) -> io::Result<String> {
213     let codepage_string_slice = codepage_string.as_ref();
214 
215     let codepage_string_slice_len: WSize = codepage_string_slice
216         .len()
217         .try_into()
218         .expect("`codepage_string`'s length is too large to transcode with Win32");
219 
220     let mut buf = Vec::<u16>::with_capacity(
221         max_expected_utf16_len
222             .try_into()
223             .expect("expected_utf16_len is negative or exceeds address space"),
224     );
225 
226     // Safety: All of these pointers and lengths are valid and checked for overflow.
227     let utf16_units_transcoded = unsafe {
228         winnls::MultiByteToWideChar(
229             codepage,
230             0,
231             codepage_string_slice.as_ptr() as *const _,
232             codepage_string_slice_len,
233             buf.as_mut_ptr(),
234             max_expected_utf16_len,
235         )
236     };
237 
238     if utf16_units_transcoded == 0 {
239         return Err(io::Error::last_os_error());
240     }
241 
242     // Safety: `MultiByteToWideChar` claims to have initialized this many UTF-16 units.
243     unsafe {
244         buf.set_len(utf16_units_transcoded as _);
245     }
246 
247     drop(codepage_string);
248 
249     let string =
250         String::from_utf16(buf.as_slice()).expect("`MultiByteToWideChar` generated invalid UTF-16");
251 
252     Ok(string)
253 }
254 
255 #[test]
test()256 fn test() {
257     use assert_matches::assert_matches;
258 
259     {
260         let result = str_to_cstr_win32("Hello, world ��".into(), winnls::CP_UTF8).unwrap();
261         assert_eq!(
262             result.to_bytes_with_nul(),
263             b"Hello, world \xf0\x9f\x98\x8e\0"
264         );
265         assert_matches!(result, Cow::Owned(_));
266     }
267 
268     {
269         let result = str_to_cstr_win32("Hello, world ��\0".into(), winnls::CP_UTF8).unwrap();
270         assert_eq!(
271             result.to_bytes_with_nul(),
272             b"Hello, world \xf0\x9f\x98\x8e\0"
273         );
274     }
275 
276     {
277         let result = str_to_cstr_win32("Hello, world ��".into(), 1252).unwrap_err();
278         let error_string = assert_matches!(result, JvmError::OptStringNotRepresentable { opt_string } => opt_string);
279         assert_eq!(error_string, "Hello, world ��");
280     }
281 
282     {
283         let result = str_to_cstr_win32("Hello, world™".into(), 1252).unwrap();
284         assert_eq!(result.to_bytes_with_nul(), b"Hello, world\x99\0");
285         assert_matches!(result, Cow::Owned(_));
286     }
287 }
288 
289 #[test]
test_overflow()290 fn test_overflow() {
291     use assert_matches::assert_matches;
292 
293     // Note: We avoid naïvely using `assert` here, because assertion failure will dump millions of
294     // characters to the console. Instead, here are some functions for handling errors without
295     // doing that.
296 
297     #[track_caller]
298     fn check_and_clear_error_opt_string(expected_opt_string: &str, error: &mut JvmError) {
299         if let Some(actual_opt_string) = error.opt_string_mut() {
300             if actual_opt_string != expected_opt_string {
301                 panic!("opt_string was mangled in moving it to an error");
302             }
303 
304             *actual_opt_string = String::new();
305         }
306     }
307 
308     #[track_caller]
309     fn expect_success(
310         expected_opt_string: &str,
311         result: Result<Cow<'static, CStr>, JvmError>,
312     ) -> Cow<'static, CStr> {
313         match result {
314             Ok(ok) => ok,
315             Err(mut error) => {
316                 check_and_clear_error_opt_string(expected_opt_string, &mut error);
317                 panic!("unexpected transcoding failure: {}", error)
318             }
319         }
320     }
321 
322     #[track_caller]
323     fn expect_successful_roundtrip(
324         expected_opt_string: &str,
325         result: Result<Cow<'static, CStr>, JvmError>,
326     ) -> Cow<'static, CStr> {
327         let string = expect_success(expected_opt_string, result);
328         assert!(
329             expected_opt_string.as_bytes() == string.to_bytes(),
330             "opt_string was transcoded successfully but mangled"
331         );
332         string
333     }
334 
335     #[track_caller]
336     fn expect_opt_string_too_long(
337         expected_opt_string: &str,
338         result: Result<Cow<'static, CStr>, JvmError>,
339     ) {
340         let mut error = match result {
341             Err(err) => err,
342             Ok(ok) => {
343                 assert!(
344                     expected_opt_string.as_bytes() == ok.to_bytes(),
345                     "transcoding unexpectedly succeeded and resulted in mangled output"
346                 );
347                 panic!("transcoding unexpectedly succeeded")
348             }
349         };
350 
351         check_and_clear_error_opt_string(expected_opt_string, &mut error);
352 
353         assert_matches!(error, JvmError::OptStringTooLong { .. });
354     }
355 
356     {
357         // Try transcoding a plain ASCII string.
358 
359         // First, allocate enough space to completely fill the maximum allowed length, plus one
360         // more.
361         //eprintln!("Allocating & filling ASCII");
362         let string = vec![b'H'; MAX_INPUT_LEN.checked_add(1).unwrap()];
363 
364         //eprintln!("Checking UTF-8 correctness");
365         let mut string = String::from_utf8(string).unwrap();
366 
367         // This string is currently one character too long to transcode, so there should be an
368         // overflow error.
369         //eprintln!("Transcoding ASCII string that's too long");
370         expect_opt_string_too_long(
371             &string,
372             str_to_cstr_win32(string.as_str().into(), winnls::CP_UTF8),
373         );
374 
375         // But if we remove one character…
376         assert_eq!(string.pop(), Some('H'));
377 
378         // …then it should transcode fine.
379         //eprintln!("Transcoding ASCII string that's not too long");
380         expect_successful_roundtrip(
381             &string,
382             str_to_cstr_win32(string.as_str().into(), winnls::CP_UTF8),
383         );
384     }
385 
386     {
387         // Try transcoding a non-ASCII string.
388 
389         // U+07FF is the highest code point that can be represnted in UTF-8 with only two bytes, so
390         // we'll use that. The UTF-8 encoding is `df bf`. We fill it this way because it's much
391         // faster than the naïve character-by-character approach (at least unless some future Rust
392         // compiler performs this optimization on its own, but 1.66 doesn't).
393         //eprintln!("Allocating & filling non-ASCII for UTF-8 and UTF-7");
394         let string_byte_pairs = vec![u16::from_be(0xdfbf); MAX_INPUT_LEN / 2];
395 
396         //eprintln!("Checking UTF-8 correctness");
397         let string: &str =
398             std::str::from_utf8(bytemuck::cast_slice(string_byte_pairs.as_slice())).unwrap();
399 
400         // Again, the string should transcode without overflow.
401         //eprintln!("Transcoding non-ASCII to UTF-8");
402         expect_successful_roundtrip(string, str_to_cstr_win32(string.into(), winnls::CP_UTF8));
403 
404         // This should work even with UTF-7. This is the real reason we're using U+07FF: we need
405         // to check that the highest code point that fits under the limit will not overflow even
406         // with the worst-case code page.
407         {
408             //eprintln!("Transcoding non-ASCII to UTF-7");
409             let result = expect_success(string, str_to_cstr_win32(string.into(), winnls::CP_UTF7));
410 
411             // *And* it should roundtrip back to UTF-8.
412             //eprintln!("Transcoding UTF-7 back to UTF-8");
413             let result: String = codepage_to_string_win32(
414                 result.to_bytes(),
415                 winnls::CP_UTF7,
416                 (string.len() / 2).try_into().unwrap(),
417             )
418             .unwrap();
419 
420             assert!(result == string, "didn't roundtrip via UTF-7");
421         }
422     }
423 
424     {
425         // Try transcoding to Windows-1252. This is the slowest part of the test
426         // (`WideCharToMultiByte` is very slow at this, for some reason), so it's done last.
427         //eprintln!("Allocating & filling non-ASCII for Windows-1252");
428         let string_byte_pairs = vec![u16::from_be(0xc2ae); MAX_INPUT_LEN / 2];
429 
430         //eprintln!("Checking UTF-8 correctness");
431         let string: &str =
432             std::str::from_utf8(bytemuck::cast_slice(string_byte_pairs.as_slice())).unwrap();
433 
434         //eprintln!("Transcoding non-ASCII to Windows-1252");
435         let result = expect_success(string, str_to_cstr_win32(string.into(), 1252));
436 
437         //eprintln!("Checking Windows-1252 for correctness");
438         assert!(
439             result.to_bytes().iter().all(|byte| *byte == 0xae),
440             "string didn't transcode to Windows-1252 properly"
441         );
442     }
443 }
444