1 use super::{char_encoding_generic::*, JvmError};
2 use std::{
3 borrow::Cow,
4 convert::TryInto,
5 ffi::{c_int, c_uint, CStr},
6 io,
7 mem::MaybeUninit,
8 ptr,
9 };
10 use windows_sys::Win32::Globalization as winnls;
11
12 // The integer type used by `WideCharToMultiByte` for string lengths.
13 type WSize = c_int;
14
15 // The type of Windows codepage numbers.
16 type WCodepage = c_uint;
17
18 // The maximum length, in UTF-8 bytes, of strings that will be accepted for transcoding.
19 //
20 // The purpose of this limit is to prevent overflow. `WideCharToMultiByte` behaves rather badly
21 // (see https://github.com/jni-rs/jni-rs/pull/414 for discussion) if the string is long enough to
22 // overflow its counters.
23 //
24 // Although it is possible to transcode a string of any length by splitting it into smaller
25 // substrings, the code complexity needed to do so isn't worthwhile just for transcoding JVM
26 // options. Also, `test_overflow` would take a very long time to run, which was deemed unacceptable
27 // (see https://github.com/jni-rs/jni-rs/pull/414#issuecomment-1419130483). We set this arbitrary
28 // limit instead.
29 const MAX_INPUT_LEN: usize = 1048576;
30
31 /// Converts `s` into a `Cow<CStr>` encoded in the specified Windows code page.
str_to_cstr_win32<'a>( s: Cow<'a, str>, needed_codepage: WCodepage, ) -> Result<Cow<'static, CStr>, JvmError>32 pub(super) fn str_to_cstr_win32<'a>(
33 s: Cow<'a, str>,
34 needed_codepage: WCodepage,
35 ) -> Result<Cow<'static, CStr>, JvmError> {
36 // First, check if the input string (UTF-8) is too long to transcode. Bail early if so.
37 if s.len() > MAX_INPUT_LEN {
38 return Err(JvmError::OptStringTooLong {
39 opt_string: s.into_owned(),
40 });
41 }
42
43 // This function will generate an error if `WideCharToMultiByte` fails.
44 fn convert_error(s: Cow<str>) -> JvmError {
45 JvmError::OptStringTranscodeFailure {
46 opt_string: s.into_owned(),
47 error: io::Error::last_os_error(),
48 }
49 }
50
51 // Convert the string to UTF-16 first.
52 let s_utf16: Vec<u16> = s.encode_utf16().collect();
53
54 // Determine how long the string is, in UTF-16 units, in the integer type that Win32 expects.
55 // Overflow should be impossible; panic if it happens.
56 let s_utf16_len: WSize = s_utf16
57 .len()
58 .try_into()
59 .expect("UTF-16 form of input string is too long");
60
61 // Decide which flags we're going to use.
62 let conversion_flags = match needed_codepage {
63 // No flags may be given for the following code pages.
64 // https://learn.microsoft.com/en-us/windows/win32/api/stringapiset/nf-stringapiset-widechartomultibyte
65 42
66 | 50220
67 | 50221
68 | 50222
69 | 50225
70 | 50227
71 | 50229
72 | 54936
73 | 57002..=57011
74 | 65000
75 | 65001 => 0,
76
77 _ => winnls::WC_COMPOSITECHECK | winnls::WC_NO_BEST_FIT_CHARS,
78 };
79
80 // Find out how much buffer space will be needed for the output and whether the string is
81 // fully representable.
82 let mut is_non_representable: Option<MaybeUninit<_>> = match needed_codepage {
83 // All characters are representable in UTF-7 and UTF-8, and moreover
84 // `WideCharToMultiByte` will fail if the target encoding is UTF-7 or UTF-8 and this is not
85 // `None`.
86 winnls::CP_UTF7 | winnls::CP_UTF8 => None,
87 _ => Some(MaybeUninit::uninit()),
88 };
89
90 // Safety: `s_utf16.as_ptr()` is a valid pointer to a UTF-16 string, and `s_utf16_len` is its
91 // length. `lpDefaultChar` is null. `lpUsedDefaultChar` is either null or valid. `cbMultiByte`
92 // is zero.
93 let required_buffer_space = unsafe {
94 winnls::WideCharToMultiByte(
95 needed_codepage,
96 conversion_flags,
97 s_utf16.as_ptr(),
98 s_utf16_len,
99 ptr::null_mut(),
100 0,
101 ptr::null(),
102 match &mut is_non_representable {
103 Some(x) => x.as_mut_ptr(),
104 None => ptr::null_mut(),
105 },
106 )
107 };
108
109 // Bail on error.
110 if required_buffer_space == 0 {
111 drop(s_utf16);
112
113 return Err(convert_error(s));
114 }
115
116 // Check if the string is not fully representable.
117 if let Some(is_non_representable) = is_non_representable {
118 // Safety: `is_non_representable` has been initialized by `WideCharToMultiByte`.
119 let is_non_representable = unsafe { is_non_representable.assume_init() };
120
121 if is_non_representable != 0 {
122 drop(s_utf16);
123
124 return Err(JvmError::OptStringNotRepresentable {
125 opt_string: s.into_owned(),
126 });
127 }
128 }
129
130 // Convert the required buffer space to `usize`, and increment it by one for the null
131 // terminator.
132 //
133 // This shouldn't overflow (see the comment on `MAX_INPUT_LEN` above), so we won't check for
134 // overflow here.
135 let required_buffer_space_usize: usize = required_buffer_space as _;
136 let required_buffer_space_usize_with_nul: usize = required_buffer_space_usize + 1;
137
138 // Allocate enough buffer space, including one byte for the null terminator.
139 let mut output = Vec::<u8>::with_capacity(required_buffer_space_usize_with_nul);
140
141 // Perform the actual conversion.
142 //
143 // Safety: `chunk.as_ptr()` is a valid pointer, and `chunk_len_i32` is its length.
144 // `chunk_output_ptr` is a valid pointer, and `required_buffer_space` is its length.
145 // All other raw pointers are null.
146 let used_buffer_space = unsafe {
147 winnls::WideCharToMultiByte(
148 needed_codepage,
149 conversion_flags,
150 s_utf16.as_ptr(),
151 s_utf16_len,
152 output.as_mut_ptr(),
153 required_buffer_space,
154 ptr::null(),
155 ptr::null_mut(),
156 )
157 };
158
159 drop(s_utf16);
160
161 // Bail on error.
162 if used_buffer_space == 0 {
163 drop(output);
164
165 return Err(convert_error(s));
166 }
167
168 let used_buffer_space_usize: usize = used_buffer_space as usize;
169
170 // Set the new length of the output buffer. Don't use `required_buffer_space`, just in case
171 // `WideCharToMultiByte` changes its mind about how much buffer space it's actually going to
172 // use.
173 //
174 // Safety: `used_buffer_space_usize` is the number of bytes that `WideCharToMultiByte` has
175 // just initialized.
176 unsafe {
177 output.set_len(used_buffer_space_usize);
178 }
179
180 // That's it, it's converted. Now turn it into a `CString`. This will add a null terminator if
181 // there isn't one already and check for null bytes in the middle.
182 unsafe { bytes_to_cstr(Cow::Owned(output), Some(s.into())) }
183 }
184
185 /// Converts `s` into the Windows default character encoding.
str_to_cstr_win32_default_codepage<'a>( s: Cow<'a, str>, ) -> Result<Cow<'a, CStr>, JvmError>186 pub(super) fn str_to_cstr_win32_default_codepage<'a>(
187 s: Cow<'a, str>,
188 ) -> Result<Cow<'a, CStr>, JvmError> {
189 // Get the code page. There is a remote possibility that it is UTF-8. If so, pass the
190 // string through unchanged (other than adding a null terminator). If not, we need to have
191 // Windows convert the string to the expected code page first.
192
193 // Safety: This function isn't actually unsafe.
194 let needed_codepage = unsafe { winnls::GetACP() };
195
196 if needed_codepage == winnls::CP_UTF8 {
197 // The code page is UTF-8! Lucky us.
198 return utf8_to_cstr(s);
199 }
200
201 // The code page is not UTF-8, so do the transcoding.
202 str_to_cstr_win32(s, needed_codepage)
203 }
204
205 /// Transcodes text in an arbitrary Windows codepage into a Rust `String`. Used to test
206 /// round-tripping.
207 #[cfg(test)]
codepage_to_string_win32( codepage_string: impl AsRef<[u8]>, codepage: WCodepage, max_expected_utf16_len: WSize, ) -> io::Result<String>208 fn codepage_to_string_win32(
209 codepage_string: impl AsRef<[u8]>,
210 codepage: WCodepage,
211 max_expected_utf16_len: WSize,
212 ) -> io::Result<String> {
213 let codepage_string_slice = codepage_string.as_ref();
214
215 let codepage_string_slice_len: WSize = codepage_string_slice
216 .len()
217 .try_into()
218 .expect("`codepage_string`'s length is too large to transcode with Win32");
219
220 let mut buf = Vec::<u16>::with_capacity(
221 max_expected_utf16_len
222 .try_into()
223 .expect("expected_utf16_len is negative or exceeds address space"),
224 );
225
226 // Safety: All of these pointers and lengths are valid and checked for overflow.
227 let utf16_units_transcoded = unsafe {
228 winnls::MultiByteToWideChar(
229 codepage,
230 0,
231 codepage_string_slice.as_ptr() as *const _,
232 codepage_string_slice_len,
233 buf.as_mut_ptr(),
234 max_expected_utf16_len,
235 )
236 };
237
238 if utf16_units_transcoded == 0 {
239 return Err(io::Error::last_os_error());
240 }
241
242 // Safety: `MultiByteToWideChar` claims to have initialized this many UTF-16 units.
243 unsafe {
244 buf.set_len(utf16_units_transcoded as _);
245 }
246
247 drop(codepage_string);
248
249 let string =
250 String::from_utf16(buf.as_slice()).expect("`MultiByteToWideChar` generated invalid UTF-16");
251
252 Ok(string)
253 }
254
255 #[test]
test()256 fn test() {
257 use assert_matches::assert_matches;
258
259 {
260 let result = str_to_cstr_win32("Hello, world ".into(), winnls::CP_UTF8).unwrap();
261 assert_eq!(
262 result.to_bytes_with_nul(),
263 b"Hello, world \xf0\x9f\x98\x8e\0"
264 );
265 assert_matches!(result, Cow::Owned(_));
266 }
267
268 {
269 let result = str_to_cstr_win32("Hello, world \0".into(), winnls::CP_UTF8).unwrap();
270 assert_eq!(
271 result.to_bytes_with_nul(),
272 b"Hello, world \xf0\x9f\x98\x8e\0"
273 );
274 }
275
276 {
277 let result = str_to_cstr_win32("Hello, world ".into(), 1252).unwrap_err();
278 let error_string = assert_matches!(result, JvmError::OptStringNotRepresentable { opt_string } => opt_string);
279 assert_eq!(error_string, "Hello, world ");
280 }
281
282 {
283 let result = str_to_cstr_win32("Hello, world™".into(), 1252).unwrap();
284 assert_eq!(result.to_bytes_with_nul(), b"Hello, world\x99\0");
285 assert_matches!(result, Cow::Owned(_));
286 }
287 }
288
289 #[test]
test_overflow()290 fn test_overflow() {
291 use assert_matches::assert_matches;
292
293 // Note: We avoid naïvely using `assert` here, because assertion failure will dump millions of
294 // characters to the console. Instead, here are some functions for handling errors without
295 // doing that.
296
297 #[track_caller]
298 fn check_and_clear_error_opt_string(expected_opt_string: &str, error: &mut JvmError) {
299 if let Some(actual_opt_string) = error.opt_string_mut() {
300 if actual_opt_string != expected_opt_string {
301 panic!("opt_string was mangled in moving it to an error");
302 }
303
304 *actual_opt_string = String::new();
305 }
306 }
307
308 #[track_caller]
309 fn expect_success(
310 expected_opt_string: &str,
311 result: Result<Cow<'static, CStr>, JvmError>,
312 ) -> Cow<'static, CStr> {
313 match result {
314 Ok(ok) => ok,
315 Err(mut error) => {
316 check_and_clear_error_opt_string(expected_opt_string, &mut error);
317 panic!("unexpected transcoding failure: {}", error)
318 }
319 }
320 }
321
322 #[track_caller]
323 fn expect_successful_roundtrip(
324 expected_opt_string: &str,
325 result: Result<Cow<'static, CStr>, JvmError>,
326 ) -> Cow<'static, CStr> {
327 let string = expect_success(expected_opt_string, result);
328 assert!(
329 expected_opt_string.as_bytes() == string.to_bytes(),
330 "opt_string was transcoded successfully but mangled"
331 );
332 string
333 }
334
335 #[track_caller]
336 fn expect_opt_string_too_long(
337 expected_opt_string: &str,
338 result: Result<Cow<'static, CStr>, JvmError>,
339 ) {
340 let mut error = match result {
341 Err(err) => err,
342 Ok(ok) => {
343 assert!(
344 expected_opt_string.as_bytes() == ok.to_bytes(),
345 "transcoding unexpectedly succeeded and resulted in mangled output"
346 );
347 panic!("transcoding unexpectedly succeeded")
348 }
349 };
350
351 check_and_clear_error_opt_string(expected_opt_string, &mut error);
352
353 assert_matches!(error, JvmError::OptStringTooLong { .. });
354 }
355
356 {
357 // Try transcoding a plain ASCII string.
358
359 // First, allocate enough space to completely fill the maximum allowed length, plus one
360 // more.
361 //eprintln!("Allocating & filling ASCII");
362 let string = vec![b'H'; MAX_INPUT_LEN.checked_add(1).unwrap()];
363
364 //eprintln!("Checking UTF-8 correctness");
365 let mut string = String::from_utf8(string).unwrap();
366
367 // This string is currently one character too long to transcode, so there should be an
368 // overflow error.
369 //eprintln!("Transcoding ASCII string that's too long");
370 expect_opt_string_too_long(
371 &string,
372 str_to_cstr_win32(string.as_str().into(), winnls::CP_UTF8),
373 );
374
375 // But if we remove one character…
376 assert_eq!(string.pop(), Some('H'));
377
378 // …then it should transcode fine.
379 //eprintln!("Transcoding ASCII string that's not too long");
380 expect_successful_roundtrip(
381 &string,
382 str_to_cstr_win32(string.as_str().into(), winnls::CP_UTF8),
383 );
384 }
385
386 {
387 // Try transcoding a non-ASCII string.
388
389 // U+07FF is the highest code point that can be represnted in UTF-8 with only two bytes, so
390 // we'll use that. The UTF-8 encoding is `df bf`. We fill it this way because it's much
391 // faster than the naïve character-by-character approach (at least unless some future Rust
392 // compiler performs this optimization on its own, but 1.66 doesn't).
393 //eprintln!("Allocating & filling non-ASCII for UTF-8 and UTF-7");
394 let string_byte_pairs = vec![u16::from_be(0xdfbf); MAX_INPUT_LEN / 2];
395
396 //eprintln!("Checking UTF-8 correctness");
397 let string: &str =
398 std::str::from_utf8(bytemuck::cast_slice(string_byte_pairs.as_slice())).unwrap();
399
400 // Again, the string should transcode without overflow.
401 //eprintln!("Transcoding non-ASCII to UTF-8");
402 expect_successful_roundtrip(string, str_to_cstr_win32(string.into(), winnls::CP_UTF8));
403
404 // This should work even with UTF-7. This is the real reason we're using U+07FF: we need
405 // to check that the highest code point that fits under the limit will not overflow even
406 // with the worst-case code page.
407 {
408 //eprintln!("Transcoding non-ASCII to UTF-7");
409 let result = expect_success(string, str_to_cstr_win32(string.into(), winnls::CP_UTF7));
410
411 // *And* it should roundtrip back to UTF-8.
412 //eprintln!("Transcoding UTF-7 back to UTF-8");
413 let result: String = codepage_to_string_win32(
414 result.to_bytes(),
415 winnls::CP_UTF7,
416 (string.len() / 2).try_into().unwrap(),
417 )
418 .unwrap();
419
420 assert!(result == string, "didn't roundtrip via UTF-7");
421 }
422 }
423
424 {
425 // Try transcoding to Windows-1252. This is the slowest part of the test
426 // (`WideCharToMultiByte` is very slow at this, for some reason), so it's done last.
427 //eprintln!("Allocating & filling non-ASCII for Windows-1252");
428 let string_byte_pairs = vec![u16::from_be(0xc2ae); MAX_INPUT_LEN / 2];
429
430 //eprintln!("Checking UTF-8 correctness");
431 let string: &str =
432 std::str::from_utf8(bytemuck::cast_slice(string_byte_pairs.as_slice())).unwrap();
433
434 //eprintln!("Transcoding non-ASCII to Windows-1252");
435 let result = expect_success(string, str_to_cstr_win32(string.into(), 1252));
436
437 //eprintln!("Checking Windows-1252 for correctness");
438 assert!(
439 result.to_bytes().iter().all(|byte| *byte == 0xae),
440 "string didn't transcode to Windows-1252 properly"
441 );
442 }
443 }
444