1 /*
2 * Copyright (C) 2017 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <gtest/gtest.h>
18
19 #include <iconv.h>
20
21 #define INVALID_ICONV_T reinterpret_cast<iconv_t>(-1)
22
TEST(iconv,iconv_open_EINVAL)23 TEST(iconv, iconv_open_EINVAL) {
24 errno = 0;
25 ASSERT_EQ(INVALID_ICONV_T, iconv_open("silly", "silly"));
26 ASSERT_EQ(EINVAL, errno);
27 errno = 0;
28 ASSERT_EQ(INVALID_ICONV_T, iconv_open("silly", "UTF-8"));
29 ASSERT_EQ(EINVAL, errno);
30 errno = 0;
31 ASSERT_EQ(INVALID_ICONV_T, iconv_open("UTF-8", "silly"));
32 ASSERT_EQ(EINVAL, errno);
33 }
34
TEST(iconv,iconv_open_comparator)35 TEST(iconv, iconv_open_comparator) {
36 // Examples from http://www.unicode.org/reports/tr22/#Charset_Alias_Matching:
37 // "For example, the following names should match: "UTF-8", "utf8", "u.t.f-008", ..."
38 iconv_t c;
39 ASSERT_NE(INVALID_ICONV_T, c = iconv_open("UTF-8", "utf8"));
40 ASSERT_EQ(0, iconv_close(c));
41 ASSERT_NE(INVALID_ICONV_T, c = iconv_open("UTF-8", "u.t.f-008"));
42 ASSERT_EQ(0, iconv_close(c));
43
44 // "...but not "utf-80" or "ut8"."
45 errno = 0;
46 ASSERT_EQ(INVALID_ICONV_T, iconv_open("UTF-8", "utf-80"));
47 ASSERT_EQ(EINVAL, errno);
48 errno = 0;
49 ASSERT_EQ(INVALID_ICONV_T, iconv_open("UTF-8", "ut80"));
50 ASSERT_EQ(EINVAL, errno);
51 }
52
TEST(iconv,iconv_smoke)53 TEST(iconv, iconv_smoke) {
54 const char* utf8 = "a٦ᄀ"; // U+0666 ٦ 0xd9 0xa6 // U+1100 ᄀ 0xe1 0x84 0x80
55 char buf[BUFSIZ] = {};
56
57 iconv_t c = iconv_open("UTF-32LE", "UTF-8");
58 ASSERT_NE(INVALID_ICONV_T, c);
59
60 char* in = const_cast<char*>(utf8);
61 size_t in_bytes = strlen(in);
62
63 char* out = buf;
64 size_t out_bytes = sizeof(buf);
65
66 EXPECT_EQ(0U, iconv(c, &in, &in_bytes, &out, &out_bytes));
67
68 wchar_t* utf16 = reinterpret_cast<wchar_t*>(buf);
69 EXPECT_EQ(L'a', utf16[0]);
70 EXPECT_EQ(L'٦', utf16[1]);
71 EXPECT_EQ(L'ᄀ', utf16[2]);
72 EXPECT_EQ(L'\0', utf16[3]);
73 EXPECT_EQ(0U, in_bytes);
74 EXPECT_EQ(sizeof(buf) - (3 /* chars */ * 4 /* bytes each */), out_bytes);
75
76 ASSERT_EQ(0, iconv_close(c));
77 }
78
TEST(iconv,iconv_lossy_TRANSLIT)79 TEST(iconv, iconv_lossy_TRANSLIT) {
80 const char* utf8 = "a٦ᄀz"; // U+0666 ٦ 0xd9 0xa6 // U+1100 ᄀ 0xe1 0x84 0x80
81 char buf[BUFSIZ] = {};
82
83 iconv_t c = iconv_open("ASCII//TRANSLIT", "UTF-8");
84 ASSERT_NE(INVALID_ICONV_T, c);
85
86 char* in = const_cast<char*>(utf8);
87 size_t in_bytes = strlen(in);
88
89 char* out = buf;
90 size_t out_bytes = sizeof(buf);
91
92 // Two of the input characters (5 input bytes) aren't representable as ASCII.
93 // With "//TRANSLIT", we use a replacement character, and report the number
94 // of replacements.
95 EXPECT_EQ(2U, iconv(c, &in, &in_bytes, &out, &out_bytes));
96
97 EXPECT_EQ('a', buf[0]);
98 EXPECT_EQ('?', buf[1]);
99 EXPECT_EQ('?', buf[2]);
100 EXPECT_EQ('z', buf[3]);
101 EXPECT_EQ(0, buf[4]);
102 EXPECT_EQ(0U, in_bytes);
103 EXPECT_EQ(sizeof(buf) - 4, out_bytes);
104
105 ASSERT_EQ(0, iconv_close(c));
106 }
107
TEST(iconv,iconv_lossy_IGNORE)108 TEST(iconv, iconv_lossy_IGNORE) {
109 const char* utf8 = "a٦ᄀz"; // U+0666 ٦ 0xd9 0xa6 // U+1100 ᄀ 0xe1 0x84 0x80
110 char buf[BUFSIZ] = {};
111
112 iconv_t c = iconv_open("ASCII//IGNORE", "UTF-8");
113 ASSERT_NE(INVALID_ICONV_T, c);
114
115 char* in = const_cast<char*>(utf8);
116 size_t in_bytes = strlen(in);
117
118 char* out = buf;
119 size_t out_bytes = sizeof(buf);
120
121 // Two of the input characters (5 input bytes) aren't representable as ASCII.
122 // With "//IGNORE", we just skip them (but return failure).
123 errno = 0;
124 EXPECT_EQ(static_cast<size_t>(-1), iconv(c, &in, &in_bytes, &out, &out_bytes));
125 EXPECT_EQ(EILSEQ, errno);
126
127 EXPECT_EQ('a', buf[0]);
128 EXPECT_EQ('z', buf[1]);
129 EXPECT_EQ(0, buf[2]);
130 EXPECT_EQ(0U, in_bytes);
131 EXPECT_EQ(sizeof(buf) - 2, out_bytes);
132
133 ASSERT_EQ(0, iconv_close(c));
134 }
135
TEST(iconv,iconv_lossy)136 TEST(iconv, iconv_lossy) {
137 const char* utf8 = "a٦ᄀz"; // U+0666 ٦ 0xd9 0xa6 // U+1100 ᄀ 0xe1 0x84 0x80
138 char buf[BUFSIZ] = {};
139
140 iconv_t c = iconv_open("ASCII", "UTF-8");
141 ASSERT_NE(INVALID_ICONV_T, c);
142
143 char* in = const_cast<char*>(utf8);
144 size_t in_bytes = strlen(in);
145
146 char* out = buf;
147 size_t out_bytes = sizeof(buf);
148
149 // The second input character isn't representable as ASCII, so we stop there.
150 errno = 0;
151 EXPECT_EQ(static_cast<size_t>(-1), iconv(c, &in, &in_bytes, &out, &out_bytes));
152 EXPECT_EQ(EILSEQ, errno);
153
154 EXPECT_EQ('a', buf[0]);
155 EXPECT_EQ(0, buf[1]);
156 EXPECT_EQ(6U, in_bytes); // Two bytes for ٦, three bytes for ᄀ, and one byte for z.
157 EXPECT_EQ(sizeof(buf) - 1, out_bytes);
158
159 ASSERT_EQ(0, iconv_close(c));
160 }
161
TEST(iconv,iconv_malformed_sequence_EILSEQ)162 TEST(iconv, iconv_malformed_sequence_EILSEQ) {
163 const char* utf8 = "a\xd9z"; // 0xd9 is the first byte of the two-byte U+0666 ٦.
164 char buf[BUFSIZ] = {};
165
166 iconv_t c = iconv_open("UTF-8", "UTF-8");
167 ASSERT_NE(INVALID_ICONV_T, c);
168
169 char* in = const_cast<char*>(utf8);
170 size_t in_bytes = strlen(in);
171
172 char* out = buf;
173 size_t out_bytes = sizeof(buf);
174
175 // The second input byte is a malformed character, so we stop there.
176 errno = 0;
177 EXPECT_EQ(static_cast<size_t>(-1), iconv(c, &in, &in_bytes, &out, &out_bytes));
178 EXPECT_EQ(EILSEQ, errno);
179 EXPECT_EQ('\xd9', *in); // *in is left pointing to the start of the invalid sequence.
180 ++in;
181 --in_bytes;
182 errno = 0;
183 EXPECT_EQ(0U, iconv(c, &in, &in_bytes, &out, &out_bytes));
184 EXPECT_EQ(0, errno);
185
186 EXPECT_EQ('a', buf[0]);
187 EXPECT_EQ('z', buf[1]);
188 EXPECT_EQ(0, buf[2]);
189 EXPECT_EQ(0U, in_bytes);
190 EXPECT_EQ(sizeof(buf) - 2, out_bytes);
191
192 ASSERT_EQ(0, iconv_close(c));
193 }
194
TEST(iconv,iconv_incomplete_sequence_EINVAL)195 TEST(iconv, iconv_incomplete_sequence_EINVAL) {
196 const char* utf8 = "a\xd9"; // 0xd9 is the first byte of the two-byte U+0666 ٦.
197 char buf[BUFSIZ] = {};
198
199 iconv_t c = iconv_open("UTF-8", "UTF-8");
200 ASSERT_NE(INVALID_ICONV_T, c);
201
202 char* in = const_cast<char*>(utf8);
203 size_t in_bytes = strlen(in);
204
205 char* out = buf;
206 size_t out_bytes = sizeof(buf);
207
208 // The second input byte is just the start of a character, and we don't have any more bytes.
209 errno = 0;
210 EXPECT_EQ(static_cast<size_t>(-1), iconv(c, &in, &in_bytes, &out, &out_bytes));
211 EXPECT_EQ(EINVAL, errno);
212 EXPECT_EQ('\xd9', *in); // *in is left pointing to the start of the incomplete sequence.
213
214 EXPECT_EQ('a', buf[0]);
215 EXPECT_EQ(0, buf[1]);
216 EXPECT_EQ(1U, in_bytes);
217 EXPECT_EQ(sizeof(buf) - 1, out_bytes);
218
219 ASSERT_EQ(0, iconv_close(c));
220 }
221
TEST(iconv,iconv_E2BIG)222 TEST(iconv, iconv_E2BIG) {
223 const char* utf8 = "abc";
224 char buf[BUFSIZ] = {};
225
226 iconv_t c = iconv_open("UTF-8", "UTF-8");
227 ASSERT_NE(INVALID_ICONV_T, c);
228
229 char* in = const_cast<char*>(utf8);
230 size_t in_bytes = strlen(in);
231
232 char* out = buf;
233 size_t out_bytes = 1;
234
235 // We need three bytes, so one isn't enough (but we will make progress).
236 out_bytes = 1;
237 errno = 0;
238 EXPECT_EQ(static_cast<size_t>(-1), iconv(c, &in, &in_bytes, &out, &out_bytes));
239 EXPECT_EQ(E2BIG, errno);
240 EXPECT_EQ(2U, in_bytes);
241 EXPECT_EQ(0U, out_bytes);
242
243 // Two bytes left, so zero isn't enough (and we can't even make progress).
244 out_bytes = 0;
245 errno = 0;
246 EXPECT_EQ(static_cast<size_t>(-1), iconv(c, &in, &in_bytes, &out, &out_bytes));
247 EXPECT_EQ(E2BIG, errno);
248 EXPECT_EQ(2U, in_bytes);
249 EXPECT_EQ(0U, out_bytes);
250
251 // Two bytes left, so one isn't enough (but we will make progress).
252 out_bytes = 1;
253 errno = 0;
254 EXPECT_EQ(static_cast<size_t>(-1), iconv(c, &in, &in_bytes, &out, &out_bytes));
255 EXPECT_EQ(E2BIG, errno);
256 EXPECT_EQ(1U, in_bytes);
257 EXPECT_EQ(0U, out_bytes);
258
259 // One byte left, so one byte is now enough.
260 out_bytes = 1;
261 errno = 0;
262 EXPECT_EQ(0U, iconv(c, &in, &in_bytes, &out, &out_bytes));
263 EXPECT_EQ(0, errno);
264 EXPECT_EQ(0U, in_bytes);
265 EXPECT_EQ(0U, out_bytes);
266
267 EXPECT_EQ('a', buf[0]);
268 EXPECT_EQ('b', buf[1]);
269 EXPECT_EQ('c', buf[2]);
270 EXPECT_EQ(0, buf[3]);
271
272 ASSERT_EQ(0, iconv_close(c));
273 }
274
TEST(iconv,iconv_invalid_converter_EBADF)275 TEST(iconv, iconv_invalid_converter_EBADF) {
276 char* in = nullptr;
277 char* out = nullptr;
278 size_t in_bytes = 0;
279 size_t out_bytes = 0;
280 errno = 0;
281 ASSERT_EQ(static_cast<size_t>(-1), iconv(INVALID_ICONV_T, &in, &in_bytes, &out, &out_bytes));
282 ASSERT_EQ(EBADF, errno);
283 }
284
TEST(iconv,iconv_close_invalid_converter_EBADF)285 TEST(iconv, iconv_close_invalid_converter_EBADF) {
286 errno = 0;
287 ASSERT_EQ(-1, iconv_close(INVALID_ICONV_T));
288 ASSERT_EQ(EBADF, errno);
289 }
290
RoundTrip(const char * dst_enc,const char * expected_bytes,size_t n)291 static void RoundTrip(const char* dst_enc, const char* expected_bytes, size_t n) {
292 // Examples from https://en.wikipedia.org/wiki/UTF-16.
293 const char* utf8 = "$€"; // U+0024, U+20AC, U+10437.
294
295 iconv_t c = iconv_open(dst_enc, "UTF-8");
296 ASSERT_NE(INVALID_ICONV_T, c) << dst_enc;
297
298 char* in = const_cast<char*>(utf8);
299 size_t in_bytes = strlen(utf8);
300 char buf[BUFSIZ] = {};
301 char* out = buf;
302 size_t out_bytes = sizeof(buf);
303 size_t replacement_count = iconv(c, &in, &in_bytes, &out, &out_bytes);
304
305 // Check we got the bytes we were expecting.
306 for (size_t i = 0; i < n; ++i) {
307 EXPECT_EQ(expected_bytes[i], buf[i]) << i << ' '<< dst_enc;
308 }
309
310 ASSERT_EQ(0, iconv_close(c));
311
312 // We can't round-trip if there were replacements.
313 if (strstr(dst_enc, "ascii")) {
314 GTEST_LOG_(INFO) << "can't round-trip " << dst_enc << "\n";
315 return;
316 }
317 ASSERT_EQ(0U, replacement_count);
318
319 c = iconv_open("UTF-8", dst_enc);
320 ASSERT_NE(INVALID_ICONV_T, c) << dst_enc;
321
322 in = buf;
323 in_bytes = n;
324 char buf2[BUFSIZ] = {};
325 out = buf2;
326 out_bytes = sizeof(buf2);
327 iconv(c, &in, &in_bytes, &out, &out_bytes);
328
329 ASSERT_STREQ(utf8, buf2) << dst_enc;
330
331 ASSERT_EQ(0, iconv_close(c));
332 }
333
TEST(iconv,iconv_round_trip_ascii)334 TEST(iconv, iconv_round_trip_ascii) {
335 RoundTrip("ascii//TRANSLIT", "$??", 3);
336 }
337
TEST(iconv,iconv_round_trip_utf8)338 TEST(iconv, iconv_round_trip_utf8) {
339 RoundTrip("utf8", "\x24\xe2\x82\xac\xf0\x90\x90\xb7", 8);
340 }
341
TEST(iconv,iconv_round_trip_utf16be)342 TEST(iconv, iconv_round_trip_utf16be) {
343 RoundTrip("utf16be", "\x00\x24" "\x20\xac" "\xd8\x01\xdc\x37", 8);
344 }
345
TEST(iconv,iconv_round_trip_utf16le)346 TEST(iconv, iconv_round_trip_utf16le) {
347 RoundTrip("utf16le", "\x24\x00" "\xac\x20" "\x01\xd8\x37\xdc", 8);
348 }
349
TEST(iconv,iconv_round_trip_utf32be)350 TEST(iconv, iconv_round_trip_utf32be) {
351 RoundTrip("utf32be", "\x00\x00\x00\x24" "\x00\x00\x20\xac" "\x00\x01\x04\x37", 12);
352 }
353
TEST(iconv,iconv_round_trip_utf32le)354 TEST(iconv, iconv_round_trip_utf32le) {
355 RoundTrip("utf32le", "\x24\x00\x00\x00" "\xac\x20\x00\x00" "\x37\x04\x01\x00", 12);
356 }
357
TEST(iconv,iconv_round_trip_wchar_t)358 TEST(iconv, iconv_round_trip_wchar_t) {
359 RoundTrip("wchar_t", "\x24\x00\x00\x00" "\xac\x20\x00\x00" "\x37\x04\x01\x00", 12);
360 }
361
Check(int expected_errno,const char * src_enc,const char * src,size_t n)362 static void Check(int expected_errno, const char* src_enc, const char* src, size_t n) {
363 iconv_t c = iconv_open("wchar_t", src_enc);
364 char* in = const_cast<char*>(src);
365 size_t in_bytes = n;
366 wchar_t out_buf[16];
367 size_t out_bytes = sizeof(out_buf);
368 char* out = reinterpret_cast<char*>(out_buf);
369 errno = 0;
370 ASSERT_EQ(static_cast<size_t>(-1), iconv(c, &in, &in_bytes, &out, &out_bytes));
371 EXPECT_EQ(expected_errno, errno);
372 EXPECT_EQ(0, iconv_close(c));
373 }
374
TEST(iconv,iconv_EILSEQ_ascii)375 TEST(iconv, iconv_EILSEQ_ascii) {
376 Check(EILSEQ, "ASCII", "\xac", 1); // > 0x7f, so not ASCII.
377 }
378
TEST(iconv,iconv_EILSEQ_utf8_initial)379 TEST(iconv, iconv_EILSEQ_utf8_initial) {
380 Check(EILSEQ, "utf8", "\x82", 1); // Invalid initial byte.
381 }
382
TEST(iconv,iconv_EILSEQ_utf8_non_initial)383 TEST(iconv, iconv_EILSEQ_utf8_non_initial) {
384 Check(EILSEQ, "utf8", "\xe2\xe2\x82", 3); // Invalid second byte.
385 }
386
TEST(iconv,iconv_EILSEQ_utf16be_low_surrogate_first)387 TEST(iconv, iconv_EILSEQ_utf16be_low_surrogate_first) {
388 Check(EILSEQ, "utf16be", "\xdc\x37" "\xd8\x01", 4);
389 }
390
TEST(iconv,iconv_EILSEQ_utf16le_low_surrogate_first)391 TEST(iconv, iconv_EILSEQ_utf16le_low_surrogate_first) {
392 Check(EILSEQ, "utf16le", "\x37\xdc" "\x01\xd8", 4);
393 }
394
TEST(iconv,iconv_EINVAL_utf8_short)395 TEST(iconv, iconv_EINVAL_utf8_short) {
396 Check(EINVAL, "utf8", "\xe2\x82", 2); // Missing final byte of 3-byte sequence.
397 }
398
TEST(iconv,iconv_EINVAL_utf16be_short)399 TEST(iconv, iconv_EINVAL_utf16be_short) {
400 Check(EINVAL, "utf16be", "\x00", 1); // Missing second byte.
401 }
402
TEST(iconv,iconv_EINVAL_utf16be_missing_low_surrogate)403 TEST(iconv, iconv_EINVAL_utf16be_missing_low_surrogate) {
404 Check(EINVAL, "utf16be", "\xd8\x01", 2);
405 }
406
TEST(iconv,iconv_EINVAL_utf16be_half_low_surrogate)407 TEST(iconv, iconv_EINVAL_utf16be_half_low_surrogate) {
408 Check(EINVAL, "utf16be", "\xd8\x01\xdc", 3);
409 }
410
TEST(iconv,iconv_EINVAL_utf16le_short)411 TEST(iconv, iconv_EINVAL_utf16le_short) {
412 Check(EINVAL, "utf16le", "\x24", 1); // Missing second byte.
413 }
414
TEST(iconv,iconv_EINVAL_utf16le_missing_low_surrogate)415 TEST(iconv, iconv_EINVAL_utf16le_missing_low_surrogate) {
416 Check(EINVAL, "utf16le", "\x01\xd8", 2);
417 }
418
TEST(iconv,iconv_EINVAL_utf16le_half_low_surrogate)419 TEST(iconv, iconv_EINVAL_utf16le_half_low_surrogate) {
420 Check(EINVAL, "utf16le", "\x01\xd8\x37", 3);
421 }
422
TEST(iconv,iconv_EINVAL_utf32be_short)423 TEST(iconv, iconv_EINVAL_utf32be_short) {
424 Check(EINVAL, "utf32be", "\x00\x00\x00", 3); // Missing final byte.
425 }
426
TEST(iconv,iconv_EINVAL_utf32le_short)427 TEST(iconv, iconv_EINVAL_utf32le_short) {
428 Check(EINVAL, "utf32le", "\x24\x00\x00", 3); // Missing final byte.
429 }
430
TEST(iconv,iconv_initial_shift_state)431 TEST(iconv, iconv_initial_shift_state) {
432 // POSIX: "For state-dependent encodings, the conversion descriptor
433 // cd is placed into its initial shift state by a call for which inbuf
434 // is a null pointer, or for which inbuf points to a null pointer."
435 iconv_t c = iconv_open("utf8", "utf8");
436 char* in = nullptr;
437 size_t in_bytes = 0;
438 wchar_t out_buf[16];
439 size_t out_bytes = sizeof(out_buf);
440 char* out = reinterpret_cast<char*>(out_buf);
441
442 // Points to a null pointer...
443 errno = 0;
444 ASSERT_EQ(static_cast<size_t>(0), iconv(c, &in, &in_bytes, &out, &out_bytes));
445 EXPECT_EQ(0, errno);
446 EXPECT_EQ(sizeof(out_buf), out_bytes);
447
448 // Is a null pointer...
449 errno = 0;
450 ASSERT_EQ(static_cast<size_t>(0), iconv(c, nullptr, &in_bytes, &out, &out_bytes));
451 EXPECT_EQ(0, errno);
452 EXPECT_EQ(sizeof(out_buf), out_bytes);
453
454 // Is a null pointer and so is in_bytes. This isn't specified by POSIX, but
455 // glibc and macOS both allow that, where Android historically didn't.
456 // https://issuetracker.google.com/180598400
457 errno = 0;
458 ASSERT_EQ(static_cast<size_t>(0), iconv(c, nullptr, nullptr, &out, &out_bytes));
459 EXPECT_EQ(0, errno);
460 EXPECT_EQ(sizeof(out_buf), out_bytes);
461
462 EXPECT_EQ(0, iconv_close(c));
463 }
464