1 /*
2 * Copyright (C) 2017 The Android Open Source Project
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * * Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in
12 * the documentation and/or other materials provided with the
13 * distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
18 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
19 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
21 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
22 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
23 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
25 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29 #include <iconv.h>
30
31 #include <ctype.h>
32 #include <endian.h>
33 #include <errno.h>
34 #include <stdlib.h>
35 #include <string.h>
36 #include <uchar.h>
37
38 #include "private/bionic_mbstate.h"
39
40 #define INVALID_ICONV_T reinterpret_cast<iconv_t>(-1)
41
42 // Ideally we'd use icu4c but the API mismatch seems too great. So we just offer something
43 // equivalent to (but slightly easier to use for runs of text than) <uchar.h>. If you're
44 // here to add more encodings, consider working on finishing the icu4c NDK wrappers instead.
45 enum Encoding {
46 US_ASCII,
47 UTF_8,
48 UTF_16_LE,
49 UTF_16_BE,
50 UTF_32_LE,
51 UTF_32_BE,
52 WCHAR_T,
53 };
54
55 enum Mode {
56 ERROR,
57 IGNORE,
58 TRANSLIT,
59 };
60
61 // This matching is strange but true.
62 // See http://www.unicode.org/reports/tr22/#Charset_Alias_Matching.
__match_encoding(const char * lhs,const char * rhs)63 static bool __match_encoding(const char* lhs, const char* rhs) {
64 while (*lhs && *rhs) {
65 // Skip non-alnum in lhs; "UTF-8", "UTF_8", "UTF8", "UTF 8" are all equivalent.
66 // Also implement the "delete each 0 that is not preceded by a digit" rule.
67 for (; *lhs; ++lhs) {
68 if (isalnum(*lhs) && (*lhs != '0' || !isdigit(*(lhs + 1)))) break;
69 }
70 // Case doesn't matter either.
71 if (tolower(*lhs) != tolower(*rhs)) break;
72 ++lhs;
73 ++rhs;
74 }
75 // As a special case we treat the GNU "//" extensions as end of string.
76 if ((*lhs == '\0' || strstr(lhs, "//") == lhs) && *rhs == '\0') return true;
77 return false;
78 }
79
__parse_encoding(const char * s,Encoding * encoding,Mode * mode)80 static bool __parse_encoding(const char* s, Encoding* encoding, Mode* mode) {
81 const char* suffix = strstr(s, "//");
82 if (suffix) {
83 if (!mode) return false;
84 if (strcmp(suffix, "//IGNORE") == 0) {
85 *mode = IGNORE;
86 } else if (strcmp(suffix, "//TRANSLIT") == 0) {
87 *mode = TRANSLIT;
88 } else {
89 return false;
90 }
91 }
92 if (__match_encoding(s, "utf8")) {
93 *encoding = UTF_8;
94 } else if (__match_encoding(s, "ascii") || __match_encoding(s, "usascii")) {
95 *encoding = US_ASCII;
96 } else if (__match_encoding(s, "utf16le")) {
97 *encoding = UTF_16_LE;
98 } else if (__match_encoding(s, "utf16be")) {
99 *encoding = UTF_16_BE;
100 } else if (__match_encoding(s, "utf32le")) {
101 *encoding = UTF_32_LE;
102 } else if (__match_encoding(s, "utf32be")) {
103 *encoding = UTF_32_BE;
104 } else if (__match_encoding(s, "wchart")) {
105 *encoding = WCHAR_T;
106 } else {
107 return false;
108 }
109 return true;
110 }
111
112 struct __iconv_t {
113 Encoding src_encoding;
114 Encoding dst_encoding;
115 Mode mode;
116
__iconv_t__iconv_t117 __iconv_t() : mode(ERROR) {
118 }
119
Convert__iconv_t120 int Convert(char** src_buf0, size_t* src_bytes_left0, char** dst_buf0, size_t* dst_bytes_left0) {
121 // Reset state.
122 wc = 0;
123 memset(&ps, 0, sizeof(ps));
124 replacement_count = 0;
125 ignored = false;
126 src_buf = src_buf0;
127 src_bytes_left = src_bytes_left0;
128 dst_buf = dst_buf0;
129 dst_bytes_left = dst_bytes_left0;
130
131 while (*src_bytes_left > 0) {
132 if (!GetNext() || !Convert()) return -1;
133 }
134 return Done();
135 }
136
137 private:
138 char32_t wc;
139 char buf[16];
140 size_t src_bytes_used;
141 size_t dst_bytes_used;
142 mbstate_t ps;
143
144 size_t replacement_count;
145 bool ignored;
146
147 char** src_buf;
148 size_t* src_bytes_left;
149 char** dst_buf;
150 size_t* dst_bytes_left;
151
GetNext__iconv_t152 bool GetNext() {
153 errno = 0;
154 switch (src_encoding) {
155 case US_ASCII:
156 wc = **src_buf;
157 src_bytes_used = 1;
158 if (wc > 0x7f) errno = EILSEQ;
159 break;
160
161 case UTF_8:
162 src_bytes_used = mbrtoc32(&wc, *src_buf, *src_bytes_left, &ps);
163 if (src_bytes_used == __MB_ERR_ILLEGAL_SEQUENCE) {
164 break; // EILSEQ already set.
165 } else if (src_bytes_used == __MB_ERR_INCOMPLETE_SEQUENCE) {
166 errno = EINVAL;
167 return false;
168 }
169 break;
170
171 case UTF_16_BE:
172 case UTF_16_LE: {
173 if (*src_bytes_left < 2) {
174 errno = EINVAL;
175 return false;
176 }
177 bool swap = (src_encoding == UTF_16_BE);
178 wc = In16(*src_buf, swap);
179 // 0xd800-0xdbff: high surrogates
180 // 0xdc00-0xdfff: low surrogates
181 if (wc >= 0xd800 && wc <= 0xdfff) {
182 if (wc >= 0xdc00) { // Low surrogate before high surrogate.
183 errno = EILSEQ;
184 return false;
185 }
186 if (*src_bytes_left < 4) {
187 errno = EINVAL;
188 return false;
189 }
190 uint16_t hi = wc;
191 uint16_t lo = In16(*src_buf + 2, swap);
192 wc = 0x10000 + ((hi - 0xd800) << 10) + (lo - 0xdc00);
193 src_bytes_used = 4;
194 }
195 break;
196 }
197
198 case UTF_32_BE:
199 case UTF_32_LE:
200 case WCHAR_T:
201 if (*src_bytes_left < 4) {
202 errno = EINVAL;
203 return false;
204 }
205 wc = In32(*src_buf, (src_encoding == UTF_32_BE));
206 break;
207 }
208
209 if (errno == EILSEQ) {
210 switch (mode) {
211 case ERROR:
212 return false;
213 case IGNORE:
214 *src_buf += src_bytes_used;
215 *src_bytes_left -= src_bytes_used;
216 ignored = true;
217 return GetNext();
218 case TRANSLIT:
219 wc = '?';
220 ++replacement_count;
221 return true;
222 }
223 }
224 return true;
225 }
226
Convert__iconv_t227 bool Convert() {
228 errno = 0;
229 switch (dst_encoding) {
230 case US_ASCII:
231 buf[0] = wc;
232 dst_bytes_used = 1;
233 if (wc > 0x7f) errno = EILSEQ;
234 break;
235
236 case UTF_8:
237 dst_bytes_used = c32rtomb(buf, wc, &ps);
238 if (dst_bytes_used == __MB_ERR_ILLEGAL_SEQUENCE) {
239 break; // EILSEQ already set.
240 } else if (dst_bytes_used == __MB_ERR_INCOMPLETE_SEQUENCE) {
241 errno = EINVAL;
242 return false;
243 }
244 break;
245
246 case UTF_16_BE:
247 case UTF_16_LE: {
248 bool swap = (dst_encoding == UTF_16_BE);
249 if (wc < 0x10000) { // BMP.
250 Out16(buf, wc, swap);
251 } else { // Supplementary plane; output surrogate pair.
252 wc -= 0x10000;
253 char16_t hi = 0xd800 | (wc >> 10);
254 char16_t lo = 0xdc00 | (wc & 0x3ff);
255 Out16(buf + 0, hi, swap);
256 Out16(buf + 2, lo, swap);
257 dst_bytes_used = 4;
258 }
259 } break;
260
261 case UTF_32_BE:
262 case UTF_32_LE:
263 case WCHAR_T:
264 Out32(wc, (dst_encoding == UTF_32_BE));
265 break;
266 }
267
268 if (errno == EILSEQ) {
269 if (mode == IGNORE) {
270 *src_buf += src_bytes_used;
271 *src_bytes_left -= src_bytes_used;
272 ignored = true;
273 return true;
274 } else if (mode == TRANSLIT) {
275 wc = '?';
276 ++replacement_count;
277 return Convert();
278 }
279 return false;
280 }
281
282 return Emit();
283 }
284
In16__iconv_t285 uint16_t In16(const char* buf, bool swap) {
286 const uint8_t* src = reinterpret_cast<const uint8_t*>(buf);
287 uint16_t wc = (src[0]) | (src[1] << 8);
288 if (swap) wc = __swap16(wc);
289 src_bytes_used = 2;
290 return wc;
291 }
292
In32__iconv_t293 uint32_t In32(const char* buf, bool swap) {
294 const uint8_t* src = reinterpret_cast<const uint8_t*>(buf);
295 uint32_t wc = (src[0]) | (src[1] << 8) | (src[2] << 16) | (src[3] << 24);
296 if (swap) wc = __swap32(wc);
297 src_bytes_used = 4;
298 return wc;
299 }
300
Out16__iconv_t301 void Out16(char* dst, char16_t ch, bool swap) {
302 if (swap) ch = __swap16(ch);
303 dst[0] = ch;
304 dst[1] = ch >> 8;
305 dst_bytes_used = 2;
306 }
307
Out32__iconv_t308 void Out32(char32_t ch, bool swap) {
309 if (swap) ch = __swap32(ch);
310 buf[0] = ch;
311 buf[1] = ch >> 8;
312 buf[2] = ch >> 16;
313 buf[3] = ch >> 24;
314 dst_bytes_used = 4;
315 }
316
Emit__iconv_t317 bool Emit() {
318 if (dst_bytes_used > *dst_bytes_left) {
319 errno = E2BIG;
320 return false;
321 }
322
323 memcpy(*dst_buf, buf, dst_bytes_used);
324 *src_buf += src_bytes_used;
325 *src_bytes_left -= src_bytes_used;
326 *dst_buf += dst_bytes_used;
327 *dst_bytes_left -= dst_bytes_used;
328 return true;
329 }
330
Done__iconv_t331 int Done() {
332 if (mode == TRANSLIT) return replacement_count;
333 if (ignored) {
334 errno = EILSEQ;
335 return -1;
336 }
337 return 0;
338 }
339 };
340
iconv_open(const char * __dst_encoding,const char * __src_encoding)341 iconv_t iconv_open(const char* __dst_encoding, const char* __src_encoding) {
342 iconv_t result = new __iconv_t;
343 if (!__parse_encoding(__src_encoding, &result->src_encoding, nullptr) ||
344 !__parse_encoding(__dst_encoding, &result->dst_encoding, &result->mode)) {
345 delete result;
346 errno = EINVAL;
347 return INVALID_ICONV_T;
348 }
349 return result;
350 }
351
iconv(iconv_t __converter,char ** __src_buf,size_t * __src_bytes_left,char ** __dst_buf,size_t * __dst_bytes_left)352 size_t iconv(iconv_t __converter,
353 char** __src_buf, size_t* __src_bytes_left,
354 char** __dst_buf, size_t* __dst_bytes_left) {
355 if (__converter == INVALID_ICONV_T) {
356 errno = EBADF;
357 return -1;
358 }
359 return __converter->Convert(__src_buf, __src_bytes_left, __dst_buf, __dst_bytes_left);
360 }
361
iconv_close(iconv_t __converter)362 int iconv_close(iconv_t __converter) {
363 if (__converter == INVALID_ICONV_T) {
364 errno = EBADF;
365 return -1;
366 }
367 delete __converter;
368 return 0;
369 }
370