1 #include <iconv.h>
2 #include <errno.h>
3 #include <wchar.h>
4 #include <string.h>
5 #include <stdlib.h>
6 #include <limits.h>
7 #include <stdint.h>
8 #include <pthread.h>
9 #include "locale_impl.h"
10 #ifndef __LITEOS__
11 #ifdef FEATURE_ICU_LOCALE
12 #include <info/device_api_version.h>
13 #endif
14 #endif
15
16 #define UTF_32BE 0300
17 #define UTF_16LE 0301
18 #define UTF_16BE 0302
19 #define UTF_32LE 0303
20 #define UCS2BE 0304
21 #define UCS2LE 0305
22 #define WCHAR_T 0306
23 #define US_ASCII 0307
24 #define UTF_8 0310
25 #define UTF_16 0312
26 #define UTF_32 0313
27 #define UCS2 0314
28 #define EUC_JP 0320
29 #define SHIFT_JIS 0321
30 #define ISO2022_JP 0322
31 #define GB18030 0330
32 #define GBK 0331
33 #define GB2312 0332
34 #define BIG5 0340
35 #define EUC_KR 0350
36 #ifndef __LITEOS__
37 #ifdef FEATURE_ICU_LOCALE
38 #define ICU_ZERO_ERROR 0
39 #define ICU_SYMBOL_LOAD_ERROR (-1)
40 #define ICU_IVALID_CHAR_ERROR 10
41 #define ICU_TRUNCATED_CHAR_ERROR 11
42 #define ICU_ILLEGAL_CHAR_ERROR 12
43 #define ICU_BUFFER_OVERFLOW_ERROR 15
44 #define ICU_SKIP_THRESHOLD 2
45 #define DEVICE_VERSION_THRESHOLD 20
46 #define TYPE_FLAG_POS 1
47 #define TO_IGNORE_FLAG_POS 2
48 #define FROM_IGNORE_FLAG_POS 3
49 #define TO_TRANSLIT_FLAG_POS 4
50 #define FROM_TRANSLIT_FLAG_POS 5
51 #define ICU_CHUNK_SIZE 1024
52 #endif
53 #endif
54 /* Definitions of charmaps. Each charmap consists of:
55 * 1. Empty-string-terminated list of null-terminated aliases.
56 * 2. Special type code or number of elided quads of entries.
57 * 3. Character table (size determined by field 2), consisting
58 * of 5 bytes for every 4 characters, interpreted as 10-bit
59 * indices into the legacy_chars table. */
60
61 static const unsigned char charmaps[] =
62 "utf8\0char\0\0\310"
63 "wchart\0\0\306"
64 "ucs2be\0\0\304"
65 "ucs2le\0\0\305"
66 "utf16be\0\0\302"
67 "utf16le\0\0\301"
68 "ucs4be\0utf32be\0\0\300"
69 "ucs4le\0utf32le\0\0\303"
70 "ascii\0usascii\0iso646\0iso646us\0\0\307"
71 "utf16\0\0\312"
72 "ucs4\0utf32\0\0\313"
73 "ucs2\0\0\314"
74 "eucjp\0\0\320"
75 "shiftjis\0sjis\0cp932\0\0\321"
76 "iso2022jp\0\0\322"
77 "gb18030\0\0\330"
78 "gbk\0\0\331"
79 "gb2312\0\0\332"
80 "big5\0bigfive\0cp950\0big5hkscs\0\0\340"
81 "euckr\0ksc5601\0ksx1001\0cp949\0\0\350"
82 #include "codepages.h"
83 ;
84
85 #ifndef __LITEOS__
86 #ifdef FEATURE_ICU_LOCALE
87 // \0 split alias; \0\0 split name in icu
88 static const unsigned char icu_name_maps[] =
89 "utf8\0char\0\0UTF-8\0"
90 "utf7\0\0UTF-7\0"
91 "ucs2\0utf16\0ucs2be\0utf16be\0\0UTF-16BE\0"
92 "ucs2le\0utf16le\0\0UTF-16LE\0"
93 "ucs4\0utf32\0ucs4be\0utf32be\0\0UTF-32BE\0"
94 "wchart\0ucs4le\0utf32le\0\0UTF-32LE\0"
95 "ascii\0usascii\0""20127\0iso646\0iso646us\0\0US-ASCII\0"
96 "eucjp\0eucjp2007\0\0euc-jp-2007\0"
97 "shiftjis\0sjis\0cp932\0ibm943p15a2003\0\0ibm-943_P15A-2003\0"
98 "gb18030\0\0gb18030\0"
99 "gbk\0""54936\0windows9362000\0\0windows-936-2000\0"
100 "gb2312\0""52936\0ibm1383p1101999\0\0ibm-1383_P110-1999\0"
101 "big5\0""950\0bigfive\0cp950\0windows9502000\0\0windows-950-2000\0"
102 "big5hk\0big5hkscs\0""951\0ibm1375p1002008\0\0ibm-1375_P100-2008\0"
103 "euckr\0ibm970p110p1102006u2\0\0ibm-970_P110_P110-2006_U2\0"
104 "ksc5601\0ksx1001\0cp949\0windows9492000\0\0windows-949-2000\0"
105 "iso88591\0latin1\0\0ISO-8859-1\0"
106 "iso88592\0ibm912p1001995\0\0ibm-912_P100-1995\0"
107 "iso88593\0ibm913p1002000\0\0ibm-913_P100-2000\0"
108 "iso88594\0ibm914p1001995\0\0ibm-914_P100-1995\0"
109 "iso88595\0ibm915p1001995\0\0ibm-915_P100-1995\0"
110 "iso88596\0ibm1089p1001995\0\0ibm-1089_P100-1995\0"
111 "iso88597\0ibm9005x1102007\0\0ibm-9005_X110-2007\0"
112 "iso88598\0ibm5012p1001999\0\0ibm-5012_P100-1999\0"
113 "iso88599\0ibm920p1001995\0\0ibm-920_P100-1995\0"
114 "iso885910\0iso8859101998\0\0iso-8859_10-1998\0"
115 "iso885911\0iso8859112001\0\0iso-8859_11-2001\0"
116 "tis620\0windows8742000\0\0windows-874-2000\0"
117 "iso885913\0ibm921p1001995\0\0ibm-921_P100-1995\0"
118 "iso885914\0iso8859141998\0\0iso-8859_14-1998\0"
119 "iso885915\0latin9\0ibm923p1001998\0\0ibm-923_P100-1998\0"
120 "cp1250\0windows1250\0ibm5346p1001998\0\0ibm-5346_P100-1998\0"
121 "cp1251\0windows1251\0ibm5347p1001998\0\0ibm-5347_P100-1998\0"
122 "cp1252\0windows1252\0ibm5348p1001997\0\0ibm-5348_P100-1997\0"
123 "cp1253\0windows1253\0ibm5349p1001998\0\0ibm-5349_P100-1998\0"
124 "cp1254\0windows1254\0ibm5350p1001998\0\0ibm-5350_P100-1998\0"
125 "cp1255\0windows1255\0ibm9447p1002002\0\0ibm-9447_P100-2002\0"
126 "cp1256\0windows1256\0ibm9448x1002005\0\0ibm-9448_X100-2005\0"
127 "cp1257\0windows1257\0ibm9449p1002002\0\0ibm-9449_P100-2002\0"
128 "cp1258\0windows1258\0ibm5354p1001998\0\0ibm-5354_P100-1998\0"
129 "koi8r\0ibm878p1001996\0\0ibm-878_P100-1996\0"
130 "koi8u\0ibm1168p1002002\0\0ibm-1168_P100-2002\0"
131 "cp437\0ibm437p1001995\0\0ibm-437_P100-1995\0"
132 "cp850\0ibm850p1001995\0\0ibm-850_P100-1995\0"
133 "cp866\0ibm866p1001995\0\0ibm-866_P100-1995\0"
134 "ibm1047\0cp1047\0ibm1047p1001995\0\0ibm-1047_P100-1995\0"
135 ;
136 #endif
137 #endif
138
139 /* Table of characters that appear in legacy 8-bit codepages,
140 * limited to 1024 slots (10 bit indices). The first 256 entries
141 * are elided since those characters are obviously all included. */
142 static const unsigned short legacy_chars[] = {
143 #include "legacychars.h"
144 };
145
146 static const unsigned short jis0208[84][94] = {
147 #include "jis0208.h"
148 };
149
150 static const unsigned short gb18030[126][190] = {
151 #include "gb18030.h"
152 };
153
154 static const unsigned short big5[89][157] = {
155 #include "big5.h"
156 };
157
158 static const unsigned short hkscs[] = {
159 #include "hkscs.h"
160 };
161
162 static const unsigned short ksc[93][94] = {
163 #include "ksc.h"
164 };
165
166 static const unsigned short rev_jis[] = {
167 #include "revjis.h"
168 };
169
fuzzycmp(const unsigned char * a,const unsigned char * b)170 static int fuzzycmp(const unsigned char *a, const unsigned char *b)
171 {
172 for (; *a && *b; a++, b++) {
173 while (*a && (*a|32U)-'a'>26 && *a-'0'>10U) a++;
174 if ((*a|32U) != *b) return 1;
175 }
176 return *a != *b;
177 }
178
find_charmap(const void * name)179 static size_t find_charmap(const void *name)
180 {
181 const unsigned char *s;
182 if (!*(char *)name) name=charmaps; /* "utf8" */
183 for (s=charmaps; *s; ) {
184 if (!fuzzycmp(name, s)) {
185 for (; *s; s+=strlen((void *)s)+1);
186 return s+1-charmaps;
187 }
188 s += strlen((void *)s)+1;
189 if (!*s) {
190 if (s[1] > 0200) s+=2;
191 else s+=2+(64U-s[1])*5;
192 }
193 }
194 return -1;
195 }
196
197 #ifndef __LITEOS__
198 #ifdef FEATURE_ICU_LOCALE
find_icu_map(const void * query_name)199 static const unsigned char* find_icu_map(const void *query_name)
200 {
201 if (!*(char *)query_name) {
202 query_name = icu_name_maps;
203 }
204
205 const unsigned char *icu_name = icu_name_maps;
206 while (*icu_name) {
207 if (!fuzzycmp(query_name, icu_name)) {
208 while (*icu_name) {
209 icu_name += strlen((void *)icu_name) + 1; //find nearly \0\0
210 }
211 return icu_name + 1;
212 }
213 icu_name += strlen((void *)icu_name) + 1; // skip \0
214 if (!*icu_name) { // skip \0\0
215 icu_name++;
216 while (*icu_name) {icu_name++;}
217 icu_name++;
218 }
219 }
220 return NULL;
221 }
222 #endif
223 #endif
224
225 struct stateful_cd {
226 #ifndef __LITEOS__
227 #ifdef FEATURE_ICU_LOCALE
228 unsigned sign;
229 const unsigned char* to;
230 const unsigned char* from;
231 #endif
232 #endif
233 iconv_t base_cd;
234 unsigned state;
235 };
236
combine_to_from(size_t t,size_t f)237 static iconv_t combine_to_from(size_t t, size_t f)
238 {
239 return (void *)(f<<16 | t<<1 | 1);
240 }
241
extract_from(iconv_t cd)242 static size_t extract_from(iconv_t cd)
243 {
244 return (size_t)cd >> 16;
245 }
246
extract_to(iconv_t cd)247 static size_t extract_to(iconv_t cd)
248 {
249 return (size_t)cd >> 1 & 0x7fff;
250 }
251
252 #ifndef __LITEOS__
253 #ifdef FEATURE_ICU_LOCALE
set_type_flag(unsigned * value)254 static void set_type_flag(unsigned* value) {*value = (1 << TYPE_FLAG_POS) | *value;}
set_to_ignore_flag(unsigned * value)255 static void set_to_ignore_flag(unsigned* value) {*value = (1 << TO_IGNORE_FLAG_POS) | *value;}
set_from_ignore_flag(unsigned * value)256 static void set_from_ignore_flag(unsigned* value) {*value = (1 << FROM_IGNORE_FLAG_POS) | *value;}
set_to_translit_flag(unsigned * value)257 static void set_to_translit_flag(unsigned* value) {*value = (1 << TO_TRANSLIT_FLAG_POS) | *value;}
set_from_translit_flag(unsigned * value)258 static void set_from_translit_flag(unsigned* value) {*value = (1 << FROM_TRANSLIT_FLAG_POS) | *value;}
get_type_flag(unsigned value)259 static bool get_type_flag(unsigned value) {return (value >> TYPE_FLAG_POS) & 1;}
get_to_ignore_flag(unsigned value)260 static bool get_to_ignore_flag(unsigned value) {return (value >> TO_IGNORE_FLAG_POS) & 1;}
get_from_ignore_flag(unsigned value)261 static bool get_from_ignore_flag(unsigned value) {return (value >> FROM_IGNORE_FLAG_POS) & 1;}
get_to_translit_flag(unsigned value)262 static bool get_to_translit_flag(unsigned value) {return (value >> TO_TRANSLIT_FLAG_POS) & 1;}
get_from_translit_flag(unsigned value)263 static bool get_from_translit_flag(unsigned value) {return (value >> FROM_TRANSLIT_FLAG_POS) & 1;}
264
deal_with_tail(const char * ins,unsigned * sign,const unsigned char ** res,bool is_from)265 static bool deal_with_tail(const char* ins, unsigned* sign, const unsigned char** res, bool is_from)
266 {
267 char* ins_tmp = strdup(ins);
268 if (!ins_tmp) {return false;}
269 char* ins_ignore_pos = strstr(ins_tmp, "//IGNORE");
270 char* ins_translit_pos = strstr(ins_tmp, "//TRANSLIT");
271 if (ins_ignore_pos) {
272 if (is_from) {
273 set_from_ignore_flag(sign);
274 } else {
275 set_to_ignore_flag(sign);
276 }
277 *ins_ignore_pos = '\0';
278 *res = find_icu_map((void*)ins_tmp);
279 } else if (ins_translit_pos) {
280 if (is_from) {
281 set_from_translit_flag(sign);
282 } else {
283 set_to_translit_flag(sign);
284 }
285 *ins_translit_pos = '\0';
286 *res = find_icu_map((void*)ins_tmp);
287 } else {
288 *res = find_icu_map(ins);
289 }
290 free(ins_tmp);
291 return true;
292 }
293
294 bool icu_locale_enable = false;
295
296 pthread_mutex_t icu_init_mutex = PTHREAD_MUTEX_INITIALIZER;
297
298 /**
299 * @Description: The set_icu_enable function is used to set the internal implementation of iconv to the implementation
300 * of the ICU library.The iconv internal implementation may have been set to the ICU library implementation before the
301 * function was executed. In this case, the function also returns success.
302 * @return:If the function call is successful, the returned value will be zero; otherwise, the returned value will be a
303 * non-zero error code.
304 */
305
set_iconv_icu_enable()306 int set_iconv_icu_enable()
307 {
308 pthread_mutex_lock(&icu_init_mutex);
309 if (!icuuc_handle_init()) {
310 pthread_mutex_unlock(&icu_init_mutex);
311 return ICU_SYMBOL_LOAD_ERROR;
312 }
313
314 icu_locale_enable = true;
315 pthread_mutex_unlock(&icu_init_mutex);
316 return ICU_ZERO_ERROR;
317 }
318
319 #endif
320 #endif
321
iconv_open(const char * to,const char * from)322 iconv_t iconv_open(const char *to, const char *from)
323 {
324 struct stateful_cd *scd;
325
326 #ifndef __LITEOS__
327 #ifdef FEATURE_ICU_LOCALE
328 bool is_basic_open = false;
329
330 for (const char* s = "iso885916\0iso2022jp\0\0"; *s;) { // icu not support
331 if (!fuzzycmp((void*)to, (void*)s) || !fuzzycmp((void*)from, (void*)s)) {
332 is_basic_open = true;
333 }
334 s += strlen(s) + 1;
335 }
336
337 // icu open
338 if (!is_basic_open && icu_locale_enable) {
339 scd = malloc(sizeof *scd);
340 if (!scd) {return (iconv_t)-1;}
341 scd->sign = 0;
342 scd->state = 0;
343
344 if (!deal_with_tail(to, &scd->sign, &scd->to, false)) {return (iconv_t)-1;}
345 if (!deal_with_tail(from, &scd->sign, &scd->from, true)) {return (iconv_t)-1;}
346
347 if (!scd->to || !scd->from) {
348 errno = EINVAL;
349 free(scd);
350 return (iconv_t)-1;
351 }
352
353 set_type_flag(&scd->sign);
354 return (iconv_t)scd;
355 }
356 #endif
357 #endif
358
359 // basic open
360 size_t f, t;
361 if ((t = find_charmap(to))==-1
362 || (f = find_charmap(from))==-1
363 || (charmaps[t] >= 0330)) {
364 errno = EINVAL;
365 return (iconv_t)-1;
366 }
367 iconv_t cd = combine_to_from(t, f);
368
369 switch (charmaps[f]) {
370 case UTF_16:
371 case UTF_32:
372 case UCS2:
373 case ISO2022_JP:
374 scd = malloc(sizeof *scd);
375 if (!scd) return (iconv_t)-1;
376 memset(scd, 0, sizeof(*scd));
377 scd->base_cd = cd;
378 scd->state = 0;
379 cd = (iconv_t)scd;
380 }
381
382 return cd;
383 }
384
get_16(const unsigned char * s,int e)385 static unsigned get_16(const unsigned char *s, int e)
386 {
387 e &= 1;
388 return s[e]<<8 | s[1-e];
389 }
390
put_16(unsigned char * s,unsigned c,int e)391 static void put_16(unsigned char *s, unsigned c, int e)
392 {
393 e &= 1;
394 s[e] = c>>8;
395 s[1-e] = c;
396 }
397
get_32(const unsigned char * s,int e)398 static unsigned get_32(const unsigned char *s, int e)
399 {
400 e &= 3;
401 return s[e]+0U<<24 | s[e^1]<<16 | s[e^2]<<8 | s[e^3];
402 }
403
put_32(unsigned char * s,unsigned c,int e)404 static void put_32(unsigned char *s, unsigned c, int e)
405 {
406 e &= 3;
407 s[e^0] = c>>24;
408 s[e^1] = c>>16;
409 s[e^2] = c>>8;
410 s[e^3] = c;
411 }
412
413 /* Adapt as needed */
414 #define mbrtowc_utf8 mbrtowc
415 #define wctomb_utf8 wctomb
416
legacy_map(const unsigned char * map,unsigned c)417 static unsigned legacy_map(const unsigned char *map, unsigned c)
418 {
419 if (c < 4*map[-1]) return c;
420 unsigned x = c - 4*map[-1];
421 x = map[x*5/4]>>2*x%8 | map[x*5/4+1]<<8-2*x%8 & 1023;
422 return x < 256 ? x : legacy_chars[x-256];
423 }
424
uni_to_jis(unsigned c)425 static unsigned uni_to_jis(unsigned c)
426 {
427 unsigned nel = sizeof rev_jis / sizeof *rev_jis;
428 unsigned d, j, i, b = 0;
429 for (;;) {
430 i = nel/2;
431 j = rev_jis[b+i];
432 d = jis0208[j/256][j%256];
433 if (d==c) return j + 0x2121;
434 else if (nel == 1) return 0;
435 else if (c < d)
436 nel /= 2;
437 else {
438 b += i;
439 nel -= nel/2;
440 }
441 }
442 }
443
444 #ifndef __LITEOS__
445 #ifdef FEATURE_ICU_LOCALE
ucnv_from_u_callback_ignore(const void * context,void * fromUArgs,const void * codeUnits,int32_t length,int32_t codePoint,int reason,int * err)446 static void ucnv_from_u_callback_ignore(
447 const void* context,
448 void* fromUArgs,
449 const void* codeUnits,
450 int32_t length,
451 int32_t codePoint,
452 int reason,
453 int* err)
454 {
455 if (reason <= ICU_SKIP_THRESHOLD) {
456 *err = ICU_ZERO_ERROR;
457 }
458 }
459
ucnv_from_u_callback_stop(const void * context,...)460 static void ucnv_from_u_callback_stop(const void* context, ...) { }
461
ucnv_to_u_callback_ignore(const void * context,void * toUArgs,const void * codeUnits,int32_t length,int reason,int * err)462 static void ucnv_to_u_callback_ignore(
463 const void* context,
464 void* toUArgs,
465 const void* codeUnits,
466 int32_t length,
467 int reason,
468 int* err)
469 {
470 if (reason <= ICU_SKIP_THRESHOLD) {
471 *err = ICU_ZERO_ERROR;
472 }
473 }
474
ucnv_to_u_callback_stop(const void * context,...)475 static void ucnv_to_u_callback_stop(const void* context, ...) { }
476
set_errno(int errCode)477 static void set_errno(int errCode)
478 {
479 if (errCode == ICU_ZERO_ERROR) {
480 errno = 0;
481 } else if (errCode == ICU_BUFFER_OVERFLOW_ERROR) {
482 errno = E2BIG;
483 } else if (errCode == ICU_IVALID_CHAR_ERROR ||
484 errCode == ICU_TRUNCATED_CHAR_ERROR ||
485 errCode == ICU_ILLEGAL_CHAR_ERROR) {
486 errno = EILSEQ;
487 } else {
488 errno = EINVAL;
489 }
490 }
491
iconv_icu(unsigned sign,const unsigned char * to,const unsigned char * from,char ** restrict in,size_t * restrict inb,char ** restrict out,size_t * restrict outb)492 static size_t iconv_icu(unsigned sign, const unsigned char* to, const unsigned char* from,
493 char **restrict in, size_t *restrict inb, char **restrict out, size_t *restrict outb)
494 {
495 int errCode = ICU_ZERO_ERROR;
496
497 void* conv_in = g_icu_opt_func.ucnv_open((void*)from, &errCode);
498 if (get_from_ignore_flag(sign)) {
499 g_icu_opt_func.ucnv_setToUCallBack(conv_in, ucnv_to_u_callback_ignore, NULL, NULL, NULL, &errCode);
500 } else if (!get_from_translit_flag(sign)) {
501 g_icu_opt_func.ucnv_setFromUCallBack(conv_in, ucnv_to_u_callback_stop, NULL, NULL, NULL, &errCode);
502 }
503
504 void* conv_out = g_icu_opt_func.ucnv_open((void*)to, &errCode);
505 if (get_to_ignore_flag(sign)) {
506 g_icu_opt_func.ucnv_setFromUCallBack(conv_out, ucnv_from_u_callback_ignore, NULL, NULL, NULL, &errCode);
507 } else if (!get_to_translit_flag(sign)) {
508 g_icu_opt_func.ucnv_setFromUCallBack(conv_out, ucnv_from_u_callback_stop, NULL, NULL, NULL, &errCode);
509 }
510
511 u_char pivot_buffer[ICU_CHUNK_SIZE];
512 u_char *pivot, *pivot2;
513 char *mytarget;
514 const char *source_limit;
515 const char *target_limit;
516 int32_t target_length = 0;
517 source_limit = *in + *inb;
518 pivot = pivot2 = pivot_buffer;
519 mytarget = *out;
520 target_limit = *out + *outb;
521 g_icu_opt_func.ucnv_convertEx(conv_out, conv_in, &mytarget, target_limit, (const char **)in, source_limit,
522 pivot_buffer, &pivot, &pivot2, pivot_buffer + ICU_CHUNK_SIZE, false, true, &errCode);
523 target_length = (int32_t)(mytarget - *out);
524 if (errCode > ICU_ZERO_ERROR) {
525 set_errno(errCode);
526 return (size_t)-1;
527 } else {
528 errCode = ICU_ZERO_ERROR;
529 }
530 g_icu_opt_func.ucnv_close(conv_in);
531 g_icu_opt_func.ucnv_close(conv_out);
532
533 *out += target_length;
534 *outb -= target_length;
535 *in += *inb;
536 *inb -= *inb;
537 set_errno(errCode);
538
539 return (size_t)errCode;
540 }
541 #endif
542 #endif
543
iconv(iconv_t cd,char ** restrict in,size_t * restrict inb,char ** restrict out,size_t * restrict outb)544 size_t iconv(iconv_t cd, char **restrict in, size_t *restrict inb, char **restrict out, size_t *restrict outb)
545 {
546 if (!in || !*in || !*inb) {
547 return 0;
548 }
549
550 size_t x=0;
551 struct stateful_cd *scd=0;
552 if (!((size_t)cd & 1)) {
553 scd = (void *)cd;
554 cd = scd->base_cd;
555 #ifndef __LITEOS__
556 #ifdef FEATURE_ICU_LOCALE
557 if (get_type_flag(scd->sign)) {
558 return iconv_icu(scd->sign, scd->to, scd->from, in, inb, out, outb);
559 }
560 #endif
561 #endif
562 }
563 unsigned to = extract_to(cd);
564 unsigned from = extract_from(cd);
565 const unsigned char *map = charmaps+from+1;
566 const unsigned char *tomap = charmaps+to+1;
567 mbstate_t st = {0};
568 wchar_t wc;
569 unsigned c, d;
570 size_t k, l;
571 int err;
572 unsigned char type = map[-1];
573 unsigned char totype = tomap[-1];
574 locale_t *ploc = &CURRENT_LOCALE, loc = *ploc;
575
576 *ploc = UTF8_LOCALE;
577
578 for (; *inb; *in+=l, *inb-=l) {
579 c = *(unsigned char *)*in;
580 l = 1;
581
582 switch (type) {
583 case UTF_8:
584 if (c < 128) break;
585 l = mbrtowc_utf8(&wc, *in, *inb, &st);
586 if (l == (size_t)-1) goto ilseq;
587 if (l == (size_t)-2) goto starved;
588 c = wc;
589 break;
590 case US_ASCII:
591 if (c >= 128) goto ilseq;
592 break;
593 case WCHAR_T:
594 l = sizeof(wchar_t);
595 if (*inb < l) goto starved;
596 c = *(wchar_t *)*in;
597 if (0) {
598 case UTF_32BE:
599 case UTF_32LE:
600 l = 4;
601 if (*inb < 4) goto starved;
602 c = get_32((void *)*in, type);
603 }
604 if (c-0xd800u < 0x800u || c >= 0x110000u) goto ilseq;
605 break;
606 case UCS2BE:
607 case UCS2LE:
608 case UTF_16BE:
609 case UTF_16LE:
610 l = 2;
611 if (*inb < 2) goto starved;
612 c = get_16((void *)*in, type);
613 if ((unsigned)(c-0xdc00) < 0x400) goto ilseq;
614 if ((unsigned)(c-0xd800) < 0x400) {
615 if (type-UCS2BE < 2U) goto ilseq;
616 l = 4;
617 if (*inb < 4) goto starved;
618 d = get_16((void *)(*in + 2), type);
619 if ((unsigned)(d-0xdc00) >= 0x400) goto ilseq;
620 c = ((c-0xd7c0)<<10) + (d-0xdc00);
621 }
622 break;
623 case UCS2:
624 case UTF_16:
625 l = 0;
626 if (!scd->state) {
627 if (*inb < 2) goto starved;
628 c = get_16((void *)*in, 0);
629 scd->state = type==UCS2
630 ? c==0xfffe ? UCS2LE : UCS2BE
631 : c==0xfffe ? UTF_16LE : UTF_16BE;
632 if (c == 0xfffe || c == 0xfeff)
633 l = 2;
634 }
635 type = scd->state;
636 continue;
637 case UTF_32:
638 l = 0;
639 if (!scd->state) {
640 if (*inb < 4) goto starved;
641 c = get_32((void *)*in, 0);
642 scd->state = c==0xfffe0000 ? UTF_32LE : UTF_32BE;
643 if (c == 0xfffe0000 || c == 0xfeff)
644 l = 4;
645 }
646 type = scd->state;
647 continue;
648 case SHIFT_JIS:
649 if (c < 128) break;
650 if (c-0xa1 <= 0xdf-0xa1) {
651 c += 0xff61-0xa1;
652 break;
653 }
654 l = 2;
655 if (*inb < 2) goto starved;
656 d = *((unsigned char *)*in + 1);
657 if (c-129 <= 159-129) c -= 129;
658 else if (c-224 <= 239-224) c -= 193;
659 else goto ilseq;
660 c *= 2;
661 if (d-64 <= 158-64) {
662 if (d==127) goto ilseq;
663 if (d>127) d--;
664 d -= 64;
665 } else if (d-159 <= 252-159) {
666 c++;
667 d -= 159;
668 }
669 c = jis0208[c][d];
670 if (!c) goto ilseq;
671 break;
672 case EUC_JP:
673 if (c < 128) break;
674 l = 2;
675 if (*inb < 2) goto starved;
676 d = *((unsigned char *)*in + 1);
677 if (c==0x8e) {
678 c = d;
679 if (c-0xa1 > 0xdf-0xa1) goto ilseq;
680 c += 0xff61 - 0xa1;
681 break;
682 }
683 c -= 0xa1;
684 d -= 0xa1;
685 if (c >= 84 || d >= 94) goto ilseq;
686 c = jis0208[c][d];
687 if (!c) goto ilseq;
688 break;
689 case ISO2022_JP:
690 if (c >= 128) goto ilseq;
691 if (c == '\033') {
692 l = 3;
693 if (*inb < 3) goto starved;
694 c = *((unsigned char *)*in + 1);
695 d = *((unsigned char *)*in + 2);
696 if (c != '(' && c != '$') goto ilseq;
697 switch (128*(c=='$') + d) {
698 case 'B': scd->state=0; continue;
699 case 'J': scd->state=1; continue;
700 case 'I': scd->state=4; continue;
701 case 128+'@': scd->state=2; continue;
702 case 128+'B': scd->state=3; continue;
703 }
704 goto ilseq;
705 }
706 switch (scd->state) {
707 case 1:
708 if (c=='\\') c = 0xa5;
709 if (c=='~') c = 0x203e;
710 break;
711 case 2:
712 case 3:
713 l = 2;
714 if (*inb < 2) goto starved;
715 d = *((unsigned char *)*in + 1);
716 c -= 0x21;
717 d -= 0x21;
718 if (c >= 84 || d >= 94) goto ilseq;
719 c = jis0208[c][d];
720 if (!c) goto ilseq;
721 break;
722 case 4:
723 if (c-0x60 < 0x1f) goto ilseq;
724 if (c-0x21 < 0x5e) c += 0xff61-0x21;
725 break;
726 }
727 break;
728 case GB2312:
729 if (c < 128) break;
730 if (c < 0xa1) goto ilseq;
731 case GBK:
732 case GB18030:
733 if (c < 128) break;
734 c -= 0x81;
735 if (c >= 126) goto ilseq;
736 l = 2;
737 if (*inb < 2) goto starved;
738 d = *((unsigned char *)*in + 1);
739 if (d < 0xa1 && type == GB2312) goto ilseq;
740 if (d-0x40>=191 || d==127) {
741 if (d-'0'>9 || type != GB18030)
742 goto ilseq;
743 l = 4;
744 if (*inb < 4) goto starved;
745 c = (10*c + d-'0') * 1260;
746 d = *((unsigned char *)*in + 2);
747 if (d-0x81>126) goto ilseq;
748 c += 10*(d-0x81);
749 d = *((unsigned char *)*in + 3);
750 if (d-'0'>9) goto ilseq;
751 c += d-'0';
752 c += 128;
753 for (d=0; d<=c; ) {
754 k = 0;
755 for (int i=0; i<126; i++)
756 for (int j=0; j<190; j++)
757 if (gb18030[i][j]-d <= c-d)
758 k++;
759 d = c+1;
760 c += k;
761 }
762 break;
763 }
764 d -= 0x40;
765 if (d>63) d--;
766 c = gb18030[c][d];
767 break;
768 case BIG5:
769 if (c < 128) break;
770 l = 2;
771 if (*inb < 2) goto starved;
772 d = *((unsigned char *)*in + 1);
773 if (d-0x40>=0xff-0x40 || d-0x7f<0xa1-0x7f) goto ilseq;
774 d -= 0x40;
775 if (d > 0x3e) d -= 0x22;
776 if (c-0xa1>=0xfa-0xa1) {
777 if (c-0x87>=0xff-0x87) goto ilseq;
778 if (c < 0xa1) c -= 0x87;
779 else c -= 0x87 + (0xfa-0xa1);
780 c = (hkscs[4867+(c*157+d)/16]>>(c*157+d)%16)%2<<17
781 | hkscs[c*157+d];
782 /* A few HKSCS characters map to pairs of UCS
783 * characters. These are mapped to surrogate
784 * range in the hkscs table then hard-coded
785 * here. Ugly, yes. */
786 if (c/256 == 0xdc) {
787 union {
788 char c[8];
789 wchar_t wc[2];
790 } tmp;
791 char *ptmp = tmp.c;
792 size_t tmpx = iconv(combine_to_from(to, find_charmap("utf8")),
793 &(char *){"\303\212\314\204"
794 "\303\212\314\214"
795 "\303\252\314\204"
796 "\303\252\314\214"
797 +c%256}, &(size_t){4},
798 &ptmp, &(size_t){sizeof tmp});
799 size_t tmplen = ptmp - tmp.c;
800 if (tmplen > *outb) goto toobig;
801 if (tmpx) x++;
802 memcpy(*out, &tmp, tmplen);
803 *out += tmplen;
804 *outb -= tmplen;
805 continue;
806 }
807 if (!c) goto ilseq;
808 break;
809 }
810 c -= 0xa1;
811 c = big5[c][d]|(c==0x27&&(d==0x3a||d==0x3c||d==0x42))<<17;
812 if (!c) goto ilseq;
813 break;
814 case EUC_KR:
815 if (c < 128) break;
816 l = 2;
817 if (*inb < 2) goto starved;
818 d = *((unsigned char *)*in + 1);
819 c -= 0xa1;
820 d -= 0xa1;
821 if (c >= 93 || d >= 94) {
822 c += (0xa1-0x81);
823 d += 0xa1;
824 if (c > 0xc6-0x81 || c==0xc6-0x81 && d>0x52)
825 goto ilseq;
826 if (d-'A'<26) d = d-'A';
827 else if (d-'a'<26) d = d-'a'+26;
828 else if (d-0x81<0xff-0x81) d = d-0x81+52;
829 else goto ilseq;
830 if (c < 0x20) c = 178*c + d;
831 else c = 178*0x20 + 84*(c-0x20) + d;
832 c += 0xac00;
833 for (d=0xac00; d<=c; ) {
834 k = 0;
835 for (int i=0; i<93; i++)
836 for (int j=0; j<94; j++)
837 if (ksc[i][j]-d <= c-d)
838 k++;
839 d = c+1;
840 c += k;
841 }
842 break;
843 }
844 c = ksc[c][d];
845 if (!c) goto ilseq;
846 break;
847 default:
848 if (!c) break;
849 c = legacy_map(map, c);
850 if (!c) goto ilseq;
851 }
852
853 switch (totype) {
854 case WCHAR_T:
855 if (*outb < sizeof(wchar_t)) goto toobig;
856 *(wchar_t *)*out = c;
857 *out += sizeof(wchar_t);
858 *outb -= sizeof(wchar_t);
859 break;
860 case UTF_8:
861 if (*outb < 4) {
862 char tmp[4];
863 k = wctomb_utf8(tmp, c);
864 if (*outb < k) goto toobig;
865 memcpy(*out, tmp, k);
866 } else k = wctomb_utf8(*out, c);
867 /* This failure condition should be unreachable, but
868 * is included to prevent decoder bugs from translating
869 * into advancement outside the output buffer range. */
870 if (k>4) goto ilseq;
871 *out += k;
872 *outb -= k;
873 break;
874 case US_ASCII:
875 if (c > 0x7f) subst: x++, c='*';
876 default:
877 if (*outb < 1) goto toobig;
878 if (c<256 && c==legacy_map(tomap, c)) {
879 revout:
880 if (*outb < 1) goto toobig;
881 *(*out)++ = c;
882 *outb -= 1;
883 break;
884 }
885 d = c;
886 for (c=4*totype; c<256; c++) {
887 if (d == legacy_map(tomap, c)) {
888 goto revout;
889 }
890 }
891 goto subst;
892 case SHIFT_JIS:
893 if (c < 128) goto revout;
894 if (c == 0xa5) {
895 x++;
896 c = '\\';
897 goto revout;
898 }
899 if (c == 0x203e) {
900 x++;
901 c = '~';
902 goto revout;
903 }
904 if (c-0xff61 <= 0xdf-0xa1) {
905 c += 0xa1 - 0xff61;
906 goto revout;
907 }
908 c = uni_to_jis(c);
909 if (!c) goto subst;
910 if (*outb < 2) goto toobig;
911 d = c%256;
912 c = c/256;
913 *(*out)++ = (c+1)/2 + (c<95 ? 112 : 176);
914 *(*out)++ = c%2 ? d + 31 + d/96 : d + 126;
915 *outb -= 2;
916 break;
917 case EUC_JP:
918 if (c < 128) goto revout;
919 if (c-0xff61 <= 0xdf-0xa1) {
920 c += 0x0e00 + 0x21 - 0xff61;
921 } else {
922 c = uni_to_jis(c);
923 }
924 if (!c) goto subst;
925 if (*outb < 2) goto toobig;
926 *(*out)++ = c/256 + 0x80;
927 *(*out)++ = c%256 + 0x80;
928 *outb -= 2;
929 break;
930 case ISO2022_JP:
931 if (c < 128) goto revout;
932 if (c-0xff61 <= 0xdf-0xa1 || c==0xa5 || c==0x203e) {
933 if (*outb < 7) goto toobig;
934 *(*out)++ = '\033';
935 *(*out)++ = '(';
936 if (c==0xa5) {
937 *(*out)++ = 'J';
938 *(*out)++ = '\\';
939 } else if (c==0x203e) {
940 *(*out)++ = 'J';
941 *(*out)++ = '~';
942 } else {
943 *(*out)++ = 'I';
944 *(*out)++ = c-0xff61+0x21;
945 }
946 *(*out)++ = '\033';
947 *(*out)++ = '(';
948 *(*out)++ = 'B';
949 *outb -= 7;
950 break;
951 }
952 c = uni_to_jis(c);
953 if (!c) goto subst;
954 if (*outb < 8) goto toobig;
955 *(*out)++ = '\033';
956 *(*out)++ = '$';
957 *(*out)++ = 'B';
958 *(*out)++ = c/256;
959 *(*out)++ = c%256;
960 *(*out)++ = '\033';
961 *(*out)++ = '(';
962 *(*out)++ = 'B';
963 *outb -= 8;
964 break;
965 case UCS2:
966 totype = UCS2BE;
967 case UCS2BE:
968 case UCS2LE:
969 case UTF_16:
970 case UTF_16BE:
971 case UTF_16LE:
972 if (c < 0x10000 || totype-UCS2BE < 2U) {
973 if (c >= 0x10000) c = 0xFFFD;
974 if (*outb < 2) goto toobig;
975 put_16((void *)*out, c, totype);
976 *out += 2;
977 *outb -= 2;
978 break;
979 }
980 if (*outb < 4) goto toobig;
981 c -= 0x10000;
982 put_16((void *)*out, (c>>10)|0xd800, totype);
983 put_16((void *)(*out + 2), (c&0x3ff)|0xdc00, totype);
984 *out += 4;
985 *outb -= 4;
986 break;
987 case UTF_32:
988 totype = UTF_32BE;
989 case UTF_32BE:
990 case UTF_32LE:
991 if (*outb < 4) goto toobig;
992 put_32((void *)*out, c, totype);
993 *out += 4;
994 *outb -= 4;
995 break;
996 }
997 }
998 *ploc = loc;
999 return x;
1000 ilseq:
1001 err = EILSEQ;
1002 x = -1;
1003 goto end;
1004 toobig:
1005 err = E2BIG;
1006 x = -1;
1007 goto end;
1008 starved:
1009 err = EINVAL;
1010 x = -1;
1011 end:
1012 errno = err;
1013 *ploc = loc;
1014 return x;
1015 }
1016