1 #include <iconv.h>
2 #include <errno.h>
3 #include <wchar.h>
4 #include <string.h>
5 #include <stdlib.h>
6 #include <limits.h>
7 #include <stdint.h>
8 #include <pthread.h>
9 #include "locale_impl.h"
10 #ifndef __LITEOS__
11 #ifdef FEATURE_ICU_LOCALE
12 #include <info/device_api_version.h>
13 #endif
14 #endif
15
16 #define UTF_32BE 0300
17 #define UTF_16LE 0301
18 #define UTF_16BE 0302
19 #define UTF_32LE 0303
20 #define UCS2BE 0304
21 #define UCS2LE 0305
22 #define WCHAR_T 0306
23 #define US_ASCII 0307
24 #define UTF_8 0310
25 #define UTF_16 0312
26 #define UTF_32 0313
27 #define UCS2 0314
28 #define EUC_JP 0320
29 #define SHIFT_JIS 0321
30 #define ISO2022_JP 0322
31 #define GB18030 0330
32 #define GBK 0331
33 #define GB2312 0332
34 #define BIG5 0340
35 #define EUC_KR 0350
36 #ifndef __LITEOS__
37 #ifdef FEATURE_ICU_LOCALE
38 #define ICU_IVALID_CHAR_ERROR 10
39 #define ICU_TRUNCATED_CHAR_ERROR 11
40 #define ICU_ILLEGAL_CHAR_ERROR 12
41 #define ICU_BUFFER_OVERFLOW_ERROR 15
42 #define ICU_SKIP_THRESHOLD 2
43 #define TYPE_FLAG_POS 1
44 #define TO_IGNORE_FLAG_POS 2
45 #define FROM_IGNORE_FLAG_POS 3
46 #define TO_TRANSLIT_FLAG_POS 4
47 #define FROM_TRANSLIT_FLAG_POS 5
48 #define ICU_CHUNK_SIZE 1024
49 #endif
50 #endif
51 /* Definitions of charmaps. Each charmap consists of:
52 * 1. Empty-string-terminated list of null-terminated aliases.
53 * 2. Special type code or number of elided quads of entries.
54 * 3. Character table (size determined by field 2), consisting
55 * of 5 bytes for every 4 characters, interpreted as 10-bit
56 * indices into the legacy_chars table. */
57
58 static const unsigned char charmaps[] =
59 "utf8\0char\0\0\310"
60 "wchart\0\0\306"
61 "ucs2be\0\0\304"
62 "ucs2le\0\0\305"
63 "utf16be\0\0\302"
64 "utf16le\0\0\301"
65 "ucs4be\0utf32be\0\0\300"
66 "ucs4le\0utf32le\0\0\303"
67 "ascii\0usascii\0iso646\0iso646us\0\0\307"
68 "utf16\0\0\312"
69 "ucs4\0utf32\0\0\313"
70 "ucs2\0\0\314"
71 "eucjp\0\0\320"
72 "shiftjis\0sjis\0cp932\0\0\321"
73 "iso2022jp\0\0\322"
74 "gb18030\0\0\330"
75 "gbk\0\0\331"
76 "gb2312\0\0\332"
77 "big5\0bigfive\0cp950\0big5hkscs\0\0\340"
78 "euckr\0ksc5601\0ksx1001\0cp949\0\0\350"
79 #include "codepages.h"
80 ;
81
82 #ifndef __LITEOS__
83 #ifdef FEATURE_ICU_LOCALE
84 // \0 split alias; \0\0 split name in icu
85 static const unsigned char icu_name_maps[] =
86 "utf8\0char\0\0UTF-8\0"
87 "utf7\0\0UTF-7\0"
88 "ucs2\0utf16\0ucs2be\0utf16be\0\0UTF-16BE\0"
89 "ucs2le\0utf16le\0\0UTF-16LE\0"
90 "ucs4\0utf32\0ucs4be\0utf32be\0\0UTF-32BE\0"
91 "wchart\0ucs4le\0utf32le\0\0UTF-32LE\0"
92 "ascii\0usascii\0""20127\0iso646\0iso646us\0\0US-ASCII\0"
93 "eucjp\0eucjp2007\0\0euc-jp-2007\0"
94 "shiftjis\0sjis\0cp932\0ibm943p15a2003\0\0ibm-943_P15A-2003\0"
95 "gb18030\0\0gb18030\0"
96 "gbk\0""54936\0windows9362000\0\0windows-936-2000\0"
97 "gb2312\0""52936\0ibm1383p1101999\0\0ibm-1383_P110-1999\0"
98 "big5\0""950\0bigfive\0cp950\0windows9502000\0\0windows-950-2000\0"
99 "big5hk\0big5hkscs\0""951\0ibm1375p1002008\0\0ibm-1375_P100-2008\0"
100 "euckr\0ibm970p110p1102006u2\0\0ibm-970_P110_P110-2006_U2\0"
101 "ksc5601\0ksx1001\0cp949\0windows9492000\0\0windows-949-2000\0"
102 "iso88591\0latin1\0\0ISO-8859-1\0"
103 "iso88592\0ibm912p1001995\0\0ibm-912_P100-1995\0"
104 "iso88593\0ibm913p1002000\0\0ibm-913_P100-2000\0"
105 "iso88594\0ibm914p1001995\0\0ibm-914_P100-1995\0"
106 "iso88595\0ibm915p1001995\0\0ibm-915_P100-1995\0"
107 "iso88596\0ibm1089p1001995\0\0ibm-1089_P100-1995\0"
108 "iso88597\0ibm9005x1102007\0\0ibm-9005_X110-2007\0"
109 "iso88598\0ibm5012p1001999\0\0ibm-5012_P100-1999\0"
110 "iso88599\0ibm920p1001995\0\0ibm-920_P100-1995\0"
111 "iso885910\0iso8859101998\0\0iso-8859_10-1998\0"
112 "iso885911\0iso8859112001\0\0iso-8859_11-2001\0"
113 "tis620\0windows8742000\0\0windows-874-2000\0"
114 "iso885913\0ibm921p1001995\0\0ibm-921_P100-1995\0"
115 "iso885914\0iso8859141998\0\0iso-8859_14-1998\0"
116 "iso885915\0latin9\0ibm923p1001998\0\0ibm-923_P100-1998\0"
117 "cp1250\0windows1250\0ibm5346p1001998\0\0ibm-5346_P100-1998\0"
118 "cp1251\0windows1251\0ibm5347p1001998\0\0ibm-5347_P100-1998\0"
119 "cp1252\0windows1252\0ibm5348p1001997\0\0ibm-5348_P100-1997\0"
120 "cp1253\0windows1253\0ibm5349p1001998\0\0ibm-5349_P100-1998\0"
121 "cp1254\0windows1254\0ibm5350p1001998\0\0ibm-5350_P100-1998\0"
122 "cp1255\0windows1255\0ibm9447p1002002\0\0ibm-9447_P100-2002\0"
123 "cp1256\0windows1256\0ibm9448x1002005\0\0ibm-9448_X100-2005\0"
124 "cp1257\0windows1257\0ibm9449p1002002\0\0ibm-9449_P100-2002\0"
125 "cp1258\0windows1258\0ibm5354p1001998\0\0ibm-5354_P100-1998\0"
126 "koi8r\0ibm878p1001996\0\0ibm-878_P100-1996\0"
127 "koi8u\0ibm1168p1002002\0\0ibm-1168_P100-2002\0"
128 "cp437\0ibm437p1001995\0\0ibm-437_P100-1995\0"
129 "cp850\0ibm850p1001995\0\0ibm-850_P100-1995\0"
130 "cp866\0ibm866p1001995\0\0ibm-866_P100-1995\0"
131 "ibm1047\0cp1047\0ibm1047p1001995\0\0ibm-1047_P100-1995\0"
132 ;
133 #endif
134 #endif
135
136 /* Table of characters that appear in legacy 8-bit codepages,
137 * limited to 1024 slots (10 bit indices). The first 256 entries
138 * are elided since those characters are obviously all included. */
139 static const unsigned short legacy_chars[] = {
140 #include "legacychars.h"
141 };
142
143 static const unsigned short jis0208[84][94] = {
144 #include "jis0208.h"
145 };
146
147 static const unsigned short gb18030[126][190] = {
148 #include "gb18030.h"
149 };
150
151 static const unsigned short big5[89][157] = {
152 #include "big5.h"
153 };
154
155 static const unsigned short hkscs[] = {
156 #include "hkscs.h"
157 };
158
159 static const unsigned short ksc[93][94] = {
160 #include "ksc.h"
161 };
162
163 static const unsigned short rev_jis[] = {
164 #include "revjis.h"
165 };
166
fuzzycmp(const unsigned char * a,const unsigned char * b)167 static int fuzzycmp(const unsigned char *a, const unsigned char *b)
168 {
169 for (; *a && *b; a++, b++) {
170 while (*a && (*a|32U)-'a'>26 && *a-'0'>10U) a++;
171 if ((*a|32U) != *b) return 1;
172 }
173 return *a != *b;
174 }
175
find_charmap(const void * name)176 static size_t find_charmap(const void *name)
177 {
178 const unsigned char *s;
179 if (!*(char *)name) name=charmaps; /* "utf8" */
180 for (s=charmaps; *s; ) {
181 if (!fuzzycmp(name, s)) {
182 for (; *s; s+=strlen((void *)s)+1);
183 return s+1-charmaps;
184 }
185 s += strlen((void *)s)+1;
186 if (!*s) {
187 if (s[1] > 0200) s+=2;
188 else s+=2+(64U-s[1])*5;
189 }
190 }
191 return -1;
192 }
193
194 #ifndef __LITEOS__
195 #ifdef FEATURE_ICU_LOCALE
find_icu_map(const void * query_name)196 static const unsigned char* find_icu_map(const void *query_name)
197 {
198 if (!*(char *)query_name) {
199 query_name = icu_name_maps;
200 }
201
202 const unsigned char *icu_name = icu_name_maps;
203 while (*icu_name) {
204 if (!fuzzycmp(query_name, icu_name)) {
205 while (*icu_name) {
206 icu_name += strlen((void *)icu_name) + 1; //find nearly \0\0
207 }
208 return icu_name + 1;
209 }
210 icu_name += strlen((void *)icu_name) + 1; // skip \0
211 if (!*icu_name) { // skip \0\0
212 icu_name++;
213 while (*icu_name) {icu_name++;}
214 icu_name++;
215 }
216 }
217 return NULL;
218 }
219 #endif
220 #endif
221
222 struct stateful_cd {
223 #ifndef __LITEOS__
224 #ifdef FEATURE_ICU_LOCALE
225 unsigned sign;
226 const unsigned char* to;
227 const unsigned char* from;
228 #endif
229 #endif
230 iconv_t base_cd;
231 unsigned state;
232 };
233
combine_to_from(size_t t,size_t f)234 static iconv_t combine_to_from(size_t t, size_t f)
235 {
236 return (void *)(f<<16 | t<<1 | 1);
237 }
238
extract_from(iconv_t cd)239 static size_t extract_from(iconv_t cd)
240 {
241 return (size_t)cd >> 16;
242 }
243
extract_to(iconv_t cd)244 static size_t extract_to(iconv_t cd)
245 {
246 return (size_t)cd >> 1 & 0x7fff;
247 }
248
249 #ifndef __LITEOS__
250 #ifdef FEATURE_ICU_LOCALE
set_type_flag(unsigned * value)251 static void set_type_flag(unsigned* value) {*value = (1 << TYPE_FLAG_POS) | *value;}
set_to_ignore_flag(unsigned * value)252 static void set_to_ignore_flag(unsigned* value) {*value = (1 << TO_IGNORE_FLAG_POS) | *value;}
set_from_ignore_flag(unsigned * value)253 static void set_from_ignore_flag(unsigned* value) {*value = (1 << FROM_IGNORE_FLAG_POS) | *value;}
set_to_translit_flag(unsigned * value)254 static void set_to_translit_flag(unsigned* value) {*value = (1 << TO_TRANSLIT_FLAG_POS) | *value;}
set_from_translit_flag(unsigned * value)255 static void set_from_translit_flag(unsigned* value) {*value = (1 << FROM_TRANSLIT_FLAG_POS) | *value;}
get_type_flag(unsigned value)256 static bool get_type_flag(unsigned value) {return (value >> TYPE_FLAG_POS) & 1;}
get_to_ignore_flag(unsigned value)257 static bool get_to_ignore_flag(unsigned value) {return (value >> TO_IGNORE_FLAG_POS) & 1;}
get_from_ignore_flag(unsigned value)258 static bool get_from_ignore_flag(unsigned value) {return (value >> FROM_IGNORE_FLAG_POS) & 1;}
get_to_translit_flag(unsigned value)259 static bool get_to_translit_flag(unsigned value) {return (value >> TO_TRANSLIT_FLAG_POS) & 1;}
get_from_translit_flag(unsigned value)260 static bool get_from_translit_flag(unsigned value) {return (value >> FROM_TRANSLIT_FLAG_POS) & 1;}
261
deal_with_tail(const char * ins,unsigned * sign,const unsigned char ** res,bool is_from)262 static bool deal_with_tail(const char* ins, unsigned* sign, const unsigned char** res, bool is_from)
263 {
264 char* ins_tmp = strdup(ins);
265 if (!ins_tmp) {return false;}
266 char* ins_ignore_pos = strstr(ins_tmp, "//IGNORE");
267 char* ins_translit_pos = strstr(ins_tmp, "//TRANSLIT");
268 if (ins_ignore_pos) {
269 if (is_from) {
270 set_from_ignore_flag(sign);
271 } else {
272 set_to_ignore_flag(sign);
273 }
274 *ins_ignore_pos = '\0';
275 *res = find_icu_map((void*)ins_tmp);
276 } else if (ins_translit_pos) {
277 if (is_from) {
278 set_from_translit_flag(sign);
279 } else {
280 set_to_translit_flag(sign);
281 }
282 *ins_translit_pos = '\0';
283 *res = find_icu_map((void*)ins_tmp);
284 } else {
285 *res = find_icu_map(ins);
286 }
287 free(ins_tmp);
288 return true;
289 }
290
291 bool icu_locale_enable = false;
292
293 pthread_mutex_t icu_init_mutex = PTHREAD_MUTEX_INITIALIZER;
294
295 /**
296 * @Description: The set_icu_enable function is used to set the internal implementation of iconv to the implementation of the ICU library.
297 * The iconv internal implementation may have been set to the ICU library implementation before the function was executed. In this case,
298 * the function also returns success.
299 * @return:If the function call is successful, the returned value will be zero; otherwise, the returned value will be a non-zero error code.
300 */
301
set_iconv_icu_enable()302 int set_iconv_icu_enable()
303 {
304 pthread_mutex_lock(&icu_init_mutex);
305 if (!icuuc_handle_init()) {
306 pthread_mutex_unlock(&icu_init_mutex);
307 return ICU_SYMBOL_LOAD_ERROR;
308 }
309
310 icu_locale_enable = true;
311 pthread_mutex_unlock(&icu_init_mutex);
312 return ICU_ZERO_ERROR;
313 }
314
315 #endif
316 #endif
317
iconv_open(const char * to,const char * from)318 iconv_t iconv_open(const char *to, const char *from)
319 {
320 struct stateful_cd *scd;
321
322 #ifndef __LITEOS__
323 #ifdef FEATURE_ICU_LOCALE
324 bool is_basic_open = false;
325
326 for (const char* s = "iso885916\0iso2022jp\0\0"; *s;) { // icu not support
327 if (!fuzzycmp((void*)to, (void*)s) || !fuzzycmp((void*)from, (void*)s)) {
328 is_basic_open = true;
329 }
330 s += strlen(s) + 1;
331 }
332
333 // icu open
334 if (!is_basic_open && icu_locale_enable) {
335 scd = malloc(sizeof *scd);
336 if (!scd) {return (iconv_t)-1;}
337 scd->sign = 0;
338 scd->state = 0;
339
340 if (!deal_with_tail(to, &scd->sign, &scd->to, false)) {return (iconv_t)-1;}
341 if (!deal_with_tail(from, &scd->sign, &scd->from, true)) {return (iconv_t)-1;}
342
343 if (!scd->to || !scd->from) {
344 errno = EINVAL;
345 free(scd);
346 return (iconv_t)-1;
347 }
348
349 set_type_flag(&scd->sign);
350 return (iconv_t)scd;
351 }
352 #endif
353 #endif
354
355 // basic open
356 size_t f, t;
357 if ((t = find_charmap(to))==-1
358 || (f = find_charmap(from))==-1
359 || (charmaps[t] >= 0330)) {
360 errno = EINVAL;
361 return (iconv_t)-1;
362 }
363 iconv_t cd = combine_to_from(t, f);
364
365 switch (charmaps[f]) {
366 case UTF_16:
367 case UTF_32:
368 case UCS2:
369 case ISO2022_JP:
370 scd = malloc(sizeof *scd);
371 if (!scd) return (iconv_t)-1;
372 memset(scd, 0, sizeof(*scd));
373 scd->base_cd = cd;
374 scd->state = 0;
375 cd = (iconv_t)scd;
376 }
377
378 return cd;
379 }
380
get_16(const unsigned char * s,int e)381 static unsigned get_16(const unsigned char *s, int e)
382 {
383 e &= 1;
384 return s[e]<<8 | s[1-e];
385 }
386
put_16(unsigned char * s,unsigned c,int e)387 static void put_16(unsigned char *s, unsigned c, int e)
388 {
389 e &= 1;
390 s[e] = c>>8;
391 s[1-e] = c;
392 }
393
get_32(const unsigned char * s,int e)394 static unsigned get_32(const unsigned char *s, int e)
395 {
396 e &= 3;
397 return s[e]+0U<<24 | s[e^1]<<16 | s[e^2]<<8 | s[e^3];
398 }
399
put_32(unsigned char * s,unsigned c,int e)400 static void put_32(unsigned char *s, unsigned c, int e)
401 {
402 e &= 3;
403 s[e^0] = c>>24;
404 s[e^1] = c>>16;
405 s[e^2] = c>>8;
406 s[e^3] = c;
407 }
408
409 /* Adapt as needed */
410 #define mbrtowc_utf8 mbrtowc
411 #define wctomb_utf8 wctomb
412
legacy_map(const unsigned char * map,unsigned c)413 static unsigned legacy_map(const unsigned char *map, unsigned c)
414 {
415 if (c < 4*map[-1]) return c;
416 unsigned x = c - 4*map[-1];
417 x = map[x*5/4]>>2*x%8 | map[x*5/4+1]<<8-2*x%8 & 1023;
418 return x < 256 ? x : legacy_chars[x-256];
419 }
420
uni_to_jis(unsigned c)421 static unsigned uni_to_jis(unsigned c)
422 {
423 unsigned nel = sizeof rev_jis / sizeof *rev_jis;
424 unsigned d, j, i, b = 0;
425 for (;;) {
426 i = nel/2;
427 j = rev_jis[b+i];
428 d = jis0208[j/256][j%256];
429 if (d==c) return j + 0x2121;
430 else if (nel == 1) return 0;
431 else if (c < d)
432 nel /= 2;
433 else {
434 b += i;
435 nel -= nel/2;
436 }
437 }
438 }
439
440 #ifndef __LITEOS__
441 #ifdef FEATURE_ICU_LOCALE
ucnv_from_u_callback_ignore(const void * context,void * fromUArgs,const void * codeUnits,int32_t length,int32_t codePoint,int reason,int * err)442 static void ucnv_from_u_callback_ignore(
443 const void* context,
444 void* fromUArgs,
445 const void* codeUnits,
446 int32_t length,
447 int32_t codePoint,
448 int reason,
449 int* err)
450 {
451 if (reason <= ICU_SKIP_THRESHOLD) {
452 *err = ICU_ZERO_ERROR;
453 }
454 }
455
ucnv_from_u_callback_stop(const void * context,...)456 static void ucnv_from_u_callback_stop(const void* context, ...) { }
457
ucnv_to_u_callback_ignore(const void * context,void * toUArgs,const void * codeUnits,int32_t length,int reason,int * err)458 static void ucnv_to_u_callback_ignore(
459 const void* context,
460 void* toUArgs,
461 const void* codeUnits,
462 int32_t length,
463 int reason,
464 int* err)
465 {
466 if (reason <= ICU_SKIP_THRESHOLD) {
467 *err = ICU_ZERO_ERROR;
468 }
469 }
470
ucnv_to_u_callback_stop(const void * context,...)471 static void ucnv_to_u_callback_stop(const void* context, ...) { }
472
set_errno(int errCode)473 static void set_errno(int errCode)
474 {
475 if (errCode == ICU_ZERO_ERROR) {
476 errno = 0;
477 } else if (errCode == ICU_BUFFER_OVERFLOW_ERROR) {
478 errno = E2BIG;
479 } else if (errCode == ICU_IVALID_CHAR_ERROR ||
480 errCode == ICU_TRUNCATED_CHAR_ERROR ||
481 errCode == ICU_ILLEGAL_CHAR_ERROR) {
482 errno = EILSEQ;
483 } else {
484 errno = EINVAL;
485 }
486 }
487
iconv_icu(unsigned sign,const unsigned char * to,const unsigned char * from,char ** restrict in,size_t * restrict inb,char ** restrict out,size_t * restrict outb)488 static size_t iconv_icu(unsigned sign, const unsigned char* to, const unsigned char* from,
489 char **restrict in, size_t *restrict inb, char **restrict out, size_t *restrict outb)
490 {
491 int errCode = ICU_ZERO_ERROR;
492
493 void* conv_in = g_icu_opt_func.ucnv_open((void*)from, &errCode);
494 if (get_from_ignore_flag(sign)) {
495 g_icu_opt_func.ucnv_setToUCallBack(conv_in, ucnv_to_u_callback_ignore, NULL, NULL, NULL, &errCode);
496 } else if (!get_from_translit_flag(sign)) {
497 g_icu_opt_func.ucnv_setFromUCallBack(conv_in, ucnv_to_u_callback_stop, NULL, NULL, NULL, &errCode);
498 }
499
500 void* conv_out = g_icu_opt_func.ucnv_open((void*)to, &errCode);
501 if (get_to_ignore_flag(sign)) {
502 g_icu_opt_func.ucnv_setFromUCallBack(conv_out, ucnv_from_u_callback_ignore, NULL, NULL, NULL, &errCode);
503 } else if (!get_to_translit_flag(sign)) {
504 g_icu_opt_func.ucnv_setFromUCallBack(conv_out, ucnv_from_u_callback_stop, NULL, NULL, NULL, &errCode);
505 }
506
507 u_char pivot_buffer[ICU_CHUNK_SIZE];
508 u_char *pivot, *pivot2;
509 char *mytarget;
510 const char *source_limit;
511 const char *target_limit;
512 int32_t target_length = 0;
513 source_limit = *in + *inb;
514 pivot = pivot2 = pivot_buffer;
515 mytarget = *out;
516 target_limit = *out + *outb;
517 g_icu_opt_func.ucnv_convertEx(conv_out, conv_in, &mytarget, target_limit, (const char **)in, source_limit,
518 pivot_buffer, &pivot, &pivot2, pivot_buffer + ICU_CHUNK_SIZE, false, true, &errCode);
519 target_length = (int32_t)(mytarget - *out);
520 if (errCode > ICU_ZERO_ERROR) {
521 set_errno(errCode);
522 return (size_t)-1;
523 } else {
524 errCode = ICU_ZERO_ERROR;
525 }
526 g_icu_opt_func.ucnv_close(conv_in);
527 g_icu_opt_func.ucnv_close(conv_out);
528
529 *out += target_length;
530 *outb -= target_length;
531 *in += *inb;
532 *inb -= *inb;
533 set_errno(errCode);
534
535 return (size_t)errCode;
536 }
537 #endif
538 #endif
539
iconv(iconv_t cd,char ** restrict in,size_t * restrict inb,char ** restrict out,size_t * restrict outb)540 size_t iconv(iconv_t cd, char **restrict in, size_t *restrict inb, char **restrict out, size_t *restrict outb)
541 {
542 if (!in || !*in || !*inb) {
543 return 0;
544 }
545
546 size_t x=0;
547 struct stateful_cd *scd=0;
548 if (!((size_t)cd & 1)) {
549 scd = (void *)cd;
550 cd = scd->base_cd;
551 #ifndef __LITEOS__
552 #ifdef FEATURE_ICU_LOCALE
553 if (get_type_flag(scd->sign)) {
554 return iconv_icu(scd->sign, scd->to, scd->from, in, inb, out, outb);
555 }
556 #endif
557 #endif
558 }
559 unsigned to = extract_to(cd);
560 unsigned from = extract_from(cd);
561 const unsigned char *map = charmaps+from+1;
562 const unsigned char *tomap = charmaps+to+1;
563 mbstate_t st = {0};
564 wchar_t wc;
565 unsigned c, d;
566 size_t k, l;
567 int err;
568 unsigned char type = map[-1];
569 unsigned char totype = tomap[-1];
570 locale_t *ploc = &CURRENT_LOCALE, loc = *ploc;
571
572 *ploc = UTF8_LOCALE;
573
574 for (; *inb; *in+=l, *inb-=l) {
575 c = *(unsigned char *)*in;
576 l = 1;
577
578 switch (type) {
579 case UTF_8:
580 if (c < 128) break;
581 l = mbrtowc_utf8(&wc, *in, *inb, &st);
582 if (l == (size_t)-1) goto ilseq;
583 if (l == (size_t)-2) goto starved;
584 c = wc;
585 break;
586 case US_ASCII:
587 if (c >= 128) goto ilseq;
588 break;
589 case WCHAR_T:
590 l = sizeof(wchar_t);
591 if (*inb < l) goto starved;
592 c = *(wchar_t *)*in;
593 if (0) {
594 case UTF_32BE:
595 case UTF_32LE:
596 l = 4;
597 if (*inb < 4) goto starved;
598 c = get_32((void *)*in, type);
599 }
600 if (c-0xd800u < 0x800u || c >= 0x110000u) goto ilseq;
601 break;
602 case UCS2BE:
603 case UCS2LE:
604 case UTF_16BE:
605 case UTF_16LE:
606 l = 2;
607 if (*inb < 2) goto starved;
608 c = get_16((void *)*in, type);
609 if ((unsigned)(c-0xdc00) < 0x400) goto ilseq;
610 if ((unsigned)(c-0xd800) < 0x400) {
611 if (type-UCS2BE < 2U) goto ilseq;
612 l = 4;
613 if (*inb < 4) goto starved;
614 d = get_16((void *)(*in + 2), type);
615 if ((unsigned)(d-0xdc00) >= 0x400) goto ilseq;
616 c = ((c-0xd7c0)<<10) + (d-0xdc00);
617 }
618 break;
619 case UCS2:
620 case UTF_16:
621 l = 0;
622 if (!scd->state) {
623 if (*inb < 2) goto starved;
624 c = get_16((void *)*in, 0);
625 scd->state = type==UCS2
626 ? c==0xfffe ? UCS2LE : UCS2BE
627 : c==0xfffe ? UTF_16LE : UTF_16BE;
628 if (c == 0xfffe || c == 0xfeff)
629 l = 2;
630 }
631 type = scd->state;
632 continue;
633 case UTF_32:
634 l = 0;
635 if (!scd->state) {
636 if (*inb < 4) goto starved;
637 c = get_32((void *)*in, 0);
638 scd->state = c==0xfffe0000 ? UTF_32LE : UTF_32BE;
639 if (c == 0xfffe0000 || c == 0xfeff)
640 l = 4;
641 }
642 type = scd->state;
643 continue;
644 case SHIFT_JIS:
645 if (c < 128) break;
646 if (c-0xa1 <= 0xdf-0xa1) {
647 c += 0xff61-0xa1;
648 break;
649 }
650 l = 2;
651 if (*inb < 2) goto starved;
652 d = *((unsigned char *)*in + 1);
653 if (c-129 <= 159-129) c -= 129;
654 else if (c-224 <= 239-224) c -= 193;
655 else goto ilseq;
656 c *= 2;
657 if (d-64 <= 158-64) {
658 if (d==127) goto ilseq;
659 if (d>127) d--;
660 d -= 64;
661 } else if (d-159 <= 252-159) {
662 c++;
663 d -= 159;
664 }
665 c = jis0208[c][d];
666 if (!c) goto ilseq;
667 break;
668 case EUC_JP:
669 if (c < 128) break;
670 l = 2;
671 if (*inb < 2) goto starved;
672 d = *((unsigned char *)*in + 1);
673 if (c==0x8e) {
674 c = d;
675 if (c-0xa1 > 0xdf-0xa1) goto ilseq;
676 c += 0xff61 - 0xa1;
677 break;
678 }
679 c -= 0xa1;
680 d -= 0xa1;
681 if (c >= 84 || d >= 94) goto ilseq;
682 c = jis0208[c][d];
683 if (!c) goto ilseq;
684 break;
685 case ISO2022_JP:
686 if (c >= 128) goto ilseq;
687 if (c == '\033') {
688 l = 3;
689 if (*inb < 3) goto starved;
690 c = *((unsigned char *)*in + 1);
691 d = *((unsigned char *)*in + 2);
692 if (c != '(' && c != '$') goto ilseq;
693 switch (128*(c=='$') + d) {
694 case 'B': scd->state=0; continue;
695 case 'J': scd->state=1; continue;
696 case 'I': scd->state=4; continue;
697 case 128+'@': scd->state=2; continue;
698 case 128+'B': scd->state=3; continue;
699 }
700 goto ilseq;
701 }
702 switch (scd->state) {
703 case 1:
704 if (c=='\\') c = 0xa5;
705 if (c=='~') c = 0x203e;
706 break;
707 case 2:
708 case 3:
709 l = 2;
710 if (*inb < 2) goto starved;
711 d = *((unsigned char *)*in + 1);
712 c -= 0x21;
713 d -= 0x21;
714 if (c >= 84 || d >= 94) goto ilseq;
715 c = jis0208[c][d];
716 if (!c) goto ilseq;
717 break;
718 case 4:
719 if (c-0x60 < 0x1f) goto ilseq;
720 if (c-0x21 < 0x5e) c += 0xff61-0x21;
721 break;
722 }
723 break;
724 case GB2312:
725 if (c < 128) break;
726 if (c < 0xa1) goto ilseq;
727 case GBK:
728 case GB18030:
729 if (c < 128) break;
730 c -= 0x81;
731 if (c >= 126) goto ilseq;
732 l = 2;
733 if (*inb < 2) goto starved;
734 d = *((unsigned char *)*in + 1);
735 if (d < 0xa1 && type == GB2312) goto ilseq;
736 if (d-0x40>=191 || d==127) {
737 if (d-'0'>9 || type != GB18030)
738 goto ilseq;
739 l = 4;
740 if (*inb < 4) goto starved;
741 c = (10*c + d-'0') * 1260;
742 d = *((unsigned char *)*in + 2);
743 if (d-0x81>126) goto ilseq;
744 c += 10*(d-0x81);
745 d = *((unsigned char *)*in + 3);
746 if (d-'0'>9) goto ilseq;
747 c += d-'0';
748 c += 128;
749 for (d=0; d<=c; ) {
750 k = 0;
751 for (int i=0; i<126; i++)
752 for (int j=0; j<190; j++)
753 if (gb18030[i][j]-d <= c-d)
754 k++;
755 d = c+1;
756 c += k;
757 }
758 break;
759 }
760 d -= 0x40;
761 if (d>63) d--;
762 c = gb18030[c][d];
763 break;
764 case BIG5:
765 if (c < 128) break;
766 l = 2;
767 if (*inb < 2) goto starved;
768 d = *((unsigned char *)*in + 1);
769 if (d-0x40>=0xff-0x40 || d-0x7f<0xa1-0x7f) goto ilseq;
770 d -= 0x40;
771 if (d > 0x3e) d -= 0x22;
772 if (c-0xa1>=0xfa-0xa1) {
773 if (c-0x87>=0xff-0x87) goto ilseq;
774 if (c < 0xa1) c -= 0x87;
775 else c -= 0x87 + (0xfa-0xa1);
776 c = (hkscs[4867+(c*157+d)/16]>>(c*157+d)%16)%2<<17
777 | hkscs[c*157+d];
778 /* A few HKSCS characters map to pairs of UCS
779 * characters. These are mapped to surrogate
780 * range in the hkscs table then hard-coded
781 * here. Ugly, yes. */
782 if (c/256 == 0xdc) {
783 union {
784 char c[8];
785 wchar_t wc[2];
786 } tmp;
787 char *ptmp = tmp.c;
788 size_t tmpx = iconv(combine_to_from(to, find_charmap("utf8")),
789 &(char *){"\303\212\314\204"
790 "\303\212\314\214"
791 "\303\252\314\204"
792 "\303\252\314\214"
793 +c%256}, &(size_t){4},
794 &ptmp, &(size_t){sizeof tmp});
795 size_t tmplen = ptmp - tmp.c;
796 if (tmplen > *outb) goto toobig;
797 if (tmpx) x++;
798 memcpy(*out, &tmp, tmplen);
799 *out += tmplen;
800 *outb -= tmplen;
801 continue;
802 }
803 if (!c) goto ilseq;
804 break;
805 }
806 c -= 0xa1;
807 c = big5[c][d]|(c==0x27&&(d==0x3a||d==0x3c||d==0x42))<<17;
808 if (!c) goto ilseq;
809 break;
810 case EUC_KR:
811 if (c < 128) break;
812 l = 2;
813 if (*inb < 2) goto starved;
814 d = *((unsigned char *)*in + 1);
815 c -= 0xa1;
816 d -= 0xa1;
817 if (c >= 93 || d >= 94) {
818 c += (0xa1-0x81);
819 d += 0xa1;
820 if (c > 0xc6-0x81 || c==0xc6-0x81 && d>0x52)
821 goto ilseq;
822 if (d-'A'<26) d = d-'A';
823 else if (d-'a'<26) d = d-'a'+26;
824 else if (d-0x81<0xff-0x81) d = d-0x81+52;
825 else goto ilseq;
826 if (c < 0x20) c = 178*c + d;
827 else c = 178*0x20 + 84*(c-0x20) + d;
828 c += 0xac00;
829 for (d=0xac00; d<=c; ) {
830 k = 0;
831 for (int i=0; i<93; i++)
832 for (int j=0; j<94; j++)
833 if (ksc[i][j]-d <= c-d)
834 k++;
835 d = c+1;
836 c += k;
837 }
838 break;
839 }
840 c = ksc[c][d];
841 if (!c) goto ilseq;
842 break;
843 default:
844 if (!c) break;
845 c = legacy_map(map, c);
846 if (!c) goto ilseq;
847 }
848
849 switch (totype) {
850 case WCHAR_T:
851 if (*outb < sizeof(wchar_t)) goto toobig;
852 *(wchar_t *)*out = c;
853 *out += sizeof(wchar_t);
854 *outb -= sizeof(wchar_t);
855 break;
856 case UTF_8:
857 if (*outb < 4) {
858 char tmp[4];
859 k = wctomb_utf8(tmp, c);
860 if (*outb < k) goto toobig;
861 memcpy(*out, tmp, k);
862 } else k = wctomb_utf8(*out, c);
863 /* This failure condition should be unreachable, but
864 * is included to prevent decoder bugs from translating
865 * into advancement outside the output buffer range. */
866 if (k>4) goto ilseq;
867 *out += k;
868 *outb -= k;
869 break;
870 case US_ASCII:
871 if (c > 0x7f) subst: x++, c='*';
872 default:
873 if (*outb < 1) goto toobig;
874 if (c<256 && c==legacy_map(tomap, c)) {
875 revout:
876 if (*outb < 1) goto toobig;
877 *(*out)++ = c;
878 *outb -= 1;
879 break;
880 }
881 d = c;
882 for (c=4*totype; c<256; c++) {
883 if (d == legacy_map(tomap, c)) {
884 goto revout;
885 }
886 }
887 goto subst;
888 case SHIFT_JIS:
889 if (c < 128) goto revout;
890 if (c == 0xa5) {
891 x++;
892 c = '\\';
893 goto revout;
894 }
895 if (c == 0x203e) {
896 x++;
897 c = '~';
898 goto revout;
899 }
900 if (c-0xff61 <= 0xdf-0xa1) {
901 c += 0xa1 - 0xff61;
902 goto revout;
903 }
904 c = uni_to_jis(c);
905 if (!c) goto subst;
906 if (*outb < 2) goto toobig;
907 d = c%256;
908 c = c/256;
909 *(*out)++ = (c+1)/2 + (c<95 ? 112 : 176);
910 *(*out)++ = c%2 ? d + 31 + d/96 : d + 126;
911 *outb -= 2;
912 break;
913 case EUC_JP:
914 if (c < 128) goto revout;
915 if (c-0xff61 <= 0xdf-0xa1) {
916 c += 0x0e00 + 0x21 - 0xff61;
917 } else {
918 c = uni_to_jis(c);
919 }
920 if (!c) goto subst;
921 if (*outb < 2) goto toobig;
922 *(*out)++ = c/256 + 0x80;
923 *(*out)++ = c%256 + 0x80;
924 *outb -= 2;
925 break;
926 case ISO2022_JP:
927 if (c < 128) goto revout;
928 if (c-0xff61 <= 0xdf-0xa1 || c==0xa5 || c==0x203e) {
929 if (*outb < 7) goto toobig;
930 *(*out)++ = '\033';
931 *(*out)++ = '(';
932 if (c==0xa5) {
933 *(*out)++ = 'J';
934 *(*out)++ = '\\';
935 } else if (c==0x203e) {
936 *(*out)++ = 'J';
937 *(*out)++ = '~';
938 } else {
939 *(*out)++ = 'I';
940 *(*out)++ = c-0xff61+0x21;
941 }
942 *(*out)++ = '\033';
943 *(*out)++ = '(';
944 *(*out)++ = 'B';
945 *outb -= 7;
946 break;
947 }
948 c = uni_to_jis(c);
949 if (!c) goto subst;
950 if (*outb < 8) goto toobig;
951 *(*out)++ = '\033';
952 *(*out)++ = '$';
953 *(*out)++ = 'B';
954 *(*out)++ = c/256;
955 *(*out)++ = c%256;
956 *(*out)++ = '\033';
957 *(*out)++ = '(';
958 *(*out)++ = 'B';
959 *outb -= 8;
960 break;
961 case UCS2:
962 totype = UCS2BE;
963 case UCS2BE:
964 case UCS2LE:
965 case UTF_16:
966 case UTF_16BE:
967 case UTF_16LE:
968 if (c < 0x10000 || totype-UCS2BE < 2U) {
969 if (c >= 0x10000) c = 0xFFFD;
970 if (*outb < 2) goto toobig;
971 put_16((void *)*out, c, totype);
972 *out += 2;
973 *outb -= 2;
974 break;
975 }
976 if (*outb < 4) goto toobig;
977 c -= 0x10000;
978 put_16((void *)*out, (c>>10)|0xd800, totype);
979 put_16((void *)(*out + 2), (c&0x3ff)|0xdc00, totype);
980 *out += 4;
981 *outb -= 4;
982 break;
983 case UTF_32:
984 totype = UTF_32BE;
985 case UTF_32BE:
986 case UTF_32LE:
987 if (*outb < 4) goto toobig;
988 put_32((void *)*out, c, totype);
989 *out += 4;
990 *outb -= 4;
991 break;
992 }
993 }
994 *ploc = loc;
995 return x;
996 ilseq:
997 err = EILSEQ;
998 x = -1;
999 goto end;
1000 toobig:
1001 err = E2BIG;
1002 x = -1;
1003 goto end;
1004 starved:
1005 err = EINVAL;
1006 x = -1;
1007 end:
1008 errno = err;
1009 *ploc = loc;
1010 return x;
1011 }
1012