1 /*
2 * Copyright (C) 2001 Edmund Grimley Evans <edmundo@rano.org>
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License along
15 * with this program; if not, write to the Free Software Foundation, Inc.,
16 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
17 */
18
19 /*
20 * See the corresponding header file for a description of the functions
21 * that this file provides.
22 *
23 * This was first written for Ogg Vorbis but could be of general use.
24 *
25 * The only deliberate assumption about data sizes is that a short has
26 * at least 16 bits, but this code has only been tested on systems with
27 * 8-bit char, 16-bit short and 32-bit int.
28 */
29
30 #ifdef HAVE_CONFIG_H
31 # include <config.h>
32 #endif
33
34 #if !defined _WIN32 && !defined HAVE_ICONV /* should be && defined USE_CHARSET_CONVERT */
35
36 #include <stdlib.h>
37
38 #include "share/alloc.h"
39 #include "charset.h"
40
41 #include "charmaps.h"
42
43 /*
44 * This is like the standard strcasecmp, but it does not depend
45 * on the locale. Locale-dependent functions can be dangerous:
46 * we once had a bug involving strcasecmp("iso", "ISO") in a
47 * Turkish locale!
48 *
49 * (I'm not really sure what the official standard says
50 * about the sign of strcasecmp("Z", "["), but usually
51 * we're only interested in whether it's zero.)
52 */
53
ascii_strcasecmp(const char * s1,const char * s2)54 static int ascii_strcasecmp(const char *s1, const char *s2)
55 {
56 char c1, c2;
57
58 for (;; s1++, s2++) {
59 if (!*s1 || !*s2)
60 break;
61 if (*s1 == *s2)
62 continue;
63 c1 = *s1;
64 if ('a' <= c1 && c1 <= 'z')
65 c1 += 'A' - 'a';
66 c2 = *s2;
67 if ('a' <= c2 && c2 <= 'z')
68 c2 += 'A' - 'a';
69 if (c1 != c2)
70 break;
71 }
72 return (uint8_t)*s1 - (uint8_t)*s2;
73 }
74
75 /*
76 * UTF-8 equivalents of the C library's wctomb() and mbtowc().
77 */
78
utf8_mbtowc(int * pwc,const char * s,size_t n)79 int utf8_mbtowc(int *pwc, const char *s, size_t n)
80 {
81 uint8_t c;
82 int wc, i, k;
83
84 if (!n || !s)
85 return 0;
86
87 c = *s;
88 if (c < 0x80) {
89 if (pwc)
90 *pwc = c;
91 return c ? 1 : 0;
92 }
93 else if (c < 0xc2)
94 return -1;
95 else if (c < 0xe0) {
96 if (n >= 2 && (s[1] & 0xc0) == 0x80) {
97 if (pwc)
98 *pwc = ((c & 0x1f) << 6) | (s[1] & 0x3f);
99 return 2;
100 }
101 else
102 return -1;
103 }
104 else if (c < 0xf0)
105 k = 3;
106 else if (c < 0xf8)
107 k = 4;
108 else if (c < 0xfc)
109 k = 5;
110 else if (c < 0xfe)
111 k = 6;
112 else
113 return -1;
114
115 if (n < (size_t)k)
116 return -1;
117 wc = *s++ & ((1 << (7 - k)) - 1);
118 for (i = 1; i < k; i++) {
119 if ((*s & 0xc0) != 0x80)
120 return -1;
121 wc = (wc << 6) | (*s++ & 0x3f);
122 }
123 if (wc < (1 << (5 * k - 4)))
124 return -1;
125 if (pwc)
126 *pwc = wc;
127 return k;
128 }
129
utf8_wctomb(char * s,int wc1)130 int utf8_wctomb(char *s, int wc1)
131 {
132 uint32_t wc = wc1;
133
134 if (!s)
135 return 0;
136 if (wc < (1u << 7)) {
137 *s++ = wc;
138 return 1;
139 }
140 else if (wc < (1u << 11)) {
141 *s++ = 0xc0 | (wc >> 6);
142 *s++ = 0x80 | (wc & 0x3f);
143 return 2;
144 }
145 else if (wc < (1u << 16)) {
146 *s++ = 0xe0 | (wc >> 12);
147 *s++ = 0x80 | ((wc >> 6) & 0x3f);
148 *s++ = 0x80 | (wc & 0x3f);
149 return 3;
150 }
151 else if (wc < (1u << 21)) {
152 *s++ = 0xf0 | (wc >> 18);
153 *s++ = 0x80 | ((wc >> 12) & 0x3f);
154 *s++ = 0x80 | ((wc >> 6) & 0x3f);
155 *s++ = 0x80 | (wc & 0x3f);
156 return 4;
157 }
158 else if (wc < (1u << 26)) {
159 *s++ = 0xf8 | (wc >> 24);
160 *s++ = 0x80 | ((wc >> 18) & 0x3f);
161 *s++ = 0x80 | ((wc >> 12) & 0x3f);
162 *s++ = 0x80 | ((wc >> 6) & 0x3f);
163 *s++ = 0x80 | (wc & 0x3f);
164 return 5;
165 }
166 else if (wc < (1u << 31)) {
167 *s++ = 0xfc | (wc >> 30);
168 *s++ = 0x80 | ((wc >> 24) & 0x3f);
169 *s++ = 0x80 | ((wc >> 18) & 0x3f);
170 *s++ = 0x80 | ((wc >> 12) & 0x3f);
171 *s++ = 0x80 | ((wc >> 6) & 0x3f);
172 *s++ = 0x80 | (wc & 0x3f);
173 return 6;
174 }
175 else
176 return -1;
177 }
178
179 /*
180 * The charset "object" and methods.
181 */
182
183 struct charset {
184 int max;
185 int (*mbtowc)(void *table, int *pwc, const char *s, size_t n);
186 int (*wctomb)(void *table, char *s, int wc);
187 void *map;
188 };
189
charset_mbtowc(struct charset * charset,int * pwc,const char * s,size_t n)190 int charset_mbtowc(struct charset *charset, int *pwc, const char *s, size_t n)
191 {
192 return (*charset->mbtowc)(charset->map, pwc, s, n);
193 }
194
charset_wctomb(struct charset * charset,char * s,int wc)195 int charset_wctomb(struct charset *charset, char *s, int wc)
196 {
197 return (*charset->wctomb)(charset->map, s, wc);
198 }
199
charset_max(struct charset * charset)200 int charset_max(struct charset *charset)
201 {
202 return charset->max;
203 }
204
205 /*
206 * Implementation of UTF-8.
207 */
208
mbtowc_utf8(void * map,int * pwc,const char * s,size_t n)209 static int mbtowc_utf8(void *map, int *pwc, const char *s, size_t n)
210 {
211 (void)map;
212 return utf8_mbtowc(pwc, s, n);
213 }
214
wctomb_utf8(void * map,char * s,int wc)215 static int wctomb_utf8(void *map, char *s, int wc)
216 {
217 (void)map;
218 return utf8_wctomb(s, wc);
219 }
220
221 /*
222 * Implementation of US-ASCII.
223 * Probably on most architectures this compiles to less than 256 bytes
224 * of code, so we can save space by not having a table for this one.
225 */
226
mbtowc_ascii(void * map,int * pwc,const char * s,size_t n)227 static int mbtowc_ascii(void *map, int *pwc, const char *s, size_t n)
228 {
229 int wc;
230
231 (void)map;
232 if (!n || !s)
233 return 0;
234 wc = (uint8_t)*s;
235 if (wc & ~0x7f)
236 return -1;
237 if (pwc)
238 *pwc = wc;
239 return wc ? 1 : 0;
240 }
241
wctomb_ascii(void * map,char * s,int wc)242 static int wctomb_ascii(void *map, char *s, int wc)
243 {
244 (void)map;
245 if (!s)
246 return 0;
247 if (wc & ~0x7f)
248 return -1;
249 *s = wc;
250 return 1;
251 }
252
253 /*
254 * Implementation of ISO-8859-1.
255 * Probably on most architectures this compiles to less than 256 bytes
256 * of code, so we can save space by not having a table for this one.
257 */
258
mbtowc_iso1(void * map,int * pwc,const char * s,size_t n)259 static int mbtowc_iso1(void *map, int *pwc, const char *s, size_t n)
260 {
261 int wc;
262
263 (void)map;
264 if (!n || !s)
265 return 0;
266 wc = (uint8_t)*s;
267 if (wc & ~0xff)
268 return -1;
269 if (pwc)
270 *pwc = wc;
271 return wc ? 1 : 0;
272 }
273
wctomb_iso1(void * map,char * s,int wc)274 static int wctomb_iso1(void *map, char *s, int wc)
275 {
276 (void)map;
277 if (!s)
278 return 0;
279 if (wc & ~0xff)
280 return -1;
281 *s = wc;
282 return 1;
283 }
284
285 /*
286 * Implementation of any 8-bit charset.
287 */
288
289 struct map {
290 const uint16_t *from;
291 struct inverse_map *to;
292 };
293
mbtowc_8bit(void * map1,int * pwc,const char * s,size_t n)294 static int mbtowc_8bit(void *map1, int *pwc, const char *s, size_t n)
295 {
296 struct map *map = map1;
297 uint16_t wc;
298
299 if (!n || !s)
300 return 0;
301 wc = map->from[(uint8_t)*s];
302 if (wc == 0xffff)
303 return -1;
304 if (pwc)
305 *pwc = (int)wc;
306 return wc ? 1 : 0;
307 }
308
309 /*
310 * For the inverse map we use a hash table, which has the advantages
311 * of small constant memory requirement and simple memory allocation,
312 * but the disadvantage of slow conversion in the worst case.
313 * If you need real-time performance while letting a potentially
314 * malicious user define their own map, then the method used in
315 * linux/drivers/char/consolemap.c would be more appropriate.
316 */
317
318 struct inverse_map {
319 uint8_t first[256];
320 uint8_t next[256];
321 };
322
323 /*
324 * The simple hash is good enough for this application.
325 * Use the alternative trivial hashes for testing.
326 */
327 #define HASH(i) ((i) & 0xff)
328 /* #define HASH(i) 0 */
329 /* #define HASH(i) 99 */
330
make_inverse_map(const uint16_t * from)331 static struct inverse_map *make_inverse_map(const uint16_t *from)
332 {
333 struct inverse_map *to;
334 char used[256];
335 int i, j, k;
336
337 to = malloc(sizeof(struct inverse_map));
338 if (!to)
339 return 0;
340 for (i = 0; i < 256; i++)
341 to->first[i] = to->next[i] = used[i] = 0;
342 for (i = 255; i >= 0; i--)
343 if (from[i] != 0xffff) {
344 k = HASH(from[i]);
345 to->next[i] = to->first[k];
346 to->first[k] = i;
347 used[k] = 1;
348 }
349
350 /* Point the empty buckets at an empty list. */
351 for (i = 0; i < 256; i++)
352 if (!to->next[i])
353 break;
354 if (i < 256)
355 for (j = 0; j < 256; j++)
356 if (!used[j])
357 to->first[j] = i;
358
359 return to;
360 }
361
wctomb_8bit(void * map1,char * s,int wc1)362 static int wctomb_8bit(void *map1, char *s, int wc1)
363 {
364 struct map *map = map1;
365 uint16_t wc = wc1;
366 int i;
367
368 if (!s)
369 return 0;
370
371 if (wc1 & ~0xffff)
372 return -1;
373
374 if (1) /* Change 1 to 0 to test the case where malloc fails. */
375 if (!map->to)
376 map->to = make_inverse_map(map->from);
377
378 if (map->to) {
379 /* Use the inverse map. */
380 i = map->to->first[HASH(wc)];
381 for (;;) {
382 if (map->from[i] == wc) {
383 *s = i;
384 return 1;
385 }
386 if (!(i = map->to->next[i]))
387 break;
388 }
389 }
390 else {
391 /* We don't have an inverse map, so do a linear search. */
392 for (i = 0; i < 256; i++)
393 if (map->from[i] == wc) {
394 *s = i;
395 return 1;
396 }
397 }
398
399 return -1;
400 }
401
402 /*
403 * The "constructor" charset_find().
404 */
405
406 struct charset charset_utf8 = {
407 6,
408 &mbtowc_utf8,
409 &wctomb_utf8,
410 0
411 };
412
413 struct charset charset_iso1 = {
414 1,
415 &mbtowc_iso1,
416 &wctomb_iso1,
417 0
418 };
419
420 struct charset charset_ascii = {
421 1,
422 &mbtowc_ascii,
423 &wctomb_ascii,
424 0
425 };
426
charset_find(const char * code)427 struct charset *charset_find(const char *code)
428 {
429 int i;
430
431 /* Find good (MIME) name. */
432 for (i = 0; names[i].bad; i++)
433 if (!ascii_strcasecmp(code, names[i].bad)) {
434 code = names[i].good;
435 break;
436 }
437
438 /* Recognise some charsets for which we avoid using a table. */
439 if (!ascii_strcasecmp(code, "UTF-8"))
440 return &charset_utf8;
441 if (!ascii_strcasecmp(code, "US-ASCII"))
442 return &charset_ascii;
443 if (!ascii_strcasecmp(code, "ISO-8859-1"))
444 return &charset_iso1;
445
446 /* Look for a mapping for a simple 8-bit encoding. */
447 for (i = 0; maps[i].name; i++)
448 if (!ascii_strcasecmp(code, maps[i].name)) {
449 if (!maps[i].charset) {
450 maps[i].charset = malloc(sizeof(struct charset));
451 if (maps[i].charset) {
452 struct map *map = malloc(sizeof(struct map));
453 if (!map) {
454 free(maps[i].charset);
455 maps[i].charset = 0;
456 }
457 else {
458 maps[i].charset->max = 1;
459 maps[i].charset->mbtowc = &mbtowc_8bit;
460 maps[i].charset->wctomb = &wctomb_8bit;
461 maps[i].charset->map = map;
462 map->from = maps[i].map;
463 map->to = 0; /* inverse mapping is created when required */
464 }
465 }
466 }
467 return maps[i].charset;
468 }
469
470 return 0;
471 }
472
473 /*
474 * Function to convert a buffer from one encoding to another.
475 * Invalid bytes are replaced by '#', and characters that are
476 * not available in the target encoding are replaced by '?'.
477 * Each of TO and TOLEN may be zero, if the result is not needed.
478 * The output buffer is null-terminated, so it is all right to
479 * use charset_convert(fromcode, tocode, s, strlen(s), &t, 0).
480 */
481
charset_convert(const char * fromcode,const char * tocode,const char * from,size_t fromlen,char ** to,size_t * tolen)482 int charset_convert(const char *fromcode, const char *tocode,
483 const char *from, size_t fromlen,
484 char **to, size_t *tolen)
485 {
486 int ret = 0;
487 struct charset *charset1, *charset2;
488 char *tobuf, *p;
489 int i, j, wc;
490
491 charset1 = charset_find(fromcode);
492 charset2 = charset_find(tocode);
493 if (!charset1 || !charset2 )
494 return -1;
495
496 tobuf = safe_malloc_mul2add_(fromlen, /*times*/charset2->max, /*+*/1);
497 if (!tobuf)
498 return -2;
499
500 for (p = tobuf; fromlen; from += i, fromlen -= i, p += j) {
501 i = charset_mbtowc(charset1, &wc, from, fromlen);
502 if (!i)
503 i = 1;
504 else if (i == -1) {
505 i = 1;
506 wc = '#';
507 ret = 2;
508 }
509 j = charset_wctomb(charset2, p, wc);
510 if (j == -1) {
511 if (!ret)
512 ret = 1;
513 j = charset_wctomb(charset2, p, '?');
514 if (j == -1)
515 j = 0;
516 }
517 }
518
519 if (tolen)
520 *tolen = p - tobuf;
521 *p++ = '\0';
522 if (to) {
523 char *tobuf_saved = tobuf;
524 *to = realloc(tobuf, p - tobuf);
525 if (*to == NULL)
526 *to = tobuf_saved;
527 }
528 else
529 free(tobuf);
530
531 return ret;
532 }
533
534 #endif /* USE_CHARSET_ICONV */
535