• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2001 Edmund Grimley Evans <edmundo@rano.org>
3  *
4  * This program is free software; you can redistribute it and/or modify
5  * it under the terms of the GNU General Public License as published by
6  * the Free Software Foundation; either version 2 of the License, or
7  * (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License along
15  * with this program; if not, write to the Free Software Foundation, Inc.,
16  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
17  */
18 
19 /*
20  * See the corresponding header file for a description of the functions
21  * that this file provides.
22  *
23  * This was first written for Ogg Vorbis but could be of general use.
24  *
25  * The only deliberate assumption about data sizes is that a short has
26  * at least 16 bits, but this code has only been tested on systems with
27  * 8-bit char, 16-bit short and 32-bit int.
28  */
29 
30 #ifdef HAVE_CONFIG_H
31 #  include <config.h>
32 #endif
33 
34 #if !defined _WIN32 && !defined HAVE_ICONV /* should be && defined USE_CHARSET_CONVERT */
35 
36 #include <stdlib.h>
37 
38 #include "share/alloc.h"
39 #include "charset.h"
40 
41 #include "charmaps.h"
42 
43 /*
44  * This is like the standard strcasecmp, but it does not depend
45  * on the locale. Locale-dependent functions can be dangerous:
46  * we once had a bug involving strcasecmp("iso", "ISO") in a
47  * Turkish locale!
48  *
49  * (I'm not really sure what the official standard says
50  * about the sign of strcasecmp("Z", "["), but usually
51  * we're only interested in whether it's zero.)
52  */
53 
ascii_strcasecmp(const char * s1,const char * s2)54 static int ascii_strcasecmp(const char *s1, const char *s2)
55 {
56   char c1, c2;
57 
58   for (;; s1++, s2++) {
59     if (!*s1 || !*s2)
60       break;
61     if (*s1 == *s2)
62       continue;
63     c1 = *s1;
64     if ('a' <= c1 && c1 <= 'z')
65       c1 += 'A' - 'a';
66     c2 = *s2;
67     if ('a' <= c2 && c2 <= 'z')
68       c2 += 'A' - 'a';
69     if (c1 != c2)
70       break;
71   }
72   return (uint8_t)*s1 - (uint8_t)*s2;
73 }
74 
75 /*
76  * UTF-8 equivalents of the C library's wctomb() and mbtowc().
77  */
78 
utf8_mbtowc(int * pwc,const char * s,size_t n)79 int utf8_mbtowc(int *pwc, const char *s, size_t n)
80 {
81   uint8_t c;
82   int wc, i, k;
83 
84   if (!n || !s)
85     return 0;
86 
87   c = *s;
88   if (c < 0x80) {
89     if (pwc)
90       *pwc = c;
91     return c ? 1 : 0;
92   }
93   else if (c < 0xc2)
94     return -1;
95   else if (c < 0xe0) {
96     if (n >= 2 && (s[1] & 0xc0) == 0x80) {
97       if (pwc)
98 	*pwc = ((c & 0x1f) << 6) | (s[1] & 0x3f);
99       return 2;
100     }
101     else
102       return -1;
103   }
104   else if (c < 0xf0)
105     k = 3;
106   else if (c < 0xf8)
107     k = 4;
108   else if (c < 0xfc)
109     k = 5;
110   else if (c < 0xfe)
111     k = 6;
112   else
113     return -1;
114 
115   if (n < (size_t)k)
116     return -1;
117   wc = *s++ & ((1 << (7 - k)) - 1);
118   for (i = 1; i < k; i++) {
119     if ((*s & 0xc0) != 0x80)
120       return -1;
121     wc = (wc << 6) | (*s++ & 0x3f);
122   }
123   if (wc < (1 << (5 * k - 4)))
124     return -1;
125   if (pwc)
126     *pwc = wc;
127   return k;
128 }
129 
utf8_wctomb(char * s,int wc1)130 int utf8_wctomb(char *s, int wc1)
131 {
132   uint32_t wc = wc1;
133 
134   if (!s)
135     return 0;
136   if (wc < (1u << 7)) {
137     *s++ = wc;
138     return 1;
139   }
140   else if (wc < (1u << 11)) {
141     *s++ = 0xc0 | (wc >> 6);
142     *s++ = 0x80 | (wc & 0x3f);
143     return 2;
144   }
145   else if (wc < (1u << 16)) {
146     *s++ = 0xe0 | (wc >> 12);
147     *s++ = 0x80 | ((wc >> 6) & 0x3f);
148     *s++ = 0x80 | (wc & 0x3f);
149     return 3;
150   }
151   else if (wc < (1u << 21)) {
152     *s++ = 0xf0 | (wc >> 18);
153     *s++ = 0x80 | ((wc >> 12) & 0x3f);
154     *s++ = 0x80 | ((wc >> 6) & 0x3f);
155     *s++ = 0x80 | (wc & 0x3f);
156     return 4;
157   }
158   else if (wc < (1u << 26)) {
159     *s++ = 0xf8 | (wc >> 24);
160     *s++ = 0x80 | ((wc >> 18) & 0x3f);
161     *s++ = 0x80 | ((wc >> 12) & 0x3f);
162     *s++ = 0x80 | ((wc >> 6) & 0x3f);
163     *s++ = 0x80 | (wc & 0x3f);
164     return 5;
165   }
166   else if (wc < (1u << 31)) {
167     *s++ = 0xfc | (wc >> 30);
168     *s++ = 0x80 | ((wc >> 24) & 0x3f);
169     *s++ = 0x80 | ((wc >> 18) & 0x3f);
170     *s++ = 0x80 | ((wc >> 12) & 0x3f);
171     *s++ = 0x80 | ((wc >> 6) & 0x3f);
172     *s++ = 0x80 | (wc & 0x3f);
173     return 6;
174   }
175   else
176     return -1;
177 }
178 
179 /*
180  * The charset "object" and methods.
181  */
182 
183 struct charset {
184   int max;
185   int (*mbtowc)(void *table, int *pwc, const char *s, size_t n);
186   int (*wctomb)(void *table, char *s, int wc);
187   void *map;
188 };
189 
charset_mbtowc(struct charset * charset,int * pwc,const char * s,size_t n)190 int charset_mbtowc(struct charset *charset, int *pwc, const char *s, size_t n)
191 {
192   return (*charset->mbtowc)(charset->map, pwc, s, n);
193 }
194 
charset_wctomb(struct charset * charset,char * s,int wc)195 int charset_wctomb(struct charset *charset, char *s, int wc)
196 {
197   return (*charset->wctomb)(charset->map, s, wc);
198 }
199 
charset_max(struct charset * charset)200 int charset_max(struct charset *charset)
201 {
202   return charset->max;
203 }
204 
205 /*
206  * Implementation of UTF-8.
207  */
208 
mbtowc_utf8(void * map,int * pwc,const char * s,size_t n)209 static int mbtowc_utf8(void *map, int *pwc, const char *s, size_t n)
210 {
211   (void)map;
212   return utf8_mbtowc(pwc, s, n);
213 }
214 
wctomb_utf8(void * map,char * s,int wc)215 static int wctomb_utf8(void *map, char *s, int wc)
216 {
217   (void)map;
218   return utf8_wctomb(s, wc);
219 }
220 
221 /*
222  * Implementation of US-ASCII.
223  * Probably on most architectures this compiles to less than 256 bytes
224  * of code, so we can save space by not having a table for this one.
225  */
226 
mbtowc_ascii(void * map,int * pwc,const char * s,size_t n)227 static int mbtowc_ascii(void *map, int *pwc, const char *s, size_t n)
228 {
229   int wc;
230 
231   (void)map;
232   if (!n || !s)
233     return 0;
234   wc = (uint8_t)*s;
235   if (wc & ~0x7f)
236     return -1;
237   if (pwc)
238     *pwc = wc;
239   return wc ? 1 : 0;
240 }
241 
wctomb_ascii(void * map,char * s,int wc)242 static int wctomb_ascii(void *map, char *s, int wc)
243 {
244   (void)map;
245   if (!s)
246     return 0;
247   if (wc & ~0x7f)
248     return -1;
249   *s = wc;
250   return 1;
251 }
252 
253 /*
254  * Implementation of ISO-8859-1.
255  * Probably on most architectures this compiles to less than 256 bytes
256  * of code, so we can save space by not having a table for this one.
257  */
258 
mbtowc_iso1(void * map,int * pwc,const char * s,size_t n)259 static int mbtowc_iso1(void *map, int *pwc, const char *s, size_t n)
260 {
261   int wc;
262 
263   (void)map;
264   if (!n || !s)
265     return 0;
266   wc = (uint8_t)*s;
267   if (wc & ~0xff)
268     return -1;
269   if (pwc)
270     *pwc = wc;
271   return wc ? 1 : 0;
272 }
273 
wctomb_iso1(void * map,char * s,int wc)274 static int wctomb_iso1(void *map, char *s, int wc)
275 {
276   (void)map;
277   if (!s)
278     return 0;
279   if (wc & ~0xff)
280     return -1;
281   *s = wc;
282   return 1;
283 }
284 
285 /*
286  * Implementation of any 8-bit charset.
287  */
288 
289 struct map {
290   const uint16_t *from;
291   struct inverse_map *to;
292 };
293 
mbtowc_8bit(void * map1,int * pwc,const char * s,size_t n)294 static int mbtowc_8bit(void *map1, int *pwc, const char *s, size_t n)
295 {
296   struct map *map = map1;
297   uint16_t wc;
298 
299   if (!n || !s)
300     return 0;
301   wc = map->from[(uint8_t)*s];
302   if (wc == 0xffff)
303     return -1;
304   if (pwc)
305     *pwc = (int)wc;
306   return wc ? 1 : 0;
307 }
308 
309 /*
310  * For the inverse map we use a hash table, which has the advantages
311  * of small constant memory requirement and simple memory allocation,
312  * but the disadvantage of slow conversion in the worst case.
313  * If you need real-time performance while letting a potentially
314  * malicious user define their own map, then the method used in
315  * linux/drivers/char/consolemap.c would be more appropriate.
316  */
317 
318 struct inverse_map {
319   uint8_t first[256];
320   uint8_t next[256];
321 };
322 
323 /*
324  * The simple hash is good enough for this application.
325  * Use the alternative trivial hashes for testing.
326  */
327 #define HASH(i) ((i) & 0xff)
328 /* #define HASH(i) 0 */
329 /* #define HASH(i) 99 */
330 
make_inverse_map(const uint16_t * from)331 static struct inverse_map *make_inverse_map(const uint16_t *from)
332 {
333   struct inverse_map *to;
334   char used[256];
335   int i, j, k;
336 
337   to = malloc(sizeof(struct inverse_map));
338   if (!to)
339     return 0;
340   for (i = 0; i < 256; i++)
341     to->first[i] = to->next[i] = used[i] = 0;
342   for (i = 255; i >= 0; i--)
343     if (from[i] != 0xffff) {
344       k = HASH(from[i]);
345       to->next[i] = to->first[k];
346       to->first[k] = i;
347       used[k] = 1;
348     }
349 
350   /* Point the empty buckets at an empty list. */
351   for (i = 0; i < 256; i++)
352     if (!to->next[i])
353       break;
354   if (i < 256)
355     for (j = 0; j < 256; j++)
356       if (!used[j])
357 	to->first[j] = i;
358 
359   return to;
360 }
361 
wctomb_8bit(void * map1,char * s,int wc1)362 static int wctomb_8bit(void *map1, char *s, int wc1)
363 {
364   struct map *map = map1;
365   uint16_t wc = wc1;
366   int i;
367 
368   if (!s)
369     return 0;
370 
371   if (wc1 & ~0xffff)
372     return -1;
373 
374   if (1) /* Change 1 to 0 to test the case where malloc fails. */
375     if (!map->to)
376       map->to = make_inverse_map(map->from);
377 
378   if (map->to) {
379     /* Use the inverse map. */
380     i = map->to->first[HASH(wc)];
381     for (;;) {
382       if (map->from[i] == wc) {
383 	*s = i;
384 	return 1;
385       }
386       if (!(i = map->to->next[i]))
387 	break;
388     }
389   }
390   else {
391     /* We don't have an inverse map, so do a linear search. */
392     for (i = 0; i < 256; i++)
393       if (map->from[i] == wc) {
394 	*s = i;
395 	return 1;
396       }
397   }
398 
399   return -1;
400 }
401 
402 /*
403  * The "constructor" charset_find().
404  */
405 
406 struct charset charset_utf8 = {
407   6,
408   &mbtowc_utf8,
409   &wctomb_utf8,
410   0
411 };
412 
413 struct charset charset_iso1 = {
414   1,
415   &mbtowc_iso1,
416   &wctomb_iso1,
417   0
418 };
419 
420 struct charset charset_ascii = {
421   1,
422   &mbtowc_ascii,
423   &wctomb_ascii,
424   0
425 };
426 
charset_find(const char * code)427 struct charset *charset_find(const char *code)
428 {
429   int i;
430 
431   /* Find good (MIME) name. */
432   for (i = 0; names[i].bad; i++)
433     if (!ascii_strcasecmp(code, names[i].bad)) {
434       code = names[i].good;
435       break;
436     }
437 
438   /* Recognise some charsets for which we avoid using a table. */
439   if (!ascii_strcasecmp(code, "UTF-8"))
440     return &charset_utf8;
441   if (!ascii_strcasecmp(code, "US-ASCII"))
442     return &charset_ascii;
443   if (!ascii_strcasecmp(code, "ISO-8859-1"))
444     return &charset_iso1;
445 
446   /* Look for a mapping for a simple 8-bit encoding. */
447   for (i = 0; maps[i].name; i++)
448     if (!ascii_strcasecmp(code, maps[i].name)) {
449       if (!maps[i].charset) {
450 	maps[i].charset = malloc(sizeof(struct charset));
451 	if (maps[i].charset) {
452 	  struct map *map = malloc(sizeof(struct map));
453 	  if (!map) {
454 	    free(maps[i].charset);
455 	    maps[i].charset = 0;
456 	  }
457 	  else {
458 	    maps[i].charset->max = 1;
459 	    maps[i].charset->mbtowc = &mbtowc_8bit;
460 	    maps[i].charset->wctomb = &wctomb_8bit;
461 	    maps[i].charset->map = map;
462 	    map->from = maps[i].map;
463 	    map->to = 0; /* inverse mapping is created when required */
464 	  }
465 	}
466       }
467       return maps[i].charset;
468     }
469 
470   return 0;
471 }
472 
473 /*
474  * Function to convert a buffer from one encoding to another.
475  * Invalid bytes are replaced by '#', and characters that are
476  * not available in the target encoding are replaced by '?'.
477  * Each of TO and TOLEN may be zero, if the result is not needed.
478  * The output buffer is null-terminated, so it is all right to
479  * use charset_convert(fromcode, tocode, s, strlen(s), &t, 0).
480  */
481 
charset_convert(const char * fromcode,const char * tocode,const char * from,size_t fromlen,char ** to,size_t * tolen)482 int charset_convert(const char *fromcode, const char *tocode,
483 		    const char *from, size_t fromlen,
484 		    char **to, size_t *tolen)
485 {
486   int ret = 0;
487   struct charset *charset1, *charset2;
488   char *tobuf, *p;
489   int i, j, wc;
490 
491   charset1 = charset_find(fromcode);
492   charset2 = charset_find(tocode);
493   if (!charset1 || !charset2 )
494     return -1;
495 
496   tobuf = safe_malloc_mul2add_(fromlen, /*times*/charset2->max, /*+*/1);
497   if (!tobuf)
498     return -2;
499 
500   for (p = tobuf; fromlen; from += i, fromlen -= i, p += j) {
501     i = charset_mbtowc(charset1, &wc, from, fromlen);
502     if (!i)
503       i = 1;
504     else if (i == -1) {
505       i  = 1;
506       wc = '#';
507       ret = 2;
508     }
509     j = charset_wctomb(charset2, p, wc);
510     if (j == -1) {
511       if (!ret)
512 	ret = 1;
513       j = charset_wctomb(charset2, p, '?');
514       if (j == -1)
515 	j = 0;
516     }
517   }
518 
519   if (tolen)
520     *tolen = p - tobuf;
521   *p++ = '\0';
522   if (to) {
523     char *tobuf_saved = tobuf;
524     *to = realloc(tobuf, p - tobuf);
525     if (*to == NULL)
526       *to = tobuf_saved;
527   }
528   else
529     free(tobuf);
530 
531   return ret;
532 }
533 
534 #endif /* USE_CHARSET_ICONV */
535