1 /* Charset handling while reading PO files.
2 Copyright (C) 2001-2007, 2010, 2019-2020 Free Software Foundation, Inc.
3 Written by Bruno Haible <haible@clisp.cons.org>, 2001.
4
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3 of the License, or
8 (at your option) any later version.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with this program. If not, see <https://www.gnu.org/licenses/>. */
17
18
19 #ifdef HAVE_CONFIG_H
20 # include "config.h"
21 #endif
22 #include <alloca.h>
23
24 /* Specification. */
25 #include "po-charset.h"
26
27 #include <stdlib.h>
28 #include <string.h>
29
30 #include "xmalloca.h"
31 #include "xvasprintf.h"
32 #include "po-xerror.h"
33 #if !IN_LIBGETTEXTPO
34 # include "basename-lgpl.h"
35 # include "progname.h"
36 #endif
37 #include "c-strstr.h"
38 #include "c-strcase.h"
39 #include "gettext.h"
40
41 #define _(str) gettext (str)
42
43 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
44
45 static const char ascii[] = "ASCII";
46
47 /* The canonicalized encoding name for ASCII. */
48 const char *po_charset_ascii = ascii;
49
50 static const char utf8[] = "UTF-8";
51
52 /* The canonicalized encoding name for UTF-8. */
53 const char *po_charset_utf8 = utf8;
54
55 /* Canonicalize an encoding name. */
56 const char *
po_charset_canonicalize(const char * charset)57 po_charset_canonicalize (const char *charset)
58 {
59 /* The list of charsets supported by glibc's iconv() and by the portable
60 iconv() across platforms. Taken from intl/localcharset.h. */
61 static const char *standard_charsets[] =
62 {
63 ascii, "ANSI_X3.4-1968", "US-ASCII", /* i = 0..2 */
64 "ISO-8859-1", "ISO_8859-1", /* i = 3, 4 */
65 "ISO-8859-2", "ISO_8859-2",
66 "ISO-8859-3", "ISO_8859-3",
67 "ISO-8859-4", "ISO_8859-4",
68 "ISO-8859-5", "ISO_8859-5",
69 "ISO-8859-6", "ISO_8859-6",
70 "ISO-8859-7", "ISO_8859-7",
71 "ISO-8859-8", "ISO_8859-8",
72 "ISO-8859-9", "ISO_8859-9",
73 "ISO-8859-13", "ISO_8859-13",
74 "ISO-8859-14", "ISO_8859-14",
75 "ISO-8859-15", "ISO_8859-15", /* i = 25, 26 */
76 "KOI8-R",
77 "KOI8-U",
78 "KOI8-T",
79 "CP850",
80 "CP866",
81 "CP874",
82 "CP932",
83 "CP949",
84 "CP950",
85 "CP1250",
86 "CP1251",
87 "CP1252",
88 "CP1253",
89 "CP1254",
90 "CP1255",
91 "CP1256",
92 "CP1257",
93 "GB2312",
94 "EUC-JP",
95 "EUC-KR",
96 "EUC-TW",
97 "BIG5",
98 "BIG5-HKSCS",
99 "GBK",
100 "GB18030",
101 "SHIFT_JIS",
102 "JOHAB",
103 "TIS-620",
104 "VISCII",
105 "GEORGIAN-PS",
106 utf8
107 };
108 size_t i;
109
110 for (i = 0; i < SIZEOF (standard_charsets); i++)
111 if (c_strcasecmp (charset, standard_charsets[i]) == 0)
112 return standard_charsets[i < 3 ? 0 : i < 27 ? ((i - 3) & ~1) + 3 : i];
113 return NULL;
114 }
115
116 /* Test for ASCII compatibility. */
117 bool
po_charset_ascii_compatible(const char * canon_charset)118 po_charset_ascii_compatible (const char *canon_charset)
119 {
120 /* There are only a few exceptions to ASCII compatibility. */
121 if (strcmp (canon_charset, "SHIFT_JIS") == 0
122 || strcmp (canon_charset, "JOHAB") == 0
123 || strcmp (canon_charset, "VISCII") == 0)
124 return false;
125 else
126 return true;
127 }
128
129 /* Test for a weird encoding, i.e. an encoding which has double-byte
130 characters ending in 0x5C. */
po_is_charset_weird(const char * canon_charset)131 bool po_is_charset_weird (const char *canon_charset)
132 {
133 static const char *weird_charsets[] =
134 {
135 "BIG5",
136 "BIG5-HKSCS",
137 "GBK",
138 "GB18030",
139 "SHIFT_JIS",
140 "JOHAB"
141 };
142 size_t i;
143
144 for (i = 0; i < SIZEOF (weird_charsets); i++)
145 if (strcmp (canon_charset, weird_charsets[i]) == 0)
146 return true;
147 return false;
148 }
149
150 /* Test for a weird CJK encoding, i.e. a weird encoding with CJK structure.
151 An encoding has CJK structure if every valid character stream is composed
152 of single bytes in the range 0x{00..7F} and of byte pairs in the range
153 0x{80..FF}{30..FF}. */
po_is_charset_weird_cjk(const char * canon_charset)154 bool po_is_charset_weird_cjk (const char *canon_charset)
155 {
156 static const char *weird_cjk_charsets[] =
157 { /* single bytes double bytes */
158 "BIG5", /* 0x{00..7F}, 0x{A1..F9}{40..FE} */
159 "BIG5-HKSCS", /* 0x{00..7F}, 0x{88..FE}{40..FE} */
160 "GBK", /* 0x{00..7F}, 0x{81..FE}{40..FE} */
161 "GB18030", /* 0x{00..7F}, 0x{81..FE}{30..FE} */
162 "SHIFT_JIS", /* 0x{00..7F}, 0x{81..F9}{40..FC} */
163 "JOHAB" /* 0x{00..7F}, 0x{84..F9}{31..FE} */
164 };
165 size_t i;
166
167 for (i = 0; i < SIZEOF (weird_cjk_charsets); i++)
168 if (strcmp (canon_charset, weird_cjk_charsets[i]) == 0)
169 return true;
170 return false;
171 }
172
173 /* Hardcoded iterator functions for all kinds of encodings.
174 We could also implement a general iterator function with iconv(),
175 but we need a fast one. */
176
177 /* Character iterator for 8-bit encodings. */
178 static size_t
char_iterator(const char * s)179 char_iterator (const char *s)
180 {
181 return 1;
182 }
183
184 /* Character iterator for GB2312. See libiconv/lib/euc_cn.h. */
185 /* Character iterator for EUC-KR. See libiconv/lib/euc_kr.h. */
186 static size_t
euc_character_iterator(const char * s)187 euc_character_iterator (const char *s)
188 {
189 unsigned char c = *s;
190 if (c >= 0xa1 && c < 0xff)
191 {
192 unsigned char c2 = s[1];
193 if (c2 >= 0xa1 && c2 < 0xff)
194 return 2;
195 }
196 return 1;
197 }
198
199 /* Character iterator for EUC-JP. See libiconv/lib/euc_jp.h. */
200 static size_t
euc_jp_character_iterator(const char * s)201 euc_jp_character_iterator (const char *s)
202 {
203 unsigned char c = *s;
204 if (c >= 0xa1 && c < 0xff)
205 {
206 unsigned char c2 = s[1];
207 if (c2 >= 0xa1 && c2 < 0xff)
208 return 2;
209 }
210 else if (c == 0x8e)
211 {
212 unsigned char c2 = s[1];
213 if (c2 >= 0xa1 && c2 < 0xe0)
214 return 2;
215 }
216 else if (c == 0x8f)
217 {
218 unsigned char c2 = s[1];
219 if (c2 >= 0xa1 && c2 < 0xff)
220 {
221 unsigned char c3 = s[2];
222 if (c3 >= 0xa1 && c3 < 0xff)
223 return 3;
224 }
225 }
226 return 1;
227 }
228
229 /* Character iterator for EUC-TW. See libiconv/lib/euc_tw.h. */
230 static size_t
euc_tw_character_iterator(const char * s)231 euc_tw_character_iterator (const char *s)
232 {
233 unsigned char c = *s;
234 if (c >= 0xa1 && c < 0xff)
235 {
236 unsigned char c2 = s[1];
237 if (c2 >= 0xa1 && c2 < 0xff)
238 return 2;
239 }
240 else if (c == 0x8e)
241 {
242 unsigned char c2 = s[1];
243 if (c2 >= 0xa1 && c2 <= 0xb0)
244 {
245 unsigned char c3 = s[2];
246 if (c3 >= 0xa1 && c3 < 0xff)
247 {
248 unsigned char c4 = s[3];
249 if (c4 >= 0xa1 && c4 < 0xff)
250 return 4;
251 }
252 }
253 }
254 return 1;
255 }
256
257 /* Character iterator for BIG5. See libiconv/lib/ces_big5.h. */
258 static size_t
big5_character_iterator(const char * s)259 big5_character_iterator (const char *s)
260 {
261 unsigned char c = *s;
262 if (c >= 0xa1 && c < 0xff)
263 {
264 unsigned char c2 = s[1];
265 if ((c2 >= 0x40 && c2 < 0x7f) || (c2 >= 0xa1 && c2 < 0xff))
266 return 2;
267 }
268 return 1;
269 }
270
271 /* Character iterator for BIG5-HKSCS. See libiconv/lib/big5hkscs.h. */
272 static size_t
big5hkscs_character_iterator(const char * s)273 big5hkscs_character_iterator (const char *s)
274 {
275 unsigned char c = *s;
276 if (c >= 0x88 && c < 0xff)
277 {
278 unsigned char c2 = s[1];
279 if ((c2 >= 0x40 && c2 < 0x7f) || (c2 >= 0xa1 && c2 < 0xff))
280 return 2;
281 }
282 return 1;
283 }
284
285 /* Character iterator for GBK. See libiconv/lib/ces_gbk.h and
286 libiconv/lib/gbk.h. */
287 static size_t
gbk_character_iterator(const char * s)288 gbk_character_iterator (const char *s)
289 {
290 unsigned char c = *s;
291 if (c >= 0x81 && c < 0xff)
292 {
293 unsigned char c2 = s[1];
294 if ((c2 >= 0x40 && c2 < 0x7f) || (c2 >= 0x80 && c2 < 0xff))
295 return 2;
296 }
297 return 1;
298 }
299
300 /* Character iterator for GB18030. See libiconv/lib/gb18030.h. */
301 static size_t
gb18030_character_iterator(const char * s)302 gb18030_character_iterator (const char *s)
303 {
304 unsigned char c = *s;
305 if (c >= 0x81 && c < 0xff)
306 {
307 unsigned char c2 = s[1];
308 if ((c2 >= 0x40 && c2 < 0x7f) || (c2 >= 0x80 && c2 < 0xff))
309 return 2;
310 }
311 if (c >= 0x81 && c <= 0x84)
312 {
313 unsigned char c2 = s[1];
314 if (c2 >= 0x30 && c2 <= 0x39)
315 {
316 unsigned char c3 = s[2];
317 if (c3 >= 0x81 && c3 < 0xff)
318 {
319 unsigned char c4 = s[3];
320 if (c4 >= 0x30 && c4 <= 0x39)
321 return 4;
322 }
323 }
324 }
325 return 1;
326 }
327
328 /* Character iterator for SHIFT_JIS. See libiconv/lib/sjis.h. */
329 static size_t
shift_jis_character_iterator(const char * s)330 shift_jis_character_iterator (const char *s)
331 {
332 unsigned char c = *s;
333 if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xf9))
334 {
335 unsigned char c2 = s[1];
336 if ((c2 >= 0x40 && c2 <= 0x7e) || (c2 >= 0x80 && c2 <= 0xfc))
337 return 2;
338 }
339 return 1;
340 }
341
342 /* Character iterator for JOHAB. See libiconv/lib/johab.h and
343 libiconv/lib/johab_hangul.h. */
344 static size_t
johab_character_iterator(const char * s)345 johab_character_iterator (const char *s)
346 {
347 unsigned char c = *s;
348 if (c >= 0x84 && c <= 0xd3)
349 {
350 unsigned char c2 = s[1];
351 if ((c2 >= 0x41 && c2 < 0x7f) || (c2 >= 0x81 && c2 < 0xff))
352 return 2;
353 }
354 else if (c >= 0xd9 && c <= 0xf9)
355 {
356 unsigned char c2 = s[1];
357 if ((c2 >= 0x31 && c2 <= 0x7e) || (c2 >= 0x91 && c2 <= 0xfe))
358 return 2;
359 }
360 return 1;
361 }
362
363 /* Character iterator for UTF-8. See libiconv/lib/utf8.h. */
364 static size_t
utf8_character_iterator(const char * s)365 utf8_character_iterator (const char *s)
366 {
367 unsigned char c = *s;
368 if (c >= 0xc2)
369 {
370 if (c < 0xe0)
371 {
372 unsigned char c2 = s[1];
373 if (c2 >= 0x80 && c2 < 0xc0)
374 return 2;
375 }
376 else if (c < 0xf0)
377 {
378 unsigned char c2 = s[1];
379 if (c2 >= 0x80 && c2 < 0xc0)
380 {
381 unsigned char c3 = s[2];
382 if (c3 >= 0x80 && c3 < 0xc0)
383 return 3;
384 }
385 }
386 else if (c < 0xf8)
387 {
388 unsigned char c2 = s[1];
389 if (c2 >= 0x80 && c2 < 0xc0)
390 {
391 unsigned char c3 = s[2];
392 if (c3 >= 0x80 && c3 < 0xc0)
393 {
394 unsigned char c4 = s[3];
395 if (c4 >= 0x80 && c4 < 0xc0)
396 return 4;
397 }
398 }
399 }
400 }
401 return 1;
402 }
403
404 /* Returns a character iterator for a given encoding.
405 Given a pointer into a string, it returns the number occupied by the next
406 single character. If the piece of string is not valid or if the *s == '\0',
407 it returns 1. */
408 character_iterator_t
po_charset_character_iterator(const char * canon_charset)409 po_charset_character_iterator (const char *canon_charset)
410 {
411 if (canon_charset == utf8)
412 return utf8_character_iterator;
413 if (strcmp (canon_charset, "GB2312") == 0
414 || strcmp (canon_charset, "EUC-KR") == 0)
415 return euc_character_iterator;
416 if (strcmp (canon_charset, "EUC-JP") == 0)
417 return euc_jp_character_iterator;
418 if (strcmp (canon_charset, "EUC-TW") == 0)
419 return euc_tw_character_iterator;
420 if (strcmp (canon_charset, "BIG5") == 0)
421 return big5_character_iterator;
422 if (strcmp (canon_charset, "BIG5-HKSCS") == 0)
423 return big5hkscs_character_iterator;
424 if (strcmp (canon_charset, "GBK") == 0)
425 return gbk_character_iterator;
426 if (strcmp (canon_charset, "GB18030") == 0)
427 return gb18030_character_iterator;
428 if (strcmp (canon_charset, "SHIFT_JIS") == 0)
429 return shift_jis_character_iterator;
430 if (strcmp (canon_charset, "JOHAB") == 0)
431 return johab_character_iterator;
432 return char_iterator;
433 }
434
435
436 /* The PO file's encoding, as specified in the header entry. */
437 const char *po_lex_charset;
438
439 #if HAVE_ICONV
440 /* Converter from the PO file's encoding to UTF-8. */
441 iconv_t po_lex_iconv;
442 #endif
443 /* If no converter is available, some information about the structure of the
444 PO file's encoding. */
445 bool po_lex_weird_cjk;
446
447 void
po_lex_charset_init()448 po_lex_charset_init ()
449 {
450 po_lex_charset = NULL;
451 #if HAVE_ICONV
452 po_lex_iconv = (iconv_t)(-1);
453 #endif
454 po_lex_weird_cjk = false;
455 }
456
457 void
po_lex_charset_set(const char * header_entry,const char * filename)458 po_lex_charset_set (const char *header_entry, const char *filename)
459 {
460 /* Verify the validity of CHARSET. It is necessary
461 1. for the correct treatment of multibyte characters containing
462 0x5C bytes in the PO lexer,
463 2. so that at run time, gettext() can call iconv() to convert
464 msgstr. */
465 const char *charsetstr = c_strstr (header_entry, "charset=");
466
467 if (charsetstr != NULL)
468 {
469 size_t len;
470 char *charset;
471 const char *canon_charset;
472
473 charsetstr += strlen ("charset=");
474 len = strcspn (charsetstr, " \t\n");
475 charset = (char *) xmalloca (len + 1);
476 memcpy (charset, charsetstr, len);
477 charset[len] = '\0';
478
479 canon_charset = po_charset_canonicalize (charset);
480 if (canon_charset == NULL)
481 {
482 /* Don't warn for POT files, because POT files usually contain
483 only ASCII msgids. */
484 size_t filenamelen = strlen (filename);
485
486 if (!(filenamelen >= 4
487 && memcmp (filename + filenamelen - 4, ".pot", 4) == 0
488 && strcmp (charset, "CHARSET") == 0))
489 {
490 char *warning_message =
491 xasprintf (_("\
492 Charset \"%s\" is not a portable encoding name.\n\
493 Message conversion to user's charset might not work.\n"),
494 charset);
495 po_xerror (PO_SEVERITY_WARNING, NULL,
496 filename, (size_t)(-1), (size_t)(-1), true,
497 warning_message);
498 free (warning_message);
499 }
500 }
501 else
502 {
503 const char *envval;
504
505 po_lex_charset = canon_charset;
506 #if HAVE_ICONV
507 if (po_lex_iconv != (iconv_t)(-1))
508 iconv_close (po_lex_iconv);
509 #endif
510
511 /* The old Solaris/openwin msgfmt and GNU msgfmt <= 0.10.35
512 don't know about multibyte encodings, and require a spurious
513 backslash after every multibyte character whose last byte is
514 0x5C. Some programs, like vim, distribute PO files in this
515 broken format. GNU msgfmt must continue to support this old
516 PO file format when the Makefile requests it. */
517 envval = getenv ("OLD_PO_FILE_INPUT");
518 if (envval != NULL && *envval != '\0')
519 {
520 /* Assume the PO file is in old format, with extraneous
521 backslashes. */
522 #if HAVE_ICONV
523 po_lex_iconv = (iconv_t)(-1);
524 #endif
525 po_lex_weird_cjk = false;
526 }
527 else
528 {
529 /* Use iconv() to parse multibyte characters. */
530 #if HAVE_ICONV
531 /* Avoid glibc-2.1 bug with EUC-KR. */
532 # if ((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
533 && !defined _LIBICONV_VERSION
534 if (strcmp (po_lex_charset, "EUC-KR") == 0)
535 po_lex_iconv = (iconv_t)(-1);
536 else
537 # endif
538 /* Avoid Solaris 2.9 bug with GB2312, EUC-TW, BIG5, BIG5-HKSCS,
539 GBK, GB18030. */
540 # if defined __sun && !defined _LIBICONV_VERSION
541 if ( strcmp (po_lex_charset, "GB2312") == 0
542 || strcmp (po_lex_charset, "EUC-TW") == 0
543 || strcmp (po_lex_charset, "BIG5") == 0
544 || strcmp (po_lex_charset, "BIG5-HKSCS") == 0
545 || strcmp (po_lex_charset, "GBK") == 0
546 || strcmp (po_lex_charset, "GB18030") == 0)
547 po_lex_iconv = (iconv_t)(-1);
548 else
549 # endif
550 po_lex_iconv = iconv_open ("UTF-8", po_lex_charset);
551 if (po_lex_iconv == (iconv_t)(-1))
552 {
553 const char *progname;
554 char *warning_message;
555 const char *recommendation;
556 const char *note;
557 char *whole_message;
558
559 # if IN_LIBGETTEXTPO
560 progname = "libgettextpo";
561 # else
562 progname = last_component (program_name);
563 # endif
564
565 warning_message =
566 xasprintf (_("\
567 Charset \"%s\" is not supported. %s relies on iconv(),\n\
568 and iconv() does not support \"%s\".\n"),
569 po_lex_charset, progname, po_lex_charset);
570
571 # if !defined _LIBICONV_VERSION
572 recommendation = _("\
573 Installing GNU libiconv and then reinstalling GNU gettext\n\
574 would fix this problem.\n");
575 # else
576 recommendation = "";
577 # endif
578
579 /* Test for a charset which has double-byte characters
580 ending in 0x5C. For these encodings, the string parser
581 is likely to be confused if it can't see the character
582 boundaries. */
583 po_lex_weird_cjk = po_is_charset_weird_cjk (po_lex_charset);
584 if (po_is_charset_weird (po_lex_charset)
585 && !po_lex_weird_cjk)
586 note = _("Continuing anyway, expect parse errors.");
587 else
588 note = _("Continuing anyway.");
589
590 whole_message =
591 xasprintf ("%s%s%s\n",
592 warning_message, recommendation, note);
593
594 po_xerror (PO_SEVERITY_WARNING, NULL,
595 filename, (size_t)(-1), (size_t)(-1), true,
596 whole_message);
597
598 free (whole_message);
599 free (warning_message);
600 }
601 #else
602 /* Test for a charset which has double-byte characters
603 ending in 0x5C. For these encodings, the string parser
604 is likely to be confused if it can't see the character
605 boundaries. */
606 po_lex_weird_cjk = po_is_charset_weird_cjk (po_lex_charset);
607 if (po_is_charset_weird (po_lex_charset) && !po_lex_weird_cjk)
608 {
609 const char *progname;
610 char *warning_message;
611 const char *recommendation;
612 const char *note;
613 char *whole_message;
614
615 # if IN_LIBGETTEXTPO
616 progname = "libgettextpo";
617 # else
618 progname = last_component (program_name);
619 # endif
620
621 warning_message =
622 xasprintf (_("\
623 Charset \"%s\" is not supported. %s relies on iconv().\n\
624 This version was built without iconv().\n"),
625 po_lex_charset, progname);
626
627 recommendation = _("\
628 Installing GNU libiconv and then reinstalling GNU gettext\n\
629 would fix this problem.\n");
630
631 note = _("Continuing anyway, expect parse errors.");
632
633 whole_message =
634 xasprintf ("%s%s%s\n",
635 warning_message, recommendation, note);
636
637 po_xerror (PO_SEVERITY_WARNING, NULL,
638 filename, (size_t)(-1), (size_t)(-1), true,
639 whole_message);
640
641 free (whole_message);
642 free (warning_message);
643 }
644 #endif
645 }
646 }
647 freea (charset);
648 }
649 else
650 {
651 /* Don't warn for POT files, because POT files usually contain
652 only ASCII msgids. */
653 size_t filenamelen = strlen (filename);
654
655 if (!(filenamelen >= 4
656 && memcmp (filename + filenamelen - 4, ".pot", 4) == 0))
657 po_xerror (PO_SEVERITY_WARNING,
658 NULL, filename, (size_t)(-1), (size_t)(-1), true,
659 _("\
660 Charset missing in header.\n\
661 Message conversion to user's charset will not work.\n"));
662 }
663 }
664
665 void
po_lex_charset_close()666 po_lex_charset_close ()
667 {
668 po_lex_charset = NULL;
669 #if HAVE_ICONV
670 if (po_lex_iconv != (iconv_t)(-1))
671 {
672 iconv_close (po_lex_iconv);
673 po_lex_iconv = (iconv_t)(-1);
674 }
675 #endif
676 po_lex_weird_cjk = false;
677 }
678