1 /* -*- buffer-read-only: t -*- vi: set ro: */
2 /* DO NOT EDIT! GENERATED AUTOMATICALLY! */
3 /* Determine a canonical name for the current locale's character encoding.
4
5 Copyright (C) 2000-2006, 2008-2009 Free Software Foundation, Inc.
6
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
10 any later version.
11
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License along
18 with this program; if not, write to the Free Software Foundation,
19 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
20
21 /* Written by Bruno Haible <bruno@clisp.org>. */
22
23 #include <config.h>
24
25 /* Specification. */
26 #include "localcharset.h"
27
28 #include <stddef.h>
29 #include <stdio.h>
30 #include <string.h>
31 #include <stdlib.h>
32
33 #if defined __APPLE__ && defined __MACH__ && HAVE_LANGINFO_CODESET
34 # define DARWIN7 /* Darwin 7 or newer, i.e. MacOS X 10.3 or newer */
35 #endif
36
37 #if defined _WIN32 || defined __WIN32__
38 # define WIN32_NATIVE
39 #endif
40
41 #if defined __EMX__
42 /* Assume EMX program runs on OS/2, even if compiled under DOS. */
43 # ifndef OS2
44 # define OS2
45 # endif
46 #endif
47
48 #if !defined WIN32_NATIVE
49 # if HAVE_LANGINFO_CODESET
50 # include <langinfo.h>
51 # else
52 # if 0 /* see comment below */
53 # include <locale.h>
54 # endif
55 # endif
56 # ifdef __CYGWIN__
57 # define WIN32_LEAN_AND_MEAN
58 # include <windows.h>
59 # endif
60 #elif defined WIN32_NATIVE
61 # define WIN32_LEAN_AND_MEAN
62 # include <windows.h>
63 #endif
64 #if defined OS2
65 # define INCL_DOS
66 # include <os2.h>
67 #endif
68
69 #if ENABLE_RELOCATABLE
70 # include "relocatable.h"
71 #else
72 # define relocate(pathname) (pathname)
73 #endif
74
75 /* Get LIBDIR. */
76 #ifndef LIBDIR
77 # include "configmake.h"
78 #endif
79
80 #if defined _WIN32 || defined __WIN32__ || defined __CYGWIN__ || defined __EMX__ || defined __DJGPP__
81 /* Win32, Cygwin, OS/2, DOS */
82 # define ISSLASH(C) ((C) == '/' || (C) == '\\')
83 #endif
84
85 #ifndef DIRECTORY_SEPARATOR
86 # define DIRECTORY_SEPARATOR '/'
87 #endif
88
89 #ifndef ISSLASH
90 # define ISSLASH(C) ((C) == DIRECTORY_SEPARATOR)
91 #endif
92
93 #if HAVE_DECL_GETC_UNLOCKED
94 # undef getc
95 # define getc getc_unlocked
96 #endif
97
98 /* The following static variable is declared 'volatile' to avoid a
99 possible multithread problem in the function get_charset_aliases. If we
100 are running in a threaded environment, and if two threads initialize
101 'charset_aliases' simultaneously, both will produce the same value,
102 and everything will be ok if the two assignments to 'charset_aliases'
103 are atomic. But I don't know what will happen if the two assignments mix. */
104 #if __STDC__ != 1
105 # define volatile /* empty */
106 #endif
107 /* Pointer to the contents of the charset.alias file, if it has already been
108 read, else NULL. Its format is:
109 ALIAS_1 '\0' CANONICAL_1 '\0' ... ALIAS_n '\0' CANONICAL_n '\0' '\0' */
110 static const char * volatile charset_aliases;
111
112 /* Return a pointer to the contents of the charset.alias file. */
113 static const char *
get_charset_aliases(void)114 get_charset_aliases (void)
115 {
116 const char *cp;
117
118 cp = charset_aliases;
119 if (cp == NULL)
120 {
121 #if !(defined DARWIN7 || defined VMS || defined WIN32_NATIVE || defined __CYGWIN__)
122 FILE *fp;
123 const char *dir;
124 const char *base = "charset.alias";
125 char *file_name;
126
127 /* Make it possible to override the charset.alias location. This is
128 necessary for running the testsuite before "make install". */
129 dir = getenv ("CHARSETALIASDIR");
130 if (dir == NULL || dir[0] == '\0')
131 dir = relocate (LIBDIR);
132
133 /* Concatenate dir and base into freshly allocated file_name. */
134 {
135 size_t dir_len = strlen (dir);
136 size_t base_len = strlen (base);
137 int add_slash = (dir_len > 0 && !ISSLASH (dir[dir_len - 1]));
138 file_name = (char *) malloc (dir_len + add_slash + base_len + 1);
139 if (file_name != NULL)
140 {
141 memcpy (file_name, dir, dir_len);
142 if (add_slash)
143 file_name[dir_len] = DIRECTORY_SEPARATOR;
144 memcpy (file_name + dir_len + add_slash, base, base_len + 1);
145 }
146 }
147
148 if (file_name == NULL || (fp = fopen (file_name, "r")) == NULL)
149 /* Out of memory or file not found, treat it as empty. */
150 cp = "";
151 else
152 {
153 /* Parse the file's contents. */
154 char *res_ptr = NULL;
155 size_t res_size = 0;
156
157 for (;;)
158 {
159 int c;
160 char buf1[50+1];
161 char buf2[50+1];
162 size_t l1, l2;
163 char *old_res_ptr;
164
165 c = getc (fp);
166 if (c == EOF)
167 break;
168 if (c == '\n' || c == ' ' || c == '\t')
169 continue;
170 if (c == '#')
171 {
172 /* Skip comment, to end of line. */
173 do
174 c = getc (fp);
175 while (!(c == EOF || c == '\n'));
176 if (c == EOF)
177 break;
178 continue;
179 }
180 ungetc (c, fp);
181 if (fscanf (fp, "%50s %50s", buf1, buf2) < 2)
182 break;
183 l1 = strlen (buf1);
184 l2 = strlen (buf2);
185 old_res_ptr = res_ptr;
186 if (res_size == 0)
187 {
188 res_size = l1 + 1 + l2 + 1;
189 res_ptr = (char *) malloc (res_size + 1);
190 }
191 else
192 {
193 res_size += l1 + 1 + l2 + 1;
194 res_ptr = (char *) realloc (res_ptr, res_size + 1);
195 }
196 if (res_ptr == NULL)
197 {
198 /* Out of memory. */
199 res_size = 0;
200 if (old_res_ptr != NULL)
201 free (old_res_ptr);
202 break;
203 }
204 strcpy (res_ptr + res_size - (l2 + 1) - (l1 + 1), buf1);
205 strcpy (res_ptr + res_size - (l2 + 1), buf2);
206 }
207 fclose (fp);
208 if (res_size == 0)
209 cp = "";
210 else
211 {
212 *(res_ptr + res_size) = '\0';
213 cp = res_ptr;
214 }
215 }
216
217 if (file_name != NULL)
218 free (file_name);
219
220 #else
221
222 # if defined DARWIN7
223 /* To avoid the trouble of installing a file that is shared by many
224 GNU packages -- many packaging systems have problems with this --,
225 simply inline the aliases here. */
226 cp = "ISO8859-1" "\0" "ISO-8859-1" "\0"
227 "ISO8859-2" "\0" "ISO-8859-2" "\0"
228 "ISO8859-4" "\0" "ISO-8859-4" "\0"
229 "ISO8859-5" "\0" "ISO-8859-5" "\0"
230 "ISO8859-7" "\0" "ISO-8859-7" "\0"
231 "ISO8859-9" "\0" "ISO-8859-9" "\0"
232 "ISO8859-13" "\0" "ISO-8859-13" "\0"
233 "ISO8859-15" "\0" "ISO-8859-15" "\0"
234 "KOI8-R" "\0" "KOI8-R" "\0"
235 "KOI8-U" "\0" "KOI8-U" "\0"
236 "CP866" "\0" "CP866" "\0"
237 "CP949" "\0" "CP949" "\0"
238 "CP1131" "\0" "CP1131" "\0"
239 "CP1251" "\0" "CP1251" "\0"
240 "eucCN" "\0" "GB2312" "\0"
241 "GB2312" "\0" "GB2312" "\0"
242 "eucJP" "\0" "EUC-JP" "\0"
243 "eucKR" "\0" "EUC-KR" "\0"
244 "Big5" "\0" "BIG5" "\0"
245 "Big5HKSCS" "\0" "BIG5-HKSCS" "\0"
246 "GBK" "\0" "GBK" "\0"
247 "GB18030" "\0" "GB18030" "\0"
248 "SJIS" "\0" "SHIFT_JIS" "\0"
249 "ARMSCII-8" "\0" "ARMSCII-8" "\0"
250 "PT154" "\0" "PT154" "\0"
251 /*"ISCII-DEV" "\0" "?" "\0"*/
252 "*" "\0" "UTF-8" "\0";
253 # endif
254
255 # if defined VMS
256 /* To avoid the troubles of an extra file charset.alias_vms in the
257 sources of many GNU packages, simply inline the aliases here. */
258 /* The list of encodings is taken from the OpenVMS 7.3-1 documentation
259 "Compaq C Run-Time Library Reference Manual for OpenVMS systems"
260 section 10.7 "Handling Different Character Sets". */
261 cp = "ISO8859-1" "\0" "ISO-8859-1" "\0"
262 "ISO8859-2" "\0" "ISO-8859-2" "\0"
263 "ISO8859-5" "\0" "ISO-8859-5" "\0"
264 "ISO8859-7" "\0" "ISO-8859-7" "\0"
265 "ISO8859-8" "\0" "ISO-8859-8" "\0"
266 "ISO8859-9" "\0" "ISO-8859-9" "\0"
267 /* Japanese */
268 "eucJP" "\0" "EUC-JP" "\0"
269 "SJIS" "\0" "SHIFT_JIS" "\0"
270 "DECKANJI" "\0" "DEC-KANJI" "\0"
271 "SDECKANJI" "\0" "EUC-JP" "\0"
272 /* Chinese */
273 "eucTW" "\0" "EUC-TW" "\0"
274 "DECHANYU" "\0" "DEC-HANYU" "\0"
275 "DECHANZI" "\0" "GB2312" "\0"
276 /* Korean */
277 "DECKOREAN" "\0" "EUC-KR" "\0";
278 # endif
279
280 # if defined WIN32_NATIVE || defined __CYGWIN__
281 /* To avoid the troubles of installing a separate file in the same
282 directory as the DLL and of retrieving the DLL's directory at
283 runtime, simply inline the aliases here. */
284
285 cp = "CP936" "\0" "GBK" "\0"
286 "CP1361" "\0" "JOHAB" "\0"
287 "CP20127" "\0" "ASCII" "\0"
288 "CP20866" "\0" "KOI8-R" "\0"
289 "CP20936" "\0" "GB2312" "\0"
290 "CP21866" "\0" "KOI8-RU" "\0"
291 "CP28591" "\0" "ISO-8859-1" "\0"
292 "CP28592" "\0" "ISO-8859-2" "\0"
293 "CP28593" "\0" "ISO-8859-3" "\0"
294 "CP28594" "\0" "ISO-8859-4" "\0"
295 "CP28595" "\0" "ISO-8859-5" "\0"
296 "CP28596" "\0" "ISO-8859-6" "\0"
297 "CP28597" "\0" "ISO-8859-7" "\0"
298 "CP28598" "\0" "ISO-8859-8" "\0"
299 "CP28599" "\0" "ISO-8859-9" "\0"
300 "CP28605" "\0" "ISO-8859-15" "\0"
301 "CP38598" "\0" "ISO-8859-8" "\0"
302 "CP51932" "\0" "EUC-JP" "\0"
303 "CP51936" "\0" "GB2312" "\0"
304 "CP51949" "\0" "EUC-KR" "\0"
305 "CP51950" "\0" "EUC-TW" "\0"
306 "CP54936" "\0" "GB18030" "\0"
307 "CP65001" "\0" "UTF-8" "\0";
308 # endif
309 #endif
310
311 charset_aliases = cp;
312 }
313
314 return cp;
315 }
316
317 /* Determine the current locale's character encoding, and canonicalize it
318 into one of the canonical names listed in config.charset.
319 The result must not be freed; it is statically allocated.
320 If the canonical name cannot be determined, the result is a non-canonical
321 name. */
322
323 #ifdef STATIC
324 STATIC
325 #endif
326 const char *
locale_charset(void)327 locale_charset (void)
328 {
329 const char *codeset;
330 const char *aliases;
331
332 #if !(defined WIN32_NATIVE || defined OS2)
333
334 # if HAVE_LANGINFO_CODESET
335
336 /* Most systems support nl_langinfo (CODESET) nowadays. */
337 codeset = nl_langinfo (CODESET);
338
339 # ifdef __CYGWIN__
340 /* Cygwin 2006 does not have locales. nl_langinfo (CODESET) always
341 returns "US-ASCII". As long as this is not fixed, return the suffix
342 of the locale name from the environment variables (if present) or
343 the codepage as a number. */
344 if (codeset != NULL && strcmp (codeset, "US-ASCII") == 0)
345 {
346 const char *locale;
347 static char buf[2 + 10 + 1];
348
349 locale = getenv ("LC_ALL");
350 if (locale == NULL || locale[0] == '\0')
351 {
352 locale = getenv ("LC_CTYPE");
353 if (locale == NULL || locale[0] == '\0')
354 locale = getenv ("LANG");
355 }
356 if (locale != NULL && locale[0] != '\0')
357 {
358 /* If the locale name contains an encoding after the dot, return
359 it. */
360 const char *dot = strchr (locale, '.');
361
362 if (dot != NULL)
363 {
364 const char *modifier;
365
366 dot++;
367 /* Look for the possible @... trailer and remove it, if any. */
368 modifier = strchr (dot, '@');
369 if (modifier == NULL)
370 return dot;
371 if (modifier - dot < sizeof (buf))
372 {
373 memcpy (buf, dot, modifier - dot);
374 buf [modifier - dot] = '\0';
375 return buf;
376 }
377 }
378 }
379
380 /* Woe32 has a function returning the locale's codepage as a number. */
381 sprintf (buf, "CP%u", GetACP ());
382 codeset = buf;
383 }
384 # endif
385
386 # else
387
388 /* On old systems which lack it, use setlocale or getenv. */
389 const char *locale = NULL;
390
391 /* But most old systems don't have a complete set of locales. Some
392 (like SunOS 4 or DJGPP) have only the C locale. Therefore we don't
393 use setlocale here; it would return "C" when it doesn't support the
394 locale name the user has set. */
395 # if 0
396 locale = setlocale (LC_CTYPE, NULL);
397 # endif
398 if (locale == NULL || locale[0] == '\0')
399 {
400 locale = getenv ("LC_ALL");
401 if (locale == NULL || locale[0] == '\0')
402 {
403 locale = getenv ("LC_CTYPE");
404 if (locale == NULL || locale[0] == '\0')
405 locale = getenv ("LANG");
406 }
407 }
408
409 /* On some old systems, one used to set locale = "iso8859_1". On others,
410 you set it to "language_COUNTRY.charset". In any case, we resolve it
411 through the charset.alias file. */
412 codeset = locale;
413
414 # endif
415
416 #elif defined WIN32_NATIVE
417
418 static char buf[2 + 10 + 1];
419
420 /* Woe32 has a function returning the locale's codepage as a number. */
421 sprintf (buf, "CP%u", GetACP ());
422 codeset = buf;
423
424 #elif defined OS2
425
426 const char *locale;
427 static char buf[2 + 10 + 1];
428 ULONG cp[3];
429 ULONG cplen;
430
431 /* Allow user to override the codeset, as set in the operating system,
432 with standard language environment variables. */
433 locale = getenv ("LC_ALL");
434 if (locale == NULL || locale[0] == '\0')
435 {
436 locale = getenv ("LC_CTYPE");
437 if (locale == NULL || locale[0] == '\0')
438 locale = getenv ("LANG");
439 }
440 if (locale != NULL && locale[0] != '\0')
441 {
442 /* If the locale name contains an encoding after the dot, return it. */
443 const char *dot = strchr (locale, '.');
444
445 if (dot != NULL)
446 {
447 const char *modifier;
448
449 dot++;
450 /* Look for the possible @... trailer and remove it, if any. */
451 modifier = strchr (dot, '@');
452 if (modifier == NULL)
453 return dot;
454 if (modifier - dot < sizeof (buf))
455 {
456 memcpy (buf, dot, modifier - dot);
457 buf [modifier - dot] = '\0';
458 return buf;
459 }
460 }
461
462 /* Resolve through the charset.alias file. */
463 codeset = locale;
464 }
465 else
466 {
467 /* OS/2 has a function returning the locale's codepage as a number. */
468 if (DosQueryCp (sizeof (cp), cp, &cplen))
469 codeset = "";
470 else
471 {
472 sprintf (buf, "CP%u", cp[0]);
473 codeset = buf;
474 }
475 }
476
477 #endif
478
479 if (codeset == NULL)
480 /* The canonical name cannot be determined. */
481 codeset = "";
482
483 /* Resolve alias. */
484 for (aliases = get_charset_aliases ();
485 *aliases != '\0';
486 aliases += strlen (aliases) + 1, aliases += strlen (aliases) + 1)
487 if (strcmp (codeset, aliases) == 0
488 || (aliases[0] == '*' && aliases[1] == '\0'))
489 {
490 codeset = aliases + strlen (aliases) + 1;
491 break;
492 }
493
494 /* Don't return an empty string. GNU libc and GNU libiconv interpret
495 the empty string as denoting "the locale's character encoding",
496 thus GNU libiconv would call this function a second time. */
497 if (codeset[0] == '\0')
498 codeset = "ASCII";
499
500 return codeset;
501 }
502