• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* -*- buffer-read-only: t -*- vi: set ro: */
2 /* DO NOT EDIT! GENERATED AUTOMATICALLY! */
3 /* Determine a canonical name for the current locale's character encoding.
4 
5    Copyright (C) 2000-2006, 2008-2009 Free Software Foundation, Inc.
6 
7    This program is free software; you can redistribute it and/or modify
8    it under the terms of the GNU General Public License as published by
9    the Free Software Foundation; either version 3, or (at your option)
10    any later version.
11 
12    This program is distributed in the hope that it will be useful,
13    but WITHOUT ANY WARRANTY; without even the implied warranty of
14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15    GNU General Public License for more details.
16 
17    You should have received a copy of the GNU General Public License along
18    with this program; if not, write to the Free Software Foundation,
19    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
20 
21 /* Written by Bruno Haible <bruno@clisp.org>.  */
22 
23 #include <config.h>
24 
25 /* Specification.  */
26 #include "localcharset.h"
27 
28 #include <stddef.h>
29 #include <stdio.h>
30 #include <string.h>
31 #include <stdlib.h>
32 
33 #if defined __APPLE__ && defined __MACH__ && HAVE_LANGINFO_CODESET
34 # define DARWIN7 /* Darwin 7 or newer, i.e. MacOS X 10.3 or newer */
35 #endif
36 
37 #if defined _WIN32 || defined __WIN32__
38 # define WIN32_NATIVE
39 #endif
40 
41 #if defined __EMX__
42 /* Assume EMX program runs on OS/2, even if compiled under DOS.  */
43 # ifndef OS2
44 #  define OS2
45 # endif
46 #endif
47 
48 #if !defined WIN32_NATIVE
49 # if HAVE_LANGINFO_CODESET
50 #  include <langinfo.h>
51 # else
52 #  if 0 /* see comment below */
53 #   include <locale.h>
54 #  endif
55 # endif
56 # ifdef __CYGWIN__
57 #  define WIN32_LEAN_AND_MEAN
58 #  include <windows.h>
59 # endif
60 #elif defined WIN32_NATIVE
61 # define WIN32_LEAN_AND_MEAN
62 # include <windows.h>
63 #endif
64 #if defined OS2
65 # define INCL_DOS
66 # include <os2.h>
67 #endif
68 
69 #if ENABLE_RELOCATABLE
70 # include "relocatable.h"
71 #else
72 # define relocate(pathname) (pathname)
73 #endif
74 
75 /* Get LIBDIR.  */
76 #ifndef LIBDIR
77 # include "configmake.h"
78 #endif
79 
80 #if defined _WIN32 || defined __WIN32__ || defined __CYGWIN__ || defined __EMX__ || defined __DJGPP__
81   /* Win32, Cygwin, OS/2, DOS */
82 # define ISSLASH(C) ((C) == '/' || (C) == '\\')
83 #endif
84 
85 #ifndef DIRECTORY_SEPARATOR
86 # define DIRECTORY_SEPARATOR '/'
87 #endif
88 
89 #ifndef ISSLASH
90 # define ISSLASH(C) ((C) == DIRECTORY_SEPARATOR)
91 #endif
92 
93 #if HAVE_DECL_GETC_UNLOCKED
94 # undef getc
95 # define getc getc_unlocked
96 #endif
97 
98 /* The following static variable is declared 'volatile' to avoid a
99    possible multithread problem in the function get_charset_aliases. If we
100    are running in a threaded environment, and if two threads initialize
101    'charset_aliases' simultaneously, both will produce the same value,
102    and everything will be ok if the two assignments to 'charset_aliases'
103    are atomic. But I don't know what will happen if the two assignments mix.  */
104 #if __STDC__ != 1
105 # define volatile /* empty */
106 #endif
107 /* Pointer to the contents of the charset.alias file, if it has already been
108    read, else NULL.  Its format is:
109    ALIAS_1 '\0' CANONICAL_1 '\0' ... ALIAS_n '\0' CANONICAL_n '\0' '\0'  */
110 static const char * volatile charset_aliases;
111 
112 /* Return a pointer to the contents of the charset.alias file.  */
113 static const char *
get_charset_aliases(void)114 get_charset_aliases (void)
115 {
116   const char *cp;
117 
118   cp = charset_aliases;
119   if (cp == NULL)
120     {
121 #if !(defined DARWIN7 || defined VMS || defined WIN32_NATIVE || defined __CYGWIN__)
122       FILE *fp;
123       const char *dir;
124       const char *base = "charset.alias";
125       char *file_name;
126 
127       /* Make it possible to override the charset.alias location.  This is
128 	 necessary for running the testsuite before "make install".  */
129       dir = getenv ("CHARSETALIASDIR");
130       if (dir == NULL || dir[0] == '\0')
131 	dir = relocate (LIBDIR);
132 
133       /* Concatenate dir and base into freshly allocated file_name.  */
134       {
135 	size_t dir_len = strlen (dir);
136 	size_t base_len = strlen (base);
137 	int add_slash = (dir_len > 0 && !ISSLASH (dir[dir_len - 1]));
138 	file_name = (char *) malloc (dir_len + add_slash + base_len + 1);
139 	if (file_name != NULL)
140 	  {
141 	    memcpy (file_name, dir, dir_len);
142 	    if (add_slash)
143 	      file_name[dir_len] = DIRECTORY_SEPARATOR;
144 	    memcpy (file_name + dir_len + add_slash, base, base_len + 1);
145 	  }
146       }
147 
148       if (file_name == NULL || (fp = fopen (file_name, "r")) == NULL)
149 	/* Out of memory or file not found, treat it as empty.  */
150 	cp = "";
151       else
152 	{
153 	  /* Parse the file's contents.  */
154 	  char *res_ptr = NULL;
155 	  size_t res_size = 0;
156 
157 	  for (;;)
158 	    {
159 	      int c;
160 	      char buf1[50+1];
161 	      char buf2[50+1];
162 	      size_t l1, l2;
163 	      char *old_res_ptr;
164 
165 	      c = getc (fp);
166 	      if (c == EOF)
167 		break;
168 	      if (c == '\n' || c == ' ' || c == '\t')
169 		continue;
170 	      if (c == '#')
171 		{
172 		  /* Skip comment, to end of line.  */
173 		  do
174 		    c = getc (fp);
175 		  while (!(c == EOF || c == '\n'));
176 		  if (c == EOF)
177 		    break;
178 		  continue;
179 		}
180 	      ungetc (c, fp);
181 	      if (fscanf (fp, "%50s %50s", buf1, buf2) < 2)
182 		break;
183 	      l1 = strlen (buf1);
184 	      l2 = strlen (buf2);
185 	      old_res_ptr = res_ptr;
186 	      if (res_size == 0)
187 		{
188 		  res_size = l1 + 1 + l2 + 1;
189 		  res_ptr = (char *) malloc (res_size + 1);
190 		}
191 	      else
192 		{
193 		  res_size += l1 + 1 + l2 + 1;
194 		  res_ptr = (char *) realloc (res_ptr, res_size + 1);
195 		}
196 	      if (res_ptr == NULL)
197 		{
198 		  /* Out of memory. */
199 		  res_size = 0;
200 		  if (old_res_ptr != NULL)
201 		    free (old_res_ptr);
202 		  break;
203 		}
204 	      strcpy (res_ptr + res_size - (l2 + 1) - (l1 + 1), buf1);
205 	      strcpy (res_ptr + res_size - (l2 + 1), buf2);
206 	    }
207 	  fclose (fp);
208 	  if (res_size == 0)
209 	    cp = "";
210 	  else
211 	    {
212 	      *(res_ptr + res_size) = '\0';
213 	      cp = res_ptr;
214 	    }
215 	}
216 
217       if (file_name != NULL)
218 	free (file_name);
219 
220 #else
221 
222 # if defined DARWIN7
223       /* To avoid the trouble of installing a file that is shared by many
224 	 GNU packages -- many packaging systems have problems with this --,
225 	 simply inline the aliases here.  */
226       cp = "ISO8859-1" "\0" "ISO-8859-1" "\0"
227 	   "ISO8859-2" "\0" "ISO-8859-2" "\0"
228 	   "ISO8859-4" "\0" "ISO-8859-4" "\0"
229 	   "ISO8859-5" "\0" "ISO-8859-5" "\0"
230 	   "ISO8859-7" "\0" "ISO-8859-7" "\0"
231 	   "ISO8859-9" "\0" "ISO-8859-9" "\0"
232 	   "ISO8859-13" "\0" "ISO-8859-13" "\0"
233 	   "ISO8859-15" "\0" "ISO-8859-15" "\0"
234 	   "KOI8-R" "\0" "KOI8-R" "\0"
235 	   "KOI8-U" "\0" "KOI8-U" "\0"
236 	   "CP866" "\0" "CP866" "\0"
237 	   "CP949" "\0" "CP949" "\0"
238 	   "CP1131" "\0" "CP1131" "\0"
239 	   "CP1251" "\0" "CP1251" "\0"
240 	   "eucCN" "\0" "GB2312" "\0"
241 	   "GB2312" "\0" "GB2312" "\0"
242 	   "eucJP" "\0" "EUC-JP" "\0"
243 	   "eucKR" "\0" "EUC-KR" "\0"
244 	   "Big5" "\0" "BIG5" "\0"
245 	   "Big5HKSCS" "\0" "BIG5-HKSCS" "\0"
246 	   "GBK" "\0" "GBK" "\0"
247 	   "GB18030" "\0" "GB18030" "\0"
248 	   "SJIS" "\0" "SHIFT_JIS" "\0"
249 	   "ARMSCII-8" "\0" "ARMSCII-8" "\0"
250 	   "PT154" "\0" "PT154" "\0"
251 	 /*"ISCII-DEV" "\0" "?" "\0"*/
252 	   "*" "\0" "UTF-8" "\0";
253 # endif
254 
255 # if defined VMS
256       /* To avoid the troubles of an extra file charset.alias_vms in the
257 	 sources of many GNU packages, simply inline the aliases here.  */
258       /* The list of encodings is taken from the OpenVMS 7.3-1 documentation
259 	 "Compaq C Run-Time Library Reference Manual for OpenVMS systems"
260 	 section 10.7 "Handling Different Character Sets".  */
261       cp = "ISO8859-1" "\0" "ISO-8859-1" "\0"
262 	   "ISO8859-2" "\0" "ISO-8859-2" "\0"
263 	   "ISO8859-5" "\0" "ISO-8859-5" "\0"
264 	   "ISO8859-7" "\0" "ISO-8859-7" "\0"
265 	   "ISO8859-8" "\0" "ISO-8859-8" "\0"
266 	   "ISO8859-9" "\0" "ISO-8859-9" "\0"
267 	   /* Japanese */
268 	   "eucJP" "\0" "EUC-JP" "\0"
269 	   "SJIS" "\0" "SHIFT_JIS" "\0"
270 	   "DECKANJI" "\0" "DEC-KANJI" "\0"
271 	   "SDECKANJI" "\0" "EUC-JP" "\0"
272 	   /* Chinese */
273 	   "eucTW" "\0" "EUC-TW" "\0"
274 	   "DECHANYU" "\0" "DEC-HANYU" "\0"
275 	   "DECHANZI" "\0" "GB2312" "\0"
276 	   /* Korean */
277 	   "DECKOREAN" "\0" "EUC-KR" "\0";
278 # endif
279 
280 # if defined WIN32_NATIVE || defined __CYGWIN__
281       /* To avoid the troubles of installing a separate file in the same
282 	 directory as the DLL and of retrieving the DLL's directory at
283 	 runtime, simply inline the aliases here.  */
284 
285       cp = "CP936" "\0" "GBK" "\0"
286 	   "CP1361" "\0" "JOHAB" "\0"
287 	   "CP20127" "\0" "ASCII" "\0"
288 	   "CP20866" "\0" "KOI8-R" "\0"
289 	   "CP20936" "\0" "GB2312" "\0"
290 	   "CP21866" "\0" "KOI8-RU" "\0"
291 	   "CP28591" "\0" "ISO-8859-1" "\0"
292 	   "CP28592" "\0" "ISO-8859-2" "\0"
293 	   "CP28593" "\0" "ISO-8859-3" "\0"
294 	   "CP28594" "\0" "ISO-8859-4" "\0"
295 	   "CP28595" "\0" "ISO-8859-5" "\0"
296 	   "CP28596" "\0" "ISO-8859-6" "\0"
297 	   "CP28597" "\0" "ISO-8859-7" "\0"
298 	   "CP28598" "\0" "ISO-8859-8" "\0"
299 	   "CP28599" "\0" "ISO-8859-9" "\0"
300 	   "CP28605" "\0" "ISO-8859-15" "\0"
301 	   "CP38598" "\0" "ISO-8859-8" "\0"
302 	   "CP51932" "\0" "EUC-JP" "\0"
303 	   "CP51936" "\0" "GB2312" "\0"
304 	   "CP51949" "\0" "EUC-KR" "\0"
305 	   "CP51950" "\0" "EUC-TW" "\0"
306 	   "CP54936" "\0" "GB18030" "\0"
307 	   "CP65001" "\0" "UTF-8" "\0";
308 # endif
309 #endif
310 
311       charset_aliases = cp;
312     }
313 
314   return cp;
315 }
316 
317 /* Determine the current locale's character encoding, and canonicalize it
318    into one of the canonical names listed in config.charset.
319    The result must not be freed; it is statically allocated.
320    If the canonical name cannot be determined, the result is a non-canonical
321    name.  */
322 
323 #ifdef STATIC
324 STATIC
325 #endif
326 const char *
locale_charset(void)327 locale_charset (void)
328 {
329   const char *codeset;
330   const char *aliases;
331 
332 #if !(defined WIN32_NATIVE || defined OS2)
333 
334 # if HAVE_LANGINFO_CODESET
335 
336   /* Most systems support nl_langinfo (CODESET) nowadays.  */
337   codeset = nl_langinfo (CODESET);
338 
339 #  ifdef __CYGWIN__
340   /* Cygwin 2006 does not have locales.  nl_langinfo (CODESET) always
341      returns "US-ASCII".  As long as this is not fixed, return the suffix
342      of the locale name from the environment variables (if present) or
343      the codepage as a number.  */
344   if (codeset != NULL && strcmp (codeset, "US-ASCII") == 0)
345     {
346       const char *locale;
347       static char buf[2 + 10 + 1];
348 
349       locale = getenv ("LC_ALL");
350       if (locale == NULL || locale[0] == '\0')
351 	{
352 	  locale = getenv ("LC_CTYPE");
353 	  if (locale == NULL || locale[0] == '\0')
354 	    locale = getenv ("LANG");
355 	}
356       if (locale != NULL && locale[0] != '\0')
357 	{
358 	  /* If the locale name contains an encoding after the dot, return
359 	     it.  */
360 	  const char *dot = strchr (locale, '.');
361 
362 	  if (dot != NULL)
363 	    {
364 	      const char *modifier;
365 
366 	      dot++;
367 	      /* Look for the possible @... trailer and remove it, if any.  */
368 	      modifier = strchr (dot, '@');
369 	      if (modifier == NULL)
370 		return dot;
371 	      if (modifier - dot < sizeof (buf))
372 		{
373 		  memcpy (buf, dot, modifier - dot);
374 		  buf [modifier - dot] = '\0';
375 		  return buf;
376 		}
377 	    }
378 	}
379 
380       /* Woe32 has a function returning the locale's codepage as a number.  */
381       sprintf (buf, "CP%u", GetACP ());
382       codeset = buf;
383     }
384 #  endif
385 
386 # else
387 
388   /* On old systems which lack it, use setlocale or getenv.  */
389   const char *locale = NULL;
390 
391   /* But most old systems don't have a complete set of locales.  Some
392      (like SunOS 4 or DJGPP) have only the C locale.  Therefore we don't
393      use setlocale here; it would return "C" when it doesn't support the
394      locale name the user has set.  */
395 #  if 0
396   locale = setlocale (LC_CTYPE, NULL);
397 #  endif
398   if (locale == NULL || locale[0] == '\0')
399     {
400       locale = getenv ("LC_ALL");
401       if (locale == NULL || locale[0] == '\0')
402 	{
403 	  locale = getenv ("LC_CTYPE");
404 	  if (locale == NULL || locale[0] == '\0')
405 	    locale = getenv ("LANG");
406 	}
407     }
408 
409   /* On some old systems, one used to set locale = "iso8859_1". On others,
410      you set it to "language_COUNTRY.charset". In any case, we resolve it
411      through the charset.alias file.  */
412   codeset = locale;
413 
414 # endif
415 
416 #elif defined WIN32_NATIVE
417 
418   static char buf[2 + 10 + 1];
419 
420   /* Woe32 has a function returning the locale's codepage as a number.  */
421   sprintf (buf, "CP%u", GetACP ());
422   codeset = buf;
423 
424 #elif defined OS2
425 
426   const char *locale;
427   static char buf[2 + 10 + 1];
428   ULONG cp[3];
429   ULONG cplen;
430 
431   /* Allow user to override the codeset, as set in the operating system,
432      with standard language environment variables.  */
433   locale = getenv ("LC_ALL");
434   if (locale == NULL || locale[0] == '\0')
435     {
436       locale = getenv ("LC_CTYPE");
437       if (locale == NULL || locale[0] == '\0')
438 	locale = getenv ("LANG");
439     }
440   if (locale != NULL && locale[0] != '\0')
441     {
442       /* If the locale name contains an encoding after the dot, return it.  */
443       const char *dot = strchr (locale, '.');
444 
445       if (dot != NULL)
446 	{
447 	  const char *modifier;
448 
449 	  dot++;
450 	  /* Look for the possible @... trailer and remove it, if any.  */
451 	  modifier = strchr (dot, '@');
452 	  if (modifier == NULL)
453 	    return dot;
454 	  if (modifier - dot < sizeof (buf))
455 	    {
456 	      memcpy (buf, dot, modifier - dot);
457 	      buf [modifier - dot] = '\0';
458 	      return buf;
459 	    }
460 	}
461 
462       /* Resolve through the charset.alias file.  */
463       codeset = locale;
464     }
465   else
466     {
467       /* OS/2 has a function returning the locale's codepage as a number.  */
468       if (DosQueryCp (sizeof (cp), cp, &cplen))
469 	codeset = "";
470       else
471 	{
472 	  sprintf (buf, "CP%u", cp[0]);
473 	  codeset = buf;
474 	}
475     }
476 
477 #endif
478 
479   if (codeset == NULL)
480     /* The canonical name cannot be determined.  */
481     codeset = "";
482 
483   /* Resolve alias. */
484   for (aliases = get_charset_aliases ();
485        *aliases != '\0';
486        aliases += strlen (aliases) + 1, aliases += strlen (aliases) + 1)
487     if (strcmp (codeset, aliases) == 0
488 	|| (aliases[0] == '*' && aliases[1] == '\0'))
489       {
490 	codeset = aliases + strlen (aliases) + 1;
491 	break;
492       }
493 
494   /* Don't return an empty string.  GNU libc and GNU libiconv interpret
495      the empty string as denoting "the locale's character encoding",
496      thus GNU libiconv would call this function a second time.  */
497   if (codeset[0] == '\0')
498     codeset = "ASCII";
499 
500   return codeset;
501 }
502