• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* gcharset.c - Charset information
2  *
3  * Copyright (C) 2011 Red Hat, Inc.
4  *
5  * This library is free software; you can redistribute it and/or
6  * modify it under the terms of the GNU Lesser General Public
7  * License as published by the Free Software Foundation; either
8  * version 2.1 of the License, or (at your option) any later version.
9  *
10  * This library is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13  * Lesser General Public License for more details.
14  *
15  * You should have received a copy of the GNU Lesser General Public
16  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
17  */
18 
19 #include "config.h"
20 
21 #include "gcharset.h"
22 #include "gcharsetprivate.h"
23 
24 #include "garray.h"
25 #include "genviron.h"
26 #include "ghash.h"
27 #include "gmessages.h"
28 #include "gstrfuncs.h"
29 #include "gthread.h"
30 #include "gthreadprivate.h"
31 #ifdef G_OS_WIN32
32 #include "gwin32.h"
33 #endif
34 
35 #include "libcharset/libcharset.h"
36 
37 #include <string.h>
38 #include <stdio.h>
39 #ifdef G_OS_WIN32
40 #define WIN32_LEAN_AND_MEAN
41 #include <windows.h>
42 #endif
43 
44 G_LOCK_DEFINE_STATIC (aliases);
45 
46 static GHashTable *
get_alias_hash(void)47 get_alias_hash (void)
48 {
49   static GHashTable *alias_hash = NULL;
50   const char *aliases;
51 
52   G_LOCK (aliases);
53 
54   if (!alias_hash)
55     {
56       alias_hash = g_hash_table_new (g_str_hash, g_str_equal);
57 
58       aliases = _g_locale_get_charset_aliases ();
59       while (*aliases != '\0')
60         {
61           const char *canonical;
62           const char *alias;
63           const char **alias_array;
64           int count = 0;
65 
66           alias = aliases;
67           aliases += strlen (aliases) + 1;
68           canonical = aliases;
69           aliases += strlen (aliases) + 1;
70 
71           alias_array = g_hash_table_lookup (alias_hash, canonical);
72           if (alias_array)
73             {
74               while (alias_array[count])
75                 count++;
76             }
77 
78           alias_array = g_renew (const char *, alias_array, count + 2);
79           alias_array[count] = alias;
80           alias_array[count + 1] = NULL;
81 
82           g_hash_table_insert (alias_hash, (char *)canonical, alias_array);
83         }
84     }
85 
86   G_UNLOCK (aliases);
87 
88   return alias_hash;
89 }
90 
91 /* As an abuse of the alias table, the following routines gets
92  * the charsets that are aliases for the canonical name.
93  */
94 const char **
_g_charset_get_aliases(const char * canonical_name)95 _g_charset_get_aliases (const char *canonical_name)
96 {
97   GHashTable *alias_hash = get_alias_hash ();
98 
99   return g_hash_table_lookup (alias_hash, canonical_name);
100 }
101 
102 static gboolean
g_utf8_get_charset_internal(const char * raw_data,const char ** a)103 g_utf8_get_charset_internal (const char  *raw_data,
104                              const char **a)
105 {
106   const char *charset = g_getenv ("CHARSET");
107 
108   if (charset && *charset)
109     {
110       *a = charset;
111 
112       if (charset && strstr (charset, "UTF-8"))
113         return TRUE;
114       else
115         return FALSE;
116     }
117 
118   /* The libcharset code tries to be thread-safe without
119    * a lock, but has a memory leak and a missing memory
120    * barrier, so we lock for it
121    */
122   G_LOCK (aliases);
123   charset = _g_locale_charset_unalias (raw_data);
124   G_UNLOCK (aliases);
125 
126   if (charset && *charset)
127     {
128       *a = charset;
129 
130       if (charset && strstr (charset, "UTF-8"))
131         return TRUE;
132       else
133         return FALSE;
134     }
135 
136   /* Assume this for compatibility at present.  */
137   *a = "US-ASCII";
138 
139   return FALSE;
140 }
141 
142 typedef struct _GCharsetCache GCharsetCache;
143 
144 struct _GCharsetCache {
145   gboolean is_utf8;
146   gchar *raw;
147   gchar *charset;
148 };
149 
150 static void
charset_cache_free(gpointer data)151 charset_cache_free (gpointer data)
152 {
153   GCharsetCache *cache = data;
154   g_free (cache->raw);
155   g_free (cache->charset);
156   g_free (cache);
157 }
158 
159 /**
160  * g_get_charset:
161  * @charset: (out) (optional) (transfer none): return location for character set
162  *   name, or %NULL.
163  *
164  * Obtains the character set for the [current locale][setlocale]; you
165  * might use this character set as an argument to g_convert(), to convert
166  * from the current locale's encoding to some other encoding. (Frequently
167  * g_locale_to_utf8() and g_locale_from_utf8() are nice shortcuts, though.)
168  *
169  * On Windows the character set returned by this function is the
170  * so-called system default ANSI code-page. That is the character set
171  * used by the "narrow" versions of C library and Win32 functions that
172  * handle file names. It might be different from the character set
173  * used by the C library's current locale.
174  *
175  * On Linux, the character set is found by consulting nl_langinfo() if
176  * available. If not, the environment variables `LC_ALL`, `LC_CTYPE`, `LANG`
177  * and `CHARSET` are queried in order.
178  *
179  * The return value is %TRUE if the locale's encoding is UTF-8, in that
180  * case you can perhaps avoid calling g_convert().
181  *
182  * The string returned in @charset is not allocated, and should not be
183  * freed.
184  *
185  * Returns: %TRUE if the returned charset is UTF-8
186  */
187 gboolean
g_get_charset(const char ** charset)188 g_get_charset (const char **charset)
189 {
190   static GPrivate cache_private = G_PRIVATE_INIT (charset_cache_free);
191   GCharsetCache *cache = g_private_get (&cache_private);
192   const gchar *raw;
193 
194   if (!cache)
195     cache = g_private_set_alloc0 (&cache_private, sizeof (GCharsetCache));
196 
197   G_LOCK (aliases);
198   raw = _g_locale_charset_raw ();
199   G_UNLOCK (aliases);
200 
201   if (cache->raw == NULL || strcmp (cache->raw, raw) != 0)
202     {
203       const gchar *new_charset;
204 
205       g_free (cache->raw);
206       g_free (cache->charset);
207       cache->raw = g_strdup (raw);
208       cache->is_utf8 = g_utf8_get_charset_internal (raw, &new_charset);
209       cache->charset = g_strdup (new_charset);
210     }
211 
212   if (charset)
213     *charset = cache->charset;
214 
215   return cache->is_utf8;
216 }
217 
218 /**
219  * g_get_codeset:
220  *
221  * Gets the character set for the current locale.
222  *
223  * Returns: a newly allocated string containing the name
224  *     of the character set. This string must be freed with g_free().
225  */
226 gchar *
g_get_codeset(void)227 g_get_codeset (void)
228 {
229   const gchar *charset;
230 
231   g_get_charset (&charset);
232 
233   return g_strdup (charset);
234 }
235 
236 /**
237  * g_get_console_charset:
238  * @charset: (out) (optional) (transfer none): return location for character set
239  *   name, or %NULL.
240  *
241  * Obtains the character set used by the console attached to the process,
242  * which is suitable for printing output to the terminal.
243  *
244  * Usually this matches the result returned by g_get_charset(), but in
245  * environments where the locale's character set does not match the encoding
246  * of the console this function tries to guess a more suitable value instead.
247  *
248  * On Windows the character set returned by this function is the
249  * output code page used by the console associated with the calling process.
250  * If the codepage can't be determined (for example because there is no
251  * console attached) UTF-8 is assumed.
252  *
253  * The return value is %TRUE if the locale's encoding is UTF-8, in that
254  * case you can perhaps avoid calling g_convert().
255  *
256  * The string returned in @charset is not allocated, and should not be
257  * freed.
258  *
259  * Returns: %TRUE if the returned charset is UTF-8
260  *
261  * Since: 2.62
262  */
263 gboolean
g_get_console_charset(const char ** charset)264 g_get_console_charset (const char **charset)
265 {
266 #ifdef G_OS_WIN32
267   static GPrivate cache_private = G_PRIVATE_INIT (charset_cache_free);
268   GCharsetCache *cache = g_private_get (&cache_private);
269   const gchar *locale;
270   unsigned int cp;
271   char buf[2 + 20 + 1]; /* "CP" + G_MAXUINT64 (to be safe) in decimal form (20 bytes) + "\0" */
272   const gchar *raw = NULL;
273 
274   if (!cache)
275     cache = g_private_set_alloc0 (&cache_private, sizeof (GCharsetCache));
276 
277   /* first try to query $LANG (works for Cygwin/MSYS/MSYS2 and others using mintty) */
278   locale = g_getenv ("LANG");
279   if (locale != NULL && locale[0] != '\0')
280     {
281       /* If the locale name contains an encoding after the dot, return it.  */
282       const char *dot = strchr (locale, '.');
283 
284       if (dot != NULL)
285         {
286           const char *modifier;
287 
288           dot++;
289           /* Look for the possible @... trailer and remove it, if any.  */
290           modifier = strchr (dot, '@');
291           if (modifier == NULL)
292             raw = dot;
293           else if (modifier - dot < sizeof (buf))
294             {
295               memcpy (buf, dot, modifier - dot);
296               buf[modifier - dot] = '\0';
297               raw = buf;
298             }
299         }
300     }
301   /* next try querying console codepage using native win32 API */
302   if (raw == NULL)
303     {
304       cp = GetConsoleOutputCP ();
305       if (cp)
306         {
307           sprintf (buf, "CP%u", cp);
308           raw = buf;
309         }
310       else if (GetLastError () != ERROR_INVALID_HANDLE)
311         {
312           gchar *emsg = g_win32_error_message (GetLastError ());
313           g_warning ("Failed to determine console output code page: %s. "
314                      "Falling back to UTF-8", emsg);
315           g_free (emsg);
316         }
317     }
318   /* fall-back to UTF-8 if the rest failed (it's a universal default) */
319   if (raw == NULL)
320     raw = "UTF-8";
321 
322   if (cache->raw == NULL || strcmp (cache->raw, raw) != 0)
323     {
324       const gchar *new_charset;
325 
326       g_free (cache->raw);
327       g_free (cache->charset);
328       cache->raw = g_strdup (raw);
329       cache->is_utf8 = g_utf8_get_charset_internal (raw, &new_charset);
330       cache->charset = g_strdup (new_charset);
331     }
332 
333   if (charset)
334     *charset = cache->charset;
335 
336   return cache->is_utf8;
337 #else
338   /* assume the locale settings match the console encoding on non-Windows OSs */
339   return g_get_charset (charset);
340 #endif
341 }
342 
343 #ifndef G_OS_WIN32
344 
345 /* read an alias file for the locales */
346 static void
read_aliases(const gchar * file,GHashTable * alias_table)347 read_aliases (const gchar *file,
348               GHashTable  *alias_table)
349 {
350   FILE *fp;
351   char buf[256];
352 
353   fp = fopen (file,"r");
354   if (!fp)
355     return;
356   while (fgets (buf, 256, fp))
357     {
358       char *p, *q;
359 
360       g_strstrip (buf);
361 
362       /* Line is a comment */
363       if ((buf[0] == '#') || (buf[0] == '\0'))
364         continue;
365 
366       /* Reads first column */
367       for (p = buf, q = NULL; *p; p++) {
368         if ((*p == '\t') || (*p == ' ') || (*p == ':')) {
369           *p = '\0';
370           q = p+1;
371           while ((*q == '\t') || (*q == ' ')) {
372             q++;
373           }
374           break;
375         }
376       }
377       /* The line only had one column */
378       if (!q || *q == '\0')
379         continue;
380 
381       /* Read second column */
382       for (p = q; *p; p++) {
383         if ((*p == '\t') || (*p == ' ')) {
384           *p = '\0';
385           break;
386         }
387       }
388 
389       /* Add to alias table if necessary */
390       if (!g_hash_table_lookup (alias_table, buf)) {
391         g_hash_table_insert (alias_table, g_strdup (buf), g_strdup (q));
392       }
393     }
394   fclose (fp);
395 }
396 
397 #endif
398 
399 static char *
unalias_lang(char * lang)400 unalias_lang (char *lang)
401 {
402 #ifndef G_OS_WIN32
403   static GHashTable *alias_table = NULL;
404   char *p;
405   int i;
406 
407   if (g_once_init_enter (&alias_table))
408     {
409       GHashTable *table = g_hash_table_new (g_str_hash, g_str_equal);
410       read_aliases ("/usr/share/locale/locale.alias", table);
411       g_once_init_leave (&alias_table, table);
412     }
413 
414   i = 0;
415   while ((p = g_hash_table_lookup (alias_table, lang)) && (strcmp (p, lang) != 0))
416     {
417       lang = p;
418       if (i++ == 30)
419         {
420           static gboolean said_before = FALSE;
421           if (!said_before)
422             g_warning ("Too many alias levels for a locale, "
423                        "may indicate a loop");
424           said_before = TRUE;
425           return lang;
426         }
427     }
428 #endif
429   return lang;
430 }
431 
432 /* Mask for components of locale spec. The ordering here is from
433  * least significant to most significant
434  */
435 enum
436 {
437   COMPONENT_CODESET =   1 << 0,
438   COMPONENT_TERRITORY = 1 << 1,
439   COMPONENT_MODIFIER =  1 << 2
440 };
441 
442 /* Break an X/Open style locale specification into components
443  */
444 static guint
explode_locale(const gchar * locale,gchar ** language,gchar ** territory,gchar ** codeset,gchar ** modifier)445 explode_locale (const gchar *locale,
446                 gchar      **language,
447                 gchar      **territory,
448                 gchar      **codeset,
449                 gchar      **modifier)
450 {
451   const gchar *uscore_pos;
452   const gchar *at_pos;
453   const gchar *dot_pos;
454 
455   guint mask = 0;
456 
457   uscore_pos = strchr (locale, '_');
458   dot_pos = strchr (uscore_pos ? uscore_pos : locale, '.');
459   at_pos = strchr (dot_pos ? dot_pos : (uscore_pos ? uscore_pos : locale), '@');
460 
461   if (at_pos)
462     {
463       mask |= COMPONENT_MODIFIER;
464       *modifier = g_strdup (at_pos);
465     }
466   else
467     at_pos = locale + strlen (locale);
468 
469   if (dot_pos)
470     {
471       mask |= COMPONENT_CODESET;
472       *codeset = g_strndup (dot_pos, at_pos - dot_pos);
473     }
474   else
475     dot_pos = at_pos;
476 
477   if (uscore_pos)
478     {
479       mask |= COMPONENT_TERRITORY;
480       *territory = g_strndup (uscore_pos, dot_pos - uscore_pos);
481     }
482   else
483     uscore_pos = dot_pos;
484 
485   *language = g_strndup (locale, uscore_pos - locale);
486 
487   return mask;
488 }
489 
490 /*
491  * Compute all interesting variants for a given locale name -
492  * by stripping off different components of the value.
493  *
494  * For simplicity, we assume that the locale is in
495  * X/Open format: language[_territory][.codeset][@modifier]
496  *
497  * TODO: Extend this to handle the CEN format (see the GNUlibc docs)
498  *       as well. We could just copy the code from glibc wholesale
499  *       but it is big, ugly, and complicated, so I'm reluctant
500  *       to do so when this should handle 99% of the time...
501  */
502 static void
append_locale_variants(GPtrArray * array,const gchar * locale)503 append_locale_variants (GPtrArray *array,
504                         const gchar *locale)
505 {
506   gchar *language = NULL;
507   gchar *territory = NULL;
508   gchar *codeset = NULL;
509   gchar *modifier = NULL;
510 
511   guint mask;
512   guint i, j;
513 
514   g_return_if_fail (locale != NULL);
515 
516   mask = explode_locale (locale, &language, &territory, &codeset, &modifier);
517 
518   /* Iterate through all possible combinations, from least attractive
519    * to most attractive.
520    */
521   for (j = 0; j <= mask; ++j)
522     {
523       i = mask - j;
524 
525       if ((i & ~mask) == 0)
526         {
527           gchar *val = g_strconcat (language,
528                                     (i & COMPONENT_TERRITORY) ? territory : "",
529                                     (i & COMPONENT_CODESET) ? codeset : "",
530                                     (i & COMPONENT_MODIFIER) ? modifier : "",
531                                     NULL);
532           g_ptr_array_add (array, val);
533         }
534     }
535 
536   g_free (language);
537   if (mask & COMPONENT_CODESET)
538     g_free (codeset);
539   if (mask & COMPONENT_TERRITORY)
540     g_free (territory);
541   if (mask & COMPONENT_MODIFIER)
542     g_free (modifier);
543 }
544 
545 /**
546  * g_get_locale_variants:
547  * @locale: a locale identifier
548  *
549  * Returns a list of derived variants of @locale, which can be used to
550  * e.g. construct locale-dependent filenames or search paths. The returned
551  * list is sorted from most desirable to least desirable.
552  * This function handles territory, charset and extra locale modifiers. See
553  * [`setlocale(3)`](man:setlocale) for information about locales and their format.
554  *
555  * @locale itself is guaranteed to be returned in the output.
556  *
557  * For example, if @locale is `fr_BE`, then the returned list
558  * is `fr_BE`, `fr`. If @locale is `en_GB.UTF-8@euro`, then the returned list
559  * is `en_GB.UTF-8@euro`, `en_GB.UTF-8`, `en_GB@euro`, `en_GB`, `en.UTF-8@euro`,
560  * `en.UTF-8`, `en@euro`, `en`.
561  *
562  * If you need the list of variants for the current locale,
563  * use g_get_language_names().
564  *
565  * Returns: (transfer full) (array zero-terminated=1) (element-type utf8): a newly
566  *   allocated array of newly allocated strings with the locale variants. Free with
567  *   g_strfreev().
568  *
569  * Since: 2.28
570  */
571 gchar **
g_get_locale_variants(const gchar * locale)572 g_get_locale_variants (const gchar *locale)
573 {
574   GPtrArray *array;
575 
576   g_return_val_if_fail (locale != NULL, NULL);
577 
578   array = g_ptr_array_sized_new (8);
579   append_locale_variants (array, locale);
580   g_ptr_array_add (array, NULL);
581 
582   return (gchar **) g_ptr_array_free (array, FALSE);
583 }
584 
585 /* The following is (partly) taken from the gettext package.
586    Copyright (C) 1995, 1996, 1997, 1998 Free Software Foundation, Inc.  */
587 
588 static const gchar *
guess_category_value(const gchar * category_name)589 guess_category_value (const gchar *category_name)
590 {
591   const gchar *retval;
592 
593   /* The highest priority value is the 'LANGUAGE' environment
594      variable.  This is a GNU extension.  */
595   retval = g_getenv ("LANGUAGE");
596   if ((retval != NULL) && (retval[0] != '\0'))
597     return retval;
598 
599   /* 'LANGUAGE' is not set.  So we have to proceed with the POSIX
600      methods of looking to 'LC_ALL', 'LC_xxx', and 'LANG'.  On some
601      systems this can be done by the 'setlocale' function itself.  */
602 
603   /* Setting of LC_ALL overwrites all other.  */
604   retval = g_getenv ("LC_ALL");
605   if ((retval != NULL) && (retval[0] != '\0'))
606     return retval;
607 
608   /* Next comes the name of the desired category.  */
609   retval = g_getenv (category_name);
610   if ((retval != NULL) && (retval[0] != '\0'))
611     return retval;
612 
613   /* Last possibility is the LANG environment variable.  */
614   retval = g_getenv ("LANG");
615   if ((retval != NULL) && (retval[0] != '\0'))
616     return retval;
617 
618 #ifdef G_PLATFORM_WIN32
619   /* g_win32_getlocale() first checks for LC_ALL, LC_MESSAGES and
620    * LANG, which we already did above. Oh well. The main point of
621    * calling g_win32_getlocale() is to get the thread's locale as used
622    * by Windows and the Microsoft C runtime (in the "English_United
623    * States" format) translated into the Unixish format.
624    */
625   {
626     char *locale = g_win32_getlocale ();
627     retval = g_intern_string (locale);
628     g_free (locale);
629     return retval;
630   }
631 #endif
632 
633   return NULL;
634 }
635 
636 typedef struct _GLanguageNamesCache GLanguageNamesCache;
637 
638 struct _GLanguageNamesCache {
639   gchar *languages;
640   gchar **language_names;
641 };
642 
643 static void
language_names_cache_free(gpointer data)644 language_names_cache_free (gpointer data)
645 {
646   GLanguageNamesCache *cache = data;
647   g_free (cache->languages);
648   g_strfreev (cache->language_names);
649   g_free (cache);
650 }
651 
652 /**
653  * g_get_language_names:
654  *
655  * Computes a list of applicable locale names, which can be used to
656  * e.g. construct locale-dependent filenames or search paths. The returned
657  * list is sorted from most desirable to least desirable and always contains
658  * the default locale "C".
659  *
660  * For example, if LANGUAGE=de:en_US, then the returned list is
661  * "de", "en_US", "en", "C".
662  *
663  * This function consults the environment variables `LANGUAGE`, `LC_ALL`,
664  * `LC_MESSAGES` and `LANG` to find the list of locales specified by the
665  * user.
666  *
667  * Returns: (array zero-terminated=1) (transfer none): a %NULL-terminated array of strings owned by GLib
668  *    that must not be modified or freed.
669  *
670  * Since: 2.6
671  */
672 const gchar * const *
g_get_language_names(void)673 g_get_language_names (void)
674 {
675   return g_get_language_names_with_category ("LC_MESSAGES");
676 }
677 
678 /**
679  * g_get_language_names_with_category:
680  * @category_name: a locale category name
681  *
682  * Computes a list of applicable locale names with a locale category name,
683  * which can be used to construct the fallback locale-dependent filenames
684  * or search paths. The returned list is sorted from most desirable to
685  * least desirable and always contains the default locale "C".
686  *
687  * This function consults the environment variables `LANGUAGE`, `LC_ALL`,
688  * @category_name, and `LANG` to find the list of locales specified by the
689  * user.
690  *
691  * g_get_language_names() returns g_get_language_names_with_category("LC_MESSAGES").
692  *
693  * Returns: (array zero-terminated=1) (transfer none): a %NULL-terminated array of strings owned by
694  *    the thread g_get_language_names_with_category was called from.
695  *    It must not be modified or freed. It must be copied if planned to be used in another thread.
696  *
697  * Since: 2.58
698  */
699 const gchar * const *
g_get_language_names_with_category(const gchar * category_name)700 g_get_language_names_with_category (const gchar *category_name)
701 {
702   static GPrivate cache_private = G_PRIVATE_INIT ((void (*)(gpointer)) g_hash_table_unref);
703   GHashTable *cache = g_private_get (&cache_private);
704   const gchar *languages;
705   GLanguageNamesCache *name_cache;
706 
707   g_return_val_if_fail (category_name != NULL, NULL);
708 
709   if (!cache)
710     {
711       cache = g_hash_table_new_full (g_str_hash, g_str_equal,
712                                      g_free, language_names_cache_free);
713       g_private_set (&cache_private, cache);
714     }
715 
716   languages = guess_category_value (category_name);
717   if (!languages)
718     languages = "C";
719 
720   name_cache = (GLanguageNamesCache *) g_hash_table_lookup (cache, category_name);
721   if (!(name_cache && name_cache->languages &&
722         strcmp (name_cache->languages, languages) == 0))
723     {
724       GPtrArray *array;
725       gchar **alist, **a;
726 
727       g_hash_table_remove (cache, category_name);
728 
729       array = g_ptr_array_sized_new (8);
730 
731       alist = g_strsplit (languages, ":", 0);
732       for (a = alist; *a; a++)
733         append_locale_variants (array, unalias_lang (*a));
734       g_strfreev (alist);
735       g_ptr_array_add (array, g_strdup ("C"));
736       g_ptr_array_add (array, NULL);
737 
738       name_cache = g_new0 (GLanguageNamesCache, 1);
739       name_cache->languages = g_strdup (languages);
740       name_cache->language_names = (gchar **) g_ptr_array_free (array, FALSE);
741       g_hash_table_insert (cache, g_strdup (category_name), name_cache);
742     }
743 
744   return (const gchar * const *) name_cache->language_names;
745 }
746