1 /* gcharset.c - Charset information
2 *
3 * Copyright (C) 2011 Red Hat, Inc.
4 *
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public
7 * License as published by the Free Software Foundation; either
8 * version 2.1 of the License, or (at your option) any later version.
9 *
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
14 *
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
17 */
18
19 #include "config.h"
20
21 #include "gcharset.h"
22 #include "gcharsetprivate.h"
23
24 #include "garray.h"
25 #include "genviron.h"
26 #include "ghash.h"
27 #include "gmessages.h"
28 #include "gstrfuncs.h"
29 #include "gthread.h"
30 #include "gthreadprivate.h"
31 #ifdef G_OS_WIN32
32 #include "gwin32.h"
33 #endif
34
35 #include "libcharset/libcharset.h"
36
37 #include <string.h>
38 #include <stdio.h>
39 #ifdef G_OS_WIN32
40 #define WIN32_LEAN_AND_MEAN
41 #include <windows.h>
42 #endif
43
44 G_LOCK_DEFINE_STATIC (aliases);
45
46 static GHashTable *
get_alias_hash(void)47 get_alias_hash (void)
48 {
49 static GHashTable *alias_hash = NULL;
50 const char *aliases;
51
52 G_LOCK (aliases);
53
54 if (!alias_hash)
55 {
56 alias_hash = g_hash_table_new (g_str_hash, g_str_equal);
57
58 aliases = _g_locale_get_charset_aliases ();
59 while (*aliases != '\0')
60 {
61 const char *canonical;
62 const char *alias;
63 const char **alias_array;
64 int count = 0;
65
66 alias = aliases;
67 aliases += strlen (aliases) + 1;
68 canonical = aliases;
69 aliases += strlen (aliases) + 1;
70
71 alias_array = g_hash_table_lookup (alias_hash, canonical);
72 if (alias_array)
73 {
74 while (alias_array[count])
75 count++;
76 }
77
78 alias_array = g_renew (const char *, alias_array, count + 2);
79 alias_array[count] = alias;
80 alias_array[count + 1] = NULL;
81
82 g_hash_table_insert (alias_hash, (char *)canonical, alias_array);
83 }
84 }
85
86 G_UNLOCK (aliases);
87
88 return alias_hash;
89 }
90
91 /* As an abuse of the alias table, the following routines gets
92 * the charsets that are aliases for the canonical name.
93 */
94 const char **
_g_charset_get_aliases(const char * canonical_name)95 _g_charset_get_aliases (const char *canonical_name)
96 {
97 GHashTable *alias_hash = get_alias_hash ();
98
99 return g_hash_table_lookup (alias_hash, canonical_name);
100 }
101
102 static gboolean
g_utf8_get_charset_internal(const char * raw_data,const char ** a)103 g_utf8_get_charset_internal (const char *raw_data,
104 const char **a)
105 {
106 const char *charset = g_getenv ("CHARSET");
107
108 if (charset && *charset)
109 {
110 *a = charset;
111
112 if (charset && strstr (charset, "UTF-8"))
113 return TRUE;
114 else
115 return FALSE;
116 }
117
118 /* The libcharset code tries to be thread-safe without
119 * a lock, but has a memory leak and a missing memory
120 * barrier, so we lock for it
121 */
122 G_LOCK (aliases);
123 charset = _g_locale_charset_unalias (raw_data);
124 G_UNLOCK (aliases);
125
126 if (charset && *charset)
127 {
128 *a = charset;
129
130 if (charset && strstr (charset, "UTF-8"))
131 return TRUE;
132 else
133 return FALSE;
134 }
135
136 /* Assume this for compatibility at present. */
137 *a = "US-ASCII";
138
139 return FALSE;
140 }
141
142 typedef struct _GCharsetCache GCharsetCache;
143
144 struct _GCharsetCache {
145 gboolean is_utf8;
146 gchar *raw;
147 gchar *charset;
148 };
149
150 static void
charset_cache_free(gpointer data)151 charset_cache_free (gpointer data)
152 {
153 GCharsetCache *cache = data;
154 g_free (cache->raw);
155 g_free (cache->charset);
156 g_free (cache);
157 }
158
159 /**
160 * g_get_charset:
161 * @charset: (out) (optional) (transfer none): return location for character set
162 * name, or %NULL.
163 *
164 * Obtains the character set for the [current locale][setlocale]; you
165 * might use this character set as an argument to g_convert(), to convert
166 * from the current locale's encoding to some other encoding. (Frequently
167 * g_locale_to_utf8() and g_locale_from_utf8() are nice shortcuts, though.)
168 *
169 * On Windows the character set returned by this function is the
170 * so-called system default ANSI code-page. That is the character set
171 * used by the "narrow" versions of C library and Win32 functions that
172 * handle file names. It might be different from the character set
173 * used by the C library's current locale.
174 *
175 * On Linux, the character set is found by consulting nl_langinfo() if
176 * available. If not, the environment variables `LC_ALL`, `LC_CTYPE`, `LANG`
177 * and `CHARSET` are queried in order.
178 *
179 * The return value is %TRUE if the locale's encoding is UTF-8, in that
180 * case you can perhaps avoid calling g_convert().
181 *
182 * The string returned in @charset is not allocated, and should not be
183 * freed.
184 *
185 * Returns: %TRUE if the returned charset is UTF-8
186 */
187 gboolean
g_get_charset(const char ** charset)188 g_get_charset (const char **charset)
189 {
190 static GPrivate cache_private = G_PRIVATE_INIT (charset_cache_free);
191 GCharsetCache *cache = g_private_get (&cache_private);
192 const gchar *raw;
193
194 if (!cache)
195 cache = g_private_set_alloc0 (&cache_private, sizeof (GCharsetCache));
196
197 G_LOCK (aliases);
198 raw = _g_locale_charset_raw ();
199 G_UNLOCK (aliases);
200
201 if (cache->raw == NULL || strcmp (cache->raw, raw) != 0)
202 {
203 const gchar *new_charset;
204
205 g_free (cache->raw);
206 g_free (cache->charset);
207 cache->raw = g_strdup (raw);
208 cache->is_utf8 = g_utf8_get_charset_internal (raw, &new_charset);
209 cache->charset = g_strdup (new_charset);
210 }
211
212 if (charset)
213 *charset = cache->charset;
214
215 return cache->is_utf8;
216 }
217
218 /**
219 * g_get_codeset:
220 *
221 * Gets the character set for the current locale.
222 *
223 * Returns: a newly allocated string containing the name
224 * of the character set. This string must be freed with g_free().
225 */
226 gchar *
g_get_codeset(void)227 g_get_codeset (void)
228 {
229 const gchar *charset;
230
231 g_get_charset (&charset);
232
233 return g_strdup (charset);
234 }
235
236 /**
237 * g_get_console_charset:
238 * @charset: (out) (optional) (transfer none): return location for character set
239 * name, or %NULL.
240 *
241 * Obtains the character set used by the console attached to the process,
242 * which is suitable for printing output to the terminal.
243 *
244 * Usually this matches the result returned by g_get_charset(), but in
245 * environments where the locale's character set does not match the encoding
246 * of the console this function tries to guess a more suitable value instead.
247 *
248 * On Windows the character set returned by this function is the
249 * output code page used by the console associated with the calling process.
250 * If the codepage can't be determined (for example because there is no
251 * console attached) UTF-8 is assumed.
252 *
253 * The return value is %TRUE if the locale's encoding is UTF-8, in that
254 * case you can perhaps avoid calling g_convert().
255 *
256 * The string returned in @charset is not allocated, and should not be
257 * freed.
258 *
259 * Returns: %TRUE if the returned charset is UTF-8
260 *
261 * Since: 2.62
262 */
263 gboolean
g_get_console_charset(const char ** charset)264 g_get_console_charset (const char **charset)
265 {
266 #ifdef G_OS_WIN32
267 static GPrivate cache_private = G_PRIVATE_INIT (charset_cache_free);
268 GCharsetCache *cache = g_private_get (&cache_private);
269 const gchar *locale;
270 unsigned int cp;
271 char buf[2 + 20 + 1]; /* "CP" + G_MAXUINT64 (to be safe) in decimal form (20 bytes) + "\0" */
272 const gchar *raw = NULL;
273
274 if (!cache)
275 cache = g_private_set_alloc0 (&cache_private, sizeof (GCharsetCache));
276
277 /* first try to query $LANG (works for Cygwin/MSYS/MSYS2 and others using mintty) */
278 locale = g_getenv ("LANG");
279 if (locale != NULL && locale[0] != '\0')
280 {
281 /* If the locale name contains an encoding after the dot, return it. */
282 const char *dot = strchr (locale, '.');
283
284 if (dot != NULL)
285 {
286 const char *modifier;
287
288 dot++;
289 /* Look for the possible @... trailer and remove it, if any. */
290 modifier = strchr (dot, '@');
291 if (modifier == NULL)
292 raw = dot;
293 else if (modifier - dot < sizeof (buf))
294 {
295 memcpy (buf, dot, modifier - dot);
296 buf[modifier - dot] = '\0';
297 raw = buf;
298 }
299 }
300 }
301 /* next try querying console codepage using native win32 API */
302 if (raw == NULL)
303 {
304 cp = GetConsoleOutputCP ();
305 if (cp)
306 {
307 sprintf (buf, "CP%u", cp);
308 raw = buf;
309 }
310 else if (GetLastError () != ERROR_INVALID_HANDLE)
311 {
312 gchar *emsg = g_win32_error_message (GetLastError ());
313 g_warning ("Failed to determine console output code page: %s. "
314 "Falling back to UTF-8", emsg);
315 g_free (emsg);
316 }
317 }
318 /* fall-back to UTF-8 if the rest failed (it's a universal default) */
319 if (raw == NULL)
320 raw = "UTF-8";
321
322 if (cache->raw == NULL || strcmp (cache->raw, raw) != 0)
323 {
324 const gchar *new_charset;
325
326 g_free (cache->raw);
327 g_free (cache->charset);
328 cache->raw = g_strdup (raw);
329 cache->is_utf8 = g_utf8_get_charset_internal (raw, &new_charset);
330 cache->charset = g_strdup (new_charset);
331 }
332
333 if (charset)
334 *charset = cache->charset;
335
336 return cache->is_utf8;
337 #else
338 /* assume the locale settings match the console encoding on non-Windows OSs */
339 return g_get_charset (charset);
340 #endif
341 }
342
343 #ifndef G_OS_WIN32
344
345 /* read an alias file for the locales */
346 static void
read_aliases(const gchar * file,GHashTable * alias_table)347 read_aliases (const gchar *file,
348 GHashTable *alias_table)
349 {
350 FILE *fp;
351 char buf[256];
352
353 fp = fopen (file,"r");
354 if (!fp)
355 return;
356 while (fgets (buf, 256, fp))
357 {
358 char *p, *q;
359
360 g_strstrip (buf);
361
362 /* Line is a comment */
363 if ((buf[0] == '#') || (buf[0] == '\0'))
364 continue;
365
366 /* Reads first column */
367 for (p = buf, q = NULL; *p; p++) {
368 if ((*p == '\t') || (*p == ' ') || (*p == ':')) {
369 *p = '\0';
370 q = p+1;
371 while ((*q == '\t') || (*q == ' ')) {
372 q++;
373 }
374 break;
375 }
376 }
377 /* The line only had one column */
378 if (!q || *q == '\0')
379 continue;
380
381 /* Read second column */
382 for (p = q; *p; p++) {
383 if ((*p == '\t') || (*p == ' ')) {
384 *p = '\0';
385 break;
386 }
387 }
388
389 /* Add to alias table if necessary */
390 if (!g_hash_table_lookup (alias_table, buf)) {
391 g_hash_table_insert (alias_table, g_strdup (buf), g_strdup (q));
392 }
393 }
394 fclose (fp);
395 }
396
397 #endif
398
399 static char *
unalias_lang(char * lang)400 unalias_lang (char *lang)
401 {
402 #ifndef G_OS_WIN32
403 static GHashTable *alias_table = NULL;
404 char *p;
405 int i;
406
407 if (g_once_init_enter (&alias_table))
408 {
409 GHashTable *table = g_hash_table_new (g_str_hash, g_str_equal);
410 read_aliases ("/usr/share/locale/locale.alias", table);
411 g_once_init_leave (&alias_table, table);
412 }
413
414 i = 0;
415 while ((p = g_hash_table_lookup (alias_table, lang)) && (strcmp (p, lang) != 0))
416 {
417 lang = p;
418 if (i++ == 30)
419 {
420 static gboolean said_before = FALSE;
421 if (!said_before)
422 g_warning ("Too many alias levels for a locale, "
423 "may indicate a loop");
424 said_before = TRUE;
425 return lang;
426 }
427 }
428 #endif
429 return lang;
430 }
431
432 /* Mask for components of locale spec. The ordering here is from
433 * least significant to most significant
434 */
435 enum
436 {
437 COMPONENT_CODESET = 1 << 0,
438 COMPONENT_TERRITORY = 1 << 1,
439 COMPONENT_MODIFIER = 1 << 2
440 };
441
442 /* Break an X/Open style locale specification into components
443 */
444 static guint
explode_locale(const gchar * locale,gchar ** language,gchar ** territory,gchar ** codeset,gchar ** modifier)445 explode_locale (const gchar *locale,
446 gchar **language,
447 gchar **territory,
448 gchar **codeset,
449 gchar **modifier)
450 {
451 const gchar *uscore_pos;
452 const gchar *at_pos;
453 const gchar *dot_pos;
454
455 guint mask = 0;
456
457 uscore_pos = strchr (locale, '_');
458 dot_pos = strchr (uscore_pos ? uscore_pos : locale, '.');
459 at_pos = strchr (dot_pos ? dot_pos : (uscore_pos ? uscore_pos : locale), '@');
460
461 if (at_pos)
462 {
463 mask |= COMPONENT_MODIFIER;
464 *modifier = g_strdup (at_pos);
465 }
466 else
467 at_pos = locale + strlen (locale);
468
469 if (dot_pos)
470 {
471 mask |= COMPONENT_CODESET;
472 *codeset = g_strndup (dot_pos, at_pos - dot_pos);
473 }
474 else
475 dot_pos = at_pos;
476
477 if (uscore_pos)
478 {
479 mask |= COMPONENT_TERRITORY;
480 *territory = g_strndup (uscore_pos, dot_pos - uscore_pos);
481 }
482 else
483 uscore_pos = dot_pos;
484
485 *language = g_strndup (locale, uscore_pos - locale);
486
487 return mask;
488 }
489
490 /*
491 * Compute all interesting variants for a given locale name -
492 * by stripping off different components of the value.
493 *
494 * For simplicity, we assume that the locale is in
495 * X/Open format: language[_territory][.codeset][@modifier]
496 *
497 * TODO: Extend this to handle the CEN format (see the GNUlibc docs)
498 * as well. We could just copy the code from glibc wholesale
499 * but it is big, ugly, and complicated, so I'm reluctant
500 * to do so when this should handle 99% of the time...
501 */
502 static void
append_locale_variants(GPtrArray * array,const gchar * locale)503 append_locale_variants (GPtrArray *array,
504 const gchar *locale)
505 {
506 gchar *language = NULL;
507 gchar *territory = NULL;
508 gchar *codeset = NULL;
509 gchar *modifier = NULL;
510
511 guint mask;
512 guint i, j;
513
514 g_return_if_fail (locale != NULL);
515
516 mask = explode_locale (locale, &language, &territory, &codeset, &modifier);
517
518 /* Iterate through all possible combinations, from least attractive
519 * to most attractive.
520 */
521 for (j = 0; j <= mask; ++j)
522 {
523 i = mask - j;
524
525 if ((i & ~mask) == 0)
526 {
527 gchar *val = g_strconcat (language,
528 (i & COMPONENT_TERRITORY) ? territory : "",
529 (i & COMPONENT_CODESET) ? codeset : "",
530 (i & COMPONENT_MODIFIER) ? modifier : "",
531 NULL);
532 g_ptr_array_add (array, val);
533 }
534 }
535
536 g_free (language);
537 if (mask & COMPONENT_CODESET)
538 g_free (codeset);
539 if (mask & COMPONENT_TERRITORY)
540 g_free (territory);
541 if (mask & COMPONENT_MODIFIER)
542 g_free (modifier);
543 }
544
545 /**
546 * g_get_locale_variants:
547 * @locale: a locale identifier
548 *
549 * Returns a list of derived variants of @locale, which can be used to
550 * e.g. construct locale-dependent filenames or search paths. The returned
551 * list is sorted from most desirable to least desirable.
552 * This function handles territory, charset and extra locale modifiers. See
553 * [`setlocale(3)`](man:setlocale) for information about locales and their format.
554 *
555 * @locale itself is guaranteed to be returned in the output.
556 *
557 * For example, if @locale is `fr_BE`, then the returned list
558 * is `fr_BE`, `fr`. If @locale is `en_GB.UTF-8@euro`, then the returned list
559 * is `en_GB.UTF-8@euro`, `en_GB.UTF-8`, `en_GB@euro`, `en_GB`, `en.UTF-8@euro`,
560 * `en.UTF-8`, `en@euro`, `en`.
561 *
562 * If you need the list of variants for the current locale,
563 * use g_get_language_names().
564 *
565 * Returns: (transfer full) (array zero-terminated=1) (element-type utf8): a newly
566 * allocated array of newly allocated strings with the locale variants. Free with
567 * g_strfreev().
568 *
569 * Since: 2.28
570 */
571 gchar **
g_get_locale_variants(const gchar * locale)572 g_get_locale_variants (const gchar *locale)
573 {
574 GPtrArray *array;
575
576 g_return_val_if_fail (locale != NULL, NULL);
577
578 array = g_ptr_array_sized_new (8);
579 append_locale_variants (array, locale);
580 g_ptr_array_add (array, NULL);
581
582 return (gchar **) g_ptr_array_free (array, FALSE);
583 }
584
585 /* The following is (partly) taken from the gettext package.
586 Copyright (C) 1995, 1996, 1997, 1998 Free Software Foundation, Inc. */
587
588 static const gchar *
guess_category_value(const gchar * category_name)589 guess_category_value (const gchar *category_name)
590 {
591 const gchar *retval;
592
593 /* The highest priority value is the 'LANGUAGE' environment
594 variable. This is a GNU extension. */
595 retval = g_getenv ("LANGUAGE");
596 if ((retval != NULL) && (retval[0] != '\0'))
597 return retval;
598
599 /* 'LANGUAGE' is not set. So we have to proceed with the POSIX
600 methods of looking to 'LC_ALL', 'LC_xxx', and 'LANG'. On some
601 systems this can be done by the 'setlocale' function itself. */
602
603 /* Setting of LC_ALL overwrites all other. */
604 retval = g_getenv ("LC_ALL");
605 if ((retval != NULL) && (retval[0] != '\0'))
606 return retval;
607
608 /* Next comes the name of the desired category. */
609 retval = g_getenv (category_name);
610 if ((retval != NULL) && (retval[0] != '\0'))
611 return retval;
612
613 /* Last possibility is the LANG environment variable. */
614 retval = g_getenv ("LANG");
615 if ((retval != NULL) && (retval[0] != '\0'))
616 return retval;
617
618 #ifdef G_PLATFORM_WIN32
619 /* g_win32_getlocale() first checks for LC_ALL, LC_MESSAGES and
620 * LANG, which we already did above. Oh well. The main point of
621 * calling g_win32_getlocale() is to get the thread's locale as used
622 * by Windows and the Microsoft C runtime (in the "English_United
623 * States" format) translated into the Unixish format.
624 */
625 {
626 char *locale = g_win32_getlocale ();
627 retval = g_intern_string (locale);
628 g_free (locale);
629 return retval;
630 }
631 #endif
632
633 return NULL;
634 }
635
636 typedef struct _GLanguageNamesCache GLanguageNamesCache;
637
638 struct _GLanguageNamesCache {
639 gchar *languages;
640 gchar **language_names;
641 };
642
643 static void
language_names_cache_free(gpointer data)644 language_names_cache_free (gpointer data)
645 {
646 GLanguageNamesCache *cache = data;
647 g_free (cache->languages);
648 g_strfreev (cache->language_names);
649 g_free (cache);
650 }
651
652 /**
653 * g_get_language_names:
654 *
655 * Computes a list of applicable locale names, which can be used to
656 * e.g. construct locale-dependent filenames or search paths. The returned
657 * list is sorted from most desirable to least desirable and always contains
658 * the default locale "C".
659 *
660 * For example, if LANGUAGE=de:en_US, then the returned list is
661 * "de", "en_US", "en", "C".
662 *
663 * This function consults the environment variables `LANGUAGE`, `LC_ALL`,
664 * `LC_MESSAGES` and `LANG` to find the list of locales specified by the
665 * user.
666 *
667 * Returns: (array zero-terminated=1) (transfer none): a %NULL-terminated array of strings owned by GLib
668 * that must not be modified or freed.
669 *
670 * Since: 2.6
671 */
672 const gchar * const *
g_get_language_names(void)673 g_get_language_names (void)
674 {
675 return g_get_language_names_with_category ("LC_MESSAGES");
676 }
677
678 /**
679 * g_get_language_names_with_category:
680 * @category_name: a locale category name
681 *
682 * Computes a list of applicable locale names with a locale category name,
683 * which can be used to construct the fallback locale-dependent filenames
684 * or search paths. The returned list is sorted from most desirable to
685 * least desirable and always contains the default locale "C".
686 *
687 * This function consults the environment variables `LANGUAGE`, `LC_ALL`,
688 * @category_name, and `LANG` to find the list of locales specified by the
689 * user.
690 *
691 * g_get_language_names() returns g_get_language_names_with_category("LC_MESSAGES").
692 *
693 * Returns: (array zero-terminated=1) (transfer none): a %NULL-terminated array of strings owned by
694 * the thread g_get_language_names_with_category was called from.
695 * It must not be modified or freed. It must be copied if planned to be used in another thread.
696 *
697 * Since: 2.58
698 */
699 const gchar * const *
g_get_language_names_with_category(const gchar * category_name)700 g_get_language_names_with_category (const gchar *category_name)
701 {
702 static GPrivate cache_private = G_PRIVATE_INIT ((void (*)(gpointer)) g_hash_table_unref);
703 GHashTable *cache = g_private_get (&cache_private);
704 const gchar *languages;
705 GLanguageNamesCache *name_cache;
706
707 g_return_val_if_fail (category_name != NULL, NULL);
708
709 if (!cache)
710 {
711 cache = g_hash_table_new_full (g_str_hash, g_str_equal,
712 g_free, language_names_cache_free);
713 g_private_set (&cache_private, cache);
714 }
715
716 languages = guess_category_value (category_name);
717 if (!languages)
718 languages = "C";
719
720 name_cache = (GLanguageNamesCache *) g_hash_table_lookup (cache, category_name);
721 if (!(name_cache && name_cache->languages &&
722 strcmp (name_cache->languages, languages) == 0))
723 {
724 GPtrArray *array;
725 gchar **alist, **a;
726
727 g_hash_table_remove (cache, category_name);
728
729 array = g_ptr_array_sized_new (8);
730
731 alist = g_strsplit (languages, ":", 0);
732 for (a = alist; *a; a++)
733 append_locale_variants (array, unalias_lang (*a));
734 g_strfreev (alist);
735 g_ptr_array_add (array, g_strdup ("C"));
736 g_ptr_array_add (array, NULL);
737
738 name_cache = g_new0 (GLanguageNamesCache, 1);
739 name_cache->languages = g_strdup (languages);
740 name_cache->language_names = (gchar **) g_ptr_array_free (array, FALSE);
741 g_hash_table_insert (cache, g_strdup (category_name), name_cache);
742 }
743
744 return (const gchar * const *) name_cache->language_names;
745 }
746