• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2014 Canonical Limited
3  *
4  * This library is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU Lesser General Public
6  * License as published by the Free Software Foundation; either
7  * version 2.1 of the License, or (at your option) any later version.
8  *
9  * This library is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
12  * Lesser General Public License for more details.
13  *
14  * You should have received a copy of the GNU Lesser General Public
15  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
16  *
17  * Author: Ryan Lortie <desrt@desrt.ca>
18  */
19 
20 #include <config.h>
21 
22 #include "gstrfuncs.h"
23 
24 #include <glib.h>
25 #include <locale.h>
26 #include <stdlib.h>
27 #include <string.h>
28 
29 struct mapping_entry
30 {
31   guint16 src;
32   guint16 ascii;
33 };
34 
35 struct mapping_range
36 {
37   guint16 start;
38   guint16 length;
39 };
40 
41 struct locale_entry
42 {
43   guint8 name_offset;
44   guint8 item_id;
45 };
46 
47 #include "gtranslit-data.h"
48 
49 #define get_src_char(array, encoded, index) ((encoded & 0x8000) ? (array)[((encoded) & 0xfff) + index] : encoded)
50 #define get_length(encoded)                 ((encoded & 0x8000) ? ((encoded & 0x7000) >> 12) : 1)
51 
52 #if G_BYTE_ORDER == G_BIG_ENDIAN
53 #define get_ascii_item(array, encoded)      ((encoded & 0x8000) ? &(array)[(encoded) & 0xfff] : (gpointer) (((char *) &(encoded)) + 1))
54 #else
55 #define get_ascii_item(array, encoded)      ((encoded & 0x8000) ? &(array)[(encoded) & 0xfff] : (gpointer) &(encoded))
56 #endif
57 
58 static const gchar * lookup_in_item (guint           item_id,
59                                      const gunichar *key,
60                                      gint           *result_len,
61                                      gint           *key_consumed);
62 
63 static gint
compare_mapping_entry(gconstpointer user_data,gconstpointer data)64 compare_mapping_entry (gconstpointer user_data,
65                        gconstpointer data)
66 {
67   const struct mapping_entry *entry = data;
68   const gunichar *key = user_data;
69   gunichar src_0;
70 
71   G_STATIC_ASSERT(MAX_KEY_SIZE == 2);
72 
73   src_0 = get_src_char (src_table, entry->src, 0);
74 
75   if (key[0] > src_0)
76     return 1;
77   else if (key[0] < src_0)
78     return -1;
79 
80   if (get_length (entry->src) > 1)
81     {
82       gunichar src_1;
83 
84       src_1 = get_src_char (src_table, entry->src, 1);
85 
86       if (key[1] > src_1)
87         return 1;
88       else if (key[1] < src_1)
89         return -1;
90     }
91   else if (key[1])
92     return 1;
93 
94   return 0;
95 }
96 
97 static const gchar *
lookup_in_mapping(const struct mapping_entry * mapping,gint mapping_size,const gunichar * key,gint * result_len,gint * key_consumed)98 lookup_in_mapping (const struct mapping_entry *mapping,
99                    gint                        mapping_size,
100                    const gunichar             *key,
101                    gint                       *result_len,
102                    gint                       *key_consumed)
103 {
104   const struct mapping_entry *hit;
105 
106   hit = bsearch (key, mapping, mapping_size, sizeof (struct mapping_entry), compare_mapping_entry);
107 
108   if (hit == NULL)
109     return NULL;
110 
111   *key_consumed = get_length (hit->src);
112   *result_len = get_length (hit->ascii);
113 
114   return get_ascii_item(ascii_table, hit->ascii);
115 }
116 
117 static const gchar *
lookup_in_chain(const guint8 * chain,const gunichar * key,gint * result_len,gint * key_consumed)118 lookup_in_chain (const guint8   *chain,
119                  const gunichar *key,
120                  gint           *result_len,
121                  gint           *key_consumed)
122 {
123   const gchar *result;
124 
125   while (*chain != 0xff)
126     {
127       result = lookup_in_item (*chain, key, result_len, key_consumed);
128 
129       if (result)
130         return result;
131 
132       chain++;
133     }
134 
135   return NULL;
136 }
137 
138 static const gchar *
lookup_in_item(guint item_id,const gunichar * key,gint * result_len,gint * key_consumed)139 lookup_in_item (guint           item_id,
140                 const gunichar *key,
141                 gint           *result_len,
142                 gint           *key_consumed)
143 {
144   if (item_id & 0x80)
145     {
146       const guint8 *chain = chains_table + chain_starts[item_id & 0x7f];
147 
148       return lookup_in_chain (chain, key, result_len, key_consumed);
149     }
150   else
151     {
152       const struct mapping_range *range = &mapping_ranges[item_id];
153 
154       return lookup_in_mapping (mappings_table + range->start, range->length, key, result_len, key_consumed);
155     }
156 }
157 
158 static gint
compare_locale_entry(gconstpointer user_data,gconstpointer data)159 compare_locale_entry (gconstpointer user_data,
160                       gconstpointer data)
161 {
162   const struct locale_entry *entry = data;
163   const gchar *key = user_data;
164 
165   return strcmp (key, &locale_names[entry->name_offset]);
166 }
167 
168 static gboolean
lookup_item_id_for_one_locale(const gchar * key,guint * item_id)169 lookup_item_id_for_one_locale (const gchar *key,
170                                guint       *item_id)
171 {
172   const struct locale_entry *hit;
173 
174   hit = bsearch (key, locale_index, G_N_ELEMENTS (locale_index), sizeof (struct locale_entry), compare_locale_entry);
175 
176   if (hit == NULL)
177     return FALSE;
178 
179   *item_id = hit->item_id;
180   return TRUE;
181 }
182 
183 static guint
lookup_item_id_for_locale(const gchar * locale)184 lookup_item_id_for_locale (const gchar *locale)
185 {
186   gchar key[MAX_LOCALE_NAME + 1];
187   const gchar *language;
188   guint language_len;
189   const gchar *territory = NULL;
190   guint territory_len = 0;
191   const gchar *modifier = NULL;
192   guint modifier_len = 0;
193   const gchar *next_char;
194   guint id;
195 
196   /* As per POSIX, a valid locale looks like:
197    *
198    *   language[_territory][.codeset][@modifier]
199    */
200   language = locale;
201   language_len = strcspn (language, "_.@");
202   next_char = language + language_len;
203 
204   if (*next_char == '_')
205     {
206       territory = next_char;
207       territory_len = strcspn (territory + 1, "_.@") + 1;
208       next_char = territory + territory_len;
209     }
210 
211   if (*next_char == '.')
212     {
213       const gchar *codeset;
214       guint codeset_len;
215 
216       codeset = next_char;
217       codeset_len = strcspn (codeset + 1, "_.@") + 1;
218       next_char = codeset + codeset_len;
219     }
220 
221   if (*next_char == '@')
222     {
223       modifier = next_char;
224       modifier_len = strcspn (modifier + 1, "_.@") + 1;
225       next_char = modifier + modifier_len;
226     }
227 
228   /* What madness is this? */
229   if (language_len == 0 || *next_char)
230     return default_item_id;
231 
232   /* We are not interested in codeset.
233    *
234    * For this locale:
235    *
236    *  aa_BB@cc
237    *
238    * try in this order:
239    *
240    * Note: we have no locales of the form aa_BB@cc in the database.
241    *
242    *  1. aa@cc
243    *  2. aa_BB
244    *  3. aa
245    */
246 
247   /* 1. */
248   if (modifier_len && language_len + modifier_len <= MAX_LOCALE_NAME)
249     {
250       memcpy (key, language, language_len);
251       memcpy (key + language_len, modifier, modifier_len);
252       key[language_len + modifier_len] = '\0';
253 
254       if (lookup_item_id_for_one_locale (key, &id))
255         return id;
256     }
257 
258   /* 2. */
259   if (territory_len && language_len + territory_len <= MAX_LOCALE_NAME)
260     {
261       memcpy (key, language, language_len);
262       memcpy (key + language_len, territory, territory_len);
263       key[language_len + territory_len] = '\0';
264 
265       if (lookup_item_id_for_one_locale (key, &id))
266         return id;
267     }
268 
269   /* 3. */
270   if (language_len <= MAX_LOCALE_NAME)
271     {
272       memcpy (key, language, language_len);
273       key[language_len] = '\0';
274 
275       if (lookup_item_id_for_one_locale (key, &id))
276         return id;
277     }
278 
279   return default_item_id;
280 }
281 
282 static guint
get_default_item_id(void)283 get_default_item_id (void)
284 {
285   static guint item_id;
286   static gboolean done;
287 
288   /* Doesn't need to be locked -- no harm in doing it twice. */
289   if (!done)
290     {
291       const gchar *locale;
292 
293       locale = setlocale (LC_CTYPE, NULL);
294       item_id = lookup_item_id_for_locale (locale);
295       done = TRUE;
296     }
297 
298   return item_id;
299 }
300 
301 /**
302  * g_str_to_ascii:
303  * @str: a string, in UTF-8
304  * @from_locale: (nullable): the source locale, if known
305  *
306  * Transliterate @str to plain ASCII.
307  *
308  * For best results, @str should be in composed normalised form.
309  *
310  * This function performs a reasonably good set of character
311  * replacements.  The particular set of replacements that is done may
312  * change by version or even by runtime environment.
313  *
314  * If the source language of @str is known, it can used to improve the
315  * accuracy of the translation by passing it as @from_locale.  It should
316  * be a valid POSIX locale string (of the form
317  * `language[_territory][.codeset][@modifier]`).
318  *
319  * If @from_locale is %NULL then the current locale is used.
320  *
321  * If you want to do translation for no specific locale, and you want it
322  * to be done independently of the currently locale, specify `"C"` for
323  * @from_locale.
324  *
325  * Returns: a string in plain ASCII
326  *
327  * Since: 2.40
328  **/
329 gchar *
g_str_to_ascii(const gchar * str,const gchar * from_locale)330 g_str_to_ascii (const gchar *str,
331                 const gchar *from_locale)
332 {
333   GString *result;
334   guint item_id;
335 
336   g_return_val_if_fail (str != NULL, NULL);
337 
338   if (g_str_is_ascii (str))
339     return g_strdup (str);
340 
341   if (from_locale)
342     item_id = lookup_item_id_for_locale (from_locale);
343   else
344     item_id = get_default_item_id ();
345 
346   result = g_string_sized_new (strlen (str));
347 
348   while (*str)
349     {
350       /* We only need to transliterate non-ASCII values... */
351       if (*str & 0x80)
352         {
353           gunichar key[MAX_KEY_SIZE];
354           const gchar *r;
355           gint consumed;
356           gint r_len;
357           gunichar c;
358 
359           G_STATIC_ASSERT(MAX_KEY_SIZE == 2);
360 
361           c = g_utf8_get_char (str);
362 
363           /* This is where it gets evil...
364            *
365            * We know that MAX_KEY_SIZE is 2.  We also know that we
366            * only want to try another character if it's non-ascii.
367            */
368           str = g_utf8_next_char (str);
369 
370           key[0] = c;
371           if (*str & 0x80)
372             key[1] = g_utf8_get_char (str);
373           else
374             key[1] = 0;
375 
376           r = lookup_in_item (item_id, key, &r_len, &consumed);
377 
378           /* If we failed to map two characters, try again with one.
379            *
380            * gconv behaviour is a bit weird here -- it seems to
381            * depend in the randomness of the binary search and the
382            * size of the input buffer as to what result we get here.
383            *
384            * Doing it this way is more work, but should be
385            * more-correct.
386            */
387           if (r == NULL && key[1])
388             {
389               key[1] = 0;
390               r = lookup_in_item (item_id, key, &r_len, &consumed);
391             }
392 
393           if (r != NULL)
394             {
395               g_string_append_len (result, r, r_len);
396               if (consumed == 2)
397                 /* If it took both then skip again */
398                 str = g_utf8_next_char (str);
399             }
400           else /* no match found */
401             g_string_append_c (result, '?');
402         }
403       else /* ASCII case */
404         g_string_append_c (result, *str++);
405     }
406 
407   return g_string_free (result, FALSE);
408 }
409