• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (c) 2009 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 // This file extends lang_enc.cc with additional languages and extended routines
6 // It is current with Unicode 5.1 (beta Jan 2008)
7 //
8 
9 #include <stdlib.h>
10 #include <stdio.h>
11 #include <string.h>
12 
13 #include "encodings/compact_lang_det/ext_lang_enc.h"
14 #include "encodings/compact_lang_det/win/cld_macros.h"
15 #include "encodings/compact_lang_det/win/cld_strtoint.h"
16 
17 // Language names above NUM_LANGUAGES
18 // These are also the C enum declared names
19 static const char* const kExtLanguageName[] = {
20 "X_BORK_BORK_BORK", "X_PIG_LATIN", "X_HACKER", "X_KLINGON", "X_ELMER_FUDD",
21 
22 // Pseudo-languages for Unicode scripts that express a single language
23 "X_OGHAM", "X_RUNIC", "X_YI", "X_OLD_ITALIC", "X_GOTHIC",
24 "X_DESERET", "X_HANUNOO", "X_BUHID", "X_TAGBANWA", "X_TAI_LE",
25 "X_LINEAR_B", "X_UGARITIC", "X_SHAVIAN", "X_OSMANYA", "X_CYPRIOT",
26 "X_BUGINESE", "X_COPTIC", "X_NEW_TAI_LUE", "X_GLAGOLITIC", "X_TIFINAGH",
27 "X_SYLOTI_NAGRI", "X_OLD_PERSIAN", "X_KHAROSHTHI", "X_BALINESE", "X_CUNEIFORM",
28 "X_PHOENICIAN", "X_PHAGS_PA", "X_NKO",
29 
30 // Unicode 5.1
31 "X_SUDANESE", "X_LEPCHA", "X_OL_CHIKI", "X_VAI", "X_SAURASHTRA",
32 "X_KAYAH_LI", "X_REJANG", "X_LYCIAN", "X_CARIAN", "X_LYDIAN",
33 "X_CHAM",
34 };
35 
36 
37 // These are the C enum declared names, for programs creating C code
38 static const char* const kExtLangDeclaredName[] = {
39   "ENGLISH",      /* 0 */
40   "DANISH",       /* 1 */
41   "DUTCH",        /* 2 */
42   "FINNISH",      /* 3 */
43   "FRENCH",       /* 4 */
44   "GERMAN",       /* 5 */
45   "HEBREW",       /* 6 */
46   "ITALIAN",      /* 7 */
47   "JAPANESE",     /* 8 */
48   "KOREAN",       /* 9 */
49   "NORWEGIAN",    /* 10 */
50   "POLISH",       /* 11 */
51   "PORTUGUESE",   /* 12 */
52   "RUSSIAN",      /* 13 */
53   "SPANISH",      /* 14 */
54   "SWEDISH",      /* 15 */
55   "CHINESE",      /* 16 */
56   "CZECH",        /* 17 */
57   "GREEK",        /* 18 */
58   "ICELANDIC",    /* 19 */
59   "LATVIAN",      /* 20 */
60   "LITHUANIAN",   /* 21 */
61   "ROMANIAN",     /* 22 */
62   "HUNGARIAN",    /* 23 */
63   "ESTONIAN",     /* 24 */
64   "TG_UNKNOWN_LANGUAGE",  /* 25 */
65   "UNKNOWN_LANGUAGE",     /* 26 */
66   "BULGARIAN",    /* 27 */
67   "CROATIAN",     /* 28 */
68   "SERBIAN",      /* 29 */
69   "IRISH",        /* 30 */
70   "GALICIAN",     /* 31 */
71   "TAGALOG",      /* 32 */
72   "TURKISH",      /* 33 */
73   "UKRAINIAN",    /* 34 */
74   "HINDI",        /* 35 */
75   "MACEDONIAN",   /* 36 */
76   "BENGALI",      /* 37 */
77   "INDONESIAN",   /* 38 */
78   "LATIN",        /* 39 */
79   "MALAY",        /* 40 */
80   "MALAYALAM",    /* 41 */
81   "WELSH",        /* 42 */
82   "NEPALI",       /* 43 */
83   "TELUGU",       /* 44 */
84   "ALBANIAN",     /* 45 */
85   "TAMIL",        /* 46 */
86   "BELARUSIAN",   /* 47 */
87   "JAVANESE",     /* 48 */
88   "OCCITAN",      /* 49 */
89   "URDU",         /* 50 */
90   "BIHARI",       /* 51 */
91   "GUJARATI",     /* 52 */
92   "THAI",         /* 53 */
93   "ARABIC",       /* 54 */
94   "CATALAN",      /* 55 */
95   "ESPERANTO",    /* 56 */
96   "BASQUE",       /* 57 */
97   "INTERLINGUA",  /* 58 */
98   "KANNADA",      /* 59 */
99   "PUNJABI",      /* 60 */
100   "SCOTS_GAELIC", /* 61 */
101   "SWAHILI",      /* 62 */
102   "SLOVENIAN",    /* 63 */
103   "MARATHI",      /* 64 */
104   "MALTESE",      /* 65 */
105   "VIETNAMESE",   /* 66 */
106   "FRISIAN",      /* 67 */
107   "SLOVAK",       /* 68 */
108   "CHINESE_T",    /* 69 */
109   "FAROESE",      /* 70 */
110   "SUNDANESE",    /* 71 */
111   "UZBEK",        /* 72 */
112   "AMHARIC",      /* 73 */
113   "AZERBAIJANI",  /* 74 */
114   "GEORGIAN",     /* 75 */
115   "TIGRINYA",     /* 76 */
116   "PERSIAN",      /* 77 */
117   "BOSNIAN",      /* 78 */
118   "SINHALESE",    /* 79 */
119   "NORWEGIAN_N",  /* 80 */
120   "PORTUGUESE_P", /* 81 */
121   "PORTUGUESE_B", /* 82 */
122   "XHOSA",        /* 83 */
123   "ZULU",         /* 84 */
124   "GUARANI",      /* 85 */
125   "SESOTHO",      /* 86 */
126   "TURKMEN",      /* 87 */
127   "KYRGYZ",       /* 88 */
128   "BRETON",       /* 89 */
129   "TWI",          /* 90 */
130   "YIDDISH",      /* 91 */
131   "SERBO_CROATIAN",       /* 92 */
132   "SOMALI",       /* 93 */
133   "UIGHUR",       /* 94 */
134   "KURDISH",      /* 95 */
135   "MONGOLIAN",    /* 96 */
136   "ARMENIAN",     /* 97 */
137   "LAOTHIAN",     /* 98 */
138   "SINDHI",       /* 99 */
139   "RHAETO_ROMANCE",  /* 100 */
140   "AFRIKAANS",    /* 101 */
141   "LUXEMBOURGISH", /* 102 */
142   "BURMESE",      /* 103 */
143   "KHMER",        /* 104 */
144   "TIBETAN",      /* 105 */
145   "DHIVEHI",      /* 106 */       // sometimes spelled Divehi; lang of Maldives
146   "CHEROKEE",     /* 107 */
147   "SYRIAC",       /* 108 */
148   "LIMBU",        /* 109 */
149   "ORIYA",        /* 110 */
150   "ASSAMESE",     /* 111 */
151   "CORSICAN",     /* 112 */
152   "INTERLINGUE",  /* 113 */
153   "KAZAKH",       /* 114 */
154   "LINGALA",      /* 115 */
155   "MOLDAVIAN",    /* 116 */
156   "PASHTO",       /* 117 */
157   "QUECHUA",      /* 118 */
158   "SHONA",        /* 119 */
159   "TAJIK",        /* 120 */
160   "TATAR",        /* 121 */
161   "TONGA",        /* 122 */
162   "YORUBA",       /* 123 */
163   "CREOLES_AND_PIDGINS_ENGLISH_BASED",      /* 124 */
164   "CREOLES_AND_PIDGINS_FRENCH_BASED",       /* 125 */
165   "CREOLES_AND_PIDGINS_PORTUGUESE_BASED",   /* 126 */
166   "CREOLES_AND_PIDGINS_OTHER",              /* 127 */
167   "MAORI",        /* 128 */
168   "WOLOF",        /* 129 */
169   "ABKHAZIAN",    /* 130 */
170   "AFAR",         /* 131 */
171   "AYMARA",       /* 132 */
172   "BASHKIR",      /* 133 */
173   "BISLAMA",      /* 134 */
174   "DZONGKHA",     /* 135 */
175   "FIJIAN",       /* 136 */
176   "GREENLANDIC",  /* 137 */
177   "HAUSA",        /* 138 */
178   "HAITIAN_CREOLE",  /* 139 */
179   "INUPIAK",      /* 140 */
180   "INUKTITUT",    /* 141 */
181   "KASHMIRI",     /* 142 */
182   "KINYARWANDA",  /* 143 */
183   "MALAGASY",     /* 144 */
184   "NAURU",        /* 145 */
185   "OROMO",        /* 146 */
186   "RUNDI",        /* 147 */
187   "SAMOAN",       /* 148 */
188   "SANGO",        /* 149 */
189   "SANSKRIT",     /* 150 */
190   "SISWANT",      /* 151 */
191   "TSONGA",       /* 152 */
192   "TSWANA",       /* 153 */
193   "VOLAPUK",      /* 154 */
194   "ZHUANG",       /* 155 */
195   "KHASI",        /* 156 */
196   "SCOTS",        /* 157 */
197   "GANDA",        /* 158 */
198   "MANX",         /* 159 */
199   "MONTENEGRIN",  /* 160 */
200   // Add new language declared names just before here
201 };
202 
203 COMPILE_ASSERT(arraysize(kExtLangDeclaredName) == NUM_LANGUAGES,
204        kExtLangDeclaredName_has_incorrect_length);
205 
206 
207 // Language codes above NUM_LANGUAGES
208 // I made all these up, except Klingon from ISO-639-2 (dsites)
209 // NOTE: zza is a standard name
210 static const char* const kExtLanguageCode[] = {
211   // "X_BORK_BORK_BORK", "X_PIG_LATIN", "X_HACKER", "X_KLINGON", "X_ELMER_FUDD",
212   // All Latin script
213   "zzb", "zzp", "zzh", "tlh", "zze",
214 
215   // Pseudo-languages for Unicode scripts that express a single language
216   "xx-Ogam", "xx-Runr", "xx-Yiii", "xx-Ital", "xx-Goth",
217   "xx-Dsrt", "xx-Hano", "xx-Buhd", "xx-Tagb", "xx-Tale",
218   "xx-Linb", "xx-Ugar", "xx-Shaw", "xx-Osma", "xx-Cprt",
219   "xx-Bugi", "xx-Copt", "xx-Talu", "xx-Glag", "xx-Tfng",
220   "xx-Sylo", "xx-Xpeo", "xx-Khar", "xx-Bali", "xx-Xsux",
221   "xx-Phnx", "xx-Phag", "xx-Nkoo",
222 
223   // Unicode 5.1
224   "xx-Sund", "xx-Lepc", "xx-Olck", "xx-Vaii", "xx-Saur",
225   "xx-Kali", "xx-Rjng", "xx-Lyci", "xx-Cari", "xx-Lydi",
226   "xx-Cham",
227 };
228 
229 
230 // Given the Language, returns its string name used as the output by
231 // the lang/enc identifier, e.g. "Korean"
232 // "invalid_language" if the input is invalid.
233 // TG_UNKNOWN_LANGUAGE is used as a placeholder for the "ignore me" language,
234 // used to subtract out HTML, link farms, DNA strings, and alittle English porn
ExtLanguageName(const Language lang)235 const char* ExtLanguageName(const Language lang) {
236   if (lang < 0) {
237     // No-text-at-all result from a Tote
238     return "";
239   }
240   // CompactLanguageDetect extension
241   if (lang == TG_UNKNOWN_LANGUAGE) {
242     return "Ignore";
243   }
244   if ((0 <= lang) && (lang < NUM_LANGUAGES)) {
245     return LanguageName(lang);
246   }
247   if ((EXT_LANGUAGE_BASE <= lang) && (lang < EXT_NUM_LANGUAGES)) {
248     return kExtLanguageName[lang - EXT_LANGUAGE_BASE];
249   }
250   return invalid_language_name();
251 }
252 
253 
254 // Given the Language, returns its Language enum spelling, for use by
255 // programs that create C declarations, e.g. "KOREAN"
256 // "UNKNOWN_LANGUAGE" if the input is invalid.
ExtLanguageDeclaredName(const Language lang)257 const char* ExtLanguageDeclaredName(const Language lang) {
258   if ((0 <= lang) && (lang < NUM_LANGUAGES)) {
259     return kExtLangDeclaredName[lang];
260   }
261   if ((EXT_LANGUAGE_BASE <= lang) && (lang < EXT_NUM_LANGUAGES)) {
262     return kExtLanguageName[lang - EXT_LANGUAGE_BASE];
263   }
264   return "UNKNOWN_LANGUAGE";
265 }
266 
267 // Given the Language, return the language code, e.g. "ko"
ExtLanguageCode(const Language lang)268 const char* ExtLanguageCode(const Language lang) {
269   // Hack for ignore/porn pseudo-language
270   if (lang == TG_UNKNOWN_LANGUAGE) {
271     return "xxx";
272   }
273   if ((0 <= lang) && (lang < NUM_LANGUAGES)) {
274     return LanguageCode(lang);
275   }
276   if ((EXT_LANGUAGE_BASE <= lang) && (lang < EXT_NUM_LANGUAGES)) {
277     return kExtLanguageCode[lang - EXT_LANGUAGE_BASE];
278   }
279   return "??";
280 }
281 
282 
283 // Convert "en-Latn-GB" to ENGLISH
284 // Normalize to PORTUGUESE, not PORTUGUESE_B nor PORTUGUESE_P
285 // Consider for later: NORWEGIAN, NORWEGIAN_N
286 // Consider for later: SCOTS, SCOTS_GAELIC
287 // Consider for later: SERBO_CROATIAN, SERBIAN, CROATIAN, BOSNIAN
288 //
GetLanguageFromNumberOrName(const char * src)289 Language GetLanguageFromNumberOrName(const char* src) {
290   if (strspn(src, "0123456789") == strlen(src)) {
291     // All digits
292     return static_cast<Language>(strto32(src, NULL, 10));
293   }
294 
295   Language retlang = UNKNOWN_LANGUAGE;
296   size_t len = strlen(src);
297 
298   if (true /*FLAGS_mergepairs*/) {
299     // Merge sets of langauges pt-xx en-xx fr-xx, NOT bs/hr/sr
300     if (memcmp(src, "pt-", 3) == 0) {return PORTUGUESE;}
301     if (memcmp(src, "en-", 3) == 0) {return ENGLISH;}
302     if (memcmp(src, "fr-", 3) == 0) {return FRENCH;}
303     // Use NormalizeLanguage instead
304     if (memcmp(src, "bs-", 3) == 0) {return CROATIAN;}
305     if (memcmp(src, "hr-", 3) == 0) {return CROATIAN;}
306     if (memcmp(src, "sr-Latn", 7) == 0) {return CROATIAN;}
307     if (memcmp(src, "sh-Latn", 7) == 0) {return CROATIAN;}
308     if (memcmp(src, "sr-Cyrl", 7) == 0) {return SERBIAN;}
309     if (memcmp(src, "sh-Cyrl", 7) == 0) {return SERBIAN;}
310   }
311 
312   // Extensions
313   if (len >= 3) {
314     // Standin for ignore/porn "language"
315     if (memcmp(src, "xxx", 3) == 0) {return TG_UNKNOWN_LANGUAGE;}
316 
317     if (memcmp(src, "zzb", 3) == 0) {return X_BORK_BORK_BORK;}
318     if (memcmp(src, "zzp", 3) == 0) {return X_PIG_LATIN;}
319     if (memcmp(src, "zzh", 3) == 0) {return X_HACKER;}
320     if (memcmp(src, "tlh", 3) == 0) {return X_KLINGON;}
321     if (memcmp(src, "zze", 3) == 0) {return X_ELMER_FUDD;}
322   }
323 
324   // We have a name like en-Latn-GB or pt-BR
325   // First, get rid of some special cases
326   if (len <= 3) {
327     LanguageFromCode(src, &retlang);
328   } else if (len == 7) {
329     // More Extensions
330     if (memcmp(src, "xx-", 3) == 0) {
331       if (memcmp(src, "xx-Ogam", 7) == 0) {return X_OGHAM;}
332       if (memcmp(src, "xx-Runr", 7) == 0) {return X_RUNIC;}
333       if (memcmp(src, "xx-Yiii", 7) == 0) {return X_YI;}
334       if (memcmp(src, "xx-Ital", 7) == 0) {return X_OLD_ITALIC;}
335       if (memcmp(src, "xx-Goth", 7) == 0) {return X_GOTHIC;}
336       if (memcmp(src, "xx-Dsrt", 7) == 0) {return X_DESERET;}
337       if (memcmp(src, "xx-Hano", 7) == 0) {return X_HANUNOO;}
338       if (memcmp(src, "xx-Buhd", 7) == 0) {return X_BUHID;}
339       if (memcmp(src, "xx-Tagb", 7) == 0) {return X_TAGBANWA;}
340       if (memcmp(src, "xx-Tale", 7) == 0) {return X_TAI_LE;}
341       if (memcmp(src, "xx-Linb", 7) == 0) {return X_LINEAR_B;}
342       if (memcmp(src, "xx-Ugar", 7) == 0) {return X_UGARITIC;}
343       if (memcmp(src, "xx-Shaw", 7) == 0) {return X_SHAVIAN;}
344       if (memcmp(src, "xx-Osma", 7) == 0) {return X_OSMANYA;}
345       if (memcmp(src, "xx-Cprt", 7) == 0) {return X_CYPRIOT;}
346       if (memcmp(src, "xx-Bugi", 7) == 0) {return X_BUGINESE;}
347       if (memcmp(src, "xx-Copt", 7) == 0) {return X_COPTIC;}
348       if (memcmp(src, "xx-Talu", 7) == 0) {return X_NEW_TAI_LUE;}
349       if (memcmp(src, "xx-Glag", 7) == 0) {return X_GLAGOLITIC;}
350       if (memcmp(src, "xx-Tfng", 7) == 0) {return X_TIFINAGH;}
351       if (memcmp(src, "xx-Sylo", 7) == 0) {return X_SYLOTI_NAGRI;}
352       if (memcmp(src, "xx-Xpeo", 7) == 0) {return X_OLD_PERSIAN;}
353       if (memcmp(src, "xx-Khar", 7) == 0) {return X_KHAROSHTHI;}
354       if (memcmp(src, "xx-Bali", 7) == 0) {return X_BALINESE;}
355       if (memcmp(src, "xx-Xsux", 7) == 0) {return X_CUNEIFORM;}
356       if (memcmp(src, "xx-Phnx", 7) == 0) {return X_PHOENICIAN;}
357       if (memcmp(src, "xx-Phag", 7) == 0) {return X_PHAGS_PA;}
358       if (memcmp(src, "xx-Nkoo", 7) == 0) {return X_NKO;}
359 
360       // Unicode 5.1
361       if (memcmp(src, "xx-Sund", 7) == 0) {return X_SUDANESE;}
362       if (memcmp(src, "xx-Lepc", 7) == 0) {return X_LEPCHA;}
363       if (memcmp(src, "xx-Olck", 7) == 0) {return X_OL_CHIKI;}
364       if (memcmp(src, "xx-Vaii", 7) == 0) {return X_VAI;}
365       if (memcmp(src, "xx-Saur", 7) == 0) {return X_SAURASHTRA;}
366       if (memcmp(src, "xx-Kali", 7) == 0) {return X_KAYAH_LI;}
367       if (memcmp(src, "xx-Rjng", 7) == 0) {return X_REJANG;}
368       if (memcmp(src, "xx-Lyci", 7) == 0) {return X_LYCIAN;}
369       if (memcmp(src, "xx-Cari", 7) == 0) {return X_CARIAN;}
370       if (memcmp(src, "xx-Lydi", 7) == 0) {return X_LYDIAN;}
371       if (memcmp(src, "xx-Cham", 7) == 0) {return X_CHAM;}
372     }
373   }
374   // Some other weird ones
375   // Could be Latn or Limb; all our current training data is Latn
376   if (strcmp(src, "sit-NP") == 0) {return LIMBU;}
377   if (strcmp(src, "un-Latn") == 0) {return UNKNOWN_LANGUAGE;}
378 
379   // Multi-country langauges
380   if (memcmp(src, "zh", 2) == 0) {
381     if (memcmp(&src[len - 2], "TW", 2) == 0) {return CHINESE_T;}
382     if (memcmp(&src[len - 2], "HK", 2) == 0) {return CHINESE_T;}
383     return CHINESE;
384   }
385   if (memcmp(src, "pt", 2) == 0) {
386     if (memcmp(&src[len - 2], "BR", 2) == 0) {return PORTUGUESE;}
387     return PORTUGUESE;
388   }
389   if (memcmp(src, "fr", 2) == 0) {
390     if (memcmp(&src[len -2], "CA", 2) == 0) {return FRENCH;}
391     return FRENCH;
392   }
393 
394   // None of the special cases matched
395   if (src[2] == '-') {
396     char temp[4];
397     memcpy(temp, src, 4);
398     temp[2] = '\0';
399     LanguageFromCode(temp, &retlang);
400   }
401   if (src[3] == '-') {
402     char temp[4];
403     memcpy(temp, src, 4);
404     temp[3] = '\0';
405     LanguageFromCode(temp, &retlang);
406   }
407   if (retlang != UNKNOWN_LANGUAGE) {
408     return retlang;
409   }
410 
411   return retlang;
412 }
413 
414 typedef struct {
415   const char* name;
416   UnicodeLScript lscript;
417 } NameScriptPair;
418 
419 // In alphabetic order for binary search
420 static const NameScriptPair kNameScriptPair[] = {
421   // Unicode 5.1 additional scripts
422   {"Arab", ULScript_Arabic},
423   {"Armn", ULScript_Armenian},
424   {"Bali", ULScript_Balinese},
425   {"Beng", ULScript_Bengali},
426   {"Bugi", ULScript_Buginese},
427   {"Buhd", ULScript_Buhid},
428   {"Cans", ULScript_Canadian_Aboriginal},
429   {"Cari", ULScript_Carian},      // Unicode 5.1
430   {"Cham", ULScript_Cham},        // Unicode 5.1
431   {"Cher", ULScript_Cherokee},
432   {"Copt", ULScript_Coptic},
433   {"Cprt", ULScript_Cypriot},
434   {"Cyrl", ULScript_Cyrillic},
435   {"Deva", ULScript_Devanagari},
436   {"Dsrt", ULScript_Deseret},
437   {"Ethi", ULScript_Ethiopic},
438   {"Geor", ULScript_Georgian},
439   {"Glag", ULScript_Glagolitic},
440   {"Goth", ULScript_Gothic},
441   {"Grek", ULScript_Greek},
442   {"Gujr", ULScript_Gujarati},
443   {"Guru", ULScript_Gurmukhi},
444   {"Hani", ULScript_HanCJK},
445   {"Hano", ULScript_Hanunoo},
446   {"Hebr", ULScript_Hebrew},
447   {"Ital", ULScript_Old_Italic},
448   {"Kali", ULScript_Kayah_Li},    // Unicode 5.1
449   {"Khar", ULScript_Kharoshthi},
450   {"Khmr", ULScript_Khmer},
451   {"Knda", ULScript_Kannada},
452   {"Laoo", ULScript_Lao},
453   {"Latn", ULScript_Latin},
454   {"Lepc", ULScript_Lepcha},      // Unicode 5.1
455   {"Limb", ULScript_Limbu},
456   {"Linb", ULScript_Linear_B},
457   {"Lyci", ULScript_Lycian},      // Unicode 5.1
458   {"Lydi", ULScript_Lydian},      // Unicode 5.1
459   {"Mlym", ULScript_Malayalam},
460   {"Mong", ULScript_Mongolian},
461   {"Mymr", ULScript_Myanmar},
462   {"Nkoo", ULScript_Nko},
463   {"Ogam", ULScript_Ogham},
464   {"Olck", ULScript_Ol_Chiki},    // Unicode 5.1
465   {"Orya", ULScript_Oriya},
466   {"Osma", ULScript_Osmanya},
467   {"Phag", ULScript_Phags_Pa},
468   {"Phnx", ULScript_Phoenician},
469   {"Rjng", ULScript_Rejang},      // Unicode 5.1
470   {"Runr", ULScript_Runic},
471   {"Saur", ULScript_Saurashtra},  // Unicode 5.1
472   {"Shaw", ULScript_Shavian},
473   {"Sinh", ULScript_Sinhala},
474   {"Sund", ULScript_Sundanese},   // Unicode 5.1
475   {"Sylo", ULScript_Syloti_Nagri},
476   {"Syrc", ULScript_Syriac},
477   {"Tagb", ULScript_Tagbanwa},
478   {"Tale", ULScript_Tai_Le},
479   {"Talu", ULScript_New_Tai_Lue},
480   {"Taml", ULScript_Tamil},
481   {"Telu", ULScript_Telugu},
482   {"Tfng", ULScript_Tifinagh},
483   {"Tglg", ULScript_Tagalog},
484   {"Thaa", ULScript_Thaana},
485   {"Thai", ULScript_Thai},
486   {"Tibt", ULScript_Tibetan},
487   {"Ugar", ULScript_Ugaritic},
488   {"Vaii", ULScript_Vai},         // Unicode 5.1 // NOTE: apparently 'Vai '
489   {"Xpeo", ULScript_Old_Persian},
490   {"Xsux", ULScript_Cuneiform},
491   {"Yiii", ULScript_Yi},
492   {"Zyyy", ULScript_Common},
493   {"Zzzz", ULScript_Inherited},
494 };
495 
496 // Convert "en-Latn-GB" to ULScript_Latin
GetLScriptFromNumberOrName(const char * src)497 UnicodeLScript GetLScriptFromNumberOrName(const char* src) {
498   if (strspn(src, "0123456789") == strlen(src)) {
499     // All digits
500     return static_cast<UnicodeLScript>(strto32(src, NULL, 10));
501   }
502 
503   if (strcmp(src, "zh-TW") == 0) {return ULScript_HanCJK;}
504   if (strcmp(src, "zh-CN") == 0) {return ULScript_HanCJK;}
505   if (strcmp(src, "pt-BR") == 0) {return ULScript_Latin;}
506   if (strcmp(src, "pt-PT") == 0) {return ULScript_Latin;}
507   // Could be Latn or Limb; all our current training data is Latn
508   if (strcmp(src, "sit-NP") == 0) {return ULScript_Latin;}
509 
510   // Isolate just the script field
511   char temp[5];
512   const char* src2 = strchr(src, '-');
513   if (src2 == NULL) {return ULScript_Latin;}
514   src2 += 1;      // over the -
515   memcpy(temp, src2, 4);
516   temp[4] = '\0';
517 
518   int lo = 0;
519   int hi = ULScript_NUM_SCRIPTS;
520   while (lo < hi) {
521     int mid = (lo + hi) >> 1;
522     if (strcmp(temp, kNameScriptPair[mid].name) < 0) {
523       hi = mid;
524     } else if (strcmp(temp, kNameScriptPair[mid].name) > 0) {
525       lo = mid + 1;
526     } else {
527       return kNameScriptPair[mid].lscript;
528     }
529   }
530   return ULScript_Latin;
531 }
532 
533 
534 // Merge together some languages, such as bo/hr/sr
535 // Croatian Latin and Serbian Cyrillic now.
NormalizeLanguage(Language lang)536 Language NormalizeLanguage(Language lang) {
537   if (lang == BOSNIAN) {return CROATIAN;}
538   if (lang == SERBO_CROATIAN) {return SERBIAN;}
539 
540   if (lang == PORTUGUESE_P) {return PORTUGUESE;}
541   if (lang == PORTUGUESE_B) {return PORTUGUESE;}
542 
543   return lang;
544 }
545 
546