1 // Copyright (c) 2009 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 // This file extends lang_enc.cc with additional languages and extended routines
6 // It is current with Unicode 5.1 (beta Jan 2008)
7 //
8
9 #include <stdlib.h>
10 #include <stdio.h>
11 #include <string.h>
12
13 #include "encodings/compact_lang_det/ext_lang_enc.h"
14 #include "encodings/compact_lang_det/win/cld_macros.h"
15 #include "encodings/compact_lang_det/win/cld_strtoint.h"
16
17 // Language names above NUM_LANGUAGES
18 // These are also the C enum declared names
19 static const char* const kExtLanguageName[] = {
20 "X_BORK_BORK_BORK", "X_PIG_LATIN", "X_HACKER", "X_KLINGON", "X_ELMER_FUDD",
21
22 // Pseudo-languages for Unicode scripts that express a single language
23 "X_OGHAM", "X_RUNIC", "X_YI", "X_OLD_ITALIC", "X_GOTHIC",
24 "X_DESERET", "X_HANUNOO", "X_BUHID", "X_TAGBANWA", "X_TAI_LE",
25 "X_LINEAR_B", "X_UGARITIC", "X_SHAVIAN", "X_OSMANYA", "X_CYPRIOT",
26 "X_BUGINESE", "X_COPTIC", "X_NEW_TAI_LUE", "X_GLAGOLITIC", "X_TIFINAGH",
27 "X_SYLOTI_NAGRI", "X_OLD_PERSIAN", "X_KHAROSHTHI", "X_BALINESE", "X_CUNEIFORM",
28 "X_PHOENICIAN", "X_PHAGS_PA", "X_NKO",
29
30 // Unicode 5.1
31 "X_SUDANESE", "X_LEPCHA", "X_OL_CHIKI", "X_VAI", "X_SAURASHTRA",
32 "X_KAYAH_LI", "X_REJANG", "X_LYCIAN", "X_CARIAN", "X_LYDIAN",
33 "X_CHAM",
34 };
35
36
37 // These are the C enum declared names, for programs creating C code
38 static const char* const kExtLangDeclaredName[] = {
39 "ENGLISH", /* 0 */
40 "DANISH", /* 1 */
41 "DUTCH", /* 2 */
42 "FINNISH", /* 3 */
43 "FRENCH", /* 4 */
44 "GERMAN", /* 5 */
45 "HEBREW", /* 6 */
46 "ITALIAN", /* 7 */
47 "JAPANESE", /* 8 */
48 "KOREAN", /* 9 */
49 "NORWEGIAN", /* 10 */
50 "POLISH", /* 11 */
51 "PORTUGUESE", /* 12 */
52 "RUSSIAN", /* 13 */
53 "SPANISH", /* 14 */
54 "SWEDISH", /* 15 */
55 "CHINESE", /* 16 */
56 "CZECH", /* 17 */
57 "GREEK", /* 18 */
58 "ICELANDIC", /* 19 */
59 "LATVIAN", /* 20 */
60 "LITHUANIAN", /* 21 */
61 "ROMANIAN", /* 22 */
62 "HUNGARIAN", /* 23 */
63 "ESTONIAN", /* 24 */
64 "TG_UNKNOWN_LANGUAGE", /* 25 */
65 "UNKNOWN_LANGUAGE", /* 26 */
66 "BULGARIAN", /* 27 */
67 "CROATIAN", /* 28 */
68 "SERBIAN", /* 29 */
69 "IRISH", /* 30 */
70 "GALICIAN", /* 31 */
71 "TAGALOG", /* 32 */
72 "TURKISH", /* 33 */
73 "UKRAINIAN", /* 34 */
74 "HINDI", /* 35 */
75 "MACEDONIAN", /* 36 */
76 "BENGALI", /* 37 */
77 "INDONESIAN", /* 38 */
78 "LATIN", /* 39 */
79 "MALAY", /* 40 */
80 "MALAYALAM", /* 41 */
81 "WELSH", /* 42 */
82 "NEPALI", /* 43 */
83 "TELUGU", /* 44 */
84 "ALBANIAN", /* 45 */
85 "TAMIL", /* 46 */
86 "BELARUSIAN", /* 47 */
87 "JAVANESE", /* 48 */
88 "OCCITAN", /* 49 */
89 "URDU", /* 50 */
90 "BIHARI", /* 51 */
91 "GUJARATI", /* 52 */
92 "THAI", /* 53 */
93 "ARABIC", /* 54 */
94 "CATALAN", /* 55 */
95 "ESPERANTO", /* 56 */
96 "BASQUE", /* 57 */
97 "INTERLINGUA", /* 58 */
98 "KANNADA", /* 59 */
99 "PUNJABI", /* 60 */
100 "SCOTS_GAELIC", /* 61 */
101 "SWAHILI", /* 62 */
102 "SLOVENIAN", /* 63 */
103 "MARATHI", /* 64 */
104 "MALTESE", /* 65 */
105 "VIETNAMESE", /* 66 */
106 "FRISIAN", /* 67 */
107 "SLOVAK", /* 68 */
108 "CHINESE_T", /* 69 */
109 "FAROESE", /* 70 */
110 "SUNDANESE", /* 71 */
111 "UZBEK", /* 72 */
112 "AMHARIC", /* 73 */
113 "AZERBAIJANI", /* 74 */
114 "GEORGIAN", /* 75 */
115 "TIGRINYA", /* 76 */
116 "PERSIAN", /* 77 */
117 "BOSNIAN", /* 78 */
118 "SINHALESE", /* 79 */
119 "NORWEGIAN_N", /* 80 */
120 "PORTUGUESE_P", /* 81 */
121 "PORTUGUESE_B", /* 82 */
122 "XHOSA", /* 83 */
123 "ZULU", /* 84 */
124 "GUARANI", /* 85 */
125 "SESOTHO", /* 86 */
126 "TURKMEN", /* 87 */
127 "KYRGYZ", /* 88 */
128 "BRETON", /* 89 */
129 "TWI", /* 90 */
130 "YIDDISH", /* 91 */
131 "SERBO_CROATIAN", /* 92 */
132 "SOMALI", /* 93 */
133 "UIGHUR", /* 94 */
134 "KURDISH", /* 95 */
135 "MONGOLIAN", /* 96 */
136 "ARMENIAN", /* 97 */
137 "LAOTHIAN", /* 98 */
138 "SINDHI", /* 99 */
139 "RHAETO_ROMANCE", /* 100 */
140 "AFRIKAANS", /* 101 */
141 "LUXEMBOURGISH", /* 102 */
142 "BURMESE", /* 103 */
143 "KHMER", /* 104 */
144 "TIBETAN", /* 105 */
145 "DHIVEHI", /* 106 */ // sometimes spelled Divehi; lang of Maldives
146 "CHEROKEE", /* 107 */
147 "SYRIAC", /* 108 */
148 "LIMBU", /* 109 */
149 "ORIYA", /* 110 */
150 "ASSAMESE", /* 111 */
151 "CORSICAN", /* 112 */
152 "INTERLINGUE", /* 113 */
153 "KAZAKH", /* 114 */
154 "LINGALA", /* 115 */
155 "MOLDAVIAN", /* 116 */
156 "PASHTO", /* 117 */
157 "QUECHUA", /* 118 */
158 "SHONA", /* 119 */
159 "TAJIK", /* 120 */
160 "TATAR", /* 121 */
161 "TONGA", /* 122 */
162 "YORUBA", /* 123 */
163 "CREOLES_AND_PIDGINS_ENGLISH_BASED", /* 124 */
164 "CREOLES_AND_PIDGINS_FRENCH_BASED", /* 125 */
165 "CREOLES_AND_PIDGINS_PORTUGUESE_BASED", /* 126 */
166 "CREOLES_AND_PIDGINS_OTHER", /* 127 */
167 "MAORI", /* 128 */
168 "WOLOF", /* 129 */
169 "ABKHAZIAN", /* 130 */
170 "AFAR", /* 131 */
171 "AYMARA", /* 132 */
172 "BASHKIR", /* 133 */
173 "BISLAMA", /* 134 */
174 "DZONGKHA", /* 135 */
175 "FIJIAN", /* 136 */
176 "GREENLANDIC", /* 137 */
177 "HAUSA", /* 138 */
178 "HAITIAN_CREOLE", /* 139 */
179 "INUPIAK", /* 140 */
180 "INUKTITUT", /* 141 */
181 "KASHMIRI", /* 142 */
182 "KINYARWANDA", /* 143 */
183 "MALAGASY", /* 144 */
184 "NAURU", /* 145 */
185 "OROMO", /* 146 */
186 "RUNDI", /* 147 */
187 "SAMOAN", /* 148 */
188 "SANGO", /* 149 */
189 "SANSKRIT", /* 150 */
190 "SISWANT", /* 151 */
191 "TSONGA", /* 152 */
192 "TSWANA", /* 153 */
193 "VOLAPUK", /* 154 */
194 "ZHUANG", /* 155 */
195 "KHASI", /* 156 */
196 "SCOTS", /* 157 */
197 "GANDA", /* 158 */
198 "MANX", /* 159 */
199 "MONTENEGRIN", /* 160 */
200 // Add new language declared names just before here
201 };
202
203 COMPILE_ASSERT(arraysize(kExtLangDeclaredName) == NUM_LANGUAGES,
204 kExtLangDeclaredName_has_incorrect_length);
205
206
207 // Language codes above NUM_LANGUAGES
208 // I made all these up, except Klingon from ISO-639-2 (dsites)
209 // NOTE: zza is a standard name
210 static const char* const kExtLanguageCode[] = {
211 // "X_BORK_BORK_BORK", "X_PIG_LATIN", "X_HACKER", "X_KLINGON", "X_ELMER_FUDD",
212 // All Latin script
213 "zzb", "zzp", "zzh", "tlh", "zze",
214
215 // Pseudo-languages for Unicode scripts that express a single language
216 "xx-Ogam", "xx-Runr", "xx-Yiii", "xx-Ital", "xx-Goth",
217 "xx-Dsrt", "xx-Hano", "xx-Buhd", "xx-Tagb", "xx-Tale",
218 "xx-Linb", "xx-Ugar", "xx-Shaw", "xx-Osma", "xx-Cprt",
219 "xx-Bugi", "xx-Copt", "xx-Talu", "xx-Glag", "xx-Tfng",
220 "xx-Sylo", "xx-Xpeo", "xx-Khar", "xx-Bali", "xx-Xsux",
221 "xx-Phnx", "xx-Phag", "xx-Nkoo",
222
223 // Unicode 5.1
224 "xx-Sund", "xx-Lepc", "xx-Olck", "xx-Vaii", "xx-Saur",
225 "xx-Kali", "xx-Rjng", "xx-Lyci", "xx-Cari", "xx-Lydi",
226 "xx-Cham",
227 };
228
229
230 // Given the Language, returns its string name used as the output by
231 // the lang/enc identifier, e.g. "Korean"
232 // "invalid_language" if the input is invalid.
233 // TG_UNKNOWN_LANGUAGE is used as a placeholder for the "ignore me" language,
234 // used to subtract out HTML, link farms, DNA strings, and alittle English porn
ExtLanguageName(const Language lang)235 const char* ExtLanguageName(const Language lang) {
236 if (lang < 0) {
237 // No-text-at-all result from a Tote
238 return "";
239 }
240 // CompactLanguageDetect extension
241 if (lang == TG_UNKNOWN_LANGUAGE) {
242 return "Ignore";
243 }
244 if ((0 <= lang) && (lang < NUM_LANGUAGES)) {
245 return LanguageName(lang);
246 }
247 if ((EXT_LANGUAGE_BASE <= lang) && (lang < EXT_NUM_LANGUAGES)) {
248 return kExtLanguageName[lang - EXT_LANGUAGE_BASE];
249 }
250 return invalid_language_name();
251 }
252
253
254 // Given the Language, returns its Language enum spelling, for use by
255 // programs that create C declarations, e.g. "KOREAN"
256 // "UNKNOWN_LANGUAGE" if the input is invalid.
ExtLanguageDeclaredName(const Language lang)257 const char* ExtLanguageDeclaredName(const Language lang) {
258 if ((0 <= lang) && (lang < NUM_LANGUAGES)) {
259 return kExtLangDeclaredName[lang];
260 }
261 if ((EXT_LANGUAGE_BASE <= lang) && (lang < EXT_NUM_LANGUAGES)) {
262 return kExtLanguageName[lang - EXT_LANGUAGE_BASE];
263 }
264 return "UNKNOWN_LANGUAGE";
265 }
266
267 // Given the Language, return the language code, e.g. "ko"
ExtLanguageCode(const Language lang)268 const char* ExtLanguageCode(const Language lang) {
269 // Hack for ignore/porn pseudo-language
270 if (lang == TG_UNKNOWN_LANGUAGE) {
271 return "xxx";
272 }
273 if ((0 <= lang) && (lang < NUM_LANGUAGES)) {
274 return LanguageCode(lang);
275 }
276 if ((EXT_LANGUAGE_BASE <= lang) && (lang < EXT_NUM_LANGUAGES)) {
277 return kExtLanguageCode[lang - EXT_LANGUAGE_BASE];
278 }
279 return "??";
280 }
281
282
283 // Convert "en-Latn-GB" to ENGLISH
284 // Normalize to PORTUGUESE, not PORTUGUESE_B nor PORTUGUESE_P
285 // Consider for later: NORWEGIAN, NORWEGIAN_N
286 // Consider for later: SCOTS, SCOTS_GAELIC
287 // Consider for later: SERBO_CROATIAN, SERBIAN, CROATIAN, BOSNIAN
288 //
GetLanguageFromNumberOrName(const char * src)289 Language GetLanguageFromNumberOrName(const char* src) {
290 if (strspn(src, "0123456789") == strlen(src)) {
291 // All digits
292 return static_cast<Language>(strto32(src, NULL, 10));
293 }
294
295 Language retlang = UNKNOWN_LANGUAGE;
296 size_t len = strlen(src);
297
298 if (true /*FLAGS_mergepairs*/) {
299 // Merge sets of langauges pt-xx en-xx fr-xx, NOT bs/hr/sr
300 if (memcmp(src, "pt-", 3) == 0) {return PORTUGUESE;}
301 if (memcmp(src, "en-", 3) == 0) {return ENGLISH;}
302 if (memcmp(src, "fr-", 3) == 0) {return FRENCH;}
303 // Use NormalizeLanguage instead
304 if (memcmp(src, "bs-", 3) == 0) {return CROATIAN;}
305 if (memcmp(src, "hr-", 3) == 0) {return CROATIAN;}
306 if (memcmp(src, "sr-Latn", 7) == 0) {return CROATIAN;}
307 if (memcmp(src, "sh-Latn", 7) == 0) {return CROATIAN;}
308 if (memcmp(src, "sr-Cyrl", 7) == 0) {return SERBIAN;}
309 if (memcmp(src, "sh-Cyrl", 7) == 0) {return SERBIAN;}
310 }
311
312 // Extensions
313 if (len >= 3) {
314 // Standin for ignore/porn "language"
315 if (memcmp(src, "xxx", 3) == 0) {return TG_UNKNOWN_LANGUAGE;}
316
317 if (memcmp(src, "zzb", 3) == 0) {return X_BORK_BORK_BORK;}
318 if (memcmp(src, "zzp", 3) == 0) {return X_PIG_LATIN;}
319 if (memcmp(src, "zzh", 3) == 0) {return X_HACKER;}
320 if (memcmp(src, "tlh", 3) == 0) {return X_KLINGON;}
321 if (memcmp(src, "zze", 3) == 0) {return X_ELMER_FUDD;}
322 }
323
324 // We have a name like en-Latn-GB or pt-BR
325 // First, get rid of some special cases
326 if (len <= 3) {
327 LanguageFromCode(src, &retlang);
328 } else if (len == 7) {
329 // More Extensions
330 if (memcmp(src, "xx-", 3) == 0) {
331 if (memcmp(src, "xx-Ogam", 7) == 0) {return X_OGHAM;}
332 if (memcmp(src, "xx-Runr", 7) == 0) {return X_RUNIC;}
333 if (memcmp(src, "xx-Yiii", 7) == 0) {return X_YI;}
334 if (memcmp(src, "xx-Ital", 7) == 0) {return X_OLD_ITALIC;}
335 if (memcmp(src, "xx-Goth", 7) == 0) {return X_GOTHIC;}
336 if (memcmp(src, "xx-Dsrt", 7) == 0) {return X_DESERET;}
337 if (memcmp(src, "xx-Hano", 7) == 0) {return X_HANUNOO;}
338 if (memcmp(src, "xx-Buhd", 7) == 0) {return X_BUHID;}
339 if (memcmp(src, "xx-Tagb", 7) == 0) {return X_TAGBANWA;}
340 if (memcmp(src, "xx-Tale", 7) == 0) {return X_TAI_LE;}
341 if (memcmp(src, "xx-Linb", 7) == 0) {return X_LINEAR_B;}
342 if (memcmp(src, "xx-Ugar", 7) == 0) {return X_UGARITIC;}
343 if (memcmp(src, "xx-Shaw", 7) == 0) {return X_SHAVIAN;}
344 if (memcmp(src, "xx-Osma", 7) == 0) {return X_OSMANYA;}
345 if (memcmp(src, "xx-Cprt", 7) == 0) {return X_CYPRIOT;}
346 if (memcmp(src, "xx-Bugi", 7) == 0) {return X_BUGINESE;}
347 if (memcmp(src, "xx-Copt", 7) == 0) {return X_COPTIC;}
348 if (memcmp(src, "xx-Talu", 7) == 0) {return X_NEW_TAI_LUE;}
349 if (memcmp(src, "xx-Glag", 7) == 0) {return X_GLAGOLITIC;}
350 if (memcmp(src, "xx-Tfng", 7) == 0) {return X_TIFINAGH;}
351 if (memcmp(src, "xx-Sylo", 7) == 0) {return X_SYLOTI_NAGRI;}
352 if (memcmp(src, "xx-Xpeo", 7) == 0) {return X_OLD_PERSIAN;}
353 if (memcmp(src, "xx-Khar", 7) == 0) {return X_KHAROSHTHI;}
354 if (memcmp(src, "xx-Bali", 7) == 0) {return X_BALINESE;}
355 if (memcmp(src, "xx-Xsux", 7) == 0) {return X_CUNEIFORM;}
356 if (memcmp(src, "xx-Phnx", 7) == 0) {return X_PHOENICIAN;}
357 if (memcmp(src, "xx-Phag", 7) == 0) {return X_PHAGS_PA;}
358 if (memcmp(src, "xx-Nkoo", 7) == 0) {return X_NKO;}
359
360 // Unicode 5.1
361 if (memcmp(src, "xx-Sund", 7) == 0) {return X_SUDANESE;}
362 if (memcmp(src, "xx-Lepc", 7) == 0) {return X_LEPCHA;}
363 if (memcmp(src, "xx-Olck", 7) == 0) {return X_OL_CHIKI;}
364 if (memcmp(src, "xx-Vaii", 7) == 0) {return X_VAI;}
365 if (memcmp(src, "xx-Saur", 7) == 0) {return X_SAURASHTRA;}
366 if (memcmp(src, "xx-Kali", 7) == 0) {return X_KAYAH_LI;}
367 if (memcmp(src, "xx-Rjng", 7) == 0) {return X_REJANG;}
368 if (memcmp(src, "xx-Lyci", 7) == 0) {return X_LYCIAN;}
369 if (memcmp(src, "xx-Cari", 7) == 0) {return X_CARIAN;}
370 if (memcmp(src, "xx-Lydi", 7) == 0) {return X_LYDIAN;}
371 if (memcmp(src, "xx-Cham", 7) == 0) {return X_CHAM;}
372 }
373 }
374 // Some other weird ones
375 // Could be Latn or Limb; all our current training data is Latn
376 if (strcmp(src, "sit-NP") == 0) {return LIMBU;}
377 if (strcmp(src, "un-Latn") == 0) {return UNKNOWN_LANGUAGE;}
378
379 // Multi-country langauges
380 if (memcmp(src, "zh", 2) == 0) {
381 if (memcmp(&src[len - 2], "TW", 2) == 0) {return CHINESE_T;}
382 if (memcmp(&src[len - 2], "HK", 2) == 0) {return CHINESE_T;}
383 return CHINESE;
384 }
385 if (memcmp(src, "pt", 2) == 0) {
386 if (memcmp(&src[len - 2], "BR", 2) == 0) {return PORTUGUESE;}
387 return PORTUGUESE;
388 }
389 if (memcmp(src, "fr", 2) == 0) {
390 if (memcmp(&src[len -2], "CA", 2) == 0) {return FRENCH;}
391 return FRENCH;
392 }
393
394 // None of the special cases matched
395 if (src[2] == '-') {
396 char temp[4];
397 memcpy(temp, src, 4);
398 temp[2] = '\0';
399 LanguageFromCode(temp, &retlang);
400 }
401 if (src[3] == '-') {
402 char temp[4];
403 memcpy(temp, src, 4);
404 temp[3] = '\0';
405 LanguageFromCode(temp, &retlang);
406 }
407 if (retlang != UNKNOWN_LANGUAGE) {
408 return retlang;
409 }
410
411 return retlang;
412 }
413
414 typedef struct {
415 const char* name;
416 UnicodeLScript lscript;
417 } NameScriptPair;
418
419 // In alphabetic order for binary search
420 static const NameScriptPair kNameScriptPair[] = {
421 // Unicode 5.1 additional scripts
422 {"Arab", ULScript_Arabic},
423 {"Armn", ULScript_Armenian},
424 {"Bali", ULScript_Balinese},
425 {"Beng", ULScript_Bengali},
426 {"Bugi", ULScript_Buginese},
427 {"Buhd", ULScript_Buhid},
428 {"Cans", ULScript_Canadian_Aboriginal},
429 {"Cari", ULScript_Carian}, // Unicode 5.1
430 {"Cham", ULScript_Cham}, // Unicode 5.1
431 {"Cher", ULScript_Cherokee},
432 {"Copt", ULScript_Coptic},
433 {"Cprt", ULScript_Cypriot},
434 {"Cyrl", ULScript_Cyrillic},
435 {"Deva", ULScript_Devanagari},
436 {"Dsrt", ULScript_Deseret},
437 {"Ethi", ULScript_Ethiopic},
438 {"Geor", ULScript_Georgian},
439 {"Glag", ULScript_Glagolitic},
440 {"Goth", ULScript_Gothic},
441 {"Grek", ULScript_Greek},
442 {"Gujr", ULScript_Gujarati},
443 {"Guru", ULScript_Gurmukhi},
444 {"Hani", ULScript_HanCJK},
445 {"Hano", ULScript_Hanunoo},
446 {"Hebr", ULScript_Hebrew},
447 {"Ital", ULScript_Old_Italic},
448 {"Kali", ULScript_Kayah_Li}, // Unicode 5.1
449 {"Khar", ULScript_Kharoshthi},
450 {"Khmr", ULScript_Khmer},
451 {"Knda", ULScript_Kannada},
452 {"Laoo", ULScript_Lao},
453 {"Latn", ULScript_Latin},
454 {"Lepc", ULScript_Lepcha}, // Unicode 5.1
455 {"Limb", ULScript_Limbu},
456 {"Linb", ULScript_Linear_B},
457 {"Lyci", ULScript_Lycian}, // Unicode 5.1
458 {"Lydi", ULScript_Lydian}, // Unicode 5.1
459 {"Mlym", ULScript_Malayalam},
460 {"Mong", ULScript_Mongolian},
461 {"Mymr", ULScript_Myanmar},
462 {"Nkoo", ULScript_Nko},
463 {"Ogam", ULScript_Ogham},
464 {"Olck", ULScript_Ol_Chiki}, // Unicode 5.1
465 {"Orya", ULScript_Oriya},
466 {"Osma", ULScript_Osmanya},
467 {"Phag", ULScript_Phags_Pa},
468 {"Phnx", ULScript_Phoenician},
469 {"Rjng", ULScript_Rejang}, // Unicode 5.1
470 {"Runr", ULScript_Runic},
471 {"Saur", ULScript_Saurashtra}, // Unicode 5.1
472 {"Shaw", ULScript_Shavian},
473 {"Sinh", ULScript_Sinhala},
474 {"Sund", ULScript_Sundanese}, // Unicode 5.1
475 {"Sylo", ULScript_Syloti_Nagri},
476 {"Syrc", ULScript_Syriac},
477 {"Tagb", ULScript_Tagbanwa},
478 {"Tale", ULScript_Tai_Le},
479 {"Talu", ULScript_New_Tai_Lue},
480 {"Taml", ULScript_Tamil},
481 {"Telu", ULScript_Telugu},
482 {"Tfng", ULScript_Tifinagh},
483 {"Tglg", ULScript_Tagalog},
484 {"Thaa", ULScript_Thaana},
485 {"Thai", ULScript_Thai},
486 {"Tibt", ULScript_Tibetan},
487 {"Ugar", ULScript_Ugaritic},
488 {"Vaii", ULScript_Vai}, // Unicode 5.1 // NOTE: apparently 'Vai '
489 {"Xpeo", ULScript_Old_Persian},
490 {"Xsux", ULScript_Cuneiform},
491 {"Yiii", ULScript_Yi},
492 {"Zyyy", ULScript_Common},
493 {"Zzzz", ULScript_Inherited},
494 };
495
496 // Convert "en-Latn-GB" to ULScript_Latin
GetLScriptFromNumberOrName(const char * src)497 UnicodeLScript GetLScriptFromNumberOrName(const char* src) {
498 if (strspn(src, "0123456789") == strlen(src)) {
499 // All digits
500 return static_cast<UnicodeLScript>(strto32(src, NULL, 10));
501 }
502
503 if (strcmp(src, "zh-TW") == 0) {return ULScript_HanCJK;}
504 if (strcmp(src, "zh-CN") == 0) {return ULScript_HanCJK;}
505 if (strcmp(src, "pt-BR") == 0) {return ULScript_Latin;}
506 if (strcmp(src, "pt-PT") == 0) {return ULScript_Latin;}
507 // Could be Latn or Limb; all our current training data is Latn
508 if (strcmp(src, "sit-NP") == 0) {return ULScript_Latin;}
509
510 // Isolate just the script field
511 char temp[5];
512 const char* src2 = strchr(src, '-');
513 if (src2 == NULL) {return ULScript_Latin;}
514 src2 += 1; // over the -
515 memcpy(temp, src2, 4);
516 temp[4] = '\0';
517
518 int lo = 0;
519 int hi = ULScript_NUM_SCRIPTS;
520 while (lo < hi) {
521 int mid = (lo + hi) >> 1;
522 if (strcmp(temp, kNameScriptPair[mid].name) < 0) {
523 hi = mid;
524 } else if (strcmp(temp, kNameScriptPair[mid].name) > 0) {
525 lo = mid + 1;
526 } else {
527 return kNameScriptPair[mid].lscript;
528 }
529 }
530 return ULScript_Latin;
531 }
532
533
534 // Merge together some languages, such as bo/hr/sr
535 // Croatian Latin and Serbian Cyrillic now.
NormalizeLanguage(Language lang)536 Language NormalizeLanguage(Language lang) {
537 if (lang == BOSNIAN) {return CROATIAN;}
538 if (lang == SERBO_CROATIAN) {return SERBIAN;}
539
540 if (lang == PORTUGUESE_P) {return PORTUGUESE;}
541 if (lang == PORTUGUESE_B) {return PORTUGUESE;}
542
543 return lang;
544 }
545
546