1 // Copyright (c) 2009 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #ifndef ENCODINGS_COMPACT_LANG_DET_CLDUTIL_H_ 6 #define ENCODINGS_COMPACT_LANG_DET_CLDUTIL_H_ 7 8 #include <string> 9 #include "encodings/compact_lang_det/ext_lang_enc.h" 10 #include "encodings/compact_lang_det/tote.h" 11 #include "encodings/compact_lang_det/win/cld_basictypes.h" 12 #include "encodings/compact_lang_det/win/cld_commandlineflags.h" 13 #include "encodings/compact_lang_det/win/cld_utf8statetable.h" 14 15 namespace cld { 16 17 // Hash bucket for four-way associative lookup with < 64K buckets 18 // 32 bytes per bucket, 8-byte entries 19 typedef struct { 20 uint32 key[4]; // hashed word to look up 21 uint32 value[4]; // packed three lang numbers and probability subscript 22 } SmallWordProbBucket4; 23 24 // Hash bucket for fouro-way associative lookup with >= 64K buckets 25 // 24 bytes per bucket, 6-byte entries 26 typedef struct { 27 uint16 key[4]; // Half of hashed word to look up; other 28 // half is used to pick the bucket 29 uint32 value[4]; // packed three lang numbers and probability subscript 30 } LargeQuadProbBucket4; 31 32 // Hash bucket for four-way associative lookup, indirect probabilities 33 // 16 bytes per bucket, 4-byte entries 34 typedef struct { 35 uint32 keyvalue[4]; // Upper part of word is hash, lower is indirect prob 36 } IndirectProbBucket4; 37 38 39 // This describes a complete CLD table, consisting of 40 // a main lookup table, an indirect language/probability table, and 41 // three constants. 42 // The main table key is a quadgram, bigram, or longword hash, with 43 // part of the key used to select a bucket modulo kCLDTableSize, 44 // and the rest matched against the key portion of four entries in a bucket, 45 // defined by kCLDTableKeyMask. The remaining bits of an entry, defined 46 // by ~kCLDTableKeyMask, are usually a subscript in the indirect table. 47 // 48 // By using part of the key to select a bucket, those key bits do not need 49 // to be stored in the main table entries, saving space (typically 2 bytes). 50 // 51 // By using an indirect table for lang/prob triples, only the subscript needs 52 // to be stored in the main table entires, saving space (typically 2 bytes). 53 // 54 // Each entry in the indirect table has three languages and three 55 // corresponding probabilities, packed into four bytes. 56 // 57 // The build date constant is included just for version tracking and is not 58 // otherwise used. 59 // 60 // Different-size tables can be linked in for different production 61 // environments. By going indirect through this struct, the runtime code is 62 // insensitive to the actual sizes. 63 // 64 // An empty placeholder table can be described by a table size of 1 65 // bucket, a keymask of 0xffffffff, a degenerate bucket of four no-match 66 // entries, and a degenerate indirect table of one no-languages entry. 67 // 68 // 69 struct CLDTableSummary { 70 const IndirectProbBucket4* kCLDTable; 71 // Each bucket has four entries, part 72 // key and part indirect subscript 73 const uint32* kCLDTableInd; // Each entry is three packed lang/prob 74 const int kCLDTableSize; // Bucket count 75 const int kCLDTableIndSize; // Entries count 76 const int kCLDTableKeyMask; // Mask hash key 77 const int kCLDTableBuildDate; // yyyymmdd 78 }; 79 80 81 // Keeps per-character 0-12 language probabilities for CTJKVZ-- in that order. 82 // Chinese ChineseT Japanese Korean Vietnamese Zhuang 83 // (2 bytes unused, for alignment padding and future) 84 typedef struct { 85 uint8 probs[8]; 86 } UnigramProbArray; 87 88 // Map 8-bit subscript to CTJKVZ probabilities 89 // Target runtime probabilities for CTJK + VZ 90 // Hand-generated to cover a reasonable range of choices 91 static const int kTargetCTJKVZProbsSize = 242; 92 static const UnigramProbArray kTargetCTJKVZProbs[kTargetCTJKVZProbsSize] = { 93 {{0,0,0,0,0,0,0,0}}, 94 {{0,0,0,0,0,12,0,0}}, 95 {{0,0,0,0,12,0,0,0}}, 96 {{0,0,0,12,0,0,0,0}}, 97 {{0,0,12,0,0,0,0,0}}, 98 {{0,12,0,0,0,0,0,0}}, 99 {{12,0,0,0,0,0,0,0}}, 100 101 {{8,0,0,0,4,0,0,0}}, 102 {{8,0,0,4,0,0,0,0}}, 103 {{8,0,4,0,0,0,0,0}}, 104 {{8,4,0,0,0,0,0,0}}, 105 {{8,2,0,2,0,0,0,0}}, 106 {{0,0,0,0,0,8,0,0}}, 107 {{0,4,8,0,0,0,0,0}}, 108 {{4,0,0,0,0,8,0,0}}, 109 {{0,0,8,0,0,0,0,0}}, 110 {{8,2,2,0,0,0,0,0}}, 111 {{0,8,4,0,0,0,0,0}}, 112 {{8,0,0,0,0,4,0,0}}, 113 {{0,8,2,0,0,0,0,0}}, 114 {{4,8,0,0,0,0,0,0}}, 115 {{2,8,0,2,0,0,0,0}}, 116 {{2,2,8,0,0,0,0,0}}, 117 {{0,8,0,0,0,0,0,0}}, 118 {{0,2,8,0,0,0,0,0}}, 119 {{2,8,2,0,0,0,0,0}}, 120 {{8,0,0,0,0,0,0,0}}, 121 {{2,8,0,0,0,0,0,0}}, 122 {{8,2,0,0,0,0,0,0}}, 123 124 {{0,6,2,0,2,0,0,0}}, 125 {{2,0,0,0,6,0,0,0}}, 126 {{4,0,0,0,6,0,0,0}}, 127 {{4,6,0,0,4,0,0,0}}, 128 {{4,6,2,0,2,0,0,0}}, 129 {{4,6,4,0,2,0,0,0}}, 130 {{5,4,6,0,0,0,0,0}}, 131 {{6,0,0,0,4,0,0,0}}, 132 {{6,0,2,0,4,0,0,0}}, 133 {{6,0,4,0,4,0,0,0}}, 134 {{6,2,0,0,4,0,0,0}}, 135 {{6,2,2,0,4,0,0,0}}, 136 {{6,2,4,0,2,0,0,0}}, 137 {{6,4,0,0,2,0,0,0}}, 138 {{6,4,2,0,2,0,0,0}}, 139 {{0,0,6,2,0,0,0,0}}, 140 {{0,6,2,0,0,2,0,0}}, 141 {{2,2,2,0,0,6,0,0}}, 142 {{2,2,6,4,0,0,0,0}}, 143 {{2,4,0,0,0,6,0,0}}, 144 {{2,6,0,4,0,0,0,0}}, 145 {{2,6,2,4,0,0,0,0}}, 146 {{2,6,4,4,0,0,0,0}}, 147 {{4,0,2,0,0,6,0,0}}, 148 {{4,2,6,2,0,0,0,0}}, 149 {{4,4,2,0,0,6,0,0}}, 150 {{4,6,4,0,0,2,0,0}}, 151 {{6,0,2,0,0,2,0,0}}, 152 {{6,2,0,0,0,2,0,0}}, 153 {{6,2,2,0,0,4,0,0}}, 154 {{6,2,4,0,0,2,0,0}}, 155 {{4,6,2,0,0,4,0,0}}, 156 {{6,4,2,0,0,4,0,0}}, 157 {{2,0,0,0,0,6,0,0}}, 158 {{6,2,0,2,0,0,0,0}}, 159 {{2,2,0,0,0,6,0,0}}, 160 {{6,2,6,0,0,0,0,0}}, 161 {{6,4,2,0,0,2,0,0}}, 162 {{6,4,2,2,0,0,0,0}}, 163 {{4,6,4,2,0,0,0,0}}, 164 {{6,0,2,0,0,4,0,0}}, 165 {{6,0,4,0,0,2,0,0}}, 166 {{6,0,6,0,0,0,0,0}}, 167 {{6,2,2,0,0,0,0,0}}, 168 {{6,4,0,0,0,2,0,0}}, 169 {{6,4,5,0,0,0,0,0}}, 170 {{0,6,0,2,0,0,0,0}}, 171 {{0,6,2,2,0,0,0,0}}, 172 {{2,6,0,2,0,0,0,0}}, 173 {{2,6,2,2,0,0,0,0}}, 174 {{4,2,0,0,0,6,0,0}}, 175 {{6,4,0,0,0,4,0,0}}, 176 {{6,4,0,2,0,0,0,0}}, 177 {{6,6,0,2,0,0,0,0}}, 178 {{6,0,4,0,0,4,0,0}}, 179 {{6,2,0,0,0,4,0,0}}, 180 {{6,6,2,2,0,0,0,0}}, 181 {{4,6,0,0,0,2,0,0}}, 182 {{2,6,6,0,0,0,0,0}}, 183 {{4,5,6,0,0,0,0,0}}, 184 {{4,6,0,2,0,0,0,0}}, 185 {{6,2,0,0,0,6,0,0}}, 186 {{0,6,4,2,0,0,0,0}}, 187 {{4,0,6,0,0,0,0,0}}, 188 {{2,6,4,2,0,0,0,0}}, 189 {{4,6,0,0,0,4,0,0}}, 190 {{6,2,2,0,0,0,0,0}}, 191 {{4,6,2,2,0,0,0,0}}, 192 {{4,6,5,0,0,0,0,0}}, 193 {{6,0,2,0,0,0,0,0}}, 194 {{6,4,4,0,0,0,0,0}}, 195 {{4,2,6,0,0,0,0,0}}, 196 {{2,0,6,0,0,0,0,0}}, 197 {{4,4,0,0,0,6,0,0}}, 198 {{4,4,6,0,0,0,0,0}}, 199 {{4,6,2,0,0,2,0,0}}, 200 {{2,2,6,0,0,0,0,0}}, 201 {{2,4,6,0,0,0,0,0}}, 202 {{0,6,6,0,0,0,0,0}}, 203 {{6,2,4,0,0,0,0,0}}, 204 {{0,4,6,0,0,0,0,0}}, 205 {{4,0,0,0,0,6,0,0}}, 206 {{4,6,4,0,0,0,0,0}}, 207 {{6,0,0,0,0,6,0,0}}, 208 {{6,0,0,0,0,2,0,0}}, 209 {{6,0,4,0,0,0,0,0}}, 210 {{6,5,4,0,0,0,0,0}}, 211 {{0,2,6,0,0,0,0,0}}, 212 {{0,0,6,0,0,0,0,0}}, 213 {{6,6,2,0,0,0,0,0}}, 214 {{2,6,4,0,0,0,0,0}}, 215 {{6,4,2,0,0,0,0,0}}, 216 {{2,6,2,0,0,0,0,0}}, 217 {{2,6,0,0,0,0,0,0}}, 218 {{6,0,0,0,0,4,0,0}}, 219 {{6,4,0,0,0,0,0,0}}, 220 {{6,6,0,0,0,0,0,0}}, 221 {{5,6,4,0,0,0,0,0}}, 222 {{0,6,0,0,0,0,0,0}}, 223 {{6,2,0,0,0,0,0,0}}, 224 {{0,6,2,0,0,0,0,0}}, 225 {{4,6,2,0,0,0,0,0}}, 226 {{0,6,4,0,0,0,0,0}}, 227 {{4,6,0,0,0,0,0,0}}, 228 {{6,0,0,0,0,0,0,0}}, 229 {{6,6,5,0,0,0,0,0}}, 230 {{6,5,6,0,0,0,0,0}}, 231 {{5,6,6,0,0,0,0,0}}, 232 {{5,5,6,0,0,0,0,0}}, 233 {{5,6,5,0,0,0,0,0}}, 234 {{6,5,5,0,0,0,0,0}}, 235 {{6,6,6,0,0,0,0,0}}, 236 {{6,5,0,0,0,0,0,0}}, 237 {{6,0,5,0,0,0,0,0}}, 238 {{0,6,5,0,0,0,0,0}}, 239 {{5,6,0,0,0,0,0,0}}, 240 {{5,0,6,0,0,0,0,0}}, 241 {{0,5,6,0,0,0,0,0}}, 242 243 {{0,0,0,0,4,0,0,0}}, 244 {{0,0,0,4,0,0,0,0}}, 245 {{2,2,0,0,4,0,0,0}}, 246 {{2,2,2,0,4,0,0,0}}, 247 {{2,4,0,0,2,0,0,0}}, 248 {{2,4,2,0,2,0,0,0}}, 249 {{2,4,4,0,2,0,0,0}}, 250 {{4,0,2,0,4,0,0,0}}, 251 {{4,0,4,0,2,0,0,0}}, 252 {{4,2,0,0,2,0,0,0}}, 253 {{4,2,2,0,2,0,0,0}}, 254 {{4,4,0,0,2,0,0,0}}, 255 {{4,4,2,0,2,0,0,0}}, 256 {{4,4,4,0,2,0,0,0}}, 257 {{0,2,2,4,0,0,0,0}}, 258 {{2,2,4,2,0,0,0,0}}, 259 {{2,4,4,0,0,2,0,0}}, 260 {{2,4,4,2,0,0,0,0}}, 261 {{4,0,4,0,0,2,0,0}}, 262 {{4,0,4,0,0,4,0,0}}, 263 {{4,2,2,4,0,0,0,0}}, 264 {{4,4,0,2,0,0,0,0}}, 265 {{2,2,0,4,0,0,0,0}}, 266 {{2,4,2,2,0,0,0,0}}, 267 {{4,4,2,2,0,0,0,0}}, 268 {{4,0,4,0,0,0,0,0}}, 269 {{4,4,4,0,0,4,0,0}}, 270 {{0,4,0,2,0,0,0,0}}, 271 {{0,4,2,2,0,0,0,0}}, 272 {{4,0,2,0,0,2,0,0}}, 273 {{4,2,0,0,0,4,0,0}}, 274 {{2,2,2,0,0,4,0,0}}, 275 {{4,0,0,2,0,0,0,0}}, 276 {{4,4,4,0,0,2,0,0}}, 277 {{4,0,0,0,0,4,0,0}}, 278 {{4,0,2,0,0,4,0,0}}, 279 {{4,2,0,0,0,2,0,0}}, 280 {{4,2,2,0,0,2,0,0}}, 281 {{2,4,0,2,0,0,0,0}}, 282 {{2,2,0,0,0,4,0,0}}, 283 {{2,4,0,0,0,4,0,0}}, 284 {{2,4,2,0,0,4,0,0}}, 285 {{4,2,4,0,0,0,0,0}}, 286 {{2,0,4,0,0,0,0,0}}, 287 {{4,0,2,0,0,0,0,0}}, 288 {{4,4,0,0,0,4,0,0}}, 289 {{4,4,2,0,0,4,0,0}}, 290 {{0,4,4,0,0,0,0,0}}, 291 {{4,4,0,0,0,2,0,0}}, 292 {{2,4,0,0,0,2,0,0}}, 293 {{2,2,4,0,0,0,0,0}}, 294 {{0,2,4,0,0,0,0,0}}, 295 {{4,2,2,0,0,0,0,0}}, 296 {{2,4,2,0,0,2,0,0}}, 297 {{4,4,4,0,0,0,0,0}}, 298 {{2,4,4,0,0,0,0,0}}, 299 {{0,0,4,0,0,0,0,0}}, 300 {{0,4,2,0,0,0,0,0}}, 301 {{4,4,2,0,0,2,0,0}}, 302 {{2,4,2,0,0,0,0,0}}, 303 {{4,2,0,0,0,0,0,0}}, 304 {{4,4,0,0,0,0,0,0}}, 305 {{4,4,2,0,0,0,0,0}}, 306 {{2,4,0,0,0,0,0,0}}, 307 {{0,4,0,0,0,0,0,0}}, 308 {{4,0,0,0,0,0,0,0}}, 309 {{0,0,0,4,4,0,0,0}}, 310 {{0,0,4,0,4,0,0,0}}, 311 {{0,0,4,4,0,0,0,0}}, 312 {{0,4,0,0,4,0,0,0}}, 313 {{0,4,0,4,0,0,0,0}}, 314 {{4,0,0,0,4,0,0,0}}, 315 {{4,0,0,4,0,0,0,0}}, 316 317 {{2,0,0,0,0,0,0,0}}, 318 {{0,2,0,0,0,0,0,0}}, 319 {{0,2,0,2,2,0,0,0}}, 320 {{0,2,2,0,2,0,0,0}}, 321 {{2,0,0,2,2,0,0,0}}, 322 {{2,0,2,0,2,0,0,0}}, 323 {{2,0,2,2,0,0,0,0}}, 324 {{2,2,0,0,2,0,0,0}}, 325 {{2,2,2,2,0,0,0,0}}, 326 {{2,2,0,2,0,0,0,0}}, 327 {{2,2,0,0,0,0,0,0}}, 328 {{0,0,2,0,0,0,0,0}}, 329 {{0,2,2,0,0,0,0,0}}, 330 {{2,2,2,0,0,0,0,0}}, 331 {{0,0,0,2,0,0,0,0}}, 332 {{2,0,2,0,0,0,0,0}}, 333 {{0,2,0,2,0,0,0,0}}, 334 {{0,0,2,2,0,0,0,0}}, 335 {{0,2,2,2,0,0,0,0}}, 336 }; 337 338 339 340 341 // 1 to skip ASCII space, vowels AEIOU aeiou and UTF-8 continuation bytes 80-BF 342 static const uint8 kSkipSpaceVowelContinue[256] = { 343 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 344 1,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 345 0,1,0,0,0,1,0,0, 0,1,0,0,0,0,0,1, 0,0,0,0,0,1,0,0, 0,0,0,0,0,0,0,0, 346 0,1,0,0,0,1,0,0, 0,1,0,0,0,0,0,1, 0,0,0,0,0,1,0,0, 0,0,0,0,0,0,0,0, 347 348 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 349 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 350 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 351 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 352 }; 353 354 // 1 to skip ASCII space, and UTF-8 continuation bytes 80-BF 355 static const uint8 kSkipSpaceContinue[256] = { 356 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 357 1,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 358 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 359 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 360 361 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 362 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 363 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 364 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 365 }; 366 367 368 // If != UNKNOWN, use nilgrams to determine language of this script 369 static const Language kOnlyLanguagePerLScript[] = { 370 ENGLISH, // ULScript_Common, [no words should be in this script] 371 UNKNOWN_LANGUAGE, // ULScript_Latin, 372 //UNKNOWN_LANGUAGE, // ULScript_Greek, Jan 2009: change so we can score quads 373 GREEK, // ULScript_Greek, Mar 2009: change back; do gibberish separately 374 UNKNOWN_LANGUAGE, // ULScript_Cyrillic, 375 ARMENIAN, // ULScript_Armenian, 376 UNKNOWN_LANGUAGE, // ULScript_Hebrew, 377 UNKNOWN_LANGUAGE, // ULScript_Arabic, 378 SYRIAC, // ULScript_Syriac, 379 DHIVEHI, // ULScript_Thaana, 380 UNKNOWN_LANGUAGE, // ULScript_Devanagari, 381 UNKNOWN_LANGUAGE, // ULScript_Bengali, 382 PUNJABI, // ULScript_Gurmukhi, 383 GUJARATI, // ULScript_Gujarati, 384 ORIYA, // ULScript_Oriya, 385 TAMIL, // ULScript_Tamil, 386 TELUGU, // ULScript_Telugu, 387 KANNADA, // ULScript_Kannada, 388 MALAYALAM, // ULScript_Malayalam, 389 SINHALESE, // ULScript_Sinhala, 390 THAI, // ULScript_Thai, 391 LAOTHIAN, // ULScript_Lao, 392 UNKNOWN_LANGUAGE, // ULScript_Tibetan, 393 BURMESE, // ULScript_Myanmar, 394 GEORGIAN, // ULScript_Georgian, 395 UNKNOWN_LANGUAGE, // ULScript_HanCJK, 396 UNKNOWN_LANGUAGE, // ULScript_Ethiopic, 397 CHEROKEE, // ULScript_Cherokee, 398 INUKTITUT, // ULScript_Canadian_Aboriginal, 399 X_OGHAM, // ULScript_Ogham, 400 X_RUNIC, // ULScript_Runic, 401 KHMER, // ULScript_Khmer, 402 MONGOLIAN, // ULScript_Mongolian, 403 X_YI, // ULScript_Yi, 404 X_OLD_ITALIC, // ULScript_Old_Italic, 405 X_GOTHIC, // ULScript_Gothic, 406 X_DESERET, // ULScript_Deseret, 407 ENGLISH, // ULScript_Inherited, [no words should be in this script] 408 TAGALOG, // ULScript_Tagalog, 409 X_HANUNOO, // ULScript_Hanunoo, 410 X_BUHID, // ULScript_Buhid, 411 X_TAGBANWA, // ULScript_Tagbanwa, 412 LIMBU, // ULScript_Limbu, 413 X_TAI_LE, // ULScript_Tai_Le, 414 X_LINEAR_B, // ULScript_Linear_B, 415 X_UGARITIC, // ULScript_Ugaritic, 416 X_SHAVIAN, // ULScript_Shavian, 417 X_OSMANYA, // ULScript_Osmanya, 418 X_CYPRIOT, // ULScript_Cypriot, 419 X_BUGINESE, // ULScript_Buginese, 420 X_COPTIC, // ULScript_Coptic, 421 X_NEW_TAI_LUE, // ULScript_New_Tai_Lue, 422 X_GLAGOLITIC, // ULScript_Glagolitic, 423 X_TIFINAGH, // ULScript_Tifinagh, 424 X_SYLOTI_NAGRI, // ULScript_Syloti_Nagri, 425 X_OLD_PERSIAN, // ULScript_Old_Persian, 426 X_KHAROSHTHI, // ULScript_Kharoshthi, 427 X_BALINESE, // ULScript_Balinese, 428 X_CUNEIFORM, // ULScript_Cuneiform, 429 X_PHOENICIAN, // ULScript_Phoenician, 430 X_PHAGS_PA, // ULScript_Phags_Pa, 431 X_NKO, // ULScript_Nko, 432 433 // Unicode 5.1 434 X_SUDANESE, // ULScript_Sundanese, 435 X_LEPCHA, // ULScript_Lepcha, 436 X_OL_CHIKI, // ULScript_Ol_Chiki, 437 X_VAI, // ULScript_Vai, 438 X_SAURASHTRA, // ULScript_Saurashtra, 439 X_KAYAH_LI, // ULScript_Kayah_Li, 440 X_REJANG, // ULScript_Rejang, 441 X_LYCIAN, // ULScript_Lycian, 442 X_CARIAN, // ULScript_Carian, 443 X_LYDIAN, // ULScript_Lydian, 444 X_CHAM, // ULScript_Cham, 445 }; 446 447 COMPILE_ASSERT(arraysize(kOnlyLanguagePerLScript) == ULScript_NUM_SCRIPTS, 448 kOnlyLanguagePerLScript_has_incorrect_length); 449 450 451 // This is, in a sense, the complement of the table above 452 // If != UNKNOWN, determines a default language of this script 453 static const Language kDefaultLanguagePerLScript[] = { 454 UNKNOWN_LANGUAGE, // ULScript_Common, [no words should be in this script] 455 ENGLISH, // ULScript_Latin, 456 UNKNOWN_LANGUAGE, // ULScript_Greek, 457 RUSSIAN, // ULScript_Cyrillic, 458 UNKNOWN_LANGUAGE, // ULScript_Armenian, 459 HEBREW, // ULScript_Hebrew, 460 ARABIC, // ULScript_Arabic, 461 UNKNOWN_LANGUAGE, // ULScript_Syriac, 462 UNKNOWN_LANGUAGE, // ULScript_Thaana, 463 HINDI, // ULScript_Devanagari, 464 BENGALI, // ULScript_Bengali, 465 UNKNOWN_LANGUAGE, // ULScript_Gurmukhi, 466 UNKNOWN_LANGUAGE, // ULScript_Gujarati, 467 UNKNOWN_LANGUAGE, // ULScript_Oriya, 468 UNKNOWN_LANGUAGE, // ULScript_Tamil, 469 UNKNOWN_LANGUAGE, // ULScript_Telugu, 470 UNKNOWN_LANGUAGE, // ULScript_Kannada, 471 UNKNOWN_LANGUAGE, // ULScript_Malayalam, 472 UNKNOWN_LANGUAGE, // ULScript_Sinhala, 473 UNKNOWN_LANGUAGE, // ULScript_Thai, 474 UNKNOWN_LANGUAGE, // ULScript_Lao, 475 TIBETAN, // ULScript_Tibetan, 476 UNKNOWN_LANGUAGE, // ULScript_Myanmar, 477 UNKNOWN_LANGUAGE, // ULScript_Georgian, 478 CHINESE, // ULScript_HanCJK, 479 AMHARIC, // ULScript_Ethiopic, 480 UNKNOWN_LANGUAGE, // ULScript_Cherokee, 481 UNKNOWN_LANGUAGE, // ULScript_Canadian_Aboriginal, 482 UNKNOWN_LANGUAGE, // ULScript_Ogham, 483 UNKNOWN_LANGUAGE, // ULScript_Runic, 484 UNKNOWN_LANGUAGE, // ULScript_Khmer, 485 UNKNOWN_LANGUAGE, // ULScript_Mongolian, 486 UNKNOWN_LANGUAGE, // ULScript_Yi, 487 UNKNOWN_LANGUAGE, // ULScript_Old_Italic, 488 UNKNOWN_LANGUAGE, // ULScript_Gothic, 489 UNKNOWN_LANGUAGE, // ULScript_Deseret, 490 UNKNOWN_LANGUAGE, // ULScript_Inherited, [no words should be in this script] 491 UNKNOWN_LANGUAGE, // ULScript_Tagalog, 492 UNKNOWN_LANGUAGE, // ULScript_Hanunoo, 493 UNKNOWN_LANGUAGE, // ULScript_Buhid, 494 UNKNOWN_LANGUAGE, // ULScript_Tagbanwa, 495 UNKNOWN_LANGUAGE, // ULScript_Limbu, 496 UNKNOWN_LANGUAGE, // ULScript_Tai_Le, 497 UNKNOWN_LANGUAGE, // ULScript_Linear_B, 498 UNKNOWN_LANGUAGE, // ULScript_Ugaritic, 499 UNKNOWN_LANGUAGE, // ULScript_Shavian, 500 UNKNOWN_LANGUAGE, // ULScript_Osmanya, 501 UNKNOWN_LANGUAGE, // ULScript_Cypriot, 502 UNKNOWN_LANGUAGE, // ULScript_Buginese, 503 UNKNOWN_LANGUAGE, // ULScript_Coptic, 504 UNKNOWN_LANGUAGE, // ULScript_New_Tai_Lue, 505 UNKNOWN_LANGUAGE, // ULScript_Glagolitic, 506 UNKNOWN_LANGUAGE, // ULScript_Tifinagh, 507 UNKNOWN_LANGUAGE, // ULScript_Syloti_Nagri, 508 UNKNOWN_LANGUAGE, // ULScript_Old_Persian, 509 UNKNOWN_LANGUAGE, // ULScript_Kharoshthi, 510 UNKNOWN_LANGUAGE, // ULScript_Balinese, 511 UNKNOWN_LANGUAGE, // ULScript_Cuneiform, 512 UNKNOWN_LANGUAGE, // ULScript_Phoenician, 513 UNKNOWN_LANGUAGE, // ULScript_Phags_Pa, 514 UNKNOWN_LANGUAGE, // ULScript_Nko, 515 516 // Unicode 5.1 517 UNKNOWN_LANGUAGE, // ULScript_Sundanese, 518 UNKNOWN_LANGUAGE, // ULScript_Lepcha, 519 UNKNOWN_LANGUAGE, // ULScript_Ol_Chiki, 520 UNKNOWN_LANGUAGE, // ULScript_Vai, 521 UNKNOWN_LANGUAGE, // ULScript_Saurashtra, 522 UNKNOWN_LANGUAGE, // ULScript_Kayah_Li, 523 UNKNOWN_LANGUAGE, // ULScript_Rejang, 524 UNKNOWN_LANGUAGE, // ULScript_Lycian, 525 UNKNOWN_LANGUAGE, // ULScript_Carian, 526 UNKNOWN_LANGUAGE, // ULScript_Lydian, 527 UNKNOWN_LANGUAGE, // ULScript_Cham, 528 }; 529 530 COMPILE_ASSERT(arraysize(kDefaultLanguagePerLScript) == ULScript_NUM_SCRIPTS, 531 kDefaultLanguagePerLScript_has_incorrect_length); 532 533 534 // True for standalone languages (only lang in a script) 535 // Subscripted by packed language number 536 // If 1, we will use nilgrams to determine language 537 static const uint8 kIsStandaloneLang[EXT_NUM_LANGUAGES + 1] = { 538 0, 539 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,1,0, // GREEK 540 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 541 0,1,0,0,1, 0,1,0,0,0, 0,0,1,1,0, 0,0,0,0,1, // MALAYALAM..KANNADA 542 1,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 1,0,0,0,1, // PUNJABI..SINHALESE 543 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,1,1,0, // ARMENIAN..LAOTHIAN 544 545 0,0,0,0,1, 0,1,1,1,0, 1,0,0,0,0, 0,0,0,0,0, // KHMER..ORIYA 546 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 547 0,1,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, // INUKTITUT 548 549 0,0,0,0,0, // [160..164] 550 // Add new language standalone bit just before here 551 0,0,0,0,0, 1,1,1,1,1, 1,1,1,1,1, 1,1,1,1,1, 552 1,1,1,1,1, 1,1,1,1,1, 1,1,1,1,1, 1,1,1,1,1, 553 554 1,1,1,1, 555 }; 556 557 // True for ULScript_HanCJK 558 // (Vietnamese and Zhuang also have Latin script quadgrams) 559 // Subscripted by packed language number 560 static const uint8 kIsUnigramLang[EXT_NUM_LANGUAGES + 1] = { 561 0, 562 0,0,0,0,0, 0,0,0,1,1, 0,0,0,0,0, 0,1,0,0,0, // JAPANESE KOREAN CHINESE 563 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, // 564 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, // 565 0,0,0,0,0, 0,1,0,0,1, 0,0,0,0,0, 0,0,0,0,0, // VIETNAMESE CHINESE_T 566 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, // 567 568 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, // 569 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, // 570 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 1,0,0,0,0, // ZHUANG 571 572 0,0,0,0,0, // [160..164] 573 // Add new language unigram bit just before here 574 575 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, // 576 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, // 577 578 0,0,0,0, 579 }; 580 581 582 // True for ULScript_HanCJK 583 // Subscripted by lscript number 584 static const uint8 kScoreUniPerLScript[] = { 585 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 1,0,0,0,0,0,0,0, 586 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 587 0,0,0,0,0,0,0,0, 588 }; 589 590 COMPILE_ASSERT(arraysize(kScoreUniPerLScript) == ULScript_NUM_SCRIPTS, 591 kScoreUniPerLScript_has_incorrect_length); 592 593 594 // Defines Top40 packed languages 595 596 // Tier 0/1 Language enum list (16) 597 // ENGLISH, /*no en_GB,*/ FRENCH, ITALIAN, GERMAN, SPANISH, // E - FIGS 598 // DUTCH, CHINESE, CHINESE_T, JAPANESE, KOREAN, 599 // PORTUGUESE, RUSSIAN, POLISH, TURKISH, THAI, 600 // ARABIC, 601 // 602 // Tier 2 Language enum list (22) 603 // SWEDISH, FINNISH, DANISH, /*no pt-PT,*/ ROMANIAN, HUNGARIAN, 604 // HEBREW, INDONESIAN, CZECH, GREEK, NORWEGIAN, 605 // VIETNAMESE, BULGARIAN, CROATIAN, LITHUANIAN, SLOVAK, 606 // TAGALOG, SLOVENIAN, SERBIAN, CATALAN, LATVIAN, 607 // UKRAINIAN, HINDI, 608 // 609 // use SERBO_CROATIAN instead of BOSNIAN, SERBIAN, CROATIAN, MONTENEGRIN(21) 610 // 611 // Include IgnoreMe (TG_UNKNOWN_LANGUAGE, 25+1) as a top 40 612 613 // NOTE: packed, i.e. Language enum + 1 614 static const uint8 kIsPackedTop40[EXT_NUM_LANGUAGES + 1] = { 615 0, 616 1,1,1,1,1, 1,1,1,1,1, 1,1,1,1,1, 1,1,1,1,0, 617 1,1,1,1,0, 1,0,1,0,0, 0,0,1,1,1, 1,0,0,1,0, 618 0,0,0,0,0, 0,0,0,0,0, 0,0,0,1,1, 1,0,0,0,0, 619 0,0,0,1,0, 0,1,0,1,1, 0,0,0,0,0, 0,0,0,0,0, 620 0,0,0,0,0, 0,0,0,0,0, 0,0,1,0,0, 0,0,0,0,0, 621 622 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 623 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 624 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 625 626 0,0,0,0,0, // [160..164] 627 // Add new language top40 bit just before here 628 629 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 630 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 631 632 0,0,0,0, 633 }; 634 635 636 637 // Table has 234 eight-byte entries. Each entry has a five-byte array and 638 // a three-byte array of log base 2 probabilities in the range 0..11. 639 // The intended use is to express five or three probabilities in a single-byte 640 // subscript, then decode via this table. These probabilities are 641 // intended to go with an array of five or three language numbers. 642 // 643 // The corresponding language numbers will have to be sorted by descending 644 // probability, then the actual probability subscript chosen to match the 645 // closest available entry in this table. 646 // 647 // Pattern of probability values: 648 // hi 3/4 1/2 1/4 lo hi mid lo 649 // where "3/4" is (hi*3+lo)/4, "1/2" is (hi+lo)/2, and "1/4" is (hi+lo*3)/4 and 650 // mid is one of 3/4 1/2 or 1/4. 651 // There are three groups of 78 (=12*13/2) entries, with hi running 0..11 and 652 // lo running 0..hi. Only the first group is used for five-entry lookups. 653 // The mid value in the first group is 1/2, the second group 3/4, and the 654 // third group 1/4. For three-entry lookups, this allows the mid entry to be 655 // somewhat higher or lower than the midpoint, to allow a better match to the 656 // original probabilities. 657 static const int kLgProbV2TblSize = 234; 658 static const uint8 kLgProbV2Tbl[kLgProbV2TblSize * 8] = { 659 1,1,1,1,1, 1,1,1, // [0] 660 2,2,2,1,1, 2,2,1, // [1] 661 2,2,2,2,2, 2,2,2, 662 3,3,2,2,1, 3,2,1, // [3] 663 3,3,3,2,2, 3,3,2, 664 3,3,3,3,3, 3,3,3, 665 4,3,3,2,1, 4,3,1, // [6] 666 4,4,3,3,2, 4,3,2, 667 4,4,4,3,3, 4,4,3, 668 4,4,4,4,4, 4,4,4, 669 5,4,3,2,1, 5,3,1, // [10] 670 5,4,4,3,2, 5,4,2, 671 5,5,4,4,3, 5,4,3, 672 5,5,5,4,4, 5,5,4, 673 5,5,5,5,5, 5,5,5, 674 6,5,4,2,1, 6,4,1, // [15] 675 6,5,4,3,2, 6,4,2, 676 6,5,5,4,3, 6,5,3, 677 6,6,5,5,4, 6,5,4, 678 6,6,6,5,5, 6,6,5, 679 6,6,6,6,6, 6,6,6, 680 7,6,4,3,1, 7,4,1, // [21] 681 7,6,5,3,2, 7,5,2, 682 7,6,5,4,3, 7,5,3, 683 7,6,6,5,4, 7,6,4, 684 7,7,6,6,5, 7,6,5, 685 7,7,7,6,6, 7,7,6, 686 7,7,7,7,7, 7,7,7, 687 8,6,5,3,1, 8,5,1, // [28] 688 8,7,5,4,2, 8,5,2, 689 8,7,6,4,3, 8,6,3, 690 8,7,6,5,4, 8,6,4, 691 8,7,7,6,5, 8,7,5, 692 8,8,7,7,6, 8,7,6, 693 8,8,8,7,7, 8,8,7, 694 8,8,8,8,8, 8,8,8, 695 9,7,5,3,1, 9,5,1, // [36] 696 9,7,6,4,2, 9,6,2, 697 9,8,6,5,3, 9,6,3, 698 9,8,7,5,4, 9,7,4, 699 9,8,7,6,5, 9,7,5, 700 9,8,8,7,6, 9,8,6, 701 9,9,8,8,7, 9,8,7, 702 9,9,9,8,8, 9,9,8, 703 9,9,9,9,9, 9,9,9, 704 10,8,6,3,1, 10,6,1, // [45] 705 10,8,6,4,2, 10,6,2, 706 10,8,7,5,3, 10,7,3, 707 10,9,7,6,4, 10,7,4, 708 10,9,8,6,5, 10,8,5, 709 10,9,8,7,6, 10,8,6, 710 10,9,9,8,7, 10,9,7, 711 10,10,9,9,8, 10,9,8, 712 10,10,10,9,9, 10,10,9, 713 10,10,10,10,10, 10,10,10, 714 11,9,6,4,1, 11,6,1, // [55] 715 11,9,7,4,2, 11,7,2, 716 11,9,7,5,3, 11,7,3, 717 11,9,8,6,4, 11,8,4, 718 11,10,8,7,5, 11,8,5, 719 11,10,9,7,6, 11,9,6, 720 11,10,9,8,7, 11,9,7, 721 11,10,10,9,8, 11,10,8, 722 11,11,10,10,9, 11,10,9, 723 11,11,11,10,10, 11,11,10, 724 11,11,11,11,11, 11,11,11, 725 12,9,7,4,1, 12,7,1, // [66] 726 12,10,7,5,2, 12,7,2, 727 12,10,8,5,3, 12,8,3, 728 12,10,8,6,4, 12,8,4, 729 12,10,9,7,5, 12,9,5, 730 12,11,9,8,6, 12,9,6, 731 12,11,10,8,7, 12,10,7, 732 12,11,10,9,8, 12,10,8, 733 12,11,11,10,9, 12,11,9, 734 12,12,11,11,10, 12,11,10, 735 12,12,12,11,11, 12,12,11, 736 12,12,12,12,12, 12,12,12, 737 738 1,1,1,1,1, 1,1,1, 739 2,2,2,1,1, 2,2,1, 740 2,2,2,2,2, 2,2,2, 741 3,3,2,2,1, 3,3,1, 742 3,3,3,2,2, 3,3,2, 743 3,3,3,3,3, 3,3,3, 744 4,3,3,2,1, 4,3,1, 745 4,4,3,3,2, 4,4,2, 746 4,4,4,3,3, 4,4,3, 747 4,4,4,4,4, 4,4,4, 748 5,4,3,2,1, 5,4,1, 749 5,4,4,3,2, 5,4,2, 750 5,5,4,4,3, 5,5,3, 751 5,5,5,4,4, 5,5,4, 752 5,5,5,5,5, 5,5,5, 753 6,5,4,2,1, 6,5,1, 754 6,5,4,3,2, 6,5,2, 755 6,5,5,4,3, 6,5,3, 756 6,6,5,5,4, 6,6,4, 757 6,6,6,5,5, 6,6,5, 758 6,6,6,6,6, 6,6,6, 759 7,6,4,3,1, 7,6,1, 760 7,6,5,3,2, 7,6,2, 761 7,6,5,4,3, 7,6,3, 762 7,6,6,5,4, 7,6,4, 763 7,7,6,6,5, 7,7,5, 764 7,7,7,6,6, 7,7,6, 765 7,7,7,7,7, 7,7,7, 766 8,6,5,3,1, 8,6,1, 767 8,7,5,4,2, 8,7,2, 768 8,7,6,4,3, 8,7,3, 769 8,7,6,5,4, 8,7,4, 770 8,7,7,6,5, 8,7,5, 771 8,8,7,7,6, 8,8,6, 772 8,8,8,7,7, 8,8,7, 773 8,8,8,8,8, 8,8,8, 774 9,7,5,3,1, 9,7,1, 775 9,7,6,4,2, 9,7,2, 776 9,8,6,5,3, 9,8,3, 777 9,8,7,5,4, 9,8,4, 778 9,8,7,6,5, 9,8,5, 779 9,8,8,7,6, 9,8,6, 780 9,9,8,8,7, 9,9,7, 781 9,9,9,8,8, 9,9,8, 782 9,9,9,9,9, 9,9,9, 783 10,8,6,3,1, 10,8,1, 784 10,8,6,4,2, 10,8,2, 785 10,8,7,5,3, 10,8,3, 786 10,9,7,6,4, 10,9,4, 787 10,9,8,6,5, 10,9,5, 788 10,9,8,7,6, 10,9,6, 789 10,9,9,8,7, 10,9,7, 790 10,10,9,9,8, 10,10,8, 791 10,10,10,9,9, 10,10,9, 792 10,10,10,10,10, 10,10,10, 793 11,9,6,4,1, 11,9,1, 794 11,9,7,4,2, 11,9,2, 795 11,9,7,5,3, 11,9,3, 796 11,9,8,6,4, 11,9,4, 797 11,10,8,7,5, 11,10,5, 798 11,10,9,7,6, 11,10,6, 799 11,10,9,8,7, 11,10,7, 800 11,10,10,9,8, 11,10,8, 801 11,11,10,10,9, 11,11,9, 802 11,11,11,10,10, 11,11,10, 803 11,11,11,11,11, 11,11,11, 804 12,9,7,4,1, 12,9,1, 805 12,10,7,5,2, 12,10,2, 806 12,10,8,5,3, 12,10,3, 807 12,10,8,6,4, 12,10,4, 808 12,10,9,7,5, 12,10,5, 809 12,11,9,8,6, 12,11,6, 810 12,11,10,8,7, 12,11,7, 811 12,11,10,9,8, 12,11,8, 812 12,11,11,10,9, 12,11,9, 813 12,12,11,11,10, 12,12,10, 814 12,12,12,11,11, 12,12,11, 815 12,12,12,12,12, 12,12,12, 816 817 1,1,1,1,1, 1,1,1, 818 2,2,2,1,1, 2,1,1, 819 2,2,2,2,2, 2,2,2, 820 3,3,2,2,1, 3,2,1, 821 3,3,3,2,2, 3,2,2, 822 3,3,3,3,3, 3,3,3, 823 4,3,3,2,1, 4,2,1, 824 4,4,3,3,2, 4,3,2, 825 4,4,4,3,3, 4,3,3, 826 4,4,4,4,4, 4,4,4, 827 5,4,3,2,1, 5,2,1, 828 5,4,4,3,2, 5,3,2, 829 5,5,4,4,3, 5,4,3, 830 5,5,5,4,4, 5,4,4, 831 5,5,5,5,5, 5,5,5, 832 6,5,4,2,1, 6,2,1, 833 6,5,4,3,2, 6,3,2, 834 6,5,5,4,3, 6,4,3, 835 6,6,5,5,4, 6,5,4, 836 6,6,6,5,5, 6,5,5, 837 6,6,6,6,6, 6,6,6, 838 7,6,4,3,1, 7,3,1, 839 7,6,5,3,2, 7,3,2, 840 7,6,5,4,3, 7,4,3, 841 7,6,6,5,4, 7,5,4, 842 7,7,6,6,5, 7,6,5, 843 7,7,7,6,6, 7,6,6, 844 7,7,7,7,7, 7,7,7, 845 8,6,5,3,1, 8,3,1, 846 8,7,5,4,2, 8,4,2, 847 8,7,6,4,3, 8,4,3, 848 8,7,6,5,4, 8,5,4, 849 8,7,7,6,5, 8,6,5, 850 8,8,7,7,6, 8,7,6, 851 8,8,8,7,7, 8,7,7, 852 8,8,8,8,8, 8,8,8, 853 9,7,5,3,1, 9,3,1, 854 9,7,6,4,2, 9,4,2, 855 9,8,6,5,3, 9,5,3, 856 9,8,7,5,4, 9,5,4, 857 9,8,7,6,5, 9,6,5, 858 9,8,8,7,6, 9,7,6, 859 9,9,8,8,7, 9,8,7, 860 9,9,9,8,8, 9,8,8, 861 9,9,9,9,9, 9,9,9, 862 10,8,6,3,1, 10,3,1, 863 10,8,6,4,2, 10,4,2, 864 10,8,7,5,3, 10,5,3, 865 10,9,7,6,4, 10,6,4, 866 10,9,8,6,5, 10,6,5, 867 10,9,8,7,6, 10,7,6, 868 10,9,9,8,7, 10,8,7, 869 10,10,9,9,8, 10,9,8, 870 10,10,10,9,9, 10,9,9, 871 10,10,10,10,10, 10,10,10, 872 11,9,6,4,1, 11,4,1, 873 11,9,7,4,2, 11,4,2, 874 11,9,7,5,3, 11,5,3, 875 11,9,8,6,4, 11,6,4, 876 11,10,8,7,5, 11,7,5, 877 11,10,9,7,6, 11,7,6, 878 11,10,9,8,7, 11,8,7, 879 11,10,10,9,8, 11,9,8, 880 11,11,10,10,9, 11,10,9, 881 11,11,11,10,10, 11,10,10, 882 11,11,11,11,11, 11,11,11, 883 12,9,7,4,1, 12,4,1, 884 12,10,7,5,2, 12,5,2, 885 12,10,8,5,3, 12,5,3, 886 12,10,8,6,4, 12,6,4, 887 12,10,9,7,5, 12,7,5, 888 12,11,9,8,6, 12,8,6, 889 12,11,10,8,7, 12,8,7, 890 12,11,10,9,8, 12,9,8, 891 12,11,11,10,9, 12,10,9, 892 12,12,11,11,10, 12,11,10, 893 12,12,12,11,11, 12,11,11, 894 12,12,12,12,12, 12,12,12, 895 }; 896 897 // Backmap a single desired probability into an entry in kLgProbV2Tbl 898 static const uint8 kLgProbV2TblBackmap[13] = { 899 0, 900 0, 1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 66, 901 }; 902 903 904 // Always advances one UTF-8 character 905 static const uint8 kAdvanceOneChar[256] = { 906 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 907 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 908 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 909 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 910 911 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 912 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 913 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 914 3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4, 915 }; 916 917 // Does not advance past space or cr/lf/nul 918 static const uint8 kAdvanceOneCharButSpace[256] = { 919 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 920 0,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 921 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 922 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 923 924 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 925 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 926 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 927 3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4, 928 }; 929 930 // Advances *only* on space or ASCII vowel (or illegal byte) 931 static const uint8 kAdvanceOneCharSpaceVowel[256] = { 932 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 933 1,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 934 0,1,0,0,0,1,0,0, 0,1,0,0,0,0,0,1, 0,0,0,0,0,1,0,0, 0,0,0,0,0,0,0,0, 935 0,1,0,0,0,1,0,0, 0,1,0,0,0,0,0,1, 0,0,0,0,0,1,0,0, 0,0,0,0,0,0,0,0, 936 937 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 938 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 939 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 940 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 941 }; 942 943 // Advances *only* on space (or illegal byte) 944 static const uint8 kAdvanceOneCharSpace[256] = { 945 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 946 1,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 947 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 948 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 949 950 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 951 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 952 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 953 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 954 }; 955 956 957 //------------------------------------------------------------------------------ 958 // General 959 //------------------------------------------------------------------------------ minint(int a,int b)960 static inline int minint(int a, int b) {return (a < b) ? a: b;} maxint(int a,int b)961 static inline int maxint(int a, int b) {return (a > b) ? a: b;} 962 963 // Here to make available for debugging 964 int ReliabilityDelta(int value1, int value2, int count); 965 int ReliabilityMainstream(int topscore, int len, int mean_score); 966 967 // Returns "0" for too small MyExtLanguageCode(Language lang)968 inline const char* MyExtLanguageCode(Language lang) { 969 return ExtLanguageCode(lang); 970 } 971 972 // Map script into Latin, Cyrillic, Arabic, Other. Used in keeping track of 973 // amount of training data for language-script combinations LScript4(UnicodeLScript lscript)974 inline int LScript4(UnicodeLScript lscript) { 975 if (lscript == ULScript_Latin) {return 0;} 976 if (lscript == ULScript_Cyrillic) {return 1;} 977 if (lscript == ULScript_Arabic) {return 2;} 978 return 3; 979 } 980 981 982 // Routines to access 3 or 5 log probabilities in a single byte. 983 984 // Return address of 8-byte entry[i] LgProb2TblEntry(int i)985 inline const uint8* LgProb2TblEntry(int i) { 986 return &kLgProbV2Tbl[i * 8]; 987 } 988 989 // Return one of five probabilities in an entry 990 // CURRENTLY UNUSED LgProb5(const uint8 * entry,int j)991 inline uint8 LgProb5(const uint8* entry, int j) { 992 return entry[j]; 993 } 994 995 // Return one of three probabilities in an entry LgProb3(const uint8 * entry,int j)996 inline uint8 LgProb3(const uint8* entry, int j) { 997 return entry[j + 5]; 998 } 999 1000 1001 1002 //------------------------------------------------------------------------------ 1003 // Hashing groups of 1/2/4/8 letters, perhaps with spaces or underscores 1004 //------------------------------------------------------------------------------ 1005 1006 // Pick up 1..12 bytes and hash them via mask/shift/add. NO pre/post 1007 // OVERSHOOTS up to 3 bytes 1008 uint32 BiHashV25(const char* word_ptr, int bytecount); 1009 1010 // Pick up 1..12 bytes plus pre/post space and hash them via mask/shift/add 1011 // UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes 1012 uint32 QuadHashV25(const char* word_ptr, int bytecount); 1013 1014 // Pick up 1..12 bytes plus pre/post '_' and hash them via mask/shift/add 1015 // OVERSHOOTS up to 3 bytes 1016 uint32 QuadHashV25Underscore(const char* word_ptr, int bytecount); 1017 1018 1019 // Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add 1020 // UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes 1021 // For runtime use of tables V3 1022 uint64 OctaHash40(const char* word_ptr, int bytecount); 1023 1024 uint64 OctaHash40underscore(const char* word_ptr, int bytecount); 1025 1026 1027 // From 32-bit gram FP, return hash table subscript and remaining key QuadFPJustHash(uint32 quadhash,uint32 keymask,int bucketcount,uint32 * subscr,uint32 * hashkey)1028 inline void QuadFPJustHash(uint32 quadhash, 1029 uint32 keymask, 1030 int bucketcount, 1031 uint32* subscr, uint32* hashkey) { 1032 *subscr = (quadhash + (quadhash >> 12)) & (bucketcount - 1); 1033 *hashkey = quadhash & keymask; 1034 } 1035 1036 // Look up 32-bit gram FP in caller-passed table 1037 // Typical size 256K entries (1.5MB) 1038 // Two-byte hashkey QuadHashV3Lookup4(const cld::CLDTableSummary * gram_obj,uint32 quadhash)1039 inline const uint32 QuadHashV3Lookup4(const cld::CLDTableSummary* gram_obj, 1040 uint32 quadhash) { 1041 1042 uint32 subscr, hashkey; 1043 const IndirectProbBucket4* quadtable = gram_obj->kCLDTable; 1044 uint32 keymask = gram_obj->kCLDTableKeyMask; 1045 int bucketcount = gram_obj->kCLDTableSize; 1046 QuadFPJustHash(quadhash, keymask, bucketcount, &subscr, &hashkey); 1047 const IndirectProbBucket4* bucket_ptr = &quadtable[subscr]; 1048 // Four-way associative, 4 compares 1049 if (((hashkey ^ bucket_ptr->keyvalue[0]) & keymask) == 0) { 1050 return bucket_ptr->keyvalue[0]; 1051 } 1052 if (((hashkey ^ bucket_ptr->keyvalue[1]) & keymask) == 0) { 1053 return bucket_ptr->keyvalue[1]; 1054 } 1055 if (((hashkey ^ bucket_ptr->keyvalue[2]) & keymask) == 0) { 1056 return bucket_ptr->keyvalue[2]; 1057 } 1058 if (((hashkey ^ bucket_ptr->keyvalue[3]) & keymask) == 0) { 1059 return bucket_ptr->keyvalue[3]; 1060 } 1061 return 0; 1062 } 1063 1064 1065 // Map 40 bits to subscript, hashkey, expected 18-22 bit subscript (min 16) 1066 // wwwwwwww xxxxxxxx xxxxxxxx yyyyyyyy yyyyyyyy 1067 // + ........ ....wwww wwwwxxxx xxxxxxxx xxxxyyyy 1068 // 00000000 00000000 00000011 11111111 11111111 (18-bit bucketcount-1) 1069 // 1070 // hashkey: 1071 // wwwwxxxx xxxxxxxx xxxx.... ........ (20-bit keymask) 1072 // 12-bit shift in subscript mixes in ~4 letters x 4 bits each 1073 1074 // From 40-bit gram FP, return hash table subscript and remaining key OctaFPJustHash(uint64 longwordhash,uint32 keymask,int bucketcount,uint32 * subscr,uint32 * hashkey)1075 inline void OctaFPJustHash(uint64 longwordhash, 1076 uint32 keymask, 1077 int bucketcount, 1078 uint32* subscr, uint32* hashkey) { 1079 uint32 temp = (longwordhash + (longwordhash >> 12)) & (bucketcount - 1); 1080 *subscr = temp; 1081 temp = longwordhash >> 4; 1082 *hashkey = temp & keymask; 1083 } 1084 1085 // Look up 40-bit gram FP in caller-passed table 1086 // Typical size 256K-4M entries (1-16MB) 1087 // 24-12 bit hashkey packed with 8-20 bit indirect lang/probs 1088 // keymask is 0xfffff000 for 20-bit hashkey and 12-bit indirect OctaHashV3Lookup4(const cld::CLDTableSummary * gram_obj,uint64 longwordhash)1089 inline const uint32 OctaHashV3Lookup4(const cld::CLDTableSummary* gram_obj, 1090 uint64 longwordhash) { 1091 uint32 subscr, hashkey; 1092 const IndirectProbBucket4* octatable = gram_obj->kCLDTable; 1093 uint32 keymask = gram_obj->kCLDTableKeyMask; 1094 int bucketcount = gram_obj->kCLDTableSize; 1095 OctaFPJustHash(longwordhash, keymask, bucketcount, 1096 &subscr, &hashkey); 1097 const IndirectProbBucket4* bucket_ptr = &octatable[subscr]; 1098 // Four-way associative, 4 compares 1099 if (((hashkey ^ bucket_ptr->keyvalue[0]) & keymask) == 0) { 1100 return bucket_ptr->keyvalue[0]; 1101 } 1102 if (((hashkey ^ bucket_ptr->keyvalue[1]) & keymask) == 0) { 1103 return bucket_ptr->keyvalue[1]; 1104 } 1105 if (((hashkey ^ bucket_ptr->keyvalue[2]) & keymask) == 0) { 1106 return bucket_ptr->keyvalue[2]; 1107 } 1108 if (((hashkey ^ bucket_ptr->keyvalue[3]) & keymask) == 0) { 1109 return bucket_ptr->keyvalue[3]; 1110 } 1111 return 0; 1112 } 1113 1114 1115 1116 //------------------------------------------------------------------------------ 1117 // Scoring single groups of letters 1118 //------------------------------------------------------------------------------ 1119 1120 // UNIGRAM score one => tote 1121 // Input: 1-byte entry of subscript into unigram probs, plus 1122 // an accumulator tote. 1123 // Output: running sums in tote updated 1124 void ProcessProbV25UniTote(int propval, Tote* tote); 1125 1126 // BIGRAM, QUADGRAM, OCTAGRAM score one => tote 1127 // Input: 4-byte entry of 3 language numbers and one probability subscript, 1128 // plus an accumulator tote. (language 0 means unused entry) 1129 // Output: running sums in tote updated 1130 void ProcessProbV25Tote(uint32 probs, Tote* tote); 1131 1132 1133 //------------------------------------------------------------------------------ 1134 // Routines to accumulate probabilities 1135 //------------------------------------------------------------------------------ 1136 1137 // Score up to n=gram_limit unigrams, returning number of bytes consumed 1138 // Caller supplies table, such as compact_lang_det_generated_ctjkvz_b1_obj 1139 int DoUniScoreV3(const UTF8PropObj* unigram_obj, 1140 const char* isrc, int srclen, int advance_by, 1141 int* tote_grams, int gram_limit, Tote* chunk_tote); 1142 1143 1144 // Score all words in isrc, using languages that have bigrams (CJK) 1145 // Caller supplies table, such as &kCjkBiTable_obj or &kGibberishTable_obj 1146 // Return number of bigrams that hit in the hash table 1147 int DoBigramScoreV3(const cld::CLDTableSummary* bigram_obj, 1148 const char* isrc, int srclen, Tote* chunk_tote); 1149 1150 1151 // Score up to n=gram_limit quadgrams, returning number of bytes consumed 1152 // Caller supplies table, such as &kQuadTable_obj or &kGibberishTable_obj 1153 int DoQuadScoreV3(const cld::CLDTableSummary* quadgram_obj, 1154 const char* isrc, int srclen, int advance_by, 1155 int* tote_grams, int gram_limit, Tote* chunk_tote); 1156 1157 // Score all octagrams (words) in isrc, using languages that have quadgrams 1158 // Caller supplies table, such as &kLongWord8Table_obj 1159 // Return number of words that hit in the hash table 1160 int DoOctaScoreV3(const cld::CLDTableSummary* octagram_obj, 1161 const char* isrc, int srclen, Tote* chunk_tote); 1162 1163 //------------------------------------------------------------------------------ 1164 // Reliability calculations, for single language and between languages 1165 //------------------------------------------------------------------------------ 1166 1167 // Reliability = 0..100 1168 static const int kMinReliable = 75; 1169 1170 // Calculate ratio of score per 1KB vs. expected score per 1KB 1171 double GetNormalizedScore(Language lang, UnicodeLScript lscript, 1172 int bytes, int score); 1173 1174 // Calculate reliablity of len bytes of script lscript with chunk_tote 1175 int GetReliability(int len, UnicodeLScript lscript, const Tote* chunk_tote); 1176 1177 1178 //------------------------------------------------------------------------------ 1179 // Miscellaneous 1180 //------------------------------------------------------------------------------ 1181 1182 // Make languages packed into uint32 values non-zero 1183 // These routines later could remap so languages not in QuadHash tables are not 1184 // represented, and so that any thrashing in accumulation is eliminated PackLanguage(Language lang)1185 uint8 inline PackLanguage(Language lang) { 1186 return static_cast<uint8>(lang + 1);} 1187 UnpackLanguage(int ilang)1188 Language inline UnpackLanguage(int ilang) { 1189 return static_cast<Language>(ilang - 1);} 1190 1191 // Useful single-byte tests IsUTF8ContinueByte(char c)1192 bool inline IsUTF8ContinueByte(char c) { 1193 return static_cast<signed char>(c) < -64;} IsUTF8HighByte(char c)1194 bool inline IsUTF8HighByte(char c) { 1195 return static_cast<signed char>(c) < 0;} 1196 1197 1198 // Demote all languages except Top40 and plus_one 1199 // Do this just before sorting 1200 void DemoteNotTop40(Tote* chunk_tote, int packed_plus_one); 1201 1202 } // End namespace cld 1203 1204 1205 #endif // ENCODINGS_COMPACT_LANG_DET_CLDUTIL_H_ 1206