• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (c) 2009 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include <stdio.h>
6 #include <string.h>
7 //#include <sys/time.h>                               // for gettimeofday
8 #include <string>
9 
10 #include "encodings/lang_enc.h"
11 
12 #include "encodings/compact_lang_det/compact_lang_det.h"
13 #include "encodings/compact_lang_det/compact_lang_det_impl.h"
14 #include "encodings/compact_lang_det/getonescriptspan.h"
15 #include "encodings/compact_lang_det/letterscript_enum.h"
16 #include "encodings/compact_lang_det/tote.h"
17 #include "encodings/compact_lang_det/utf8propjustletter.h"
18 #include "encodings/compact_lang_det/utf8propletterscriptnum.h"
19 #include "encodings/compact_lang_det/utf8scannotjustletterspecial.h"
20 
21 #include "encodings/compact_lang_det/cldutil_dbg.h"
22 
23 #include "encodings/compact_lang_det/win/cld_basictypes.h"
24 #include "encodings/compact_lang_det/win/cld_commandlineflags.h"
25 #include "encodings/compact_lang_det/win/cld_google.h"
26 #include "encodings/compact_lang_det/win/cld_utf8statetable.h"
27 
28 // Linker supplies the right tables
29 extern const UTF8PropObj compact_lang_det_generated_ctjkvz_b1_obj;
30 extern const cld::CLDTableSummary kCjkBiTable_obj;
31 extern const cld::CLDTableSummary kQuadTable_obj;
32 extern const cld::CLDTableSummary kLongWord8Table_obj;
33 
34 DEFINE_bool(cld_html, false, "Print language spans in HTML on stderr");
35 DEFINE_bool(cld_forcewords, false, "Score all words, in addition to quads");
36 
37 DEFINE_bool(cld_showme, false, "Put squeeze/repeat points into HTML text");
38 DEFINE_bool(cld_echotext, false, "Print each scriptspan to stderr");
39 DEFINE_int32(cld_textlimit, 160, "Examine only initial n KB of actual text");
40 // 20 quadgrams is about 80 bytes or about 12 words in real text
41 DEFINE_int32(cld_smoothwidth, 20, "Smoothing window width in quadgrams");
42 
43 
44 static const int kLangHintInitial = 12;  // Boost language by N initially
45 static const int kLangHintBoost = 12;    // Boost language by N/16 per quadgram
46 
47 static const int kShortSpanThresh = 32;       // Bytes
48 static const int kMaxSecondChanceLen = 1024;  // Look at first 1K of short spans
49 
50 static const int kCheapSqueezeTestThresh = 4096;  // Only look for squeezing
51                                                   // after this many text bytes
52 static const int kCheapSqueezeTestLen = 256;   // Bytes to test to trigger sqz
53 static const int kSpacesTriggerPercent = 25;   // Trigger sqz if >=25% spaces
54 static const int kPredictTriggerPercent = 67;  // Trigger sqz if >=67% predicted
55 
56 static const int kChunksizeDefault = 48;      // Squeeze 48-byte chunks
57 static const int kSpacesThreshPercent = 25;   // Squeeze if >=25% spaces
58 static const int kPredictThreshPercent = 40;  // Squeeze if >=40% predicted
59 
60 static const int kMaxSpaceScan = 32;          // Bytes
61 
62 static const int kGoodLang1Percent = 70;
63 static const int kGoodLang1and2Percent = 93;
64 static const int kShortTextThresh = 256;      // Bytes
65 
66 static const int kMinChunkSizeQuads = 4;      // Chunk is at least four quads
67 static const int kMaxChunkSizeQuads = 1024;   // Chunk is at most 1K quads
68 
69 static const int kDefaultWordSpan = 256;      // Scan at least this many initial
70                                               // bytes with word scoring
71 static const int kReallyBigWordSpan = 9999999;  // Forces word scoring all text
72 
73 static const int kMinReliableSeq = 50;      // Record in seq if >= 50% reliable
74 
75 static const int kPredictionTableSize = 4096;   // Must be exactly 4096 for
76                                                 // cheap compressor
77 
78 //
79 // Generated by dsites 2008.07.07 from 10% of Base
80 //
81 
82 // Three packed language probs, subscripted by Encoding
83 static const uint32 kEncodingHintProbs[] = {
84   0x00000000,  // ASCII
85   0x18120cd5,  // Latin2   POLISH.11 CZECH.5 HUNGARIAN.3
86   0x1d3a4bc9,  // Latin3   AZERBAIJANI.10 BASQUE.3 CROATIAN.1
87   0x030819d4,  // Latin4   ESTONIAN.11 ITALIAN.4 DUTCH.2
88   0x00000000,  // ISO-8859-5
89   0x00003742,  // Arabic   ARABIC.12
90   0x00000000,  // Greek
91   0x00000742,  // Hebrew   HEBREW.12
92   0x00002242,  // Latin5   TURKISH.12
93   0x060419c9,  // Latin6   ESTONIAN.10 FINNISH.3 GERMAN.1
94   0x00000942,  // EUC-JP   Japanese.12
95   0x00000942,  // SJS   Japanese.12
96   0x00000942,  // JIS   Japanese.12
97   0x00004642,  // BIG5   ChineseT.12
98   0x00001142,  // GB   Chinese.12
99   0x46295fcd,  // EUC-CN   UIGHUR.10 MALAY.6 ChineseT.5
100   0x00000a42,  // KSC   Korean.12
101   0x00000000,  // Unicode
102   0x03104674,  // EUC   ChineseT.9 SWEDISH.8 DUTCH.3
103   0x00000000,  // CNS
104   0x0f1146c3,  // BIG5-CP950   ChineseT.9 Chinese.5 SPANISH.4
105   0x00000942,  // CP932   Japanese.12
106   0x00000000,  // UTF8
107   0x00000000,  // Unknown
108   0x00000000,  // ASCII-7-bit
109   0x00000000,  // KOI8R
110   0x00000000,  // CP1251
111   0x00000000,  // CP1252
112   0x00000000,  // KOI8U
113   0x451d12cd,  // CP1250   CZECH.10 CROATIAN.6 SLOVAK.5
114   0x0d06052a,  // ISO-8859-15   FRENCH.9 GERMAN.8 PORTUGUESE.7
115   0x00002242,  // CP1254   TURKISH.12
116   0x191516be,  // CP1257   LITHUANIAN.8 LATVIAN.7 ESTONIAN.7
117   0x08003642,  // ISO-8859-11   THAI.12 ITALIAN.1
118   0x00000000,  // CP874
119   0x00003742,  // CP1256   ARABIC.12
120   0x00000742,  // CP1255   HEBREW.12
121   0x00000000,  // ISO-8859-8-I
122   0x00000000,  // VISUAL
123   0x00000000,  // CP852
124   0x39001242,  // CSN_369103   CZECH.12 ESPERANTO.1
125   0x00000000,  // CP1253
126   0x00000000,  // CP866
127   0x2e001944,  // ISO-8859-13   ESTONIAN.12 ALBANIAN.3
128   0x08090a74,  // ISO-2022-KR   Korean.9 Japanese.8 ITALIAN.3
129   0x00001142,  // GBK   Chinese.12
130   0x4600113d,  // GB18030   Chinese.11 ChineseT.7
131   0x00004642,  // BIG5_HKSCS   ChineseT.12
132   0x00000000,  // ISO_2022_CN
133   0x00000000,  // TSCII
134   0x00000000,  // TAM
135   0x00000000,  // TAB
136   0x00000000,  // JAGRAN
137   0x00000000,  // MACINTOSH
138   0x00000000,  // UTF7
139   0x00000000,  // BHASKAR
140   0x00000000,  // HTCHANAKYA
141   0x090646ca,  // UTF-16BE   ChineseT.10 GERMAN.4 Japanese.2
142   0x00000000,  // UTF-16LE
143   0x00000000,  // UTF-32BE
144   0x00000000,  // UTF-32LE
145   0x00000000,  // X-BINARYENC
146   0x06001142,  // HZ-GB-2312   Chinese.12 GERMAN.1
147   0x461109c2,  // X-UTF8UTF8   Japanese.9 Chinese.5 ChineseT.3
148   0x00000000,  // X-TAM-ELANGO
149   0x00000000,  // X-TAM-LTTMBARANI
150   0x00000000,  // X-TAM-SHREE
151   0x00000000,  // X-TAM-TBOOMIS
152   0x00000000,  // X-TAM-TMNEWS
153   0x00000000,  // X-TAM-WEBTAMIL
154   0x00000000,  // X-KDDI-Shift_JIS
155   0x00000000,  // X-DoCoMo-Shift_JIS
156   0x00000000,  // X-SoftBank-Shift_JIS
157   0x00000000,  // X-KDDI-ISO-2022-JP
158   0x00000000,  // X-SoftBank-ISO-2022-JP
159 };
160 
161 COMPILE_ASSERT(arraysize(kEncodingHintProbs) == NUM_ENCODINGS,
162                kEncodingHintProbs_has_incorrect_size);
163 
164 //
165 // Generated by dsites 2008.07.07 from 10% of Base
166 //
167 
168 // Three packed language probs, subscripted by (anchor) language
169 static const uint32 kLanguageHintProbs[] = {
170   0x00000000,  // ENGLISH
171   0x00000242,  // DANISH   DANISH.12
172   0x00000342,  // DUTCH   DUTCH.12
173   0x00000442,  // FINNISH   FINNISH.12
174   0x00000542,  // FRENCH   FRENCH.12
175   0x00000642,  // GERMAN   GERMAN.12
176   0x00000742,  // HEBREW   HEBREW.12
177   0x00000842,  // ITALIAN   ITALIAN.12
178   0x00000942,  // Japanese   Japanese.12
179   0x00000a42,  // Korean   Korean.12
180   0x51000b43,  // NORWEGIAN   NORWEGIAN.12 NORWEGIAN_N.2
181   0x00000c42,  // POLISH   POLISH.12
182   0x00000d42,  // PORTUGUESE   PORTUGUESE.12
183   0x00000000,  // RUSSIAN
184   0x00000f42,  // SPANISH   SPANISH.12
185   0x00001042,  // SWEDISH   SWEDISH.12
186   0x00001142,  // Chinese   Chinese.12
187   0x00001242,  // CZECH   CZECH.12
188   0x00000000,  // GREEK
189   0x47001442,  // ICELANDIC   ICELANDIC.12 FAROESE.1
190   0x00001542,  // LATVIAN   LATVIAN.12
191   0x00001642,  // LITHUANIAN   LITHUANIAN.12
192   0x00001742,  // ROMANIAN   ROMANIAN.12
193   0x00001842,  // HUNGARIAN   HUNGARIAN.12
194   0x00001942,  // ESTONIAN   ESTONIAN.12
195   0x00000000,  // TG_UNKNOWN_LANGUAGE
196   0x00000000,  // Unknown
197   0x00001c42,  // BULGARIAN   BULGARIAN.12
198   0x00001d42,  // CROATIAN   CROATIAN.12
199   0x1e001d46,  // SERBIAN   CROATIAN.12 SERBIAN.5
200   0x00000000,  // IRISH
201   0x0f00203d,  // GALICIAN   GALICIAN.11 SPANISH.7
202   0x5e00213a,  // TAGALOG   TAGALOG.11 SOMALI.4
203   0x00002242,  // TURKISH   TURKISH.12
204   0x00002342,  // UKRAINIAN   UKRAINIAN.12
205   0x00000000,  // HINDI
206   0x1c1e25d4,  // MACEDONIAN   MACEDONIAN.11 SERBIAN.4 BULGARIAN.2
207   0x00002642,  // BENGALI   BENGALI.12
208   0x00002742,  // INDONESIAN   INDONESIAN.12
209   0x00000000,  // LATIN
210   0x2700293c,  // MALAY   MALAY.11 INDONESIAN.6
211   0x00000000,  // MALAYALAM
212   0x00000000,  // WELSH
213   0x00000000,  // NEPALI
214   0x00000000,  // TELUGU
215   0x00002e42,  // ALBANIAN   ALBANIAN.12
216   0x00000000,  // TAMIL
217   0x00003042,  // BELARUSIAN   BELARUSIAN.12
218   0x00000000,  // JAVANESE
219   0x00000000,  // OCCITAN
220   0x375f3330,  // URDU   URDU.10 UIGHUR.7 ARABIC.4
221   0x41003436,  // BIHARI   BIHARI.10 MARATHI.10
222   0x00000000,  // GUJARATI
223   0x0a4636b2,  // THAI   THAI.7 ChineseT.3 Korean.2
224   0x00003742,  // ARABIC   ARABIC.12
225   0x00003842,  // CATALAN   CATALAN.12
226   0x00003942,  // ESPERANTO   ESPERANTO.12
227   0x00003a42,  // BASQUE   BASQUE.12
228   0x00000000,  // INTERLINGUA
229   0x00000000,  // KANNADA
230   0x05060cca,  // PUNJABI   POLISH.10 GERMAN.4 FRENCH.2
231   0x00000000,  // SCOTS_GAELIC
232   0x00003f42,  // SWAHILI   SWAHILI.12
233   0x00004042,  // SLOVENIAN   SLOVENIAN.12
234   0x00004142,  // MARATHI   MARATHI.12
235   0x00004242,  // MALTESE   MALTESE.12
236   0x00004342,  // VIETNAMESE   VIETNAMESE.12
237   0x00000000,  // FRISIAN
238   0x12004543,  // SLOVAK   SLOVAK.12 CZECH.2
239   0x00004642,  // ChineseT   ChineseT.12
240   0x00000000,  // FAROESE
241   0x00000000,  // SUNDANESE
242   0x79004944,  // UZBEK   UZBEK.12 TAJIK.3
243   0x4d004a46,  // AMHARIC   AMHARIC.12 TIGRINYA.5
244   0x00004b42,  // AZERBAIJANI   AZERBAIJANI.12
245   0x00000000,  // GEORGIAN
246   0x00000000,  // TIGRINYA
247   0x00004e42,  // PERSIAN   PERSIAN.12
248   0x00000000,  // BOSNIAN
249   0x00000000,  // SINHALESE
250   0x00000000,  // NORWEGIAN_N
251   0x00000000,  // PORTUGUESE_P
252   0x00000000,  // PORTUGUESE_B
253   0x00000000,  // XHOSA
254   0x00000000,  // ZULU
255   0x00000000,  // GUARANI
256   0x00000000,  // SESOTHO
257   0x00000000,  // TURKMEN
258   0x7a005933,  // KYRGYZ   KYRGYZ.10 TATAR.7
259   0x00000000,  // BRETON
260   0x00000000,  // TWI
261   0x00000000,  // YIDDISH
262   0x00000000,  // SERBO_CROATIAN
263   0x00000000,  // SOMALI
264   0x00005f42,  // UIGHUR   UIGHUR.12
265   0x00006042,  // KURDISH   KURDISH.12
266   0x00006142,  // MONGOLIAN   MONGOLIAN.12
267   0x051130c9,  // ARMENIAN   BELARUSIAN.10 Chinese.3 FRENCH.1
268   0x020f0521,  // LAOTHIAN   FRENCH.8 SPANISH.7 DANISH.6
269   0x64004e35,  // SINDHI   PERSIAN.10 SINDHI.9
270   0x00000000,  // RHAETO_ROMANCE
271   0x00006642,  // AFRIKAANS   AFRIKAANS.12
272   0x00000000,  // LUXEMBOURGISH
273   0x00006842,  // BURMESE   BURMESE.12
274   0x00002242,  // KHMER   TURKISH.12
275   0x88006a3c,  // TIBETAN   TIBETAN.11 DZONGKHA.6
276   0x00000000,  // DHIVEHI
277   0x00000000,  // CHEROKEE
278   0x00000000,  // SYRIAC
279   0x00000000,  // LIMBU
280   0x00000000,  // ORIYA
281   0x00000000,  // ASSAMESE
282   0x00000000,  // CORSICAN
283   0x00000000,  // INTERLINGUE
284   0x00007342,  // KAZAKH   KAZAKH.12
285   0x00000000,  // LINGALA
286   0x00000000,  // MOLDAVIAN
287   0x5f007645,  // PASHTO   PASHTO.12 UIGHUR.4
288   0x00000000,  // QUECHUA
289   0x00000000,  // SHONA
290   0x00007942,  // TAJIK   TAJIK.12
291   0x00000000,  // TATAR
292   0x00000000,  // TONGA
293   0x00000000,  // YORUBA
294   0x00000000,  // CREOLES_AND_PIDGINS_ENGLISH_BASED
295   0x00000000,  // CREOLES_AND_PIDGINS_FRENCH_BASED
296   0x00000000,  // CREOLES_AND_PIDGINS_PORTUGUESE_BASED
297   0x00000000,  // CREOLES_AND_PIDGINS_OTHER
298   0x00000000,  // MAORI
299   0x00000000,  // WOLOF
300   0x00000000,  // ABKHAZIAN
301   0x00000000,  // AFAR
302   0x00000000,  // AYMARA
303   0x00000000,  // BASHKIR
304   0x00000000,  // BISLAMA
305   0x00000000,  // DZONGKHA
306   0x00000000,  // FIJIAN
307   0x00000000,  // GREENLANDIC
308   0x00000000,  // HAUSA
309   0x00000000,  // HAITIAN_CREOLE
310   0x00000000,  // INUPIAK
311   0x00000542,  // INUKTITUT   FRENCH.12
312   0x00000000,  // KASHMIRI
313   0x00000000,  // KINYARWANDA
314   0x00000000,  // MALAGASY
315   0x00000000,  // NAURU
316   0x00000000,  // OROMO
317   0x00000000,  // RUNDI
318   0x00000000,  // SAMOAN
319   0x00000000,  // SANGO
320   0x344197d3,  // SANSKRIT   SANSKRIT.11 MARATHI.4 BIHARI.1
321   0x00000000,  // SISWANT
322   0x00000000,  // TSONGA
323   0x00000000,  // TSWANA
324   0x00000000,  // VOLAPUK
325   0x00000000,  // ZHUANG
326   0x00000000,  // KHASI
327   0x00000000,  // SCOTS
328   0x00000000,  // GANDA
329   0x00000000,  // MANX
330   0x00000000,  // MONTENEGRIN
331   // Add new language hints just before here (just use 0x00000000)
332 };
333 
334 COMPILE_ASSERT(arraysize(kLanguageHintProbs) == NUM_LANGUAGES,
335                kLanguageHintProbs_has_incorrect_size);
336 
337 //
338 // Generated by dsites 2008.07.07 from 10% of Base
339 //
340 
341 typedef struct {
342   char key[4];
343   uint32 probs;
344 } HintEntry;
345 
346 
347 // Massaged TLD, followed by three packed language probs
348 // Hand-removed 4 items dsites 2008.07.15
349 static const int kTLDHintProbsSize = 201;
350 static const HintEntry kTLDHintProbs[kTLDHintProbsSize] = {   // MaxRange 12
351   {{0x61,0x63,0x5f,0x5f}, 0x0a000945},   // ac__ Japanese.12 Korean.4
352   {{0x61,0x64,0x5f,0x5f}, 0x00003842},   // ad__ CATALAN.12
353   {{0x61,0x65,0x5f,0x5f}, 0x00003742},   // ae__ ARABIC.12
354   {{0x61,0x66,0x5f,0x5f}, 0x4e00763d},   // af__ PASHTO.11 PERSIAN.7
355   {{0x61,0x67,0x5f,0x5f}, 0x09000643},   // ag__ GERMAN.12 Japanese.2
356   {{0x61,0x69,0x5f,0x5f}, 0x0c180938},   // ai__ Japanese.11 HUNGARIAN.7 POLISH.2
357   {{0x61,0x6c,0x5f,0x5f}, 0x00002e42},   // al__ ALBANIAN.12
358   {{0x61,0x6e,0x5f,0x5f}, 0x6e00033d},   // an__ DUTCH.11 LIMBU.7
359   {{0x61,0x6f,0x5f,0x5f}, 0x05000d42},   // ao__ PORTUGUESE.12 FRENCH.1
360   {{0x61,0x71,0x5f,0x5f}, 0x05000f29},   // aq__ SPANISH.9 FRENCH.6
361   {{0x61,0x72,0x5f,0x5f}, 0x00000f42},   // ar__ SPANISH.12
362   {{0x61,0x73,0x5f,0x5f}, 0x0f120bcd},   // as__ NORWEGIAN.10 CZECH.6 SPANISH.5
363   {{0x61,0x74,0x5f,0x5f}, 0x00000642},   // at__ GERMAN.12
364   {{0x61,0x77,0x5f,0x5f}, 0x0f000345},   // aw__ DUTCH.12 SPANISH.4
365   {{0x61,0x78,0x5f,0x5f}, 0x00001042},   // ax__ SWEDISH.12
366   {{0x61,0x7a,0x5f,0x5f}, 0x00004b42},   // az__ AZERBAIJANI.12
367   {{0x62,0x61,0x5f,0x5f}, 0x00001d42},   // ba__ CROATIAN.12
368   {{0x62,0x62,0x5f,0x5f}, 0x00002842},   // bb__ LATIN.12
369   {{0x62,0x64,0x5f,0x5f}, 0x00002642},   // bd__ BENGALI.12
370   {{0x62,0x65,0x5f,0x5f}, 0x05000335},   // be__ DUTCH.10 FRENCH.9
371   {{0x62,0x66,0x5f,0x5f}, 0x00000542},   // bf__ FRENCH.12
372   {{0x62,0x67,0x5f,0x5f}, 0x00001c42},   // bg__ BULGARIAN.12
373   {{0x62,0x68,0x5f,0x5f}, 0x00003742},   // bh__ ARABIC.12
374   {{0x62,0x69,0x5f,0x5f}, 0x0f00053f},   // bi__ FRENCH.11 SPANISH.9
375   {{0x62,0x6a,0x5f,0x5f}, 0x00000542},   // bj__ FRENCH.12
376   {{0x62,0x6d,0x5f,0x5f}, 0x98043929},   // bm__ ESPERANTO.9 FINNISH.8 SISWANT.6
377   {{0x62,0x6e,0x5f,0x5f}, 0x00002942},   // bn__ MALAY.12
378   {{0x62,0x6f,0x5f,0x5f}, 0x00000f42},   // bo__ SPANISH.12
379   {{0x62,0x72,0x5f,0x5f}, 0x00000d42},   // br__ PORTUGUESE.12
380   {{0x62,0x74,0x5f,0x5f}, 0x00008842},   // bt__ DZONGKHA.12
381   {{0x62,0x77,0x5f,0x5f}, 0x06059ac4},   // bw__ TSWANA.9 FRENCH.6 GERMAN.5
382   {{0x62,0x79,0x5f,0x5f}, 0x00003024},   // by__ BELARUSIAN.9
383   {{0x62,0x7a,0x5f,0x5f}, 0x0f0a0924},   // bz__ Japanese.9 Korean.5 SPANISH.1
384   {{0x63,0x61,0x5f,0x5f}, 0x00000542},   // ca__ FRENCH.12
385   {{0x63,0x61,0x74,0x5f}, 0x00003842},   // cat_ CATALAN.12
386   {{0x63,0x64,0x5f,0x5f}, 0x06051224},   // cd__ CZECH.9 FRENCH.5 GERMAN.1
387   {{0x63,0x66,0x5f,0x5f}, 0x00000542},   // cf__ FRENCH.12
388   {{0x63,0x67,0x5f,0x5f}, 0x00000542},   // cg__ FRENCH.12
389   {{0x63,0x68,0x5f,0x5f}, 0x08050638},   // ch__ GERMAN.11 FRENCH.7 ITALIAN.2
390   {{0x63,0x69,0x5f,0x5f}, 0x00000542},   // ci__ FRENCH.12
391   {{0x63,0x6c,0x5f,0x5f}, 0x00000f42},   // cl__ SPANISH.12
392   {{0x63,0x6d,0x5f,0x5f}, 0x00000542},   // cm__ FRENCH.12
393   {{0x63,0x6e,0x5f,0x5f}, 0x00001142},   // cn__ Chinese.12
394   {{0x63,0x6f,0x5f,0x5f}, 0x00000f42},   // co__ SPANISH.12
395 // {{0x63,0x6f,0x6f,0x70}, 0x0f0509cd},   // coop Japanese.10 FRENCH.6 SPANISH.5
396   {{0x63,0x72,0x5f,0x5f}, 0x00000f42},   // cr__ SPANISH.12
397   {{0x63,0x75,0x5f,0x5f}, 0x00000f42},   // cu__ SPANISH.12
398   {{0x63,0x76,0x5f,0x5f}, 0x00000d42},   // cv__ PORTUGUESE.12
399   {{0x63,0x78,0x5f,0x5f}, 0x223a091f},   // cx__ Japanese.8 BASQUE.6 TURKISH.4
400   {{0x63,0x79,0x5f,0x5f}, 0x150622ba},   // cy__ TURKISH.8 GERMAN.4 LATVIAN.3
401   {{0x63,0x7a,0x5f,0x5f}, 0x00001242},   // cz__ CZECH.12
402   {{0x64,0x65,0x5f,0x5f}, 0x00000642},   // de__ GERMAN.12
403   {{0x64,0x6b,0x5f,0x5f}, 0x00000242},   // dk__ DANISH.12
404   {{0x64,0x6f,0x5f,0x5f}, 0x21000f42},   // do__ SPANISH.12 TAGALOG.1
405   {{0x64,0x7a,0x5f,0x5f}, 0x37000535},   // dz__ FRENCH.10 ARABIC.9
406   {{0x65,0x63,0x5f,0x5f}, 0x00000f42},   // ec__ SPANISH.12
407 // {{0x65,0x64,0x75,0x5f}, 0x2e0f3873},   // edu_ CATALAN.9 SPANISH.7 ALBANIAN.2
408   {{0x65,0x65,0x5f,0x5f}, 0x00001942},   // ee__ ESTONIAN.12
409   {{0x65,0x67,0x5f,0x5f}, 0x05003742},   // eg__ ARABIC.12 FRENCH.1
410   {{0x65,0x72,0x5f,0x5f}, 0x00000b42},   // er__ NORWEGIAN.12
411   {{0x65,0x73,0x5f,0x5f}, 0x38200fd4},   // es__ SPANISH.11 GALICIAN.4 CATALAN.2
412   {{0x65,0x74,0x5f,0x5f}, 0x39004a39},   // et__ AMHARIC.11 ESPERANTO.3
413   {{0x66,0x69,0x5f,0x5f}, 0x10000444},   // fi__ FINNISH.12 SWEDISH.3
414   {{0x66,0x6a,0x5f,0x5f}, 0x050489e0},   // fj__ FIJIAN.12 FINNISH.5 FRENCH.3
415   {{0x66,0x6f,0x5f,0x5f}, 0x00004742},   // fo__ FAROESE.12
416   {{0x66,0x72,0x5f,0x5f}, 0x00000542},   // fr__ FRENCH.12
417   {{0x67,0x61,0x5f,0x5f}, 0x00000542},   // ga__ FRENCH.12
418   {{0x67,0x64,0x5f,0x5f}, 0x061d05d5},   // gd__ FRENCH.11 CROATIAN.5 GERMAN.3
419   {{0x67,0x65,0x5f,0x5f}, 0x00004c2d},   // ge__ GEORGIAN.10
420   {{0x67,0x66,0x5f,0x5f}, 0x00000542},   // gf__ FRENCH.12
421   {{0x67,0x67,0x5f,0x5f}, 0x06002244},   // gg__ TURKISH.12 GERMAN.3
422   {{0x67,0x68,0x5f,0x5f}, 0x05000436},   // gh__ FINNISH.10 FRENCH.10
423   {{0x67,0x69,0x5f,0x5f}, 0x0f0538ce},   // gi__ CATALAN.10 FRENCH.7 SPANISH.6
424   {{0x67,0x6c,0x5f,0x5f}, 0x398a0238},   // gl__ DANISH.11 GREENLANDIC.7 ESPERANTO.2
425   {{0x67,0x6d,0x5f,0x5f}, 0x0600043e},   // gm__ FINNISH.11 GERMAN.8
426   {{0x67,0x6e,0x5f,0x5f}, 0x00000542},   // gn__ FRENCH.12
427 // {{0x67,0x6f,0x76,0x5f}, 0x05000f25},   // gov_ SPANISH.9 FRENCH.2
428   {{0x67,0x70,0x5f,0x5f}, 0x00000542},   // gp__ FRENCH.12
429   {{0x67,0x71,0x5f,0x5f}, 0x0f000547},   // gq__ FRENCH.12 SPANISH.6
430   {{0x67,0x73,0x5f,0x5f}, 0x00000942},   // gs__ Japanese.12
431   {{0x67,0x74,0x5f,0x5f}, 0x00000f42},   // gt__ SPANISH.12
432   {{0x68,0x6b,0x5f,0x5f}, 0x11004643},   // hk__ ChineseT.12 Chinese.2
433   {{0x68,0x6d,0x5f,0x5f}, 0x4606092e},   // hm__ Japanese.10 GERMAN.6 ChineseT.2
434   {{0x68,0x6e,0x5f,0x5f}, 0x00000f42},   // hn__ SPANISH.12
435   {{0x68,0x72,0x5f,0x5f}, 0x00001d42},   // hr__ CROATIAN.12
436   {{0x68,0x74,0x5f,0x5f}, 0x0f000542},   // ht__ FRENCH.12 SPANISH.1
437   {{0x68,0x75,0x5f,0x5f}, 0x00001842},   // hu__ HUNGARIAN.12
438   {{0x69,0x64,0x5f,0x5f}, 0x00002742},   // id__ INDONESIAN.12
439   {{0x69,0x65,0x5f,0x5f}, 0x050c1f24},   // ie__ IRISH.9 POLISH.5 FRENCH.1
440   {{0x69,0x6c,0x5f,0x5f}, 0x00000742},   // il__ HEBREW.12
441   {{0x69,0x6e,0x74,0x5f}, 0x0f060574},   // int_ FRENCH.9 GERMAN.8 SPANISH.3
442   {{0x69,0x6f,0x5f,0x5f}, 0x11090fd5},   // io__ SPANISH.11 Japanese.5 Chinese.3
443   {{0x69,0x71,0x5f,0x5f}, 0x60003744},   // iq__ ARABIC.12 KURDISH.3
444   {{0x69,0x72,0x5f,0x5f}, 0x00004e42},   // ir__ PERSIAN.12
445   {{0x69,0x73,0x5f,0x5f}, 0x00001442},   // is__ ICELANDIC.12
446   {{0x69,0x74,0x5f,0x5f}, 0x00000842},   // it__ ITALIAN.12
447   {{0x6a,0x65,0x5f,0x5f}, 0x29050328},   // je__ DUTCH.9 FRENCH.7 MALAY.5
448   {{0x6a,0x6d,0x5f,0x5f}, 0x040f0576},   // jm__ FRENCH.9 SPANISH.8 FINNISH.5
449   {{0x6a,0x6f,0x5f,0x5f}, 0x00003742},   // jo__ ARABIC.12
450 // {{0x6a,0x6f,0x62,0x73}, 0x0f060329},   // jobs DUTCH.9 GERMAN.8 SPANISH.6
451   {{0x6a,0x70,0x5f,0x5f}, 0x00000942},   // jp__ Japanese.12
452   {{0x6b,0x65,0x5f,0x5f}, 0x040f3fc3},   // ke__ SWAHILI.9 SPANISH.5 FINNISH.4
453   {{0x6b,0x69,0x5f,0x5f}, 0x04000643},   // ki__ GERMAN.12 FINNISH.2
454   {{0x6b,0x6d,0x5f,0x5f}, 0x00000542},   // km__ FRENCH.12
455   {{0x6b,0x70,0x5f,0x5f}, 0x00000a42},   // kp__ Korean.12
456   {{0x6b,0x72,0x5f,0x5f}, 0x00000a42},   // kr__ Korean.12
457   {{0x6b,0x77,0x5f,0x5f}, 0x00003742},   // kw__ ARABIC.12
458   {{0x6b,0x79,0x5f,0x5f}, 0x0500083f},   // ky__ ITALIAN.11 FRENCH.9
459   {{0x6b,0x7a,0x5f,0x5f}, 0x0000732d},   // kz__ KAZAKH.10
460   {{0x6c,0x62,0x5f,0x5f}, 0x05003747},   // lb__ ARABIC.12 FRENCH.6
461   {{0x6c,0x63,0x5f,0x5f}, 0x09000645},   // lc__ GERMAN.12 Japanese.4
462   {{0x6c,0x69,0x5f,0x5f}, 0x1600063d},   // li__ GERMAN.11 LITHUANIAN.7
463   {{0x6c,0x73,0x5f,0x5f}, 0x00005742},   // ls__ SESOTHO.12
464   {{0x6c,0x74,0x5f,0x5f}, 0x00001642},   // lt__ LITHUANIAN.12
465   {{0x6c,0x75,0x5f,0x5f}, 0x0600053d},   // lu__ FRENCH.11 GERMAN.7
466   {{0x6c,0x76,0x5f,0x5f}, 0x00001542},   // lv__ LATVIAN.12
467   {{0x6c,0x79,0x5f,0x5f}, 0x05003744},   // ly__ ARABIC.12 FRENCH.3
468   {{0x6d,0x61,0x5f,0x5f}, 0x3700053d},   // ma__ FRENCH.11 ARABIC.7
469   {{0x6d,0x63,0x5f,0x5f}, 0x00000542},   // mc__ FRENCH.12
470   {{0x6d,0x64,0x5f,0x5f}, 0x00001724},   // md__ ROMANIAN.9
471   {{0x6d,0x65,0x5f,0x5f}, 0x00001d42},   // me__ CROATIAN.12
472   {{0x6d,0x67,0x5f,0x5f}, 0x00000542},   // mg__ FRENCH.12
473   {{0x6d,0x6b,0x5f,0x5f}, 0x1c002543},   // mk__ MACEDONIAN.12 BULGARIAN.2
474   {{0x6d,0x6c,0x5f,0x5f}, 0x00000542},   // ml__ FRENCH.12
475   {{0x6d,0x6e,0x5f,0x5f}, 0x00006142},   // mn__ MONGOLIAN.12
476   {{0x6d,0x6f,0x5f,0x5f}, 0x110d4631},   // mo__ ChineseT.10 PORTUGUESE.8 Chinese.5
477   {{0x6d,0x71,0x5f,0x5f}, 0x00000542},   // mq__ FRENCH.12
478   {{0x6d,0x72,0x5f,0x5f}, 0x37000535},   // mr__ FRENCH.10 ARABIC.9
479   {{0x6d,0x73,0x5f,0x5f}, 0x090f06d5},   // ms__ GERMAN.11 SPANISH.5 Japanese.3
480   {{0x6d,0x74,0x5f,0x5f}, 0x00004242},   // mt__ MALTESE.12
481   {{0x6d,0x75,0x5f,0x5f}, 0x05000934},   // mu__ Japanese.10 FRENCH.8
482   {{0x6d,0x76,0x5f,0x5f}, 0x28000436},   // mv__ FINNISH.10 LATIN.10
483   {{0x6d,0x77,0x5f,0x5f}, 0x0611092a},   // mw__ Japanese.9 Chinese.8 GERMAN.7
484   {{0x6d,0x78,0x5f,0x5f}, 0x00000f42},   // mx__ SPANISH.12
485   {{0x6d,0x79,0x5f,0x5f}, 0x00002942},   // my__ MALAY.12
486   {{0x6d,0x7a,0x5f,0x5f}, 0x00000d42},   // mz__ PORTUGUESE.12
487   {{0x6e,0x61,0x5f,0x5f}, 0x06006644},   // na__ AFRIKAANS.12 GERMAN.3
488   {{0x6e,0x63,0x5f,0x5f}, 0x00000542},   // nc__ FRENCH.12
489   {{0x6e,0x65,0x5f,0x5f}, 0x8b000542},   // ne__ FRENCH.12 HAUSA.1
490   {{0x6e,0x66,0x5f,0x5f}, 0x00000542},   // nf__ FRENCH.12
491   {{0x6e,0x69,0x5f,0x5f}, 0x00000f42},   // ni__ SPANISH.12
492   {{0x6e,0x6c,0x5f,0x5f}, 0x00000342},   // nl__ DUTCH.12
493   {{0x6e,0x6f,0x5f,0x5f}, 0x51000b43},   // no__ NORWEGIAN.12 NORWEGIAN_N.2
494   {{0x6e,0x75,0x5f,0x5f}, 0x0300103b},   // nu__ SWEDISH.11 DUTCH.5
495   {{0x6f,0x6d,0x5f,0x5f}, 0x00003742},   // om__ ARABIC.12
496   {{0x70,0x61,0x5f,0x5f}, 0x00000f42},   // pa__ SPANISH.12
497   {{0x70,0x65,0x5f,0x5f}, 0x00000f42},   // pe__ SPANISH.12
498   {{0x70,0x66,0x5f,0x5f}, 0x00000542},   // pf__ FRENCH.12
499   {{0x70,0x67,0x5f,0x5f}, 0x00000f24},   // pg__ SPANISH.9
500   {{0x70,0x68,0x5f,0x5f}, 0x00002142},   // ph__ TAGALOG.12
501   {{0x70,0x6b,0x5f,0x5f}, 0x00003342},   // pk__ URDU.12
502   {{0x70,0x6c,0x5f,0x5f}, 0x30000c42},   // pl__ POLISH.12 BELARUSIAN.1
503   {{0x70,0x6e,0x5f,0x5f}, 0x04000644},   // pn__ GERMAN.12 FINNISH.3
504   {{0x70,0x72,0x5f,0x5f}, 0x00000f42},   // pr__ SPANISH.12
505   {{0x70,0x72,0x6f,0x5f}, 0x46050fd5},   // pro_ SPANISH.11 FRENCH.5 ChineseT.3
506   {{0x70,0x73,0x5f,0x5f}, 0x00003742},   // ps__ ARABIC.12
507   {{0x70,0x74,0x5f,0x5f}, 0x00000d42},   // pt__ PORTUGUESE.12
508   {{0x70,0x79,0x5f,0x5f}, 0x00000f42},   // py__ SPANISH.12
509   {{0x71,0x61,0x5f,0x5f}, 0x00003742},   // qa__ ARABIC.12
510   {{0x72,0x65,0x5f,0x5f}, 0x00000542},   // re__ FRENCH.12
511   {{0x72,0x6f,0x5f,0x5f}, 0x00001742},   // ro__ ROMANIAN.12
512   {{0x72,0x73,0x5f,0x5f}, 0x00001d42},   // rs__ CROATIAN.12
513   {{0x72,0x77,0x5f,0x5f}, 0x9000053e},   // rw__ FRENCH.11 KINYARWANDA.8
514   {{0x73,0x61,0x5f,0x5f}, 0x00003742},   // sa__ ARABIC.12
515   {{0x73,0x62,0x5f,0x5f}, 0x00000442},   // sb__ FINNISH.12
516   {{0x73,0x63,0x5f,0x5f}, 0x060f092f},   // sc__ Japanese.10 SPANISH.7 GERMAN.3
517   {{0x73,0x64,0x5f,0x5f}, 0x00003742},   // sd__ ARABIC.12
518   {{0x73,0x65,0x5f,0x5f}, 0x00001042},   // se__ SWEDISH.12
519   {{0x73,0x69,0x5f,0x5f}, 0x00004042},   // si__ SLOVENIAN.12
520   {{0x73,0x6b,0x5f,0x5f}, 0x12004543},   // sk__ SLOVAK.12 CZECH.2
521   {{0x73,0x6d,0x5f,0x5f}, 0x00000842},   // sm__ ITALIAN.12
522   {{0x73,0x6e,0x5f,0x5f}, 0x00000542},   // sn__ FRENCH.12
523   {{0x73,0x72,0x5f,0x5f}, 0x03001e44},   // sr__ SERBIAN.12 DUTCH.3
524   {{0x73,0x76,0x5f,0x5f}, 0x00000f42},   // sv__ SPANISH.12
525   {{0x73,0x79,0x5f,0x5f}, 0x00003742},   // sy__ ARABIC.12
526   {{0x74,0x63,0x5f,0x5f}, 0x0a2206cd},   // tc__ GERMAN.10 TURKISH.6 Korean.5
527   {{0x74,0x66,0x5f,0x5f}, 0x00000642},   // tf__ GERMAN.12
528   {{0x74,0x67,0x5f,0x5f}, 0x00000542},   // tg__ FRENCH.12
529   {{0x74,0x68,0x5f,0x5f}, 0x9e0936c9},   // th__ THAI.10 Japanese.3 SCOTS.1
530   {{0x74,0x6a,0x5f,0x5f}, 0x00007924},   // tj__ TAJIK.9
531   {{0x74,0x6c,0x5f,0x5f}, 0x060f0dcd},   // tl__ PORTUGUESE.10 SPANISH.6 GERMAN.5
532   {{0x74,0x6e,0x5f,0x5f}, 0x3700053e},   // tn__ FRENCH.11 ARABIC.8
533   {{0x74,0x6f,0x5f,0x5f}, 0x064609c5},   // to__ Japanese.9 ChineseT.7 GERMAN.6
534   {{0x74,0x70,0x5f,0x5f}, 0x06000944},   // tp__ Japanese.12 GERMAN.3
535   {{0x74,0x72,0x5f,0x5f}, 0x00002242},   // tr__ TURKISH.12
536   {{0x74,0x72,0x61,0x76}, 0x064509c3},   // trav Japanese.9 SLOVAK.5 GERMAN.4
537   {{0x74,0x74,0x5f,0x5f}, 0x0f00063e},   // tt__ GERMAN.11 SPANISH.8
538   {{0x74,0x77,0x5f,0x5f}, 0x00004642},   // tw__ ChineseT.12
539   {{0x74,0x7a,0x5f,0x5f}, 0x00003f42},   // tz__ SWAHILI.12
540   {{0x75,0x61,0x5f,0x5f}, 0x0000232d},   // ua__ UKRAINIAN.10
541   {{0x75,0x79,0x5f,0x5f}, 0x00000f42},   // uy__ SPANISH.12
542   {{0x75,0x7a,0x5f,0x5f}, 0x0000492d},   // uz__ UZBEK.10
543   {{0x76,0x61,0x5f,0x5f}, 0x060f0828},   // va__ ITALIAN.9 SPANISH.7 GERMAN.5
544   {{0x76,0x63,0x5f,0x5f}, 0x0d000939},   // vc__ Japanese.11 PORTUGUESE.3
545   {{0x76,0x65,0x5f,0x5f}, 0x00000f42},   // ve__ SPANISH.12
546   {{0x76,0x67,0x5f,0x5f}, 0x09000f43},   // vg__ SPANISH.12 Japanese.2
547   {{0x76,0x69,0x5f,0x5f}, 0x00002942},   // vi__ MALAY.12
548   {{0x76,0x6e,0x5f,0x5f}, 0x00004342},   // vn__ VIETNAMESE.12
549   {{0x76,0x75,0x5f,0x5f}, 0x00000642},   // vu__ GERMAN.12
550   {{0x77,0x73,0x5f,0x5f}, 0x4b0f0624},   // ws__ GERMAN.9 SPANISH.5 AZERBAIJANI.1
551   {{0x79,0x65,0x5f,0x5f}, 0x00003742},   // ye__ ARABIC.12
552   {{0x79,0x75,0x5f,0x5f}, 0x1e001d3d},   // yu__ CROATIAN.11 SERBIAN.7
553   {{0x7a,0x61,0x5f,0x5f}, 0x00006642},   // za__ AFRIKAANS.12
554   {{0x7a,0x6d,0x5f,0x5f}, 0x0b000435},   // zm__ FINNISH.10 NORWEGIAN.9
555   {{0x7a,0x77,0x5f,0x5f}, 0x3f00783e},   // zw__ SHONA.11 SWAHILI.8
556 };
557 
558 
559 // Statistically closest language, based on quadgram table
560 // Those that are far from other languges map to UNKNOWN_LANGUAGE
561 // Subscripted by Language
562 //
563 // From lang_correlation.txt and hand-edits
564 // sed 's/^\([^ ]*\) \([^ ]*\) coef=0\.\(..\).*$/
565 //   (\3 >= kMinCorrPercent) ? \2 : UNKNOWN_LANGUAGE,
566 //   \/\/ \1/' lang_correlation.txt >/tmp/closest_lang_decl.txt
567 //
568 static const int kMinCorrPercent = 24;        // Pick off how close you want
569                                               // 24 catches PERSIAN <== ARABIC
570                                               // but not SPANISH <== PORTUGESE
571 static Language Unknown = UNKNOWN_LANGUAGE;
572 
573 // Subscripted by Language
574 static const Language kClosestAltLanguage[] = {
575   (28 >= kMinCorrPercent) ? SCOTS : UNKNOWN_LANGUAGE,  // ENGLISH
576   (36 >= kMinCorrPercent) ? NORWEGIAN : UNKNOWN_LANGUAGE,  // DANISH
577   (31 >= kMinCorrPercent) ? AFRIKAANS : UNKNOWN_LANGUAGE,  // DUTCH
578   (15 >= kMinCorrPercent) ? ESTONIAN : UNKNOWN_LANGUAGE,  // FINNISH
579   (11 >= kMinCorrPercent) ? OCCITAN : UNKNOWN_LANGUAGE,  // FRENCH
580   (17 >= kMinCorrPercent) ? LUXEMBOURGISH : UNKNOWN_LANGUAGE,  // GERMAN
581   (27 >= kMinCorrPercent) ? YIDDISH : UNKNOWN_LANGUAGE,  // HEBREW
582   (16 >= kMinCorrPercent) ? CORSICAN : UNKNOWN_LANGUAGE,  // ITALIAN
583   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // Japanese
584   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // Korean
585   (41 >= kMinCorrPercent) ? NORWEGIAN_N : UNKNOWN_LANGUAGE,  // NORWEGIAN
586   ( 5 >= kMinCorrPercent) ? SLOVAK : UNKNOWN_LANGUAGE,  // POLISH
587   (23 >= kMinCorrPercent) ? SPANISH : UNKNOWN_LANGUAGE,  // PORTUGUESE
588   (33 >= kMinCorrPercent) ? BULGARIAN : UNKNOWN_LANGUAGE,  // RUSSIAN
589   (28 >= kMinCorrPercent) ? GALICIAN : UNKNOWN_LANGUAGE,  // SPANISH
590   (17 >= kMinCorrPercent) ? NORWEGIAN : UNKNOWN_LANGUAGE,  // SWEDISH
591   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // Chinese
592   (42 >= kMinCorrPercent) ? SLOVAK : UNKNOWN_LANGUAGE,  // CZECH
593   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // GREEK
594   (35 >= kMinCorrPercent) ? FAROESE : UNKNOWN_LANGUAGE,  // ICELANDIC
595   ( 7 >= kMinCorrPercent) ? LITHUANIAN : UNKNOWN_LANGUAGE,  // LATVIAN
596   ( 7 >= kMinCorrPercent) ? LATVIAN : UNKNOWN_LANGUAGE,  // LITHUANIAN
597   ( 4 >= kMinCorrPercent) ? LATIN : UNKNOWN_LANGUAGE,  // ROMANIAN
598   ( 4 >= kMinCorrPercent) ? SLOVAK : UNKNOWN_LANGUAGE,  // HUNGARIAN
599   (15 >= kMinCorrPercent) ? FINNISH : UNKNOWN_LANGUAGE,  // ESTONIAN
600   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // Ignore
601   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // Unknown
602   (33 >= kMinCorrPercent) ? RUSSIAN : UNKNOWN_LANGUAGE,  // BULGARIAN
603   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // CROATIAN
604   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // SERBIAN
605   (24 >= kMinCorrPercent) ? SCOTS_GAELIC : UNKNOWN_LANGUAGE,  // IRISH
606   (28 >= kMinCorrPercent) ? SPANISH : UNKNOWN_LANGUAGE,  // GALICIAN
607   ( 8 >= kMinCorrPercent) ? INDONESIAN : UNKNOWN_LANGUAGE,  // TAGALOG
608   (29 >= kMinCorrPercent) ? AZERBAIJANI : UNKNOWN_LANGUAGE,  // TURKISH
609   (28 >= kMinCorrPercent) ? RUSSIAN : UNKNOWN_LANGUAGE,  // UKRAINIAN
610   (37 >= kMinCorrPercent) ? MARATHI : UNKNOWN_LANGUAGE,  // HINDI
611   (29 >= kMinCorrPercent) ? BULGARIAN : UNKNOWN_LANGUAGE,  // MACEDONIAN
612   (14 >= kMinCorrPercent) ? ASSAMESE : UNKNOWN_LANGUAGE,  // BENGALI
613   (46 >= kMinCorrPercent) ? MALAY : UNKNOWN_LANGUAGE,  // INDONESIAN
614   ( 9 >= kMinCorrPercent) ? INTERLINGUA : UNKNOWN_LANGUAGE,  // LATIN
615   (46 >= kMinCorrPercent) ? INDONESIAN : UNKNOWN_LANGUAGE,  // MALAY
616   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // MALAYALAM
617   ( 4 >= kMinCorrPercent) ? BRETON : UNKNOWN_LANGUAGE,  // WELSH
618   ( 8 >= kMinCorrPercent) ? HINDI : UNKNOWN_LANGUAGE,  // NEPALI
619   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // TELUGU
620   ( 3 >= kMinCorrPercent) ? ESPERANTO : UNKNOWN_LANGUAGE,  // ALBANIAN
621   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // TAMIL
622   (22 >= kMinCorrPercent) ? UKRAINIAN : UNKNOWN_LANGUAGE,  // BELARUSIAN
623   (15 >= kMinCorrPercent) ? SUNDANESE : UNKNOWN_LANGUAGE,  // JAVANESE
624   (19 >= kMinCorrPercent) ? CATALAN : UNKNOWN_LANGUAGE,  // OCCITAN
625   (27 >= kMinCorrPercent) ? PERSIAN : UNKNOWN_LANGUAGE,  // URDU
626   (36 >= kMinCorrPercent) ? HINDI : UNKNOWN_LANGUAGE,  // BIHARI
627   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // GUJARATI
628   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // THAI
629   (24 >= kMinCorrPercent) ? PERSIAN : UNKNOWN_LANGUAGE,  // ARABIC
630   (19 >= kMinCorrPercent) ? OCCITAN : UNKNOWN_LANGUAGE,  // CATALAN
631   ( 4 >= kMinCorrPercent) ? LATIN : UNKNOWN_LANGUAGE,  // ESPERANTO
632   ( 3 >= kMinCorrPercent) ? GERMAN : UNKNOWN_LANGUAGE,  // BASQUE
633   ( 9 >= kMinCorrPercent) ? LATIN : UNKNOWN_LANGUAGE,  // INTERLINGUA
634   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // KANNADA
635   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // PUNJABI
636   (24 >= kMinCorrPercent) ? IRISH : UNKNOWN_LANGUAGE,  // SCOTS_GAELIC
637   ( 7 >= kMinCorrPercent) ? KINYARWANDA : UNKNOWN_LANGUAGE,  // SWAHILI
638   (28 >= kMinCorrPercent) ? SERBO_CROATIAN : UNKNOWN_LANGUAGE,  // SLOVENIAN
639   (37 >= kMinCorrPercent) ? HINDI : UNKNOWN_LANGUAGE,  // MARATHI
640   ( 3 >= kMinCorrPercent) ? ITALIAN : UNKNOWN_LANGUAGE,  // MALTESE
641   ( 1 >= kMinCorrPercent) ? YORUBA : UNKNOWN_LANGUAGE,  // VIETNAMESE
642   (15 >= kMinCorrPercent) ? DUTCH : UNKNOWN_LANGUAGE,  // FRISIAN
643   (42 >= kMinCorrPercent) ? CZECH : UNKNOWN_LANGUAGE,  // SLOVAK
644   // Original ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // ChineseT
645   (24 >= kMinCorrPercent) ? CHINESE : UNKNOWN_LANGUAGE,  // ChineseT
646   (35 >= kMinCorrPercent) ? ICELANDIC : UNKNOWN_LANGUAGE,  // FAROESE
647   (15 >= kMinCorrPercent) ? JAVANESE : UNKNOWN_LANGUAGE,  // SUNDANESE
648   (17 >= kMinCorrPercent) ? TAJIK : UNKNOWN_LANGUAGE,  // UZBEK
649   ( 7 >= kMinCorrPercent) ? TIGRINYA : UNKNOWN_LANGUAGE,  // AMHARIC
650   (29 >= kMinCorrPercent) ? TURKISH : UNKNOWN_LANGUAGE,  // AZERBAIJANI
651   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // GEORGIAN
652   ( 7 >= kMinCorrPercent) ? AMHARIC : UNKNOWN_LANGUAGE,  // TIGRINYA
653   (27 >= kMinCorrPercent) ? URDU : UNKNOWN_LANGUAGE,  // PERSIAN
654   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // BOSNIAN
655   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // SINHALESE
656   (41 >= kMinCorrPercent) ? NORWEGIAN : UNKNOWN_LANGUAGE,  // NORWEGIAN_N
657   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // PORTUGUESE_P
658   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // PORTUGUESE_B
659   (37 >= kMinCorrPercent) ? ZULU : UNKNOWN_LANGUAGE,  // XHOSA
660   (37 >= kMinCorrPercent) ? XHOSA : UNKNOWN_LANGUAGE,  // ZULU
661   ( 2 >= kMinCorrPercent) ? SPANISH : UNKNOWN_LANGUAGE,  // GUARANI
662   (29 >= kMinCorrPercent) ? TSWANA : UNKNOWN_LANGUAGE,  // SESOTHO
663   ( 7 >= kMinCorrPercent) ? TURKISH : UNKNOWN_LANGUAGE,  // TURKMEN
664   ( 8 >= kMinCorrPercent) ? KAZAKH : UNKNOWN_LANGUAGE,  // KYRGYZ
665   ( 5 >= kMinCorrPercent) ? FRENCH : UNKNOWN_LANGUAGE,  // BRETON
666   ( 3 >= kMinCorrPercent) ? GANDA : UNKNOWN_LANGUAGE,  // TWI
667   (27 >= kMinCorrPercent) ? HEBREW : UNKNOWN_LANGUAGE,  // YIDDISH
668   (28 >= kMinCorrPercent) ? SLOVENIAN : UNKNOWN_LANGUAGE,  // SERBO_CROATIAN
669   (12 >= kMinCorrPercent) ? OROMO : UNKNOWN_LANGUAGE,  // SOMALI
670   ( 9 >= kMinCorrPercent) ? UZBEK : UNKNOWN_LANGUAGE,  // UIGHUR
671   (15 >= kMinCorrPercent) ? PERSIAN : UNKNOWN_LANGUAGE,  // KURDISH
672   ( 6 >= kMinCorrPercent) ? KYRGYZ : UNKNOWN_LANGUAGE,  // MONGOLIAN
673   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // ARMENIAN
674   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // LAOTHIAN
675   ( 8 >= kMinCorrPercent) ? URDU : UNKNOWN_LANGUAGE,  // SINDHI
676   (10 >= kMinCorrPercent) ? ITALIAN : UNKNOWN_LANGUAGE,  // RHAETO_ROMANCE
677   (31 >= kMinCorrPercent) ? DUTCH : UNKNOWN_LANGUAGE,  // AFRIKAANS
678   (17 >= kMinCorrPercent) ? GERMAN : UNKNOWN_LANGUAGE,  // LUXEMBOURGISH
679   ( 2 >= kMinCorrPercent) ? SCOTS : UNKNOWN_LANGUAGE,  // BURMESE
680   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // KHMER
681   (45 >= kMinCorrPercent) ? DZONGKHA : UNKNOWN_LANGUAGE,  // TIBETAN
682   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // DHIVEHI
683   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // CHEROKEE
684   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // SYRIAC
685   ( 8 >= kMinCorrPercent) ? DUTCH : UNKNOWN_LANGUAGE,  // LIMBU
686   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // ORIYA
687   (14 >= kMinCorrPercent) ? BENGALI : UNKNOWN_LANGUAGE,  // ASSAMESE
688   (16 >= kMinCorrPercent) ? ITALIAN : UNKNOWN_LANGUAGE,  // CORSICAN
689   ( 5 >= kMinCorrPercent) ? INTERLINGUA : UNKNOWN_LANGUAGE,  // INTERLINGUE
690   ( 8 >= kMinCorrPercent) ? KYRGYZ : UNKNOWN_LANGUAGE,  // KAZAKH
691   ( 4 >= kMinCorrPercent) ? SWAHILI : UNKNOWN_LANGUAGE,  // LINGALA
692   (11 >= kMinCorrPercent) ? RUSSIAN : UNKNOWN_LANGUAGE,  // MOLDAVIAN
693   (19 >= kMinCorrPercent) ? PERSIAN : UNKNOWN_LANGUAGE,  // PASHTO
694   ( 5 >= kMinCorrPercent) ? AYMARA : UNKNOWN_LANGUAGE,  // QUECHUA
695   ( 5 >= kMinCorrPercent) ? KINYARWANDA : UNKNOWN_LANGUAGE,  // SHONA
696   (17 >= kMinCorrPercent) ? UZBEK : UNKNOWN_LANGUAGE,  // TAJIK
697   (13 >= kMinCorrPercent) ? BASHKIR : UNKNOWN_LANGUAGE,  // TATAR
698   (11 >= kMinCorrPercent) ? SAMOAN : UNKNOWN_LANGUAGE,  // TONGA
699   ( 2 >= kMinCorrPercent) ? TWI : UNKNOWN_LANGUAGE,  // YORUBA
700   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // CREOLES_AND_PIDGINS_ENGLISH_BASED
701   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // CREOLES_AND_PIDGINS_FRENCH_BASED
702   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // CREOLES_AND_PIDGINS_PORTUGUESE_BASED
703   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // CREOLES_AND_PIDGINS_OTHER
704   ( 6 >= kMinCorrPercent) ? TONGA : UNKNOWN_LANGUAGE,  // MAORI
705   ( 3 >= kMinCorrPercent) ? OROMO : UNKNOWN_LANGUAGE,  // WOLOF
706   ( 1 >= kMinCorrPercent) ? MONGOLIAN : UNKNOWN_LANGUAGE,  // ABKHAZIAN
707   ( 8 >= kMinCorrPercent) ? SOMALI : UNKNOWN_LANGUAGE,  // AFAR
708   ( 5 >= kMinCorrPercent) ? QUECHUA : UNKNOWN_LANGUAGE,  // AYMARA
709   (13 >= kMinCorrPercent) ? TATAR : UNKNOWN_LANGUAGE,  // BASHKIR
710   ( 3 >= kMinCorrPercent) ? ENGLISH : UNKNOWN_LANGUAGE,  // BISLAMA
711   (45 >= kMinCorrPercent) ? TIBETAN : UNKNOWN_LANGUAGE,  // DZONGKHA
712   ( 4 >= kMinCorrPercent) ? TONGA : UNKNOWN_LANGUAGE,  // FIJIAN
713   ( 7 >= kMinCorrPercent) ? INUPIAK : UNKNOWN_LANGUAGE,  // GREENLANDIC
714   ( 3 >= kMinCorrPercent) ? AFAR : UNKNOWN_LANGUAGE,  // HAUSA
715   ( 3 >= kMinCorrPercent) ? OCCITAN : UNKNOWN_LANGUAGE,  // HAITIAN_CREOLE
716   ( 7 >= kMinCorrPercent) ? GREENLANDIC : UNKNOWN_LANGUAGE,  // INUPIAK
717   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // INUKTITUT
718   ( 4 >= kMinCorrPercent) ? HINDI : UNKNOWN_LANGUAGE,  // KASHMIRI
719   (30 >= kMinCorrPercent) ? RUNDI : UNKNOWN_LANGUAGE,  // KINYARWANDA
720   ( 2 >= kMinCorrPercent) ? TAGALOG : UNKNOWN_LANGUAGE,  // MALAGASY
721   (17 >= kMinCorrPercent) ? GERMAN : UNKNOWN_LANGUAGE,  // NAURU
722   (12 >= kMinCorrPercent) ? SOMALI : UNKNOWN_LANGUAGE,  // OROMO
723   (30 >= kMinCorrPercent) ? KINYARWANDA : UNKNOWN_LANGUAGE,  // RUNDI
724   (11 >= kMinCorrPercent) ? TONGA : UNKNOWN_LANGUAGE,  // SAMOAN
725   ( 1 >= kMinCorrPercent) ? LINGALA : UNKNOWN_LANGUAGE,  // SANGO
726   (32 >= kMinCorrPercent) ? MARATHI : UNKNOWN_LANGUAGE,  // SANSKRIT
727   (16 >= kMinCorrPercent) ? ZULU : UNKNOWN_LANGUAGE,  // SISWANT
728   ( 5 >= kMinCorrPercent) ? SISWANT : UNKNOWN_LANGUAGE,  // TSONGA
729   (29 >= kMinCorrPercent) ? SESOTHO : UNKNOWN_LANGUAGE,  // TSWANA
730   ( 2 >= kMinCorrPercent) ? ESTONIAN : UNKNOWN_LANGUAGE,  // VOLAPUK
731   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // ZHUANG
732   ( 1 >= kMinCorrPercent) ? MALAY : UNKNOWN_LANGUAGE,  // KHASI
733   (28 >= kMinCorrPercent) ? ENGLISH : UNKNOWN_LANGUAGE,  // SCOTS
734   (15 >= kMinCorrPercent) ? KINYARWANDA : UNKNOWN_LANGUAGE,  // GANDA
735   ( 7 >= kMinCorrPercent) ? ENGLISH : UNKNOWN_LANGUAGE,  // MANX
736   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // MONTENEGRIN
737 };
738 
739 COMPILE_ASSERT(arraysize(kClosestAltLanguage) == NUM_LANGUAGES,
740                kClosestAltLanguage_has_incorrect_size);
741 
742 
FlagFinish(int flags)743 inline bool FlagFinish(int flags) {return (flags & kCLDFlagFinish) != 0;}
FlagSqueeze(int flags)744 inline bool FlagSqueeze(int flags) {return (flags & kCLDFlagSqueeze) != 0;}
FlagRepeats(int flags)745 inline bool FlagRepeats(int flags) {return (flags & kCLDFlagRepeats) != 0;}
FlagTop40(int flags)746 inline bool FlagTop40(int flags) {return (flags & kCLDFlagTop40) != 0;}
FlagShort(int flags)747 inline bool FlagShort(int flags) {return (flags & kCLDFlagShort) != 0;}
FlagHint(int flags)748 inline bool FlagHint(int flags) {return (flags & kCLDFlagHint) != 0;}
FlagUseWords(int flags)749 inline bool FlagUseWords(int flags) {return (flags & kCLDFlagUseWords) != 0;}
750 
751 
752 
753 
754 //------------------------------------------------------------------------------
755 // For --cld_html debugging output. Not thread safe
756 //------------------------------------------------------------------------------
757 static Language prior_lang = UNKNOWN_LANGUAGE;
758 static bool prior_unreliable = false;
759 
760 //------------------------------------------------------------------------------
761 // End For --cld_html debugging output
762 //------------------------------------------------------------------------------
763 
764 
765 // Backscan to word boundary, returning how many bytes n to go back
766 // so that src - n is non-space ans src - n - 1 is space.
767 // If not found in kMaxSpaceScan bytes, return 0
BackscanToSpace(const char * src,int limit)768 int BackscanToSpace(const char* src, int limit) {
769   int n = 0;
770   limit = cld::minint(limit, kMaxSpaceScan);
771   while (n < limit) {
772     if (src[-n - 1] == ' ') {return n;}    // We are at _X
773     ++n;
774   }
775   return 0;
776 }
777 
778 // Forwardscan to word boundary, returning how many bytes n to go forward
779 // so that src + n is non-space ans src + n - 1 is space.
780 // If not found in kMaxSpaceScan bytes, return 0
ForwardscanToSpace(const char * src,int limit)781 int ForwardscanToSpace(const char* src, int limit) {
782   int n = 0;
783   limit = cld::minint(limit, kMaxSpaceScan);
784   while (n < limit) {
785     if (src[n] == ' ') {return n + 1;}    // We are at _X
786     ++n;
787   }
788   return 0;
789 }
790 
791 
792 // This uses a cheap predictor to get a measure of compression, and
793 // hence a measure of repetitiveness. It works on complete UTF-8 characters
794 // instead of bytes, because three-byte UTF-8 Indic, etc. text compress highly
795 // all the time when done with a byte-based count. Sigh.
796 //
797 // To allow running prediction across multiple chunks, caller passes in current
798 // 12-bit hash value and int[4096] prediction table. Caller inits these to 0.
799 //
800 // Returns the number of *bytes* correctly predicted, increments by 1..4 for
801 // each correctly-predicted character.
802 //
803 // NOTE: Overruns by up to three bytes. Not a problem with valid UTF-8 text
804 //
CountPredictedBytes(const char * isrc,int srclen,int * hash,int * tbl)805 int CountPredictedBytes(const char* isrc, int srclen, int* hash, int* tbl) {
806   int p_count = 0;
807   const uint8* src = reinterpret_cast<const uint8*>(isrc);
808   const uint8* srclimit = src + srclen;
809   int local_hash = *hash;
810 
811   while (src < srclimit) {
812     int c = src[0];
813     int incr = 1;
814 
815     // Pick up one char and length
816     if (c < 0xc0) {
817       // One-byte or continuation byte: 00xxxxxx 01xxxxxx 10xxxxxx
818       // Do nothing more
819     } else if ((c & 0xe0) == 0xc0) {
820       // Two-byte
821       c = (c << 8) | src[1];
822       incr = 2;
823     } else if ((c & 0xf0) == 0xe0) {
824       // Three-byte
825       c = (c << 16) | (src[1] << 8) | src[2];
826       incr = 3;
827     } else {
828       // Four-byte
829       c = (c << 24) | (src[1] << 16) | (src[2] << 8) | src[3];
830       incr = 4;
831     }
832     src += incr;
833 
834     int p = tbl[local_hash];            // Prediction
835     tbl[local_hash] = c;                // Update prediction
836     p_count += (c == p);                // Count good predictions
837 
838     local_hash = ((local_hash << 4) ^ c) & 0xfff;
839   }
840 
841   *hash = local_hash;
842   return p_count;
843 }
844 
845 
846 
847 // Counts number of spaces; a little faster than one-at-a-time
848 // Doesn't count odd bytes at end
CountSpaces4(const char * src,int src_len)849 int CountSpaces4(const char* src, int src_len) {
850   int s_count = 0;
851   for (int i = 0; i < (src_len & ~3); i += 4) {
852     s_count += (src[i] == ' ');
853     s_count += (src[i+1] == ' ');
854     s_count += (src[i+2] == ' ');
855     s_count += (src[i+3] == ' ');
856   }
857   return s_count;
858 }
859 
860 // Remove words of text that have more than half their letters predicted
861 // correctly by our cheap predictor, moving the remaining words in-place
862 // to the front of the input buffer.
863 //
864 // To allow running prediction across multiple chunks, caller passes in current
865 // 12-bit hash value and int[4096] prediction table. Caller inits these to 0.
866 //
867 // Return the new, possibly-shorter length
868 //
869 // Result Buffer ALWAYS has leading space and trailing space space space NUL,
870 // if input does
871 //
CheapRepWordsInplace(char * isrc,int srclen,int * hash,int * tbl)872 int CheapRepWordsInplace(char* isrc, int srclen, int* hash, int* tbl) {
873   const uint8* src = reinterpret_cast<const uint8*>(isrc);
874   const uint8* srclimit = src + srclen;
875   char* dst = isrc;
876   int local_hash = *hash;
877   char* word_dst = dst;           // Start of next word
878   int good_predict_bytes = 0;
879   int word_length_bytes = 0;
880 
881   while (src < srclimit) {
882     int c = src[0];
883     int incr = 1;
884     *dst++ = c;
885 
886     if (c == ' ') {
887       if ((good_predict_bytes * 2) > word_length_bytes) {
888         // Word is well-predicted: backup to start of this word
889         dst = word_dst;
890         if (FLAGS_cld_showme) {
891           // Mark the deletion point with period
892           // Don't repeat multiple periods
893           // Cannot mark with more bytes or may overwrite unseen input
894           if ((isrc < (dst - 2)) && (dst[-2] != '.')) {
895             *dst++ = '.';
896             *dst++ = ' ';
897           }
898         }
899       }
900       word_dst = dst;              // Start of next word
901       good_predict_bytes = 0;
902       word_length_bytes = 0;
903     }
904 
905     // Pick up one char and length
906     if (c < 0xc0) {
907       // One-byte or continuation byte: 00xxxxxx 01xxxxxx 10xxxxxx
908       // Do nothing more
909     } else if ((c & 0xe0) == 0xc0) {
910       // Two-byte
911       *dst++ = src[1];
912       c = (c << 8) | src[1];
913       incr = 2;
914     } else if ((c & 0xf0) == 0xe0) {
915       // Three-byte
916       *dst++ = src[1];
917       *dst++ = src[2];
918       c = (c << 16) | (src[1] << 8) | src[2];
919       incr = 3;
920     } else {
921       // Four-byte
922       *dst++ = src[1];
923       *dst++ = src[2];
924       *dst++ = src[3];
925       c = (c << 24) | (src[1] << 16) | (src[2] << 8) | src[3];
926       incr = 4;
927     }
928     src += incr;
929     word_length_bytes += incr;
930 
931     int p = tbl[local_hash];            // Prediction
932     tbl[local_hash] = c;                // Update prediction
933     if (c == p) {
934       good_predict_bytes += incr;       // Count good predictions
935     }
936 
937     local_hash = ((local_hash << 4) ^ c) & 0xfff;
938   }
939 
940   *hash = local_hash;
941 
942   if ((dst - isrc) < (srclen - 3)) {
943     // Pad and make last char clean UTF-8 by putting following spaces
944     dst[0] = ' ';
945     dst[1] = ' ';
946     dst[2] = ' ';
947     dst[3] = '\0';
948   } else   if ((dst - isrc) < srclen) {
949     // Make last char clean UTF-8 by putting following space off the end
950     dst[0] = ' ';
951   }
952 
953   return static_cast<int>(dst - isrc);
954 }
955 
956 
957 // Remove portions of text that have a high density of spaces, or that are
958 // overly repetitive, squeezing the remaining text in-place to the front of the
959 // input buffer.
960 //
961 // Squeezing looks at density of space/prediced chars in fixed-size chunks,
962 // specified by chunksize. A chunksize <= 0 uses the default size of 48 bytes.
963 //
964 // Return the new, possibly-shorter length
965 //
966 // Result Buffer ALWAYS has leading space and trailing space space space NUL,
967 // if input does
968 //
CheapSqueezeInplace(char * isrc,int srclen,int ichunksize)969 int CompactLangDetImpl::CheapSqueezeInplace(char* isrc,
970                                             int srclen,
971                                             int ichunksize) {
972   char* src = isrc;
973   char* dst = src;
974   char* srclimit = src + srclen;
975   bool skipping = false;
976 
977   int hash = 0;
978   // Allocate local prediction table.
979   int* predict_tbl = new int[kPredictionTableSize];
980   memset(predict_tbl, 0, kPredictionTableSize * sizeof(predict_tbl[0]));
981 
982   int chunksize = ichunksize;
983   if (chunksize == 0) {chunksize = kChunksizeDefault;}
984   int space_thresh = (chunksize * kSpacesThreshPercent) / 100;
985   int predict_thresh = (chunksize * kPredictThreshPercent) / 100;
986 
987   while (src < srclimit) {
988     int remaining_bytes = srclimit - src;
989     int len = cld::minint(chunksize, remaining_bytes);
990     // Make len land us on a UTF-8 character boundary, and also fix
991     // mispredictions because we could get out of phase.
992     // Loop always terminates at trailing space in buffer.
993     while ((src[len] & 0xc0) == 0x80)
994       ++len; // Move past continuation bytes
995 
996     int space_n = CountSpaces4(src, len);
997     int predb_n = CountPredictedBytes(src, len, &hash, predict_tbl);
998     if ((space_n >= space_thresh) || (predb_n >= predict_thresh)) {
999       // Skip the text
1000       if (!skipping) {
1001         // Keeping-to-skipping transition; do it at a space
1002         int n = BackscanToSpace(dst, static_cast<int>(dst - isrc));
1003         dst -= n;
1004         skipping = true;
1005         if (FLAGS_cld_showme) {
1006           // Mark the deletion point with black square U+25A0
1007           *dst++ = 0xe2;
1008           *dst++ = 0x96;
1009           *dst++ = 0xa0;
1010           *dst++ = ' ';
1011         }
1012         if (dst == isrc) {
1013           // Force a leading space if the first chunk is deleted
1014           *dst++ = ' ';
1015         }
1016       }
1017     } else {
1018       // Keep the text
1019       if (skipping) {
1020         // Skipping-to-keeping transition; do it at a space
1021         int n = ForwardscanToSpace(src, len);
1022         src += n;
1023         remaining_bytes -= n;   // Shrink remaining length
1024         len -= n;
1025         skipping = false;
1026       }
1027       // "len" can be negative in some cases
1028       if (len > 0) {
1029         memmove(dst, src, len);
1030         dst += len;
1031       }
1032     }
1033     src += len;
1034   }
1035 
1036   if ((dst - isrc) < (srclen - 3)) {
1037     // Pad and make last char clean UTF-8 by putting following spaces
1038     dst[0] = ' ';
1039     dst[1] = ' ';
1040     dst[2] = ' ';
1041     dst[3] = '\0';
1042   } else   if ((dst - isrc) < srclen) {
1043     // Make last char clean UTF-8 by putting following space off the end
1044     dst[0] = ' ';
1045   }
1046 
1047   // Deallocate local prediction table
1048   delete[] predict_tbl;
1049   return static_cast<int>(dst - isrc);
1050 }
1051 
1052 // Timing 2.8GHz P4 (dsites 2008.03.20) with 170KB input
1053 //  About 90 MB/sec, with or without memcpy, chunksize 48 or 4096
1054 //  Just CountSpaces is about 340 MB/sec
1055 //  Byte-only CountPredictedBytes is about 150 MB/sec
1056 //  Byte-only CountPredictedBytes, conditional tbl[] = is about 85! MB/sec
1057 //  Byte-only CountPredictedBytes is about 180 MB/sec, byte tbl, byte/int c
1058 //  Unjammed byte-only both = 170 MB/sec
1059 //  Jammed byte-only both = 120 MB/sec
1060 //  Back to original w/slight updates, 110 MB/sec
1061 //
CheapSqueezeTriggerTest(const char * src,int srclen,int testsize)1062 bool CheapSqueezeTriggerTest(const char* src, int srclen, int testsize) {
1063   // Don't trigger at all on short text
1064   if (srclen < testsize) {return false;}
1065   int space_thresh = (testsize * kSpacesTriggerPercent) / 100;
1066   int predict_thresh = (testsize * kPredictTriggerPercent) / 100;
1067   int hash = 0;
1068   // Allocate local prediction table.
1069   int* predict_tbl = new int[kPredictionTableSize];
1070   memset(predict_tbl, 0, kPredictionTableSize * sizeof(predict_tbl[0]));
1071 
1072   bool retval = false;
1073   if ((CountSpaces4(src, testsize) >= space_thresh) ||
1074       (CountPredictedBytes(src, testsize, &hash, predict_tbl) >=
1075        predict_thresh)) {
1076     retval = true;
1077   }
1078   // Deallocate local prediction table
1079   delete[] predict_tbl;
1080   return retval;
1081 }
1082 
1083 
1084 
1085 // Close pairs (correlation) language_enum/language_enum
1086 //  id/ms (0.47)    38/40    [1]
1087 //  bo/dz (0.46)    105/135  [2]
1088 //  cz/sk (0.43)    17/68    [3]
1089 //  no/nn (0.42)    10/80    [4]
1090 //  hi/mr (0.38)    35/64    [5]
1091 //  xh/zu (0.37)    83/84    [6]
1092 // Subscripted by packed language, gives 0 or a subscript in closepair
1093 // scoring array inside doc_tote
1094 static const uint8 kClosePair[EXT_NUM_LANGUAGES + 1] = {
1095   0,
1096   0,0,0,0,0,0,0,0, 0,0,4,0,0,0,0,0, 0,3,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
1097   0,0,0,5,0,0,1,0, 1,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
1098   5,0,0,0,3,0,0,0, 0,0,0,0,0,0,0,0, 4,0,0,6,6,0,0,0, 0,0,0,0,0,0,0,0,
1099   0,0,0,0,0,0,0,0, 0,2,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
1100   0,0,0,0,0,0,0,2, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
1101   // Add new language close-pair number just before here (just use 0)
1102 };
1103 
1104 
1105 // Delete any extended languages from doc_tote
RemoveExtendedLanguages(ToteWithReliability * doc_tote)1106 void RemoveExtendedLanguages(ToteWithReliability* doc_tote) {
1107   for (int sub = 0; sub < doc_tote->MaxSize(); ++sub) {
1108     if (cld::UnpackLanguage(doc_tote->Key(sub)) >= NUM_LANGUAGES) {
1109       // Effectively remove the extended language by setting key&score to zero
1110       if (FLAGS_dbgscore) {
1111         fprintf(stderr, "{-%s} ",
1112                 ExtLanguageCode(cld::UnpackLanguage(doc_tote->Key(sub))));
1113       }
1114 
1115       // Delete entry
1116       doc_tote->SetKey(sub, 0);
1117       doc_tote->SetValue(sub, 0);
1118       doc_tote->SetReliability(sub, 0);
1119     }
1120   }
1121 }
1122 
1123 static const int kMinReliableKeepPercent = 41;  // Remove lang if reli < this
1124 
1125 // For Tier3 languages, require a minimum number of bytes to be first-place lang
1126 static const int kGoodFirstT3MinBytes = 24;         // <this => no first
1127 
1128 // Move bytes for unreliable langs to another lang or UNKNOWN
1129 // doc_tote is sorted, so cannot Add
1130 //
1131 // If both CHINESE and CHINESET are present and unreliable, do not delete both;
1132 // merge both into CHINESE.
1133 //
1134 //dsites 2009.03.19
1135 // we also want to remove Tier3 languages as the first lang if there is very
1136 // little text like ej1 ej2 ej3 ej4
1137 // maybe fold this back in earlier
1138 //
RemoveUnreliableLanguages(ToteWithReliability * doc_tote)1139 void RemoveUnreliableLanguages(ToteWithReliability* doc_tote) {
1140   // Prepass to merge some low-reliablility languages
1141   int total_bytes = 0;
1142   for (int sub = 0; sub < doc_tote->MaxSize(); ++sub) {
1143     int plang = doc_tote->Key(sub);
1144     if (plang == 0) {continue;}                     // Empty slot
1145 
1146     Language lang = cld::UnpackLanguage(plang);
1147     int bytes = doc_tote->Value(sub);
1148     int reli = doc_tote->Reliability(sub);
1149     if (bytes == 0) {continue;}                     // Zero bytes
1150     total_bytes += bytes;
1151 
1152     // Reliable percent is stored reliable score over stored bytecount
1153     int reliable_percent = reli / bytes;
1154     if (reliable_percent >= kMinReliableKeepPercent) {continue;}   // Keeper
1155 
1156     // This language is too unreliable to keep, but we might merge it.
1157     Language altlang = UNKNOWN_LANGUAGE;
1158     if (lang < NUM_LANGUAGES) {altlang = kClosestAltLanguage[lang];}
1159     if (altlang == UNKNOWN_LANGUAGE) {continue;}    // No alternative
1160 
1161     // Look for alternative in doc_tote
1162     int altsub = doc_tote->Find(cld::PackLanguage(altlang));
1163     if (altsub < 0) {continue;}                     // No alternative text
1164 
1165     int bytes2 = doc_tote->Value(altsub);
1166     int reli2 = doc_tote->Reliability(altsub);
1167     if (bytes2 == 0) {continue;}                    // Zero bytes
1168 
1169     // Reliable percent is stored reliable score over stored bytecount
1170     int reliable_percent2 = reli2 / bytes2;
1171 
1172     // Merge one language into the other. Break ties toward lower lang #
1173     int tosub = altsub;
1174     int fromsub = sub;
1175     bool into_lang = false;
1176     if ((reliable_percent2 < reliable_percent) ||
1177         ((reliable_percent2 == reliable_percent) && (lang < altlang))) {
1178       tosub = sub;
1179       fromsub = altsub;
1180       into_lang = true;
1181     }
1182 
1183     // Make sure reliability doesn't drop and is enough to avoid delete
1184     int newpercent = cld::maxint(reliable_percent, reliable_percent2);
1185     newpercent = cld::maxint(newpercent, kMinReliableKeepPercent);
1186     int newbytes = bytes + bytes2;
1187     int newreli = newpercent * newbytes;
1188 
1189     doc_tote->SetKey(fromsub, 0);
1190     doc_tote->SetValue(fromsub, 0);
1191     doc_tote->SetReliability(fromsub, 0);
1192     doc_tote->SetValue(tosub, newbytes);
1193     doc_tote->SetReliability(tosub, newreli);
1194 
1195     // Show fate of unreliable languages if at least 10 bytes
1196     if (FLAGS_cld_html /*&& (newpercent >= 10)*/ && (newbytes >= 10)) {
1197       if (into_lang) {
1198         fprintf(stderr, "{Unreli %s.%d(%dB) => %s} ",
1199                 ExtLanguageCode(altlang), reliable_percent2, bytes2,
1200                 ExtLanguageCode(lang));
1201       } else {
1202         fprintf(stderr, "{Unreli %s.%d(%dB) => %s} ",
1203                 ExtLanguageCode(lang), reliable_percent, bytes,
1204                 ExtLanguageCode(altlang));
1205       }
1206     }
1207   }
1208 
1209 
1210   // Pass to delete any remaining unreliable languages
1211   for (int sub = 0; sub < doc_tote->MaxSize(); ++sub) {
1212     int plang = doc_tote->Key(sub);
1213     if (plang == 0) {continue;}                     // Empty slot
1214 
1215     Language lang = cld::UnpackLanguage(plang);
1216     int bytes = doc_tote->Value(sub);
1217     int reli = doc_tote->Reliability(sub);
1218     if (bytes == 0) {continue;}                     // Zero bytes
1219 
1220     bool is_tier3 = (cld::kIsPackedTop40[plang] == 0);
1221     if (is_tier3 &&
1222         (bytes < kGoodFirstT3MinBytes) &&
1223         (bytes < total_bytes)) {
1224       reli = 0;                                     // Too-short tier3
1225     }
1226 
1227     // Reliable percent is stored as reliable score over stored bytecount
1228     int reliable_percent = reli / bytes;
1229     if (reliable_percent >= kMinReliableKeepPercent) {continue;}  // Keeper
1230 
1231     // Delete unreliable entry
1232     doc_tote->SetKey(sub, 0);
1233     doc_tote->SetValue(sub, 0);
1234     doc_tote->SetReliability(sub, 0);
1235 
1236     // Show fate of unreliable languages if at least 10 bytes
1237     if (FLAGS_cld_html /*&& (reliable_percent >= 10)*/ && (bytes >= 10)) {
1238       fprintf(stderr, "{Unreli %s.%d(%dB)} ",
1239               ExtLanguageCode(lang), reliable_percent, bytes);
1240     }
1241   }
1242 
1243   if (FLAGS_cld_html) {fprintf(stderr, "<br>\n");}
1244 }
1245 
1246 
1247 // Move less likely byte count to more likely for close pairs of languages
RefineScoredClosePairs(ToteWithReliability * doc_tote)1248 void RefineScoredClosePairs(ToteWithReliability* doc_tote) {
1249   for (int sub = 0; sub < doc_tote->MaxSize(); ++sub) {
1250     int close_packedlang = doc_tote->Key(sub);
1251     int subscr = kClosePair[close_packedlang];
1252     if (subscr == 0) {continue;}
1253 
1254     // We have a close pair language -- if the other one is also scored and the
1255     // longword score differs enough, put all our eggs into one basket
1256 
1257     // Nonzero longword score: Go look for the other of this pair
1258     for (int sub2 = sub + 1; sub2 < doc_tote->MaxSize(); ++sub2) {
1259       if (kClosePair[doc_tote->Key(sub2)] == subscr) {
1260         // We have a matching pair
1261         int close_packedlang2 = doc_tote->Key(sub2);
1262 
1263         // Move all the text bytes from lower byte-count to higher one
1264         int from_sub, to_sub;
1265         Language from_lang, to_lang;
1266         if (doc_tote->Value(sub) < doc_tote->Value(sub2)) {
1267           from_sub = sub;
1268           to_sub = sub2;
1269           from_lang = cld::UnpackLanguage(close_packedlang);
1270           to_lang = cld::UnpackLanguage(close_packedlang2);
1271         } else {
1272           from_sub = sub2;
1273           to_sub = sub;
1274           from_lang = cld::UnpackLanguage(close_packedlang2);
1275           to_lang = cld::UnpackLanguage(close_packedlang);
1276         }
1277 
1278         // Move all the bytes smaller => larger of the pair
1279         if (FLAGS_cld_html || FLAGS_dbgscore) {
1280           // Show fate of closepair language
1281           int val = doc_tote->Value(from_sub);
1282           int reli = doc_tote->Reliability(from_sub);
1283           int reliable_percent = reli / (val ? val : 1);  // avoid zdiv
1284           fprintf(stderr, "{CloseLangPair: %s.%d%%(%dB) => %s} ",
1285                   ExtLanguageCode(from_lang),
1286                   reliable_percent,
1287                   doc_tote->Value(from_sub),
1288                   ExtLanguageCode(to_lang));
1289         }
1290         int sum = doc_tote->Value(to_sub) + doc_tote->Value(from_sub);
1291         doc_tote->SetValue(to_sub, sum);
1292         doc_tote->SetReliability(to_sub, 100 * sum);
1293 
1294         // Delete old entry
1295         doc_tote->SetKey(from_sub, 0);
1296         doc_tote->SetValue(from_sub, 0);
1297         doc_tote->SetReliability(from_sub, 0);
1298 
1299         break;    // Exit inner for sub2 loop
1300       }
1301     }     // End for sub2
1302   }   // End for sub
1303 }
1304 
1305 
ApplyLanguageHints(Tote * chunk_tote,int tote_grams,uint8 * lang_hint_boost)1306 void ApplyLanguageHints(Tote* chunk_tote, int tote_grams,
1307                         uint8* lang_hint_boost) {
1308   // Need 8 quad/unigrams to give full hint boost, else derate linearly
1309   if (tote_grams > 8) {
1310     tote_grams = 8;
1311   }
1312   for (int sub = 0; sub < chunk_tote->MaxSize(); ++sub) {
1313     // Hint boosts are per packed subscript
1314     int lang_sub = chunk_tote->Key(sub);
1315     int new_value = chunk_tote->Value(sub) +
1316       ((lang_hint_boost[lang_sub] * tote_grams) >> 3);
1317     chunk_tote->SetValue(sub, new_value);
1318     if (FLAGS_dbgscore && (lang_hint_boost[lang_sub] > 0)) {
1319       fprintf(stderr, "[%s+=%d*%d/8] ",
1320               ExtLanguageCode(cld::UnpackLanguage(lang_sub)),
1321               lang_hint_boost[lang_sub], tote_grams);
1322     }
1323   }
1324 }
1325 
1326 
PrintHtmlEscapedText(FILE * f,const char * txt,int len)1327 void PrintHtmlEscapedText(FILE* f, const char* txt, int len) {
1328   for (int i = 0; i < len; ++i) {
1329     char c = txt[i];
1330     if (c == '<') {
1331       fprintf(f, "&lt;");
1332     } else if (c == '>') {
1333       fprintf(f, "&gt;");
1334     } else if (c == '&') {
1335       fprintf(f, "&amp;");
1336     } else if (c == '\'') {
1337       fprintf(f, "&apos;");
1338     } else if (c == '"') {
1339       fprintf(f, "&quot;");
1340     } else {
1341       fprintf(f, "%c", c);
1342     }
1343   }
1344   fprintf(f, "<br>\n");
1345 }
1346 
1347 
1348 // Add one chunk's score to running document score
1349 // If the top language is UNKNOWN_LANGUAGE, score nothing. This is used to
1350 // positively identify text to be ignored, such as link farms.
1351 // Sort before scoring and reinit afterward
1352 //
1353 // src and srclen are just for debug output
ScoreChunkIntoDoc(const char * src,int srclen,int advance_by,UnicodeLScript lscript,Tote * chunk_tote,ToteWithReliability * doc_tote,int tote_grams,uint8 * lang_hint_boost)1354 void ScoreChunkIntoDoc(const char* src, int srclen, int advance_by,
1355                        UnicodeLScript lscript,
1356                        Tote* chunk_tote,
1357                        ToteWithReliability* doc_tote,
1358                        int tote_grams,
1359                        uint8* lang_hint_boost) {
1360   // Apply hints before sorting
1361   if (lang_hint_boost) {
1362     ApplyLanguageHints(chunk_tote, tote_grams, lang_hint_boost);
1363   }
1364 
1365   // Sort to get top two languages
1366   chunk_tote->Sort(2);
1367   Language cur_lang = cld::UnpackLanguage(chunk_tote->Key(0));
1368 
1369   // Return if empty
1370   if (cur_lang < 0) {
1371     chunk_tote->Reinit();
1372     return;
1373   }
1374 
1375   bool cur_unreliable = false;
1376 
1377   // Reliability is a function of mean script score per KB of text
1378   int len = chunk_tote->GetByteCount();
1379   int reliability = cld::GetReliability((len * 2) / advance_by,
1380                                         lscript,
1381                                         chunk_tote);
1382   cur_unreliable = (reliability < cld::kMinReliable);
1383 
1384   // If tote_grams=0, always reliable
1385   // If tote_grams=1, always unreliable
1386   if (tote_grams == 0) {
1387     reliability = 100;
1388     cur_unreliable = false;
1389   } else if (tote_grams == 1) {
1390     reliability = 0;
1391     cur_unreliable = true;
1392   }
1393 
1394 #if 0
1395   // TEMP
1396   if (FLAGS_cld_html) {
1397     if (reliability >= kMinReliableKeepPercent) {
1398       fprintf(stderr, "R%d%% ", reliability);
1399     } else {
1400       fprintf(stderr, "--R%d%% ", reliability);
1401     }
1402   }
1403 #endif
1404 
1405   // Track the sequence of language fragments [result currently unused]
1406   ////if (reliability >= kMinReliableSeq) {
1407   ////  doc_tote->AddSeq(chunk_tote->Key(0));
1408   ////}
1409 
1410   if (cur_unreliable && (chunk_tote->Key(1) != 0)) {
1411     // Unreliable and two top contenders, split byte count 5/8 - 3/8
1412     int top_len = ((len * 5) + 4) >> 3;
1413     int second_len = len - top_len;
1414 
1415     doc_tote->Add(chunk_tote->Key(0),
1416                   top_len, chunk_tote->Value(0), reliability);
1417     doc_tote->Add(chunk_tote->Key(1),
1418                   second_len, chunk_tote->Value(1), reliability);
1419     if (FLAGS_dbgscore) {
1420       fprintf(stderr, "{+%s.%d.%dR(%dB) +%s.%d.%dR(%dB)} ",
1421               ExtLanguageCode(cld::UnpackLanguage(chunk_tote->Key(0))),
1422               chunk_tote->Value(0),
1423               reliability,
1424               top_len,
1425               ExtLanguageCode(cld::UnpackLanguage(chunk_tote->Key(1))),
1426               chunk_tote->Value(1),
1427               reliability,
1428               second_len);
1429     }
1430   } else {
1431     // Reliable or single contender
1432     doc_tote->Add(chunk_tote->Key(0),
1433                   len, chunk_tote->Value(0), reliability);
1434     if (FLAGS_dbgscore) {
1435       fprintf(stderr, "{+%s.%d.%dR(%dB)} ",
1436               ExtLanguageCode(cld::UnpackLanguage(chunk_tote->Key(0))),
1437               chunk_tote->Value(0),
1438               reliability,
1439               len);
1440     }
1441   }
1442 
1443   if (FLAGS_cld_html) {
1444     if (cur_lang < 0) {cur_lang = UNKNOWN_LANGUAGE;}
1445     cld::PrintLang(stderr, chunk_tote,
1446               cur_lang, cur_unreliable,
1447               prior_lang, prior_unreliable);
1448     prior_lang = cur_lang;
1449     prior_unreliable = cur_unreliable;
1450 
1451     string temp(src, srclen);
1452     if (temp[0] == '=') {
1453       // Rewrite =ScriptX= or =SwitchX= as =Xxxx= for script code Xxxx
1454       temp = "=Buffered_";
1455       temp.append(UnicodeLScriptCode(lscript));
1456       temp.append("=");
1457     }
1458     cld::PrintText(stderr, cur_lang, temp);
1459   }
1460 
1461   chunk_tote->Reinit();
1462 }
1463 
1464 
PrintTopLang(Language top_lang)1465 void PrintTopLang(Language top_lang) {
1466   if ((top_lang == prior_lang) && (top_lang != UNKNOWN_LANGUAGE)) {
1467     fprintf(stderr, "[] ");
1468   } else {
1469     fprintf(stderr, "[%s] ", ExtLanguageName(top_lang));
1470     prior_lang = top_lang;
1471   }
1472 }
1473 
PrintTopLangSpeculative(Language top_lang)1474 void PrintTopLangSpeculative(Language top_lang) {
1475   fprintf(stderr, "<span style=\"color:#%06X;\">", 0xa0a0a0);
1476   if ((top_lang == prior_lang) && (top_lang != UNKNOWN_LANGUAGE)) {
1477     fprintf(stderr, "[] ");
1478   } else {
1479     fprintf(stderr, "[%s] ", ExtLanguageName(top_lang));
1480     prior_lang = top_lang;
1481   }
1482   fprintf(stderr, "</span>\n");
1483 }
1484 
1485 
1486 // Add one chunk's score to running document score
1487 // Convenience function with constant src text
ScoreChunkIntoDoc2(const char * src,int advance_by,UnicodeLScript lscript,Tote * chunk_tote,ToteWithReliability * doc_tote,int tote_grams,uint8 * lang_hint_boost)1488 void ScoreChunkIntoDoc2(const char* src, int advance_by,
1489                        UnicodeLScript lscript,
1490                        Tote* chunk_tote,
1491                        ToteWithReliability* doc_tote,
1492                        int tote_grams,
1493                        uint8* lang_hint_boost) {
1494   int srclen = static_cast<int>(strlen(src));
1495   ScoreChunkIntoDoc(src, srclen, advance_by, lscript, chunk_tote,
1496                     doc_tote, tote_grams, lang_hint_boost);
1497 }
1498 
1499 
1500 // Score one scriptspan using the only language for that script
ScoreNilgrams(getone::LangSpan * scriptspan,int lang,ToteWithReliability * doc_tote,uint8 * lang_hint_boost,int flags,Language plus_one)1501 void ScoreNilgrams(getone::LangSpan* scriptspan, int lang,
1502                   ToteWithReliability* doc_tote,
1503                   uint8* lang_hint_boost,
1504                   int flags, Language plus_one) {
1505   // For debugging only. Not thread-safe
1506   prior_lang = UNKNOWN_LANGUAGE;
1507   prior_unreliable = false;
1508 
1509   const char* src = scriptspan->text;
1510   int len = scriptspan->text_bytes;
1511 
1512   Tote chunk_tote;
1513   // Score 1000 for 1000 bytes
1514   chunk_tote.AddGram();
1515   chunk_tote.Add(lang, scriptspan->text_bytes);
1516   chunk_tote.AddBytes(scriptspan->text_bytes);
1517   int advance_by = 2;
1518   int tote_grams = 0;   // Indicates fully reliable
1519   ScoreChunkIntoDoc(src, len, advance_by,
1520                     scriptspan->script, &chunk_tote,
1521                     doc_tote, tote_grams, lang_hint_boost);
1522 }
1523 
1524 // Score one scriptspan using unigrams
1525 // Updates tote_grams
ScoreUnigrams(const UTF8PropObj * unigram_obj,getone::LangSpan * scriptspan,int * tote_grams,int gram_limit,Tote * chunk_tote,ToteWithReliability * doc_tote,uint8 * lang_hint_boost,int advance_by,int flags,int * initial_word_span,Language plus_one)1526 static void ScoreUnigrams(const UTF8PropObj* unigram_obj,
1527                       getone::LangSpan* scriptspan,
1528                       int* tote_grams, int gram_limit,
1529                       Tote* chunk_tote,
1530                       ToteWithReliability* doc_tote,
1531                       uint8* lang_hint_boost,
1532                       int advance_by, int flags,
1533                    int* initial_word_span, Language plus_one) {
1534   // chunk_tote may have partial sum coming in
1535   const char* src = scriptspan->text;
1536   const char* srclimit = src + scriptspan->text_bytes;
1537 
1538   // For debugging only. Not thread-safe
1539   prior_lang = UNKNOWN_LANGUAGE;
1540   prior_unreliable = false;
1541 
1542   // Break text up into multiple chunks and score each
1543   while (src < srclimit) {
1544     // Updates tote_grams
1545     int len = cld::DoUniScoreV3(unigram_obj,
1546                                  src, srclimit - src, advance_by,
1547                                  tote_grams, gram_limit, chunk_tote);
1548     if (FlagUseWords(flags) || (*initial_word_span > 0)) {
1549       // Use bigram scoring in addition to quadgrams
1550       cld::DoBigramScoreV3(&kCjkBiTable_obj,
1551                            src, len, chunk_tote);
1552     }
1553     chunk_tote->AddBytes(len);
1554     *initial_word_span -= len;
1555 
1556     if (*tote_grams >= gram_limit) {
1557       // Add this chunk to doc totals
1558       // Remove all but top40 if asked
1559       if (FlagTop40(flags)) {
1560         cld::DemoteNotTop40(chunk_tote, cld::PackLanguage(plus_one));
1561       }
1562 
1563       // Sort, accumulate into doc total, reinit
1564       ScoreChunkIntoDoc(src, len, advance_by,
1565                         scriptspan->script, chunk_tote,
1566                         doc_tote, *tote_grams, lang_hint_boost);
1567       *tote_grams = 0;
1568     } else {
1569       if (FLAGS_cld_html) {
1570         string temp(src, len);
1571         Language top_lang = cld::UnpackLanguage(chunk_tote->CurrentTopKey());
1572         PrintTopLangSpeculative(top_lang);
1573         cld::PrintText(stderr, top_lang, temp);
1574       }
1575     }
1576     src += len;
1577   }
1578   // chunk_tote may have partial sum going out
1579 }
1580 
1581 // Back up one UTF-8 character
BackOneUTF8(const uint8 * p)1582 const uint8* BackOneUTF8(const uint8* p) {
1583   const uint8* retval = p - 1;
1584   if ((*retval & 0xc0) == 0x80) {--retval;}
1585   if ((*retval & 0xc0) == 0x80) {--retval;}
1586   if ((*retval & 0xc0) == 0x80) {--retval;}
1587   return retval;
1588 }
1589 
1590 
1591 // Score one scriptspan using quadgrams
1592 // Incoming chunk_tote may have partial accumulation
ScoreQuadgrams(const cld::CLDTableSummary * quadgram_obj,getone::LangSpan * scriptspan,int * tote_grams,int gram_limit,Tote * chunk_tote,ToteWithReliability * doc_tote,uint8 * lang_hint_boost,int advance_by,int flags,int * initial_word_span,Language plus_one)1593 static void ScoreQuadgrams(const cld::CLDTableSummary* quadgram_obj,
1594                        getone::LangSpan* scriptspan,
1595                        int* tote_grams, int gram_limit,
1596                        Tote* chunk_tote,
1597                        ToteWithReliability* doc_tote,
1598                        uint8* lang_hint_boost,
1599                        int advance_by, int flags,
1600                        int* initial_word_span, Language plus_one) {
1601   // chunk_tote may have partial sum coming in
1602   const char* src = scriptspan->text;
1603   const char* srclimit = src + scriptspan->text_bytes;
1604   const char* lastscored_src = src;
1605 
1606   // For debugging only. Not thread-safe
1607   prior_lang = UNKNOWN_LANGUAGE;
1608   prior_unreliable = false;
1609 
1610   // Break text up into multiple chunks and score each
1611   while (src < srclimit) {
1612     // Updates tote_grams
1613     int len = cld::DoQuadScoreV3(quadgram_obj,
1614                                  src, srclimit - src, advance_by,
1615                                  tote_grams, gram_limit, chunk_tote);
1616     if (FlagUseWords(flags) || (*initial_word_span > 0)) {
1617       // Use word scoring in addition to quadgrams
1618       cld::DoOctaScoreV3(&kLongWord8Table_obj,
1619                          src, len, chunk_tote);
1620     }
1621     chunk_tote->AddBytes(len);
1622     *initial_word_span -= len;
1623 
1624     if (*tote_grams >= gram_limit) {
1625       // Remove all but top40 if asked
1626       if (FlagTop40(flags)) {
1627         cld::DemoteNotTop40(chunk_tote, cld::PackLanguage(plus_one));
1628       }
1629 
1630       // Sort, accumulate into doc total, reinit
1631       ScoreChunkIntoDoc(src, len, advance_by,
1632                         scriptspan->script, chunk_tote,
1633                         doc_tote, *tote_grams, lang_hint_boost);
1634       lastscored_src = src + len;
1635       *tote_grams = 0;
1636     } else {
1637       if (FLAGS_cld_html) {
1638         string temp(src, len);
1639         Language top_lang = cld::UnpackLanguage(chunk_tote->CurrentTopKey());
1640         PrintTopLangSpeculative(top_lang);
1641         cld::PrintText(stderr, top_lang, temp);
1642       }
1643     }
1644     src += len;
1645   }
1646 }
1647 
1648 
1649 
PrintLangs(FILE * f,const Language * language3,const int * percent3,const int * text_bytes,const bool * is_reliable)1650 void PrintLangs(FILE* f, const Language* language3, const int* percent3,
1651                 const int* text_bytes, const bool* is_reliable) {
1652   fprintf(f, "<br>&nbsp;&nbsp;Initial_Languages ");
1653   if (language3[0] != UNKNOWN_LANGUAGE) {
1654     fprintf(f, "%s%s(%d%%)  ",
1655             ExtLanguageName(language3[0]),
1656             *is_reliable ? "" : "*",
1657             percent3[0]);
1658   }
1659   if (language3[1] != UNKNOWN_LANGUAGE) {
1660     fprintf(f, "%s(%d%%)  ", ExtLanguageName(language3[1]), percent3[1]);
1661   }
1662   if (language3[2] != UNKNOWN_LANGUAGE) {
1663     fprintf(f, "%s(%d%%)  ", ExtLanguageName(language3[2]), percent3[2]);
1664   }
1665   fprintf(f, "%d bytes \n", *text_bytes);
1666 
1667   fprintf(f, "<br>\n");
1668 }
1669 
1670 
1671 // Start the tote with a count of one for the default language for script
InitScriptToteLang(Tote * script_tote,UnicodeLScript lscript)1672 void InitScriptToteLang(Tote* script_tote, UnicodeLScript lscript) {
1673   Language defaultlang = cld::kDefaultLanguagePerLScript[lscript];
1674   script_tote->Add(cld::PackLanguage(defaultlang), 1);
1675   script_tote->AddBytes(1);
1676 #if 0
1677   if (FLAGS_cld_html) {
1678     cld::PrintLang(stderr, script_tote,
1679               defaultlang, false,
1680               UNKNOWN_LANGUAGE, false);
1681     prior_lang = cur_lang;
1682     string temp("+1");
1683     cld::PrintText(stderr, defaultlang, temp);
1684   }
1685 #endif
1686 }
1687 
1688 static const char* const kToteName[4] =
1689   {"=Latn=", "=Hani=", "=Script2=", "=Script3="};
1690 static const char* const kToteSwitch[4] =
1691   {"=Latn=", "=Hani=", "=Switch2=", "=Switch3="};
1692 
1693 
1694 
1695 // Upper to lower, keep digits, everything else to minus '-' (2d)
1696 static const char kCharsetToLowerTbl[256] = {
1697   0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
1698   0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
1699   0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
1700   0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37, 0x38,0x39,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
1701 
1702   0x2d,0x61,0x62,0x63,0x64,0x65,0x66,0x67, 0x68,0x69,0x6a,0x6b,0x6c,0x6d,0x6e,0x6f,
1703   0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77, 0x78,0x79,0x7a,0x2d,0x2d,0x2d,0x2d,0x2d,
1704   0x2d,0x61,0x62,0x63,0x64,0x65,0x66,0x67, 0x68,0x69,0x6a,0x6b,0x6c,0x6d,0x6e,0x6f,
1705   0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77, 0x78,0x79,0x7a,0x2d,0x2d,0x2d,0x2d,0x2d,
1706 
1707   0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
1708   0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
1709   0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
1710   0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
1711 
1712   0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
1713   0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
1714   0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
1715   0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
1716 };
1717 
1718 
1719 static const char kIsAlpha[256] = {
1720   0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
1721   0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
1722   0,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,0,0,0,0,0,
1723   0,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,0,0,0,0,0,
1724 
1725   0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
1726   0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
1727   0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
1728   0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
1729 };
1730 
1731 static const char kIsDigit[256] = {
1732   0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
1733   0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 1,1,1,1,1,1,1,1, 1,1,0,0,0,0,0,0,
1734   0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
1735   0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
1736 
1737   0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
1738   0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
1739   0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
1740   0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
1741 };
1742 
1743 // Normalize ASCII string to first 4 alphabetic/digit chars
1744 // Letters are forced to lowercase ASCII
1745 // Used to normalize TLD values
MakeChar4(const char * str,char * norm)1746 void MakeChar4(const char* str, char* norm) {
1747   memcpy(norm, "____", 4);     // four underscores
1748   int l_ptr = 0;
1749   for (int i = 0; i < strlen(str); ++i) {
1750     uint8 uc = static_cast<uint8>(str[i]);
1751     if (kIsAlpha[uc] | kIsDigit[uc]) {
1752       if (l_ptr < 4) {                  // Else ignore
1753         norm[l_ptr] = kCharsetToLowerTbl[uc];
1754         l_ptr++;
1755       }
1756     }
1757   }
1758 }
1759 
1760 // Find subscript of matching key in first 4 bytes of sorted hint array, or -1
HintBinaryLookup4(const HintEntry * hintprobs,int hintprobssize,const char * norm_key)1761 static int HintBinaryLookup4(const HintEntry* hintprobs, int hintprobssize,
1762                      const char* norm_key) {
1763   // Key is always in range [lo..hi)
1764   int lo = 0;
1765   int hi = hintprobssize;
1766   while (lo < hi) {
1767     int mid = (lo + hi) >> 1;
1768     int comp = memcmp(&hintprobs[mid].key[0], norm_key, 4);
1769     if (comp < 0) {
1770       lo = mid + 1;
1771     } else if (comp > 0) {
1772       hi = mid;
1773     } else {
1774       return mid;
1775     }
1776   }
1777   return -1;
1778 }
1779 
1780 
1781 // Increment the initial probabilities based on a per-TLD probs entry
ApplyTLDHint(uint8 * lang_hint_boost,const char * tld_hint)1782 void ApplyTLDHint(uint8* lang_hint_boost, const char* tld_hint) {
1783   if (FLAGS_dbgscore) {
1784     fprintf(stderr, "TLD hint %s\n", tld_hint);
1785   }
1786   char normalized_tld[8];
1787   MakeChar4(tld_hint, normalized_tld);
1788   int n = HintBinaryLookup4(kTLDHintProbs, kTLDHintProbsSize,
1789                            normalized_tld);
1790   // TLD is four bytes, probability entry is 4 bytes
1791   if (n >= 0) {
1792     uint32 probs = kTLDHintProbs[n].probs;
1793 
1794     uint8 prob123 = (probs >> 0) & 0xff;
1795     const uint8* prob123_entry = cld::LgProb2TblEntry(prob123);
1796     uint8 top1 = (probs >> 8) & 0xff;
1797     if (top1 > 0) {lang_hint_boost[top1] += cld::LgProb3(prob123_entry, 0);}
1798     uint8 top2 = (probs >> 16) & 0xff;
1799     if (top2 > 0) {lang_hint_boost[top2] += cld::LgProb3(prob123_entry, 1);}
1800     uint8 top3 = (probs >> 24) & 0xff;
1801     if (top3 > 0) {lang_hint_boost[top3] += cld::LgProb3(prob123_entry, 2);}
1802   }
1803 }
1804 
1805 
1806 // Increment the initial probabilities based on a per-encoding probs entry
ApplyEncodingHint(uint8 * lang_hint_boost,int encoding_hint)1807 void ApplyEncodingHint(uint8* lang_hint_boost, int encoding_hint) {
1808   if (FLAGS_dbgscore) {
1809     Encoding tempenc = static_cast<Encoding>(encoding_hint);
1810     fprintf(stderr, "ENC hint %s\n", EncodingName(tempenc));
1811   }
1812   if (encoding_hint < ISO_8859_1) {return;}
1813   if (encoding_hint >= NUM_ENCODINGS) {return;}
1814   uint32 probs = kEncodingHintProbs[encoding_hint];
1815 
1816   uint8 prob123 = (probs >> 0) & 0xff;
1817   const uint8* prob123_entry = cld::LgProb2TblEntry(prob123);
1818   uint8 top1 = (probs >> 8) & 0xff;
1819   if (top1 > 0) {lang_hint_boost[top1] += cld::LgProb3(prob123_entry, 0);}
1820   uint8 top2 = (probs >> 16) & 0xff;
1821   if (top2 > 0) {lang_hint_boost[top2] += cld::LgProb3(prob123_entry, 1);}
1822   uint8 top3 = (probs >> 24) & 0xff;
1823   if (top3 > 0) {lang_hint_boost[top3] += cld::LgProb3(prob123_entry, 2);}
1824 }
1825 
1826 
1827 // Increment the initial probability for given language by fixed amount
1828 // Does not recognize extended languages as hints
ApplyLanguageHint(uint8 * lang_hint_boost,Language language_hint)1829 void ApplyLanguageHint(uint8* lang_hint_boost, Language language_hint) {
1830   if (FLAGS_dbgscore) {
1831     fprintf(stderr, "LANG hint %s\n", ExtLanguageName(language_hint));
1832   }
1833   if (language_hint < ENGLISH) {return;}
1834   if (language_hint >= NUM_LANGUAGES) {return;}
1835   uint32 probs = kLanguageHintProbs[language_hint];
1836 
1837   uint8 prob123 = (probs >> 0) & 0xff;
1838   const uint8* prob123_entry = cld::LgProb2TblEntry(prob123);
1839   uint8 top1 = (probs >> 8) & 0xff;
1840   if (top1 > 0) {lang_hint_boost[top1] += cld::LgProb3(prob123_entry, 0);}
1841   uint8 top2 = (probs >> 16) & 0xff;
1842   if (top2 > 0) {lang_hint_boost[top2] += cld::LgProb3(prob123_entry, 1);}
1843   uint8 top3 = (probs >> 24) & 0xff;
1844   if (top3 > 0) {lang_hint_boost[top3] += cld::LgProb3(prob123_entry, 2);}
1845 }
1846 
1847 // Extract return values before fixups
ExtractLangEtc(ToteWithReliability * doc_tote,int total_text_bytes,int * reliable_percent3,Language * language3,int * percent3,double * normalized_score3,int * text_bytes,bool * is_reliable)1848 void ExtractLangEtc(ToteWithReliability* doc_tote, int total_text_bytes,
1849                     int* reliable_percent3, Language* language3, int* percent3,
1850                     double*  normalized_score3,
1851                     int* text_bytes, bool* is_reliable) {
1852   reliable_percent3[0] = 0;
1853   reliable_percent3[1] = 0;
1854   reliable_percent3[2] = 0;
1855   language3[0] = UNKNOWN_LANGUAGE;
1856   language3[1] = UNKNOWN_LANGUAGE;
1857   language3[2] = UNKNOWN_LANGUAGE;
1858   percent3[0] = 100;
1859   percent3[1] = 0;
1860   percent3[2] = 0;
1861   normalized_score3[0] = 0.0;
1862   normalized_score3[1] = 0.0;
1863   normalized_score3[2] = 0.0;
1864 
1865   *text_bytes = total_text_bytes;
1866   *is_reliable = false;
1867 
1868   int bytecount1 = total_text_bytes;
1869   int bytecount2 = 0;
1870   int bytecount3 = 0;
1871 
1872   int lang1 = doc_tote->Key(0);
1873   if (lang1 != 0) {
1874     // We have a top language
1875     language3[0] = cld::UnpackLanguage(lang1);
1876     bytecount1 = doc_tote->Value(0);
1877     int reli1 = doc_tote->Reliability(0);
1878     reliable_percent3[0] = reli1 / (bytecount1 ? bytecount1 : 1);  // avoid zdiv
1879     normalized_score3[0] = cld::GetNormalizedScore(language3[0],
1880                                                   ULScript_Common,
1881                                                   bytecount1,
1882                                                   doc_tote->Score(0));
1883   }
1884 
1885   int lang2 = doc_tote->Key(1);
1886   if (lang2 != 0) {
1887     language3[1] = cld::UnpackLanguage(lang2);
1888     bytecount2 = doc_tote->Value(1);
1889     int reli2 = doc_tote->Reliability(1);
1890     reliable_percent3[1] = reli2 / (bytecount2 ? bytecount2 : 1);  // avoid zdiv
1891     normalized_score3[1] = cld::GetNormalizedScore(language3[1],
1892                                                   ULScript_Common,
1893                                                   bytecount2,
1894                                                   doc_tote->Score(1));
1895   }
1896 
1897   int lang3 = doc_tote->Key(2);
1898   if (lang3 != 0) {
1899     language3[2] = cld::UnpackLanguage(lang3);
1900     bytecount3 = doc_tote->Value(2);
1901     int reli3 = doc_tote->Reliability(2);
1902     reliable_percent3[2] = reli3 / (bytecount3 ? bytecount3 : 1);  // avoid zdiv
1903     normalized_score3[2] = cld::GetNormalizedScore(language3[2],
1904                                                   ULScript_Common,
1905                                                   bytecount3,
1906                                                   doc_tote->Score(2));
1907   }
1908 
1909   // Increase total bytes to sum (top 3) if low for some reason
1910   int total_bytecount12 = bytecount1 + bytecount2;
1911   int total_bytecount123 = total_bytecount12 + bytecount3;
1912   if (total_text_bytes < total_bytecount123) {
1913     total_text_bytes = total_bytecount123;
1914     *text_bytes = total_text_bytes;
1915   }
1916 
1917   // Sum minus previous % gives better roundoff behavior than bytecount/total
1918   int total_text_bytes_div = cld::maxint(1, total_text_bytes);    // Avoid zdiv
1919   percent3[0] = (bytecount1 * 100) / total_text_bytes_div;
1920   percent3[1] = (total_bytecount12 * 100) / total_text_bytes_div;
1921   percent3[2] = (total_bytecount123 * 100) / total_text_bytes_div;
1922   percent3[2] -= percent3[1];
1923   percent3[1] -= percent3[0];
1924 
1925   // Roundoff, say 96% 1.6% 1.4%, will produce non-obvious 96% 1% 2%
1926   // Fix this explicitly
1927   if (percent3[1] < percent3[2]) {
1928     ++percent3[1];
1929     --percent3[2];
1930   }
1931   if (percent3[0] < percent3[1]) {
1932     ++percent3[0];
1933     --percent3[1];
1934   }
1935 
1936   *text_bytes = total_text_bytes;
1937 
1938   if (lang1 != 0) {
1939     // We have a top language
1940     // Its reliability is overal result reliability
1941     int bytecount = doc_tote->Value(0);
1942     int reli = doc_tote->Reliability(0);
1943     int reliable_percent = reli / (bytecount ? bytecount : 1);  // avoid zdiv
1944     *is_reliable = reliable_percent >= cld::kMinReliable;
1945   } else {
1946     // No top language at all. This can happen with zero text or 100% Klingon
1947     // if extended=false. Just return all UNKNOWN_LANGUAGE, reliable.
1948     *is_reliable = true;
1949   }
1950 }
1951 
IsFIGS(Language lang)1952 bool IsFIGS(Language lang) {
1953   if (lang == FRENCH) {return true;}
1954   if (lang == ITALIAN) {return true;}
1955   if (lang == GERMAN) {return true;}
1956   if (lang == SPANISH) {return true;}
1957   return false;
1958 }
1959 
IsEFIGS(Language lang)1960 bool IsEFIGS(Language lang) {
1961   if (lang == ENGLISH) {return true;}
1962   if (lang == FRENCH) {return true;}
1963   if (lang == ITALIAN) {return true;}
1964   if (lang == GERMAN) {return true;}
1965   if (lang == SPANISH) {return true;}
1966   return false;
1967 }
1968 
1969 static const int kNonEnBoilerplateMinPercent = 17;    // <this => no second
1970 static const int kNonFIGSBoilerplateMinPercent = 20;  // <this => no second
1971 static const int kGoodFirstMinPercent = 26;           // <this => UNK
1972 static const int kGoodFirstReliableMinPercent = 51;   // <this => unreli
1973 static const int kIgnoreMaxPercent = 95;              // >this => unreli
1974 static const int kKeepMinPercent = 2;                 // <this => unreli
1975 
1976 // For Tier3 languages, require more bytes of text to override
1977 // the first-place language
1978 static const int kGoodSecondT1T2MinBytes = 15;        // <this => no second
1979 static const int kGoodSecondT3MinBytes = 128;         // <this => no second
1980                                                       //
1981 
1982 // Calculate a single summary language for the document, and its reliability.
1983 // Returns language3[0] or language3[1] or ENGLISH or UNKNOWN_LANGUAGE
1984 // This is the heart of matching human-rater perception.
1985 // reliable_percent3[] is currently unused
1986 //
1987 // Do not return Tier3 second language unless there are at least 128 bytes
CalcSummaryLang(ToteWithReliability * doc_tote,int total_text_bytes,const int * reliable_percent3,const Language * language3,const int * percent3,Language * summary_lang,bool * is_reliable)1988 void CalcSummaryLang(ToteWithReliability* doc_tote, int total_text_bytes,
1989                      const int* reliable_percent3,
1990                      const Language* language3,
1991                      const int* percent3,
1992                      Language* summary_lang, bool* is_reliable) {
1993   // Vector of active languages; changes if we delete some
1994   int slot_count = 3;
1995   int active_slot[3] = {0, 1, 2};
1996 
1997   int ignore_percent = 0;
1998   int return_percent = percent3[0];   // Default to top lang
1999   *summary_lang = language3[0];
2000   *is_reliable = true;
2001   if (percent3[0] < kKeepMinPercent) {*is_reliable = false;}
2002 
2003   // If any of top 3 is IGNORE, remove it and increment ignore_percent
2004   for (int i = 0; i < 3; ++i) {
2005     if (language3[i] == TG_UNKNOWN_LANGUAGE) {
2006       ignore_percent += percent3[i];
2007       // Move the rest up, levaing input vectors unchanged
2008       for (int j=i+1; j < 3; ++j) {
2009         active_slot[j - 1] = active_slot[j];
2010       }
2011       -- slot_count;
2012       // Logically remove Ignore from percentage-text calculation
2013       // (extra 1 in 101 avoids zdiv, biases slightly small)
2014       return_percent = (percent3[0] * 100) / (101 - ignore_percent);
2015       *summary_lang = language3[active_slot[0]];
2016       if (percent3[active_slot[0]] < kKeepMinPercent) {*is_reliable = false;}
2017     }
2018   }
2019 
2020 
2021   // If English and X, where X (not UNK) is big enough,
2022   // assume the English is boilerplate and return X.
2023   // Logically remove English from percentage-text calculation
2024   int second_bytes = (total_text_bytes * percent3[active_slot[1]]) / 100;
2025   // Require more bytes of text for Tier3 languages
2026   int minbytesneeded = kGoodSecondT1T2MinBytes;
2027   int plang_second =  cld::PackLanguage(language3[active_slot[1]]);
2028   bool is_tier3 = (cld::kIsPackedTop40[plang_second] == 0);
2029   if (is_tier3) {
2030     minbytesneeded = kGoodSecondT3MinBytes;
2031   }
2032 
2033   if ((language3[active_slot[0]] == ENGLISH) &&
2034       (language3[active_slot[1]] != ENGLISH) &&
2035       (language3[active_slot[1]] != UNKNOWN_LANGUAGE) &&
2036       (percent3[active_slot[1]] >= kNonEnBoilerplateMinPercent) &&
2037       (second_bytes >= minbytesneeded)) {
2038     ignore_percent += percent3[active_slot[0]];
2039     return_percent = (percent3[active_slot[1]] * 100) / (101 - ignore_percent);
2040     *summary_lang = language3[active_slot[1]];
2041     if (percent3[active_slot[1]] < kKeepMinPercent) {*is_reliable = false;}
2042 
2043   // Else If FIGS and X, where X (not UNK, EFIGS) is big enough,
2044   // assume the FIGS is boilerplate and return X.
2045   // Logically remove FIGS from percentage-text calculation
2046   } else if (IsFIGS(language3[active_slot[0]]) &&
2047              !IsEFIGS(language3[active_slot[1]]) &&
2048              (language3[active_slot[1]] != UNKNOWN_LANGUAGE) &&
2049              (percent3[active_slot[1]] >= kNonFIGSBoilerplateMinPercent) &&
2050              (second_bytes >= minbytesneeded)) {
2051     ignore_percent += percent3[active_slot[0]];
2052     return_percent = (percent3[active_slot[1]] * 100) / (101 - ignore_percent);
2053     *summary_lang = language3[active_slot[1]];
2054     if (percent3[active_slot[1]] < kKeepMinPercent) {*is_reliable = false;}
2055 
2056   // Else we are returning the first language, but want to improve its
2057   // return_percent if the second language should be ignored
2058   } else  if ((language3[active_slot[1]] == ENGLISH) &&
2059               (language3[active_slot[0]] != ENGLISH)) {
2060     ignore_percent += percent3[active_slot[1]];
2061     return_percent = (percent3[active_slot[0]] * 100) / (101 - ignore_percent);
2062   } else  if (IsFIGS(language3[active_slot[1]]) &&
2063               !IsEFIGS(language3[active_slot[0]])) {
2064     ignore_percent += percent3[active_slot[1]];
2065     return_percent = (percent3[active_slot[0]] * 100) / (101 - ignore_percent);
2066   }
2067 
2068   // If return percent is too small (too many languages), return UNKNOWN
2069   if ((return_percent < kGoodFirstMinPercent)) {
2070     *summary_lang = UNKNOWN_LANGUAGE;
2071     *is_reliable = false;
2072   }
2073 
2074   // If return percent is small, return language but set unreliable.
2075   if ((return_percent < kGoodFirstReliableMinPercent)) {
2076     *is_reliable = false;
2077   }
2078 
2079   // If ignore percent is too large, set unreliable.
2080   if ((ignore_percent > kIgnoreMaxPercent)) {
2081     *is_reliable = false;
2082   }
2083 
2084   // If we removed all the active languages, return UNKNOWN
2085   if (slot_count == 0) {
2086     *summary_lang = UNKNOWN_LANGUAGE;
2087     *is_reliable = false;
2088   }
2089 }
2090 
2091 
2092 
2093 // Result vector must be exactly three items
DetectLanguageSummaryV25(const CompactLangDet::DetectionTables * tables,const char * buffer,int buffer_length,bool is_plain_text,const char * tld_hint,int encoding_hint,Language language_hint,bool allow_extended_lang,int flags,Language plus_one,Language * language3,int * percent3,double * normalized_score3,int * text_bytes,bool * is_reliable)2094 Language CompactLangDetImpl::DetectLanguageSummaryV25(
2095                         const CompactLangDet::DetectionTables* tables,
2096                         const char* buffer,
2097                         int buffer_length,
2098                         bool is_plain_text,
2099                         const char* tld_hint,       // "id" boosts Indonesian
2100                         int encoding_hint,          // SJS boosts Japanese
2101                         Language language_hint,     // ITALIAN boosts it
2102                         bool allow_extended_lang,
2103                         int flags,
2104                         Language plus_one,
2105                         Language* language3,
2106                         int* percent3,
2107                         double* normalized_score3,
2108                         int* text_bytes,
2109                         bool* is_reliable) {
2110   if (!tables) {
2111     static const CompactLangDet::DetectionTables default_cld_tables = {
2112       &kQuadTable_obj,
2113       &compact_lang_det_generated_ctjkvz_b1_obj
2114     };
2115     tables = &default_cld_tables;
2116   }
2117   language3[0] = UNKNOWN_LANGUAGE;
2118   language3[1] = UNKNOWN_LANGUAGE;
2119   language3[2] = UNKNOWN_LANGUAGE;
2120   percent3[0] = 100;
2121   percent3[1] = 0;
2122   percent3[2] = 0;
2123   normalized_score3[0] = 0.0;
2124   normalized_score3[1] = 0.0;
2125   normalized_score3[2] = 0.0;
2126   *text_bytes = 0;
2127   *is_reliable = false;
2128 
2129   // Document totals
2130   ToteWithReliability doc_tote;   // Reliability = 0..100
2131 
2132   // Vector of packed per-language boosts (just one filled in from hints)
2133   uint8 lang_hint_boost[EXT_NUM_LANGUAGES + 1];
2134   memset(lang_hint_boost, 0, sizeof(lang_hint_boost));
2135 
2136   // Apply hints,if any
2137   if ((tld_hint != NULL) && (tld_hint[0] != '\0')) {
2138     ApplyTLDHint(lang_hint_boost, tld_hint);
2139   }
2140   if (encoding_hint != UNKNOWN_ENCODING) {
2141     ApplyEncodingHint(lang_hint_boost, encoding_hint);
2142   }
2143   if (language_hint != UNKNOWN_LANGUAGE) {
2144     ApplyLanguageHint(lang_hint_boost, language_hint);
2145   }
2146 
2147 
2148   // Four individual script totals, Latin, Han, other2, other3
2149   int next_other_tote = 2;
2150 
2151   // Four totes for up to four different scripts pending at once
2152   Tote totes[4];                  // [0] Latn  [1] Hani  [2] other  [3] other
2153   bool tote_seen[4] = {false, false, false, false};
2154   int tote_grams[4] = {0, 0, 0, 0};     // Number in partial chunk
2155   UnicodeLScript tote_script[4] =
2156     {ULScript_Latin, ULScript_HanCJK, ULScript_Common, ULScript_Common};
2157 
2158   // Loop through text spans in a single script
2159   ScriptScanner ss(buffer, buffer_length, is_plain_text);
2160   getone::LangSpan scriptspan;
2161 
2162   scriptspan.text = NULL;
2163   scriptspan.text_bytes = 0;
2164   scriptspan.offset = 0;
2165   scriptspan.script = ULScript_Common;
2166   scriptspan.lang = UNKNOWN_LANGUAGE;
2167 
2168   int total_text_bytes = 0;
2169   int textlimit = FLAGS_cld_textlimit << 10;    // in KB
2170   if (textlimit == 0) {textlimit = 0x7fffffff;}
2171 
2172   int advance_by = 2;                   // Advance 2 bytes
2173   int advance_limit = textlimit >> 3;   // For first 1/8 of max document
2174 
2175   int initial_word_span = kDefaultWordSpan;
2176   if (FLAGS_cld_forcewords) {
2177     initial_word_span = kReallyBigWordSpan;
2178   }
2179 
2180   // Pick up chunk sizes
2181   // Smoothwidth is units of quadgrams, about 2.5 chars (unigrams) each
2182   // Sanity check -- force into a reasonable range
2183   int chunksizequads = FLAGS_cld_smoothwidth;
2184   chunksizequads = cld::minint(cld::maxint(chunksizequads, kMinChunkSizeQuads),
2185                                kMaxChunkSizeQuads);
2186   int chunksizeunis = (chunksizequads * 5) >> 1;
2187 
2188   // Varying short-span limit doesn't work well -- skips too much beyond 20KB
2189   // int spantooshortlimit = advance_by * FLAGS_cld_smoothwidth;
2190   int spantooshortlimit = kShortSpanThresh;
2191 
2192   // For debugging only. Not thread-safe
2193   prior_lang = UNKNOWN_LANGUAGE;
2194   prior_unreliable = false;
2195 
2196   // Allocate full-document prediction table for finding repeating words
2197   int hash = 0;
2198   int* predict_tbl = new int[kPredictionTableSize];
2199   if (FlagRepeats(flags)) {
2200     memset(predict_tbl, 0, kPredictionTableSize * sizeof(predict_tbl[0]));
2201   }
2202 
2203   // Loop through scriptspans accumulating number of text bytes in each language
2204   while (ss.GetOneScriptSpanLower(&scriptspan)) {
2205     UnicodeLScript lscript = scriptspan.script;
2206 
2207     // Echo text if asked to
2208     if (FLAGS_cld_echotext) {
2209       PrintHtmlEscapedText(stderr, scriptspan.text, scriptspan.text_bytes);
2210     }
2211 
2212     // Squeeze out big chunks of text span if asked to
2213     if (FlagSqueeze(flags)) {
2214       // Remove repetitive or mostly-spaces chunks
2215       int newlen;
2216       int chunksize = 0;    // Use the default
2217       newlen = CheapSqueezeInplace(scriptspan.text, scriptspan.text_bytes,
2218                                    chunksize);
2219       scriptspan.text_bytes = newlen;
2220     } else {
2221       // Check now and then to see if we should be squeezing
2222       if ((total_text_bytes >= kCheapSqueezeTestThresh) &&
2223           !FlagFinish(flags) &&
2224           ((getone::kMaxScriptBuffer >> 1) < scriptspan.text_bytes) &&
2225           CheapSqueezeTriggerTest(scriptspan.text,
2226                                     scriptspan.text_bytes,
2227                                     kCheapSqueezeTestLen)) {
2228         // Recursive call with big-chunk squeezing set
2229         if (FLAGS_cld_html || FLAGS_dbgscore) {
2230           fprintf(stderr,
2231                   "<br>---text_bytes[%d] Recursive(Squeeze)---<br><br>\n",
2232                   total_text_bytes);
2233         }
2234         // Deallocate full-document prediction table
2235         delete[] predict_tbl;
2236 
2237         return DetectLanguageSummaryV25(
2238                           tables,
2239                           buffer,
2240                           buffer_length,
2241                           is_plain_text,
2242                           tld_hint,               // "id" boosts Indonesian
2243                           encoding_hint,          // SJS boosts Japanese
2244                           language_hint,          // ITALIAN boosts it
2245                           allow_extended_lang,
2246                           flags | kCLDFlagSqueeze,
2247                           plus_one,
2248                           language3,
2249                           percent3,
2250                           normalized_score3,
2251                           text_bytes,
2252                           is_reliable);
2253       }
2254     }
2255 
2256     // Remove repetitive words if asked to
2257     if (FlagRepeats(flags)) {
2258       // Remove repetitive words
2259       int newlen;
2260       newlen = CheapRepWordsInplace(scriptspan.text, scriptspan.text_bytes,
2261                                     &hash, predict_tbl);
2262       scriptspan.text_bytes = newlen;
2263     }
2264 
2265     // The real scoring
2266     // Accumulate directly into the document total, or accmulate in one of four
2267     // chunk totals. The purpose of the multiple chunk totals is to piece
2268     // together short choppy pieces of text in alternating scripts. One total is
2269     // dedicated to Latin text, one to Han text, and the other two are dynamicly
2270     // assigned.
2271     Language onlylang = cld::kOnlyLanguagePerLScript[lscript];
2272 
2273     if (onlylang != UNKNOWN_LANGUAGE) {
2274       // This entire script run is in a single language.
2275       ScoreNilgrams(&scriptspan, cld::PackLanguage(onlylang), &doc_tote,
2276                    lang_hint_boost, flags, plus_one);
2277     } else if (cld::kScoreUniPerLScript[lscript] != 0) {
2278       // This entire script run's languages can be distinguished by uni-grams
2279       // Accumulate in hani_tote
2280       int tote_num = 1;
2281       if (!tote_seen[tote_num]) {
2282         tote_seen[tote_num] = true;
2283         // Default language gets 1 byte
2284         total_text_bytes += 1;
2285         InitScriptToteLang(&totes[tote_num], lscript);
2286       }
2287       ScoreUnigrams(tables->unigram_obj,
2288                        &scriptspan, &tote_grams[tote_num], chunksizeunis,
2289                        &totes[tote_num],
2290                        &doc_tote, lang_hint_boost,
2291                        advance_by, flags, &initial_word_span, plus_one);
2292     } else {
2293       // This entire script-run's languages can be distinguished by quad-grams
2294       // Accumulate in latn_tote or script0/1_tote
2295       int tote_num = -1;
2296       for (int t = 0; t < 4; ++t) {
2297         if (lscript == tote_script[t]) {
2298           tote_num = t;
2299           break;
2300         }
2301       }
2302       if (tote_num < 0) {
2303         // Need to allocate other0/1
2304         tote_num = next_other_tote;
2305         next_other_tote ^= 1;     // Round-robin
2306         if (tote_seen[tote_num]) {
2307           // Flush previous
2308           ScoreChunkIntoDoc2(kToteSwitch[tote_num], advance_by,
2309                              tote_script[tote_num], &totes[tote_num],
2310                              &doc_tote, tote_grams[tote_num], lang_hint_boost);
2311           totes[tote_num].Reinit();
2312         }
2313         tote_script[tote_num] = lscript;
2314       }
2315 
2316       if (!tote_seen[tote_num]) {
2317         tote_seen[tote_num] = true;
2318         // Default language gets 1 byte
2319         total_text_bytes += 1;
2320         InitScriptToteLang(&totes[tote_num], lscript);
2321       }
2322 
2323       // The actual accumulation, possibly with word scoring also
2324       ScoreQuadgrams(tables->quadgram_obj, &scriptspan, &tote_grams[tote_num],
2325                         chunksizequads,
2326                         &totes[tote_num],
2327                         &doc_tote, lang_hint_boost,
2328                         advance_by, flags, &initial_word_span, plus_one);
2329     }
2330 
2331     total_text_bytes += scriptspan.text_bytes;
2332 
2333     // For long documents, do less-dense samples the further along we go.
2334     // This is to keep speed sublinear in document size.
2335     if (total_text_bytes > advance_limit) {
2336       if (total_text_bytes > textlimit) {
2337         // Don't look at rest of doc
2338         if (FLAGS_cld_html || FLAGS_dbgscore) {
2339           fprintf(stderr, "<br>---text_bytes[%d] textlimit %d reached---<br>",
2340                   total_text_bytes, textlimit);
2341         }
2342         break;
2343       }
2344       advance_by <<= 1;         // Double advance bytes
2345       advance_limit <<= 1;      // Double limit until next change
2346       spantooshortlimit <<= 1;  // Double short-span size
2347       if (FLAGS_cld_html || FLAGS_dbgscore) {
2348         fprintf(stderr, "<br>---text_bytes[%d] advance_by doubled to %d---<br>",
2349                 total_text_bytes, advance_by);
2350       }
2351     }
2352   }     // End while (ss.GetOneScriptSpanLower())
2353 
2354   // Deallocate full-document prediction table
2355   delete[] predict_tbl;
2356 
2357   // Flush pending totals
2358   for (int tote_num = 0; tote_num < 4; ++tote_num) {
2359     if (tote_seen[tote_num]) {
2360       ScoreChunkIntoDoc2(kToteName[tote_num], advance_by,
2361                          tote_script[tote_num], &totes[tote_num], &doc_tote,
2362                          tote_grams[tote_num], lang_hint_boost);
2363     }
2364   }
2365 
2366   // If extended langauges are disallowed, remove them here
2367   if (!allow_extended_lang) {
2368     RemoveExtendedLanguages(&doc_tote);
2369   }
2370 
2371   // Force close pairs to one or the other
2372   RefineScoredClosePairs(&doc_tote);
2373 
2374 
2375   // Calculate return results
2376   // Find top three byte counts in tote heap
2377   int reliable_percent3[3];
2378 
2379 
2380   // Cannot use Add, etc. after sorting
2381   doc_tote.Sort(3);
2382 
2383   ExtractLangEtc(&doc_tote, total_text_bytes,
2384                  reliable_percent3, language3, percent3, normalized_score3,
2385                  text_bytes, is_reliable);
2386 
2387   bool have_good_answer = false;
2388   if (FlagFinish(flags)) {
2389     // Force a result
2390     have_good_answer = true;
2391   } else if (total_text_bytes <= kShortTextThresh) {
2392     // Don't recurse on short text -- we already did word scores
2393     have_good_answer = true;
2394   } else if (*is_reliable &&
2395              (percent3[0] >= kGoodLang1Percent)) {
2396     have_good_answer = true;
2397   } else if (*is_reliable &&
2398              ((percent3[0] + percent3[1]) >= kGoodLang1and2Percent)) {
2399     have_good_answer = true;
2400   }
2401 
2402 
2403   if (have_good_answer) {
2404     // This is the real, non-recursive return
2405 
2406     // Move bytes for unreliable langs to another lang or UNKNOWN
2407     RemoveUnreliableLanguages(&doc_tote);
2408 
2409     // Redo the result extraction after the removal above
2410     doc_tote.Sort(3);
2411     ExtractLangEtc(&doc_tote, total_text_bytes,
2412                    reliable_percent3, language3, percent3, normalized_score3,
2413                    text_bytes, is_reliable);
2414 
2415 #if 0
2416     // OLD code, replaced by CalcSummaryLang
2417     //
2418     // Suppress ignore-me text, TG_UNKNOWN_LANGUAGE if 2nd or 3rd language
2419     // Force it to English if first language
2420     if (language3[2] == TG_UNKNOWN_LANGUAGE) {
2421       reliable_percent3[2] = 0;
2422       language3[2] = UNKNOWN_LANGUAGE;
2423       percent3[2] = 0;
2424     } else if (language3[1] == TG_UNKNOWN_LANGUAGE) {
2425       // Move up lower language
2426       reliable_percent3[1] = reliable_percent3[2];
2427       language3[1] = language3[2];
2428       percent3[1] = percent3[2];
2429       reliable_percent3[2] = 0;
2430       language3[2] = UNKNOWN_LANGUAGE;
2431       percent3[2] = 0;
2432     } else if (language3[0] == TG_UNKNOWN_LANGUAGE) {
2433       language3[0] = ENGLISH;
2434     }
2435 
2436     if (language3[0] == UNKNOWN_LANGUAGE) {
2437       // Last-ditch test for some result, but it is UNKNOWN_LANGUAGE
2438       // Force it to English (should not happen)
2439       language3[0] = ENGLISH;
2440       percent3[0] = 100;
2441       *is_reliable = true;
2442     }
2443 #endif
2444 
2445 
2446 #if 0
2447     // Scaffolding to reveal subset sequence lang distribution across doc text
2448     // Track the sequence of language fragments [result currently unused]
2449     if (FLAGS_cld_html) {
2450       static const int kMaxSubsetSeq = 12;
2451       uint8 subseq[kMaxSubsetSeq];
2452       doc_tote.ExtractSeq(kMaxSubsetSeq, subseq);
2453 
2454       fprintf(stderr, "<br>\nSubset Sequence[%d]: ", kMaxSubsetSeq);
2455       for (int i = 0; i < kMaxSubsetSeq; ++i) {
2456         fprintf(stderr, "%s ", ExtLanguageCode(cld::UnpackLanguage(subseq[i])));
2457         if ((i % 4) == 3) {fprintf(stderr, "&nbsp; ");}
2458       }
2459       fprintf(stderr, "&nbsp;&nbsp; ");
2460 
2461       for (int i = 0; i < 3; ++i) {
2462         if (language3[i] != UNKNOWN_LANGUAGE) {
2463           fprintf(stderr, "%s.%d(%d%%) ",
2464                   ExtLanguageCode(language3[i]),
2465                   reliable_percent3[i],
2466                   percent3[i]);
2467         }
2468       }
2469 
2470       fprintf(stderr, "%d B ", total_text_bytes);
2471       fprintf(stderr, "<br>\n");
2472     }
2473     // End Scaffolding to reveal subset sequence lang distribution
2474 #endif
2475 
2476     Language summary_lang;
2477     CalcSummaryLang(&doc_tote, total_text_bytes,
2478                     reliable_percent3, language3, percent3,
2479                     &summary_lang, is_reliable);
2480 
2481     if (FLAGS_cld_html) {
2482       for (int i = 0; i < 3; ++i) {
2483         if (language3[i] != UNKNOWN_LANGUAGE) {
2484           fprintf(stderr, "%s.%d(%d%%) ",
2485                   ExtLanguageCode(language3[i]),
2486                   reliable_percent3[i],
2487                   percent3[i]);
2488         }
2489       }
2490 
2491       fprintf(stderr, "%d B ", total_text_bytes);
2492       fprintf(stderr, "= %s%c ",
2493               ExtLanguageName(summary_lang), is_reliable ? ' ' : '*');
2494       fprintf(stderr, "<br>\n");
2495     }
2496 
2497     return summary_lang;
2498   }
2499 
2500   // Not a good answer -- do recursive call to refine
2501   if (FLAGS_cld_html || FLAGS_dbgscore) {
2502     // This is what we hope to improve on in the recursive call, if any
2503     PrintLangs(stderr, language3, percent3, text_bytes, is_reliable);
2504   }
2505 
2506   // For restriction to Top40 + one, the one is 1st/2nd lang that is not Top40
2507   // For this purpose, we treate "Ignore" as top40
2508   Language new_plus_one = UNKNOWN_LANGUAGE;
2509   if (cld::kIsPackedTop40[cld::PackLanguage(language3[0])] == 0) {
2510     new_plus_one = language3[0];
2511   } else if (cld::kIsPackedTop40[cld::PackLanguage(language3[1])] == 0) {
2512     new_plus_one = language3[1];
2513   }
2514 
2515   if (total_text_bytes < kShortTextThresh) {
2516       // Short text: Recursive call with top40 and short set
2517       if (FLAGS_cld_html || FLAGS_dbgscore) {
2518         fprintf(stderr, "&nbsp;&nbsp;---text_bytes[%d] "
2519                 "Recursive(Top40/Rep/Short/Words)---<br><br>\n",
2520                 total_text_bytes);
2521       }
2522       return DetectLanguageSummaryV25(
2523                         tables,
2524                         buffer,
2525                         buffer_length,
2526                         is_plain_text,
2527                         tld_hint,               // "id" boosts Indonesian
2528                         encoding_hint,          // SJS boosts Japanese
2529                         language_hint,          // ITALIAN boosts it
2530                         allow_extended_lang,
2531                         flags | kCLDFlagTop40 | kCLDFlagRepeats |
2532                           kCLDFlagShort | kCLDFlagUseWords | kCLDFlagFinish,
2533                         new_plus_one,
2534                         language3,
2535                         percent3,
2536                         normalized_score3,
2537                         text_bytes,
2538                         is_reliable);
2539   }
2540 
2541   // Longer text: Recursive call with top40 set
2542   if (FLAGS_cld_html || FLAGS_dbgscore) {
2543     fprintf(stderr,
2544             "&nbsp;&nbsp;---text_bytes[%d] Recursive(Top40/Rep)---<br><br>\n",
2545             total_text_bytes);
2546   }
2547   return DetectLanguageSummaryV25(
2548                         tables,
2549                         buffer,
2550                         buffer_length,
2551                         is_plain_text,
2552                         tld_hint,               // "id" boosts Indonesian
2553                         encoding_hint,          // SJS boosts Japanese
2554                         language_hint,          // ITALIAN boosts it
2555                         allow_extended_lang,
2556                         flags | kCLDFlagTop40 | kCLDFlagRepeats |
2557                           kCLDFlagFinish,
2558                         new_plus_one,
2559                         language3,
2560                         percent3,
2561                         normalized_score3,
2562                         text_bytes,
2563                         is_reliable);
2564 }   // End CompactLangDetImpl::DetectLanguageSummaryV25
2565