• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (c) 2009 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include <string>
6 #include "encodings/compact_lang_det/cldutil.h"
7 
8 #include "base/basictypes.h"
9 #include "encodings/compact_lang_det/cldutil_dbg.h"
10 #include "encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h"
11 #include "encodings/compact_lang_det/utf8propletterscriptnum.h"
12 #include "encodings/compact_lang_det/win/cld_commandlineflags.h"
13 #include "encodings/compact_lang_det/win/cld_logging.h"
14 #include "encodings/compact_lang_det/win/cld_unilib.h"
15 #include "encodings/compact_lang_det/win/cld_utf.h"
16 #include "encodings/compact_lang_det/win/cld_utf8statetable.h"
17 
18 // Runtime routines for hashing, looking up, and scoring
19 // unigrams (CJK), bigrams (CJK), quadgrams, and octagrams.
20 // Unigrams and bigrams are for CJK languages only, including simplified/
21 // traditional Chinese, Japanese, Korean, Vietnamese Han characters, and
22 // Zhuang Han characters. Surrounding spaces are not considered.
23 // Quadgrams and octagrams for for non-CJK and include two bits indicating
24 // preceding and trailing spaces (word boundaries).
25 
26 
27 // Indicator bits for leading/trailing space around quad/octagram
28 // NOTE: 4444 bits are chosen to flip constant bits in hash of four chars of
29 // 1-, 2-, or 3-bytes each.
30 static const uint32 kPreSpaceIndicator =  0x00004444;
31 static const uint32 kPostSpaceIndicator = 0x44440000;
32 
33 // Little-endian masks for 0..24 bytes picked up as uint32's
34 static const uint32 kWordMask0[4] = {
35   0xFFFFFFFF, 0x000000FF, 0x0000FFFF, 0x00FFFFFF
36 };
37 
38 static const int kMinCJKUTF8CharBytes = 3;
39 
40 static const int kMinGramCount = 3;
41 static const int kMaxGramCount = 16;
42 
43 
44 
45 
46 // Routines to access a hash table of <key:wordhash, value:probs> pairs
47 // Buckets have 4-byte wordhash for sizes < 32K buckets, but only
48 // 2-byte wordhash for sizes >= 32K buckets, with other wordhash bits used as
49 // bucket subscript.
50 // Probs is a packed: three languages plus a subscript for probability table
51 // Buckets have all the keys together, then all the values.Key array never
52 // crosses a cache-line boundary, so no-match case takes exactly one cache miss.
53 // Match case may sometimes take an additional cache miss on value access.
54 //
55 // Other possibilites include 5 or 10 6-byte entries plus pad to make 32 or 64
56 // byte buckets with single cache miss.
57 // Or 2-byte key and 6-byte value, allowing 5 languages instead  of three.
58 //------------------------------------------------------------------------------
59 
60 
61 //------------------------------------------------------------------------------
62 // Hashing groups of 1/2/4/8 letters, perhaps with spaces or underscores
63 //------------------------------------------------------------------------------
64 
65 // Design principles for these hash functions
66 // - Few operations
67 // - Handle 1-, 2-, and 3-byte UTF-8 scripts, ignoring intermixing except in
68 //   Latin script expect 1- and 2-byte mixtures.
69 // - Last byte of each character has about 5 bits of information
70 // - Spread good bits around so they can interact in at least two ways
71 //   with other characters
72 // - Use add for additional mixing thorugh carries
73 
74 // CJK Three-byte bigram
75 //   ....dddd..cccccc..bbbbbb....aaaa
76 //   ..................ffffff..eeeeee
77 // make
78 //   ....dddd..cccccc..bbbbbb....aaaa
79 //   000....dddd..cccccc..bbbbbb....a
80 //   ..................ffffff..eeeeee
81 //   ffffff..eeeeee000000000000000000
82 //
83 // CJK Four-byte bigram
84 //   ..dddddd..cccccc....bbbb....aaaa
85 //   ..hhhhhh..gggggg....ffff....eeee
86 // make
87 //   ..dddddd..cccccc....bbbb....aaaa
88 //   000..dddddd..cccccc....bbbb....a
89 //   ..hhhhhh..gggggg....ffff....eeee
90 //   ..ffff....eeee000000000000000000
91 
92 // BIGRAM
93 // Pick up 1..8 bytes and hash them via mask/shift/add. NO pre/post
94 // OVERSHOOTS up to 3 bytes
95 // For runtime use of tables
BiHashV25(const char * word_ptr,int bytecount)96 uint32 cld::BiHashV25(const char* word_ptr, int bytecount) {
97   if (bytecount == 0) {
98     return 0;
99   }
100   uint32 word0, word1;
101   if (bytecount <= 4) {
102     word0 = UnalignedLoad32(word_ptr) & kWordMask0[bytecount & 3];
103     word0 = word0 ^ (word0 >> 3);
104     return word0;
105   }
106   // Else do 8 bytes
107   word0 = UnalignedLoad32(word_ptr);
108   word0 = word0 ^ (word0 >> 3);
109   word1 = UnalignedLoad32(word_ptr + 4) & kWordMask0[bytecount & 3];
110   word1 = word1 ^ (word1 << 18);
111   return word0 + word1;
112 }
113 
114 //
115 // Ascii-7 One-byte chars
116 //   ...ddddd...ccccc...bbbbb...aaaaa
117 // make
118 //   ...ddddd...ccccc...bbbbb...aaaaa
119 //   000...ddddd...ccccc...bbbbb...aa
120 //
121 // Latin 1- and 2-byte chars
122 //   ...ddddd...ccccc...bbbbb...aaaaa
123 //   ...................fffff...eeeee
124 // make
125 //   ...ddddd...ccccc...bbbbb...aaaaa
126 //   000...ddddd...ccccc...bbbbb...aa
127 //   ...................fffff...eeeee
128 //   ...............fffff...eeeee0000
129 //
130 // Non-CJK Two-byte chars
131 //   ...ddddd...........bbbbb........
132 //   ...hhhhh...........fffff........
133 // make
134 //   ...ddddd...........bbbbb........
135 //   000...ddddd...........bbbbb.....
136 //   ...hhhhh...........fffff........
137 //   hhhh...........fffff........0000
138 //
139 // Non-CJK Three-byte chars
140 //   ...........ccccc................
141 //   ...................fffff........
142 //   ...lllll...................iiiii
143 // make
144 //   ...........ccccc................
145 //   000...........ccccc.............
146 //   ...................fffff........
147 //   ...............fffff........0000
148 //   ...lllll...................iiiii
149 //   .lllll...................iiiii00
150 //
151 
152 // QUADGRAM
153 // Pick up 1..12 bytes plus pre/post space and hash them via mask/shift/add
154 // OVERSHOOTS up to 3 bytes
155 // For runtime use of tables
QuadHashV25Mix(const char * word_ptr,int bytecount,uint32 prepost)156 uint32 QuadHashV25Mix(const char* word_ptr, int bytecount, uint32 prepost) {
157   uint32 word0, word1, word2;
158   if (bytecount <= 4) {
159     word0 = UnalignedLoad32(word_ptr) & kWordMask0[bytecount & 3];
160     word0 = word0 ^ (word0 >> 3);
161     return word0 ^ prepost;
162   } else if (bytecount <= 8) {
163     word0 = UnalignedLoad32(word_ptr);
164     word0 = word0 ^ (word0 >> 3);
165     word1 = UnalignedLoad32(word_ptr + 4) & kWordMask0[bytecount & 3];
166     word1 = word1 ^ (word1 << 4);
167     return (word0 ^ prepost) + word1;
168   }
169   // else do 12 bytes
170   word0 = UnalignedLoad32(word_ptr);
171   word0 = word0 ^ (word0 >> 3);
172   word1 = UnalignedLoad32(word_ptr + 4);
173   word1 = word1 ^ (word1 << 4);
174   word2 = UnalignedLoad32(word_ptr + 8) & kWordMask0[bytecount & 3];
175   word2 = word2 ^ (word2 << 2);
176   return (word0 ^ prepost) + word1 + word2;
177 }
178 
179 
180 // QUADGRAM wrapper with surrounding spaces
181 // Pick up 1..12 bytes plus pre/post space and hash them via mask/shift/add
182 // UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
183 // For runtime use of tables
QuadHashV25(const char * word_ptr,int bytecount)184 uint32 cld::QuadHashV25(const char* word_ptr, int bytecount) {
185   if (bytecount == 0) {
186     return 0;
187   }
188   uint32 prepost = 0;
189   if (word_ptr[-1] == ' ') {prepost |= kPreSpaceIndicator;}
190   if (word_ptr[bytecount] == ' ') {prepost |= kPostSpaceIndicator;}
191   return QuadHashV25Mix(word_ptr, bytecount, prepost);
192 }
193 
194 // QUADGRAM wrapper with surrounding underscores (offline use)
195 // Pick up 1..12 bytes plus pre/post '_' and hash them via mask/shift/add
196 // OVERSHOOTS up to 3 bytes
197 // For offline construction of tables
QuadHashV25Underscore(const char * word_ptr,int bytecount)198 uint32 cld::QuadHashV25Underscore(const char* word_ptr, int bytecount) {
199   if (bytecount == 0) {
200     return 0;
201   }
202   const char* local_word_ptr = word_ptr;
203   int local_bytecount = bytecount;
204   uint32 prepost = 0;
205   if (local_word_ptr[0] == '_') {
206     prepost |= kPreSpaceIndicator;
207     ++local_word_ptr;
208     --local_bytecount;
209   }
210   if (local_word_ptr[local_bytecount - 1] == '_') {
211     prepost |= kPostSpaceIndicator;
212     --local_bytecount;
213   }
214   return QuadHashV25Mix(local_word_ptr, local_bytecount, prepost);
215 }
216 
217 
218 // OCTAGRAM
219 // Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add
220 // UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
221 //
222 // The low 32 bits follow the pattern from above, tuned to different scripts
223 // The high 8 bits are a simple sum of all bytes, shifted by 0/1/2/3 bits each
224 // For runtime use of tables V3
OctaHash40Mix(const char * word_ptr,int bytecount,uint64 prepost)225 uint64 OctaHash40Mix(const char* word_ptr, int bytecount, uint64 prepost) {
226   uint64 word0;
227   uint64 word1;
228   uint64 sum;
229 
230   if (word_ptr[-1] == ' ') {prepost |= kPreSpaceIndicator;}
231   if (word_ptr[bytecount] == ' ') {prepost |= kPostSpaceIndicator;}
232   switch ((bytecount - 1) >> 2) {
233   case 0:       // 1..4 bytes
234     word0 = UnalignedLoad32(word_ptr) & kWordMask0[bytecount & 3];
235     sum = word0;
236     word0 = word0 ^ (word0 >> 3);
237     break;
238   case 1:       // 5..8 bytes
239     word0 = UnalignedLoad32(word_ptr);
240     sum = word0;
241     word0 = word0 ^ (word0 >> 3);
242     word1 = UnalignedLoad32(word_ptr + 4) & kWordMask0[bytecount & 3];
243     sum += word1;
244     word1 = word1 ^ (word1 << 4);
245     word0 += word1;
246     break;
247   case 2:       // 9..12 bytes
248     word0 = UnalignedLoad32(word_ptr);
249     sum = word0;
250     word0 = word0 ^ (word0 >> 3);
251     word1 = UnalignedLoad32(word_ptr + 4);
252     sum += word1;
253     word1 = word1 ^ (word1 << 4);
254     word0 += word1;
255     word1 = UnalignedLoad32(word_ptr + 8) & kWordMask0[bytecount & 3];
256     sum += word1;
257     word1 = word1 ^ (word1 << 2);
258     word0 += word1;
259     break;
260   case 3:       // 13..16 bytes
261     word0 = UnalignedLoad32(word_ptr);
262     sum = word0;
263     word0 = word0 ^ (word0 >> 3);
264     word1 = UnalignedLoad32(word_ptr + 4);
265     sum += word1;
266     word1 = word1 ^ (word1 << 4);
267     word0 += word1;
268     word1 = UnalignedLoad32(word_ptr + 8);
269     sum += word1;
270     word1 = word1 ^ (word1 << 2);
271     word0 += word1;
272     word1 = UnalignedLoad32(word_ptr + 12) & kWordMask0[bytecount & 3];
273     sum += word1;
274     word1 = word1 ^ (word1 >> 8);
275     word0 += word1;
276     break;
277   case 4:       // 17..20 bytes
278     word0 = UnalignedLoad32(word_ptr);
279     sum = word0;
280     word0 = word0 ^ (word0 >> 3);
281     word1 = UnalignedLoad32(word_ptr + 4);
282     sum += word1;
283     word1 = word1 ^ (word1 << 4);
284     word0 += word1;
285     word1 = UnalignedLoad32(word_ptr + 8);
286     sum += word1;
287     word1 = word1 ^ (word1 << 2);
288     word0 += word1;
289     word1 = UnalignedLoad32(word_ptr + 12);
290     sum += word1;
291     word1 = word1 ^ (word1 >> 8);
292     word0 += word1;
293     word1 = UnalignedLoad32(word_ptr + 16) & kWordMask0[bytecount & 3];
294     sum += word1;
295     word1 = word1 ^ (word1 >> 4);
296     word0 += word1;
297     break;
298   default:      // 21..24 bytes and higher (ignores beyond 24)
299     word0 = UnalignedLoad32(&word_ptr);
300     sum = word0;
301     word0 = word0 ^ (word0 >> 3);
302     word1 = UnalignedLoad32(word_ptr + 4);
303     sum += word1;
304     word1 = word1 ^ (word1 << 4);
305     word0 += word1;
306     word1 = UnalignedLoad32(word_ptr + 8);
307     sum += word1;
308     word1 = word1 ^ (word1 << 2);
309     word0 += word1;
310     word1 = UnalignedLoad32(word_ptr + 12);
311     sum += word1;
312     word1 = word1 ^ (word1 >> 8);
313     word0 += word1;
314     word1 = UnalignedLoad32(word_ptr + 16);
315     sum += word1;
316     word1 = word1 ^ (word1 >> 4);
317     word0 += word1;
318     word1 = UnalignedLoad32(word_ptr + 20) & kWordMask0[bytecount & 3];
319     sum += word1;
320     word1 = word1 ^ (word1 >> 6);
321     word0 += word1;
322     break;
323   }
324 
325   sum += (sum >> 17);             // extra 1-bit shift for bytes 2 & 3
326   sum += (sum >> 9);              // extra 1-bit shift for bytes 1 & 3
327   sum = (sum & 0xff) << 32;
328   return (word0 ^ prepost) + sum;
329 }
330 
331 // OCTAGRAM wrapper with surrounding spaces
332 // Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add
333 // UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
334 //
335 // The low 32 bits follow the pattern from above, tuned to different scripts
336 // The high 8 bits are a simple sum of all bytes, shifted by 0/1/2/3 bits each
337 // For runtime use of tables V3
OctaHash40(const char * word_ptr,int bytecount)338 uint64 cld::OctaHash40(const char* word_ptr, int bytecount) {
339   if (bytecount == 0) {
340     return 0;
341   }
342   uint64 prepost = 0;
343   if (word_ptr[-1] == ' ') {prepost |= kPreSpaceIndicator;}
344   if (word_ptr[bytecount] == ' ') {prepost |= kPostSpaceIndicator;}
345   return OctaHash40Mix(word_ptr, bytecount, prepost);
346 }
347 
348 
349 // OCTAGRAM wrapper with surrounding underscores (offline use)
350 // Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add
351 // UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
352 //
353 // The low 32 bits follow the pattern from above, tuned to different scripts
354 // The high 8 bits are a simple sum of all bytes, shifted by 0/1/2/3 bits each
355 // For offline construction of tables
OctaHash40underscore(const char * word_ptr,int bytecount)356 uint64 cld::OctaHash40underscore(const char* word_ptr, int bytecount) {
357   if (bytecount == 0) {
358     return 0;
359   }
360   const char* local_word_ptr = word_ptr;
361   int local_bytecount = bytecount;
362   uint64 prepost = 0;
363   if (local_word_ptr[0] == '_') {
364     prepost |= kPreSpaceIndicator;
365     ++local_word_ptr;
366     --local_bytecount;
367   }
368   if (local_word_ptr[local_bytecount - 1] == '_') {
369     prepost |= kPostSpaceIndicator;
370     --local_bytecount;
371   }
372   return OctaHash40Mix(local_word_ptr, local_bytecount, prepost);
373 }
374 
375 
376 
377 
378 //------------------------------------------------------------------------------
379 // Scoring single groups of letters
380 //------------------------------------------------------------------------------
381 
382 // UNIGRAM score one => tote
383 // Input: 1-byte entry of subscript into unigram probs, plus
384 //  an accumulator tote.
385 // Output: running sums in tote updated
ProcessProbV25UniTote(int propval,Tote * tote)386 void cld::ProcessProbV25UniTote(int propval, Tote* tote) {
387   tote->AddGram();
388   const UnigramProbArray* pa = &kTargetCTJKVZProbs[propval];
389   if (pa->probs[0] > 0) {tote->Add(cld::PackLanguage(CHINESE), pa->probs[0]);}
390   if (pa->probs[1] > 0) {tote->Add(cld::PackLanguage(CHINESE_T), pa->probs[1]);}
391   if (pa->probs[2] > 0) {tote->Add(cld::PackLanguage(JAPANESE), pa->probs[2]);}
392   if (pa->probs[3] > 0) {tote->Add(cld::PackLanguage(KOREAN), pa->probs[3]);}
393   if (pa->probs[4] > 0) {tote->Add(cld::PackLanguage(VIETNAMESE), pa->probs[4]);}
394   if (pa->probs[5] > 0) {tote->Add(cld::PackLanguage(ZHUANG), pa->probs[5]);}
395 }
396 
397 // BIGRAM, QUADGRAM, OCTAGRAM score one => tote
398 // Input: 4-byte entry of 3 language numbers and one probability subscript, plus
399 //  an accumulator tote. (language 0 means unused entry)
400 // Output: running sums in tote updated
ProcessProbV25Tote(uint32 probs,Tote * tote)401 void cld::ProcessProbV25Tote(uint32 probs, Tote* tote) {
402   tote->AddGram();
403   uint8 prob123 = (probs >> 0) & 0xff;
404   const uint8* prob123_entry = cld::LgProb2TblEntry(prob123);
405 
406   uint8 top1 = (probs >> 8) & 0xff;
407   if (top1 > 0) {tote->Add(top1, cld::LgProb3(prob123_entry, 0));}
408   uint8 top2 = (probs >> 16) & 0xff;
409   if (top2 > 0) {tote->Add(top2, cld::LgProb3(prob123_entry, 1));}
410   uint8 top3 = (probs >> 24) & 0xff;
411   if (top3 > 0) {tote->Add(top3, cld::LgProb3(prob123_entry, 2));}
412 }
413 
414 
415 //------------------------------------------------------------------------------
416 // Routines to accumulate probabilities
417 //------------------------------------------------------------------------------
418 
419 
420 // UNIGRAM, using UTF-8 property table, advancing by 1/2/4/8 chars
421 // Caller supplies table, such as compact_lang_det_generated_ctjkvz_b1_obj
422 // Score up to n unigrams, returning number of bytes consumed
423 // Updates tote_grams
DoUniScoreV3(const UTF8PropObj * unigram_obj,const char * isrc,int srclen,int advance_by,int * tote_grams,int gram_limit,Tote * chunk_tote)424 int cld::DoUniScoreV3(const UTF8PropObj* unigram_obj,
425                       const char* isrc, int srclen, int advance_by,
426                       int* tote_grams, int gram_limit, Tote* chunk_tote) {
427   const char* src = isrc;
428   if (FLAGS_dbgscore) {DbgScoreInit(src, srclen);}
429 
430   // Property-based CJK unigram lookup
431   if (src[0] == ' ') {++src; --srclen;}
432 
433   const uint8* usrc = reinterpret_cast<const uint8*>(src);
434   int usrclen = srclen;
435 
436   while (usrclen > 0) {
437     int len = kAdvanceOneChar[usrc[0]];
438     // Look up property of one UTF-8 character and advance over it
439     // Return 0 if input length is zero
440     // Return 0 and advance one byte if input is ill-formed
441 
442     int propval = UTF8GenericPropertyBigOneByte(unigram_obj, &usrc, &usrclen);
443 
444     if (FLAGS_dbglookup) {
445       DbgUniTermToStderr(propval, usrc, len);
446     }
447 
448     if (propval > 0) {
449       ProcessProbV25UniTote(propval, chunk_tote);
450       ++(*tote_grams);
451       if (FLAGS_dbgscore) {DbgScoreRecordUni((const char*)usrc, propval, len);}
452     }
453 
454     // Advance by 1/2/4/8 characters (half of quad advance)
455     if (advance_by == 2) {
456       // Already advanced by 1
457     } else if (advance_by == 4) {
458       // Advance by 2 chars total, if not at end
459       if (UTFmax <= usrclen) {
460         int n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
461       }
462     } else if (advance_by == 8) {
463       // Advance by 4 chars total, if not at end
464       if ((UTFmax * 3) <= usrclen) {
465         int n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
466         n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
467         n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
468       }
469     } else {
470       // Advance by 8 chars total, if not at end
471       if ((UTFmax * 7) <= usrclen) {
472         int n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
473         n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
474         n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
475         n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
476         n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
477         n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
478         n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
479       }
480     }
481     DCHECK(usrclen >= 0);
482 
483     if (*tote_grams >= gram_limit) {
484       break;
485     }
486   }
487   if (FLAGS_dbgscore) {
488     // With advance_by>2, we consume more input to get the same number of quads
489     int len = src - isrc;
490     DbgScoreTop(src, (len * 2) / advance_by, chunk_tote);
491     DbgScoreFlush();
492   }
493 
494   int consumed2 = reinterpret_cast<const char*>(usrc) - isrc;
495   return consumed2;
496 }
497 
498 
499 // BIGRAM, using hash table, always advancing by 1 char
500 // Caller supplies table, such as &kCjkBiTable_obj or &kGibberishTable_obj
501 // Score all bigrams in isrc, using languages that have bigrams (CJK)
502 // Return number of bigrams that hit in the hash table
DoBigramScoreV3(const cld::CLDTableSummary * bigram_obj,const char * isrc,int srclen,Tote * chunk_tote)503 int cld::DoBigramScoreV3(const cld::CLDTableSummary* bigram_obj,
504                          const char* isrc, int srclen, Tote* chunk_tote) {
505   int hit_count = 0;
506   const char* src = isrc;
507 
508   // Hashtable-based CJK bigram lookup
509   const uint8* usrc = reinterpret_cast<const uint8*>(src);
510   const uint8* usrclimit1 = usrc + srclen - UTFmax;
511   if (FLAGS_dbgscore) {
512     fprintf(stderr, "  " );
513   }
514 
515   while (usrc < usrclimit1) {
516     int len = kAdvanceOneChar[usrc[0]];
517     int len2 = kAdvanceOneChar[usrc[len]] + len;
518 
519     if ((kMinCJKUTF8CharBytes * 2) <= len2) {      // Two CJK chars possible
520       // Lookup and score this bigram
521       // Always ignore pre/post spaces
522       uint32 bihash = BiHashV25(reinterpret_cast<const char*>(usrc), len2);
523       uint32 probs = QuadHashV3Lookup4(bigram_obj, bihash);
524       // Now go indirect on the subscript
525       probs = bigram_obj->kCLDTableInd[probs &
526         ~bigram_obj->kCLDTableKeyMask];
527 
528       // Process the bigram
529       if (FLAGS_dbglookup) {
530         const char* ssrc = reinterpret_cast<const char*>(usrc);
531         DbgBiTermToStderr(bihash, probs, ssrc, len2);
532         DbgScoreRecord(NULL, probs, len2);
533       } else if (FLAGS_dbgscore && (probs != 0)) {
534         const char* ssrc = reinterpret_cast<const char*>(usrc);
535         DbgScoreRecord(NULL, probs, len2);
536         string temp(ssrc, len2);
537         fprintf(stderr, "%s ", temp.c_str());
538       }
539 
540       if (probs != 0) {
541         ProcessProbV25Tote(probs, chunk_tote);
542         ++hit_count;
543       }
544     }
545     usrc += len;  // Advance by one char
546   }
547 
548   if (FLAGS_dbgscore) {
549     fprintf(stderr, "[%d bigrams scored]\n", hit_count);
550     DbgScoreState();
551   }
552   return hit_count;
553 }
554 
555 
556 
557 // QUADGRAM, using hash table, advancing by 2/4/8/16 chars
558 // Caller supplies table, such as &kQuadTable_obj or &kGibberishTable_obj
559 // Score up to n quadgrams, returning number of bytes consumed
560 // Updates tote_grams
DoQuadScoreV3(const cld::CLDTableSummary * quadgram_obj,const char * isrc,int srclen,int advance_by,int * tote_grams,int gram_limit,Tote * chunk_tote)561 int cld::DoQuadScoreV3(const cld::CLDTableSummary* quadgram_obj,
562                        const char* isrc, int srclen, int advance_by,
563                        int* tote_grams, int gram_limit, Tote* chunk_tote) {
564   const char* src = isrc;
565   const char* srclimit = src + srclen;
566   // Limit is end, which has extra 20 20 20 00 past len
567   const char* srclimit7 = src + srclen - (UTFmax * 7);
568   const char* srclimit15 = src + srclen - (UTFmax * 15);
569 
570   if (FLAGS_dbgscore) {DbgScoreInit(src, srclen);}
571 
572   // Run a little cache of last hits to catch overly-repetitive "text"
573   int next_prior = 0;
574   uint32 prior_quads[2] = {0, 0};
575 
576   // Visit all quadgrams
577   if (src[0] == ' ') {++src;}
578   while (src < srclimit) {
579     // Find one quadgram
580     const char* src_end = src;
581     src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
582     src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
583     const char* src_mid = src_end;
584     src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
585     src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
586     int len = src_end - src;
587 
588     // Lookup and score this quadgram
589     uint32 quadhash = QuadHashV25(src, len);
590     uint32 probs = QuadHashV3Lookup4(quadgram_obj, quadhash);
591     // Now go indirect on the subscript
592     probs = quadgram_obj->kCLDTableInd[probs &
593       ~quadgram_obj->kCLDTableKeyMask];
594 
595     // Process the quadgram
596     if (FLAGS_dbglookup) {
597       DbgQuadTermToStderr(quadhash, probs, src, len);
598     }
599     if (probs != 0) {
600       // Filter out recent repeats. If this works out, use in the other lookups
601       if ((quadhash != prior_quads[0]) && (quadhash != prior_quads[1])) {
602         prior_quads[next_prior] = quadhash;
603         next_prior = (next_prior + 1) & 1;
604         ProcessProbV25Tote(probs, chunk_tote);
605         ++(*tote_grams);
606         if (FLAGS_dbgscore) {DbgScoreRecord(src, probs, len);}
607       }
608     }
609 
610     // Advance all the way past word if at end-of-word
611     if (src_end[0] == ' ') {
612       src_mid = src_end;
613     }
614 
615     // Advance by 2/4/8/16 characters
616     if (advance_by == 2) {
617       src = src_mid;
618     } else if (advance_by == 4) {
619       src = src_end;
620     } else if (advance_by == 8) {
621       // Advance by 8 chars total (4 more), if not at end
622       if (src < srclimit7) {
623         src_end += kAdvanceOneChar[(uint8)src_end[0]];
624         src_end += kAdvanceOneChar[(uint8)src_end[0]];
625         src_end += kAdvanceOneChar[(uint8)src_end[0]];
626         src_end += kAdvanceOneChar[(uint8)src_end[0]];
627       }
628       src = src_end;
629     } else {
630       // Advance by 16 chars total (12 more), if not at end
631       if (src < srclimit15) {
632         // Advance by ~16 chars by adding 3 * current bytelen
633         int fourcharlen = src_end - src;
634         src = src_end + (3 * fourcharlen);
635         // Advance a bit more if mid-character
636         src += kAdvanceOneCharSpaceVowel[(uint8)src[0]];
637         src += kAdvanceOneCharSpaceVowel[(uint8)src[0]];
638       } else {
639         src = src_end;
640       }
641     }
642     DCHECK(src < srclimit);
643     src += kAdvanceOneCharSpaceVowel[(uint8)src[0]];
644 
645     if (*tote_grams >= gram_limit) {
646       break;
647     }
648   }
649 
650   if (FLAGS_dbgscore) {
651     // With advance_by>2, we consume more input to get the same number of quads
652     int len = src - isrc;
653     DbgScoreTop(src, (len * 2) / advance_by, chunk_tote);
654     DbgScoreFlush();
655   }
656 
657   int consumed = src - isrc;
658 
659   // If advancing by more than 2, src may have overshot srclimit
660   if (consumed > srclen) {
661     consumed = srclen;
662   }
663 
664   return consumed;
665 }
666 
667 
668 // OCTAGRAM, using hash table, always advancing by 1 word
669 // Caller supplies table, such as &kLongWord8Table_obj
670 // Score all words in isrc, using languages that have quadgrams
671 // We don't normally use this routine except on the first quadgram run,
672 // but it can be used to resolve unreliable pages.
673 // This routine does not have an optimized advance_by
674 // SOON: Uses indirect language/probability longword
675 //
676 // Return number of words that hit in the hash table
DoOctaScoreV3(const cld::CLDTableSummary * octagram_obj,const char * isrc,int srclen,Tote * chunk_tote)677 int cld::DoOctaScoreV3(const cld::CLDTableSummary* octagram_obj,
678                        const char* isrc, int srclen, Tote* chunk_tote) {
679   int hit_count = 0;
680   const char* src = isrc;
681   const char* srclimit = src + srclen + 1;
682   // Limit is end+1, to include extra space char (0x20) off the end
683   //
684   // Score all words truncated to 8 characters
685   int charcount = 0;
686   // Skip any initial space
687   if (src[0] == ' ') {++src;}
688   const char* word_ptr = src;
689   const char* word_end = word_ptr;
690   if (FLAGS_dbgscore) {
691     fprintf(stderr, "  " );
692   }
693   while (src < srclimit) {
694     // Terminate previous word or continue current word
695     if (src[0] == ' ') {
696       int bytecount = word_end - word_ptr;
697       if (bytecount == 0)
698         break;
699       // Lookup and score this word
700       uint64 wordhash40 = OctaHash40(word_ptr, bytecount);
701       uint32 probs = OctaHashV3Lookup4(octagram_obj, wordhash40);
702       // Now go indirect on the subscript
703       probs = octagram_obj->kCLDTableInd[probs &
704         ~octagram_obj->kCLDTableKeyMask];
705 
706       // // Lookup and score this word
707       // uint32 wordhash = QuadHashV25(word_ptr, bytecount);
708       // uint32 probs = WordHashLookup4(wordhash, kLongWord8Table,
709       //                                kLongWord8TableSize);
710       //
711       if (FLAGS_dbglookup) {
712         DbgWordTermToStderr(wordhash40, probs, word_ptr, bytecount);
713         DbgScoreRecord(NULL, probs, bytecount);
714       } else if (FLAGS_dbgscore && (probs != 0)) {
715         DbgScoreRecord(NULL, probs, bytecount);
716         string temp(word_ptr, bytecount);
717         fprintf(stderr, "%s ", temp.c_str());
718       }
719 
720       if (probs != 0) {
721         ProcessProbV25Tote(probs, chunk_tote);
722         ++hit_count;
723       }
724       charcount = 0;
725       word_ptr = src + 1;   // Over the space
726       word_end = word_ptr;
727     } else {
728       ++charcount;
729     }
730 
731     // Advance to next char
732     src += cld_UniLib::OneCharLen(src);
733     if (charcount <= 8) {
734       word_end = src;
735     }
736   }
737 
738   if (FLAGS_dbgscore) {
739     fprintf(stderr, "[%d words scored]\n", hit_count);
740     DbgScoreState();
741   }
742   return hit_count;
743 }
744 
745 
746 
747 //------------------------------------------------------------------------------
748 // Reliability calculations, for single language and between languages
749 //------------------------------------------------------------------------------
750 
751 // Return reliablity of result 0..100 for top two scores
752 // delta==0 is 0% reliable, delta==fully_reliable_thresh is 100% reliable
753 // (on a scale where +1 is a factor of  2 ** 1.6 = 3.02)
754 // Threshold is uni/quadgram increment count, bounded above and below.
755 //
756 // Requiring a factor of 3 improvement (e.g. +1 log base 3)
757 // for each scored quadgram is too stringent, so I've backed this off to a
758 // factor of 2 (e.g. +5/8 log base 3).
759 //
760 // I also somewhat lowered the Min/MaxGramCount limits above
761 //
762 // Added: if fewer than 8 quads/unis, max reliability is 12*n percent
763 //
ReliabilityDelta(int value1,int value2,int gramcount)764 int cld::ReliabilityDelta(int value1, int value2, int gramcount) {
765   int max_reliability_percent = 100;
766   if (gramcount < 8) {
767     max_reliability_percent = 12 * gramcount;
768   }
769   int fully_reliable_thresh = (gramcount * 5) >> 3;     // see note above
770   if (fully_reliable_thresh < kMinGramCount) {          // Fully = 3..16
771     fully_reliable_thresh = kMinGramCount;
772   } else if (fully_reliable_thresh > kMaxGramCount) {
773     fully_reliable_thresh = kMaxGramCount;
774   }
775 
776   int delta = value1 - value2;
777   if (delta >= fully_reliable_thresh) {return max_reliability_percent;}
778   if (delta <= 0) {return 0;}
779   return cld::minint(max_reliability_percent,
780                      (100 * delta) / fully_reliable_thresh);
781 }
782 
783 // Return reliablity of result 0..100 for top score vs. mainsteam score
784 // Values are score per 1024 bytes of input
785 // ratio = max(top/mainstream, mainstream/top)
786 // ratio > 4.0 is 0% reliable, <= 2.0 is 100% reliable
787 // Change: short-text word scoring can give unusually good results.
788 //  Let top exceed mainstream by 4x at 50% reliable
ReliabilityMainstream(int topscore,int len,int mean_score)789 int cld::ReliabilityMainstream(int topscore, int len, int mean_score) {
790   if (mean_score == 0) {return 100;}    // No reliability data available yet
791   if (topscore == 0) {return 0;}        // zero score = unreliable
792   if (len == 0) {return 0;}             // zero len = unreliable
793   int top_kb = (topscore << 10) / len;
794   double ratio;
795   double ratio_cutoff;
796   if (top_kb > mean_score) {
797     ratio = (1.0 * top_kb) / mean_score;
798     ratio_cutoff = 5.0;                 // ramp down from 100% to 0%: 3.0-5.0
799   } else {
800     ratio = (1.0 * mean_score) / top_kb;
801     ratio_cutoff = 4.0;                 // ramp down from 100% to 0%: 2.0-4.0
802   }
803   if (ratio <= ratio_cutoff - 2.0) {return 100;}
804   if (ratio > ratio_cutoff) {return 0;}
805 
806   int iratio = static_cast<int>(100 * (ratio_cutoff - ratio) / 2.0);
807   return iratio;
808 }
809 
810 // Calculate ratio of score per 1KB vs. expected score per 1KB
GetNormalizedScore(Language lang,UnicodeLScript lscript,int bytes,int score)811 double cld::GetNormalizedScore(Language lang, UnicodeLScript lscript,
812                           int bytes, int score) {
813   // Average training-data score for this language-script combo, per 1KB
814   int expected_score = kMeanScore[lang * 4 + LScript4(lscript)];
815   if (lscript == ULScript_Common) {
816     // We don't know the script (only happens with second-chance score)
817     // Look for first non-zero mean value
818     for (int i = 2; i >= 0; --i) {
819       if (kMeanScore[lang * 4 + i] > 0) {
820         expected_score = kMeanScore[lang * 4 + i];
821         break;
822       }
823     }
824   }
825   if (expected_score < 100) {
826       expected_score = 1000;
827   }
828 
829   // Our score per 1KB
830   double our_score = (score << 10) / (bytes ? bytes : 1);  // Avoid zdiv
831   double ratio = our_score / expected_score;
832 
833   // Just the raw count normalized as though each language has mean=1000;
834   ratio = (score * 1000.0) /  expected_score;
835   return ratio;
836 }
837 
838 // Calculate reliablity of len bytes of script lscript with chunk_tote
GetReliability(int len,UnicodeLScript lscript,const Tote * chunk_tote)839 int cld::GetReliability(int len, UnicodeLScript lscript,
840                    const Tote* chunk_tote) {
841   Language cur_lang = UnpackLanguage(chunk_tote->Key(0));
842   // Average score for this language-script combo
843   int mean_score = kMeanScore[cur_lang * 4 + LScript4(lscript)];
844   if (lscript == ULScript_Common) {
845     // We don't know the script (only happens with second-chance score)
846     // Look for first non-zero mean value
847     for (int i = 2; i >= 0; --i) {
848       if (kMeanScore[cur_lang * 4 + i] > 0) {
849         mean_score = kMeanScore[cur_lang * 4 + i];
850         break;
851       }
852     }
853   }
854   int reliability_delta = ReliabilityDelta(chunk_tote->Value(0),
855                                            chunk_tote->Value(1),
856                                            chunk_tote->GetGramCount());
857 
858   int reliability_main = ReliabilityMainstream(chunk_tote->Value(0),
859                                                len,
860                                                mean_score);
861 
862   int reliability_min = minint(reliability_delta, reliability_main);
863 
864 
865   if (FLAGS_dbgreli) {
866     char temp1[4];
867     char temp2[4];
868     cld::DbgLangName3(UnpackLanguage(chunk_tote->Key(0)), temp1);
869     if (temp1[2] == ' ') {temp1[2] = '\0';}
870     cld::DbgLangName3(UnpackLanguage(chunk_tote->Key(1)), temp2);
871     if (temp2[2] == ' ') {temp2[2] = '\0';}
872     int srclen = len;
873     fprintf(stderr, "CALC GetReliability gram=%d incr=%d srclen=%d,  %s=%d %s=%d "
874                    "top/KB=%d mean/KB=%d del=%d%% reli=%d%%   "
875                    "lang/lscript %d %d\n",
876            chunk_tote->GetGramCount(),
877            chunk_tote->GetIncrCount(),
878            srclen,
879            temp1, chunk_tote->Value(0),
880            temp2, chunk_tote->Value(1),
881            (chunk_tote->Value(0) << 10) / (srclen ? srclen : 1),
882            mean_score,
883            reliability_delta,
884            reliability_main,
885            cur_lang, lscript);
886   }
887 
888   return reliability_min;
889 }
890 
891 
892 //------------------------------------------------------------------------------
893 // Miscellaneous
894 //------------------------------------------------------------------------------
895 
896 // Demote all languages except Top40 and plus_one
897 // Do this just before sorting chunk_tote results
DemoteNotTop40(Tote * chunk_tote,int packed_plus_one)898 void cld::DemoteNotTop40(Tote* chunk_tote, int packed_plus_one) {
899   for (int sub = 0; sub < chunk_tote->MaxSize(); ++sub) {
900     if (chunk_tote->Key(sub) == 0) continue;
901     if (chunk_tote->Key(sub) == packed_plus_one) continue;
902     if (kIsPackedTop40[chunk_tote->Key(sub)]) continue;
903     // Quarter the score of others
904     chunk_tote->SetValue(sub, chunk_tote->Value(sub) >> 2);
905   }
906 }
907