• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "encodings/compact_lang_det/tote.h"
6 #include <string.h>     // memset
7 
8 #include "encodings/compact_lang_det/win/cld_logging.h"
9 
10 
11 // Take a set of <key, value> pairs and tote them up.
12 // After explicitly sorting, retrieve top key, value pairs
Tote()13 Tote::Tote() {
14   gram_count_ = 0;
15   incr_count_ = 0;
16   byte_count_ = 0;
17   memset(key_, 0, sizeof(key_));
18   // No need to initialize values
19 }
20 
~Tote()21 Tote::~Tote() {
22 }
23 
Reinit()24 void Tote::Reinit() {
25   gram_count_ = 0;
26   incr_count_ = 0;
27   byte_count_ = 0;
28   memset(key_, 0, sizeof(key_));
29   // No need to initialize values
30 }
31 
32 // Increment count of quadgrams/trigrams/unigrams scored
AddGram()33 void Tote::AddGram() {
34   ++gram_count_;
35 }
36 
37 // Three-way associative, guaranteeing that the largest two counts are always
38 // in the data structure. kMaxSize must be a multiple of 3, and is tied to the
39 // subscript calculations here, which are for 8 sets of 3-way associative
40 // buckets. The subscripts for set N are [N], [N+8], and [N+16] used in a
41 // slightly-weird way: The initial probe point is [N] or [N+8], whichever
42 // is specified by key mod 16. In most cases (nearly *all* cases except Latin
43 // script), this entry matches and we update/return. The second probe is
44 // the other of [N] and [N+8]. The third probe is only used as a fallback to
45 // these two, and is there only for the rare case that there are three or more
46 // languages with Language enum values equal mod 8, contending within the same
47 // bucket. This can only happen in Latin and (rarely) Cyrillic scripts, because
48 // the other scripts have fewer than 17 languages total.
49 // If you change kMaxSize, change the constants 7/8/15/16 below
Add(uint8 ikey,int idelta)50 void Tote::Add(uint8 ikey, int idelta) {
51   DCHECK(ikey != 0);
52   ++incr_count_;
53 
54   // Look for existing entry
55   int sub0 = ikey & 15;
56   if (key_[sub0] == ikey) {
57     value_[sub0] += idelta;
58     return;
59   }
60   int sub1 = sub0 ^ 8;
61   if (key_[sub1] == ikey) {
62     value_[sub1] += idelta;
63     return;
64   }
65   int sub2 = (ikey & 7) + 16;
66   if (key_[sub2] == ikey) {
67     value_[sub2] += idelta;
68     return;
69   }
70 
71   // Allocate new entry
72   int alloc = -1;
73   if (key_[sub0] == 0) {
74     alloc = sub0;
75   } else if (key_[sub1] == 0) {
76     alloc = sub1;
77   } else if (key_[sub2] == 0) {
78     alloc = sub2;
79   } else {
80     // All choices allocated, need to replace smallest one
81     alloc = sub0;
82     if (value_[sub1] < value_[alloc]) {alloc = sub1;}
83     if (value_[sub2] < value_[alloc]) {alloc = sub2;}
84   }
85   key_[alloc] = ikey;
86   value_[alloc] = idelta;
87   return;
88 }
89 
90 // Return current top key
CurrentTopKey()91 int Tote::CurrentTopKey() {
92   int top_key = 0;
93   int top_value = -1;
94   for (int sub = 0; sub < kMaxSize_; ++sub) {
95     if (key_[sub] == 0) {continue;}
96     if (top_value < value_[sub]) {
97       top_value = value_[sub];
98       top_key = key_[sub];
99     }
100   }
101   return top_key;
102 }
103 
104 
105 // Sort first n entries by decreasing order of value
106 // If key==0 other fields are not valid, treat value as -1
Sort(int n)107 void Tote::Sort(int n) {
108   // This is n**2, but n is small
109   for (int sub = 0; sub < n; ++sub) {
110     if (key_[sub] == 0) {value_[sub] = -1;}
111 
112     // Bubble sort key[sub] and entry[sub]
113     for (int sub2 = sub + 1; sub2 < kMaxSize_; ++sub2) {
114       if (key_[sub2] == 0) {value_[sub2] = -1;}
115       if (value_[sub] < value_[sub2]) {
116         // swap
117         uint8 tmpk = key_[sub];
118         key_[sub] = key_[sub2];
119         key_[sub2] = tmpk;
120         int tmpv = value_[sub];
121         value_[sub] = value_[sub2];
122         value_[sub2] = tmpv;
123       }
124     }
125   }
126 }
127 
Dump(FILE * f)128 void Tote::Dump(FILE* f) {
129   for (int sub = 0; sub < kMaxSize_; ++sub) {
130     if (key_[sub] > 0) {
131       fprintf(f, "[%2d] %3d %8d\n", sub, key_[sub], value_[sub]);
132     }
133   }
134   fprintf(f, "%d %d %d\n", gram_count_, incr_count_, byte_count_);
135 }
136 
137 
138 
139 
140 // Take a set of <key, value> pairs and tote them up.
141 // After explicitly sorting, retrieve top key, value pairs
ToteWithReliability()142 ToteWithReliability::ToteWithReliability() {
143   // No need to initialize score_ or value_
144   incr_count_ = 0;
145   sorted_ = 0;
146   memset(closepair_, 0, sizeof(closepair_));
147   memset(key_, 0, sizeof(key_));
148 }
149 
~ToteWithReliability()150 ToteWithReliability::~ToteWithReliability() {
151 }
152 
Reinit()153 void ToteWithReliability::Reinit() {
154   // No need to initialize score_ or value_
155   incr_count_ = 0;
156   sorted_ = 0;
157   memset(closepair_, 0, sizeof(closepair_));
158   memset(key_, 0, sizeof(key_));
159   ////ss_.Init();
160 }
161 
162 // Weight reliability by ibytes
163 // Also see three-way associative comments above for Tote
Add(uint8 ikey,int ibytes,int score,int ireliability)164 void ToteWithReliability::Add(uint8 ikey, int ibytes,
165                               int score, int ireliability) {
166   DCHECK(ikey != 0);
167   CHECK(sorted_ == 0);
168   ++incr_count_;
169 
170   // Look for existing entry
171   int sub0 = ikey & 15;
172   if (key_[sub0] == ikey) {
173     value_[sub0] += ibytes;
174     score_[sub0] += score;
175     reliability_[sub0] += ireliability * ibytes;
176     return;
177   }
178   int sub1 = sub0 ^ 8;
179   if (key_[sub1] == ikey) {
180     value_[sub1] += ibytes;
181     score_[sub1] += score;
182     reliability_[sub1] += ireliability * ibytes;
183     return;
184   }
185   int sub2 = (ikey & 7) + 16;
186   if (key_[sub2] == ikey) {
187     value_[sub2] += ibytes;
188     score_[sub2] += score;
189     reliability_[sub2] += ireliability * ibytes;
190     return;
191   }
192 
193   // Allocate new entry
194   int alloc = -1;
195   if (key_[sub0] == 0) {
196     alloc = sub0;
197   } else if (key_[sub1] == 0) {
198     alloc = sub1;
199   } else if (key_[sub2] == 0) {
200     alloc = sub2;
201   } else {
202     // All choices allocated, need to replace smallest one
203     alloc = sub0;
204     if (value_[sub1] < value_[alloc]) {alloc = sub1;}
205     if (value_[sub2] < value_[alloc]) {alloc = sub2;}
206   }
207   key_[alloc] = ikey;
208   value_[alloc] = ibytes;
209   score_[alloc] = score;
210   reliability_[alloc] = ireliability * ibytes;
211   return;
212 }
213 
214 // Find subscript of a given packed language, or -1
Find(uint8 ikey)215 int ToteWithReliability::Find(uint8 ikey) {
216   DCHECK(ikey != 0);
217 
218   if (sorted_) {
219     // Linear search if sorted
220     for (int sub = 0; sub < kMaxSize_; ++sub) {
221       if (key_[sub] == ikey) {return sub;}
222     }
223     return -1;
224   }
225 
226   // Look for existing entry
227   int sub0 = ikey & 15;
228   if (key_[sub0] == ikey) {
229     return sub0;
230   }
231   int sub1 = sub0 ^ 8;
232   if (key_[sub1] == ikey) {
233     return sub1;
234   }
235   int sub2 = (ikey & 7) + 16;
236   if (key_[sub2] == ikey) {
237     return sub2;
238   }
239 
240   return -1;
241 }
242 
243 // Return current top key
CurrentTopKey()244 int ToteWithReliability::CurrentTopKey() {
245   int top_key = 0;
246   int top_value = -1;
247   for (int sub = 0; sub < kMaxSize_; ++sub) {
248     if (key_[sub] == 0) {continue;}
249     if (top_value < value_[sub]) {
250       top_value = value_[sub];
251       top_key = key_[sub];
252     }
253   }
254   return top_key;
255 }
256 
257 
258 // Sort first n entries by decreasing order of value
259 // If key==0 other fields are not valid, treat value as -1
Sort(int n)260 void ToteWithReliability::Sort(int n) {
261   // This is n**2, but n is small
262   for (int sub = 0; sub < n; ++sub) {
263     if (key_[sub] == 0) {value_[sub] = -1;}
264 
265     // Bubble sort key[sub] and entry[sub]
266     for (int sub2 = sub + 1; sub2 < kMaxSize_; ++sub2) {
267       if (key_[sub2] == 0) {value_[sub2] = -1;}
268       if (value_[sub] < value_[sub2]) {
269         // swap
270         uint8 tmpk = key_[sub];
271         key_[sub] = key_[sub2];
272         key_[sub2] = tmpk;
273 
274         int tmpv = value_[sub];
275         value_[sub] = value_[sub2];
276         value_[sub2] = tmpv;
277 
278         int tmps = score_[sub];
279         score_[sub] = score_[sub2];
280         score_[sub2] = tmps;
281 
282         int tmpr = reliability_[sub];
283         reliability_[sub] = reliability_[sub2];
284         reliability_[sub2] = tmpr;
285       }
286     }
287   }
288   sorted_ = 1;
289 }
290 
Dump(FILE * f)291 void ToteWithReliability::Dump(FILE* f) {
292   for (int sub = 0; sub < kMaxSize_; ++sub) {
293     if (key_[sub] > 0) {
294       fprintf(f, "[%2d] %3d %6d %5d %4d\n",
295               sub, key_[sub], value_[sub], score_[sub], reliability_[sub]);
296     }
297   }
298   fprintf(f, "  %d#\n", incr_count_);
299 }
300