1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "encodings/compact_lang_det/tote.h"
6 #include <string.h> // memset
7
8 #include "encodings/compact_lang_det/win/cld_logging.h"
9
10
11 // Take a set of <key, value> pairs and tote them up.
12 // After explicitly sorting, retrieve top key, value pairs
Tote()13 Tote::Tote() {
14 gram_count_ = 0;
15 incr_count_ = 0;
16 byte_count_ = 0;
17 memset(key_, 0, sizeof(key_));
18 // No need to initialize values
19 }
20
~Tote()21 Tote::~Tote() {
22 }
23
Reinit()24 void Tote::Reinit() {
25 gram_count_ = 0;
26 incr_count_ = 0;
27 byte_count_ = 0;
28 memset(key_, 0, sizeof(key_));
29 // No need to initialize values
30 }
31
32 // Increment count of quadgrams/trigrams/unigrams scored
AddGram()33 void Tote::AddGram() {
34 ++gram_count_;
35 }
36
37 // Three-way associative, guaranteeing that the largest two counts are always
38 // in the data structure. kMaxSize must be a multiple of 3, and is tied to the
39 // subscript calculations here, which are for 8 sets of 3-way associative
40 // buckets. The subscripts for set N are [N], [N+8], and [N+16] used in a
41 // slightly-weird way: The initial probe point is [N] or [N+8], whichever
42 // is specified by key mod 16. In most cases (nearly *all* cases except Latin
43 // script), this entry matches and we update/return. The second probe is
44 // the other of [N] and [N+8]. The third probe is only used as a fallback to
45 // these two, and is there only for the rare case that there are three or more
46 // languages with Language enum values equal mod 8, contending within the same
47 // bucket. This can only happen in Latin and (rarely) Cyrillic scripts, because
48 // the other scripts have fewer than 17 languages total.
49 // If you change kMaxSize, change the constants 7/8/15/16 below
Add(uint8 ikey,int idelta)50 void Tote::Add(uint8 ikey, int idelta) {
51 DCHECK(ikey != 0);
52 ++incr_count_;
53
54 // Look for existing entry
55 int sub0 = ikey & 15;
56 if (key_[sub0] == ikey) {
57 value_[sub0] += idelta;
58 return;
59 }
60 int sub1 = sub0 ^ 8;
61 if (key_[sub1] == ikey) {
62 value_[sub1] += idelta;
63 return;
64 }
65 int sub2 = (ikey & 7) + 16;
66 if (key_[sub2] == ikey) {
67 value_[sub2] += idelta;
68 return;
69 }
70
71 // Allocate new entry
72 int alloc = -1;
73 if (key_[sub0] == 0) {
74 alloc = sub0;
75 } else if (key_[sub1] == 0) {
76 alloc = sub1;
77 } else if (key_[sub2] == 0) {
78 alloc = sub2;
79 } else {
80 // All choices allocated, need to replace smallest one
81 alloc = sub0;
82 if (value_[sub1] < value_[alloc]) {alloc = sub1;}
83 if (value_[sub2] < value_[alloc]) {alloc = sub2;}
84 }
85 key_[alloc] = ikey;
86 value_[alloc] = idelta;
87 return;
88 }
89
90 // Return current top key
CurrentTopKey()91 int Tote::CurrentTopKey() {
92 int top_key = 0;
93 int top_value = -1;
94 for (int sub = 0; sub < kMaxSize_; ++sub) {
95 if (key_[sub] == 0) {continue;}
96 if (top_value < value_[sub]) {
97 top_value = value_[sub];
98 top_key = key_[sub];
99 }
100 }
101 return top_key;
102 }
103
104
105 // Sort first n entries by decreasing order of value
106 // If key==0 other fields are not valid, treat value as -1
Sort(int n)107 void Tote::Sort(int n) {
108 // This is n**2, but n is small
109 for (int sub = 0; sub < n; ++sub) {
110 if (key_[sub] == 0) {value_[sub] = -1;}
111
112 // Bubble sort key[sub] and entry[sub]
113 for (int sub2 = sub + 1; sub2 < kMaxSize_; ++sub2) {
114 if (key_[sub2] == 0) {value_[sub2] = -1;}
115 if (value_[sub] < value_[sub2]) {
116 // swap
117 uint8 tmpk = key_[sub];
118 key_[sub] = key_[sub2];
119 key_[sub2] = tmpk;
120 int tmpv = value_[sub];
121 value_[sub] = value_[sub2];
122 value_[sub2] = tmpv;
123 }
124 }
125 }
126 }
127
Dump(FILE * f)128 void Tote::Dump(FILE* f) {
129 for (int sub = 0; sub < kMaxSize_; ++sub) {
130 if (key_[sub] > 0) {
131 fprintf(f, "[%2d] %3d %8d\n", sub, key_[sub], value_[sub]);
132 }
133 }
134 fprintf(f, "%d %d %d\n", gram_count_, incr_count_, byte_count_);
135 }
136
137
138
139
140 // Take a set of <key, value> pairs and tote them up.
141 // After explicitly sorting, retrieve top key, value pairs
ToteWithReliability()142 ToteWithReliability::ToteWithReliability() {
143 // No need to initialize score_ or value_
144 incr_count_ = 0;
145 sorted_ = 0;
146 memset(closepair_, 0, sizeof(closepair_));
147 memset(key_, 0, sizeof(key_));
148 }
149
~ToteWithReliability()150 ToteWithReliability::~ToteWithReliability() {
151 }
152
Reinit()153 void ToteWithReliability::Reinit() {
154 // No need to initialize score_ or value_
155 incr_count_ = 0;
156 sorted_ = 0;
157 memset(closepair_, 0, sizeof(closepair_));
158 memset(key_, 0, sizeof(key_));
159 ////ss_.Init();
160 }
161
162 // Weight reliability by ibytes
163 // Also see three-way associative comments above for Tote
Add(uint8 ikey,int ibytes,int score,int ireliability)164 void ToteWithReliability::Add(uint8 ikey, int ibytes,
165 int score, int ireliability) {
166 DCHECK(ikey != 0);
167 CHECK(sorted_ == 0);
168 ++incr_count_;
169
170 // Look for existing entry
171 int sub0 = ikey & 15;
172 if (key_[sub0] == ikey) {
173 value_[sub0] += ibytes;
174 score_[sub0] += score;
175 reliability_[sub0] += ireliability * ibytes;
176 return;
177 }
178 int sub1 = sub0 ^ 8;
179 if (key_[sub1] == ikey) {
180 value_[sub1] += ibytes;
181 score_[sub1] += score;
182 reliability_[sub1] += ireliability * ibytes;
183 return;
184 }
185 int sub2 = (ikey & 7) + 16;
186 if (key_[sub2] == ikey) {
187 value_[sub2] += ibytes;
188 score_[sub2] += score;
189 reliability_[sub2] += ireliability * ibytes;
190 return;
191 }
192
193 // Allocate new entry
194 int alloc = -1;
195 if (key_[sub0] == 0) {
196 alloc = sub0;
197 } else if (key_[sub1] == 0) {
198 alloc = sub1;
199 } else if (key_[sub2] == 0) {
200 alloc = sub2;
201 } else {
202 // All choices allocated, need to replace smallest one
203 alloc = sub0;
204 if (value_[sub1] < value_[alloc]) {alloc = sub1;}
205 if (value_[sub2] < value_[alloc]) {alloc = sub2;}
206 }
207 key_[alloc] = ikey;
208 value_[alloc] = ibytes;
209 score_[alloc] = score;
210 reliability_[alloc] = ireliability * ibytes;
211 return;
212 }
213
214 // Find subscript of a given packed language, or -1
Find(uint8 ikey)215 int ToteWithReliability::Find(uint8 ikey) {
216 DCHECK(ikey != 0);
217
218 if (sorted_) {
219 // Linear search if sorted
220 for (int sub = 0; sub < kMaxSize_; ++sub) {
221 if (key_[sub] == ikey) {return sub;}
222 }
223 return -1;
224 }
225
226 // Look for existing entry
227 int sub0 = ikey & 15;
228 if (key_[sub0] == ikey) {
229 return sub0;
230 }
231 int sub1 = sub0 ^ 8;
232 if (key_[sub1] == ikey) {
233 return sub1;
234 }
235 int sub2 = (ikey & 7) + 16;
236 if (key_[sub2] == ikey) {
237 return sub2;
238 }
239
240 return -1;
241 }
242
243 // Return current top key
CurrentTopKey()244 int ToteWithReliability::CurrentTopKey() {
245 int top_key = 0;
246 int top_value = -1;
247 for (int sub = 0; sub < kMaxSize_; ++sub) {
248 if (key_[sub] == 0) {continue;}
249 if (top_value < value_[sub]) {
250 top_value = value_[sub];
251 top_key = key_[sub];
252 }
253 }
254 return top_key;
255 }
256
257
258 // Sort first n entries by decreasing order of value
259 // If key==0 other fields are not valid, treat value as -1
Sort(int n)260 void ToteWithReliability::Sort(int n) {
261 // This is n**2, but n is small
262 for (int sub = 0; sub < n; ++sub) {
263 if (key_[sub] == 0) {value_[sub] = -1;}
264
265 // Bubble sort key[sub] and entry[sub]
266 for (int sub2 = sub + 1; sub2 < kMaxSize_; ++sub2) {
267 if (key_[sub2] == 0) {value_[sub2] = -1;}
268 if (value_[sub] < value_[sub2]) {
269 // swap
270 uint8 tmpk = key_[sub];
271 key_[sub] = key_[sub2];
272 key_[sub2] = tmpk;
273
274 int tmpv = value_[sub];
275 value_[sub] = value_[sub2];
276 value_[sub2] = tmpv;
277
278 int tmps = score_[sub];
279 score_[sub] = score_[sub2];
280 score_[sub2] = tmps;
281
282 int tmpr = reliability_[sub];
283 reliability_[sub] = reliability_[sub2];
284 reliability_[sub2] = tmpr;
285 }
286 }
287 }
288 sorted_ = 1;
289 }
290
Dump(FILE * f)291 void ToteWithReliability::Dump(FILE* f) {
292 for (int sub = 0; sub < kMaxSize_; ++sub) {
293 if (key_[sub] > 0) {
294 fprintf(f, "[%2d] %3d %6d %5d %4d\n",
295 sub, key_[sub], value_[sub], score_[sub], reliability_[sub]);
296 }
297 }
298 fprintf(f, " %d#\n", incr_count_);
299 }
300