1 // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 // Remember a subset of a sequence of values, using a modest amount of memory
6
7 /***
8 Design:
9 Accumulate in powers of three, using 3-way median to collapse entries.
10 At any given time, there is one most-dense (highest power of 3) range of
11 entries and a series of less-dense ranges that hold 0..2 entries each. There
12 is a bounded-size storage array of S cells for all the entries.
13
14 The overflow detect is set up so that a new higher power of 3, K+1, is
15 triggered precisely when range K has 3n entries and all ranges < K have
16 zero entries.
17
18 In general, think of the range sizes as a multi-digit base 3 number, except
19 the highest digit may exceed 2:
20
21 3**6 3**5 3**4 3**3 3**2 3**1 3**0 K=2
22 0 0 0 0 3n-1 2 2 unused:1
23
24 There are a total of 3n-1 + 2 + 2 entries in use. Assume a size limit S at
25 one more than that, and we add a new 3**0 entry and "carry" by performing
26 medians on any group of 3 elements:
27
28 3**6 3**5 3**4 3**3 3**2 3**1 3**0 K=2
29 0 0 0 0 3n-1 2 3 unused:0
30 0 0 0 0 3n-1 3 0 carry unused:2
31 0 0 0 0 3n 0 0 carry unused:4
32
33 To accumulate 2 entries at all levels < K and 3 just before the first carry at
34 level 0, we need 2*K + 1 unused cells after doing all carries, or five cells
35 in this case. Since we only have 4 cells in the example above, we need to
36 make room by starting a new power of three:
37
38 3**6 3**5 3**4 3**3 3**2 3**1 3**0
39 0 0 0 0 3n 0 0 K=2 unused:4
40 0 0 0 n 0 0 0 K=3 unused:2n+4
41
42 In the code below, we don't worry about overflow from the topmost place.
43
44
45 ***/
46
47 #include "encodings/compact_lang_det/subsetsequence.h"
48 #include <stdio.h>
49
50 #include "encodings/compact_lang_det/win/cld_logging.h"
51
DumpInts(const char * label,const int * v,int n)52 void DumpInts(const char* label, const int* v, int n) {
53 printf("%s ", label);
54 for (int i = 0; i < n; ++i) {
55 printf("%d ", v[i]);
56 }
57 printf("\n");
58 }
59
DumpUint8s(const char * label,const uint8 * v,int n)60 void DumpUint8s(const char* label, const uint8* v, int n) {
61 printf("%s ", label);
62 for (int i = 0; i < n; ++i) {
63 printf("%d ", v[i]);
64 }
65 printf("\n");
66 }
67
68 // Return median of seq_[sub] .. seq_[sub+2], favoring middle element
Median3(int sub)69 uint8 SubsetSequence::Median3(int sub) {
70 if (seq_[sub] == seq_[sub + 1]) {
71 return seq_[sub];
72 }
73 if (seq_[sub] == seq_[sub + 2]) {
74 return seq_[sub];
75 }
76 return seq_[sub + 1];
77 }
78
Init()79 void SubsetSequence::Init() {
80 // printf("Init\n");
81
82 k_ = 0;
83 count_[0] = 0;
84 next_e_ = 0;
85 seq_[0] = 0; // Default value if no calls to Add
86
87 // Want largest <= kMaxSeq_ that allows reserve and makes count_[k_] = 0 mod 3
88 int reserve = (2 * k_ + 1);
89 level_limit_e_ = kMaxSeq_ - reserve;
90 level_limit_e_ = (level_limit_e_ / 3) * 3; // Round down to multiple of 3
91 limit_e_ = level_limit_e_;
92 }
93
94 // Compress level k by 3x, creating level k+1
NewLevel()95 void SubsetSequence::NewLevel() {
96 // printf("NewLevel 3 ** %d\n", k_ + 1);
97 //DumpUint8s("count[k]", count_, k_ + 1);
98 //DumpUint8s("seq[next]", seq_, next_e_);
99
100 // Incoming level must be an exact multiple of three in size
101 CHECK((count_[k_] % 3) == 0);
102 int k_size = count_[k_];
103 int new_size = k_size / 3;
104
105 // Compress down by 3x, via median
106 for (int j = 0; j < new_size; ++j) {
107 seq_[j] = Median3(j * 3);
108 }
109
110 // Update counts
111 count_[k_] = 0;
112 // Else Overflow -- just continue with 3x dense Level K
113 if (k_ < (kMaxLevel_ - 1)) {++k_;}
114 count_[k_] = new_size;
115
116 // Update limits
117 next_e_ = new_size;
118 limit_e_ = next_e_ + 3;
119
120 // Want largest <= kMaxSeq_ that allows reserve and makes count_[k_] = 0 mod 3
121 int reserve = (2 * k_ + 1);
122 level_limit_e_ = kMaxSeq_ - reserve;
123 level_limit_e_ = (level_limit_e_ / 3) * 3; // Round down to multiple of 3
124 //
125 //DumpUint8s("after: count[k]", count_, k_ + 1);
126 //DumpUint8s("after: seq[next]", seq_, next_e_);
127 }
128
DoCarries()129 void SubsetSequence::DoCarries() {
130 CHECK(count_[k_] > 3); // We depend on count_[k_] being > 3 to stop while
131 // Make room by carrying
132
133 //DumpUint8s("DoCarries count[k]", count_, k_ + 1);
134 //DumpUint8s("DoCarries seq[next]", seq_, next_e_);
135
136 int i = 0;
137 while (count_[i] == 3) {
138 next_e_ -= 3;
139 seq_[next_e_] = Median3(next_e_);
140 ++next_e_;
141 count_[i] = 0;
142 ++count_[i + 1];
143 ++i;
144 }
145 limit_e_ = next_e_ + 3;
146
147 //DumpUint8s("after: DoCarries count[k]", count_, k_ + 1);
148 //DumpUint8s("after: DoCarries seq[next]", seq_, next_e_);
149
150 // If we just fully carried into level K,
151 // Make sure there is now enough room, else start level K + 1
152 if (i >= k_) {
153 CHECK(count_[k_] == next_e_);
154 if (next_e_ >= level_limit_e_) {
155 NewLevel();
156 }
157 }
158 }
159
Add(uint8 e)160 void SubsetSequence::Add(uint8 e) {
161 // Add an entry then carry as needed
162 seq_[next_e_] = e;
163 ++next_e_;
164 ++count_[0];
165
166 if (next_e_ >= limit_e_) {
167 DoCarries();
168 }
169 }
170
171
172 // Collapse tail end by simple median across disparate-weight values,
173 // dropping or duplicating last value if need be.
174 // This routine is idempotent.
Flush()175 void SubsetSequence::Flush() {
176 // printf("Flush %d\n", count_[k_]);
177 int start_tail = count_[k_];
178 int size_tail = next_e_ - start_tail;
179 if ((size_tail % 3) == 2) {
180 seq_[next_e_] = seq_[next_e_ - 1]; // Duplicate last value
181 ++size_tail;
182 }
183
184 // Compress tail down by 3x, via median
185 int new_size = size_tail / 3; // May delete last value
186 for (int j = 0; j < new_size; ++j) {
187 seq_[start_tail + j] = Median3(start_tail + j * 3);
188 }
189
190 next_e_ = start_tail + new_size;
191 count_[k_] = next_e_;
192 }
193
194
195 // Extract representative pattern of exactly N values into dst[0..n-1]
196 // This routine may be called multiple times, but it may downsample as a
197 // side effect, causing subsequent calls with larger N to get poor answers.
Extract(int to_n,uint8 * dst)198 void SubsetSequence::Extract(int to_n, uint8* dst) {
199 // Collapse partial-carries in tail
200 Flush();
201
202 // Just use Bresenham to resample
203 int from_n = next_e_;
204 if (to_n >= from_n) {
205 // Up-sample from_n => to_n
206 int err = to_n - 1; // bias toward no overshoot
207 int j = 0;
208 for (int i = 0; i < to_n; ++i) {
209 dst[i] = seq_[j];
210 err -= from_n;
211 if (err < 0) {
212 ++j;
213 err += to_n;
214 }
215 }
216 } else {
217 // Get to the point that the number of samples is <= 3 * to_n
218 while (next_e_ > (to_n * 3)) {
219 // Compress down by 3x, via median
220 // printf("Extract, median %d / 3\n", next_e_);
221 if ((next_e_ % 3) == 2) {
222 seq_[next_e_] = seq_[next_e_ - 1]; // Duplicate last value
223 ++next_e_;
224 }
225 int new_size = next_e_ / 3; // May delete last value
226 for (int j = 0; j < new_size; ++j) {
227 seq_[j] = Median3(j * 3);
228 }
229 next_e_ = new_size;
230 count_[k_] = next_e_;
231 }
232 from_n = next_e_;
233
234 if (to_n == from_n) {
235 // Copy verbatim
236 for (int i = 0; i < to_n; ++i) {
237 dst[i] = seq_[i];
238 }
239 return;
240 }
241
242 // Down-sample from_n => to_n, using medians
243 int err = 0; // Bias to immediate median sample
244 int j = 0;
245 for (int i = 0; i < from_n; ++i) {
246 err -= to_n;
247 if (err < 0) {
248 if (i <= (next_e_ - 2)) {
249 dst[j] = Median3(i);
250 } else {
251 dst[j] = seq_[i];
252 }
253 ++j;
254 err += from_n;
255 }
256 }
257 }
258
259 }
260