• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 // Remember a subset of a sequence of values, using a modest amount of memory
6 
7 /***
8   Design:
9   Accumulate in powers of three, using 3-way median to collapse entries.
10   At any given time, there is one most-dense (highest power of 3) range of
11   entries and a series of less-dense ranges that hold 0..2 entries each. There
12   is a bounded-size storage array of S cells for all the entries.
13 
14   The overflow detect is set up so that a new higher power of 3, K+1, is
15   triggered precisely when range K has 3n entries and all ranges < K have
16   zero entries.
17 
18   In general, think of the range sizes as a multi-digit base 3 number, except
19   the highest digit may exceed 2:
20 
21   3**6  3**5  3**4  3**3  3**2  3**1  3**0  K=2
22     0     0     0     0   3n-1    2     2   unused:1
23 
24   There are a total of 3n-1 + 2 + 2 entries in use. Assume a size limit S at
25   one more than that, and we add a new 3**0 entry and "carry" by performing
26   medians on any group of 3 elements:
27 
28   3**6  3**5  3**4  3**3  3**2  3**1  3**0       K=2
29     0     0     0     0   3n-1    2     3        unused:0
30     0     0     0     0   3n-1    3     0 carry  unused:2
31     0     0     0     0    3n     0     0 carry  unused:4
32 
33   To accumulate 2 entries at all levels < K and 3 just before the first carry at
34   level 0, we need 2*K + 1 unused cells after doing all carries, or five cells
35   in this case. Since we only have 4 cells in the example above, we need to
36   make room by starting a new power of three:
37 
38   3**6  3**5  3**4  3**3  3**2  3**1  3**0
39     0     0     0     0    3n     0     0        K=2 unused:4
40     0     0     0     n     0     0     0        K=3 unused:2n+4
41 
42   In the code below, we don't worry about overflow from the topmost place.
43 
44 
45 ***/
46 
47 #include "encodings/compact_lang_det/subsetsequence.h"
48 #include <stdio.h>
49 
50 #include "encodings/compact_lang_det/win/cld_logging.h"
51 
DumpInts(const char * label,const int * v,int n)52 void DumpInts(const char* label, const int* v, int n) {
53   printf("%s ", label);
54   for (int i = 0; i < n; ++i) {
55     printf("%d ", v[i]);
56   }
57   printf("\n");
58 }
59 
DumpUint8s(const char * label,const uint8 * v,int n)60 void DumpUint8s(const char* label, const uint8* v, int n) {
61   printf("%s ", label);
62   for (int i = 0; i < n; ++i) {
63     printf("%d ", v[i]);
64   }
65   printf("\n");
66 }
67 
68 // Return median of seq_[sub] .. seq_[sub+2], favoring middle element
Median3(int sub)69 uint8 SubsetSequence::Median3(int sub) {
70   if (seq_[sub] == seq_[sub + 1]) {
71     return seq_[sub];
72   }
73   if (seq_[sub] == seq_[sub + 2]) {
74     return seq_[sub];
75   }
76   return seq_[sub + 1];
77 }
78 
Init()79 void SubsetSequence::Init() {
80   // printf("Init\n");
81 
82   k_ = 0;
83   count_[0] = 0;
84   next_e_ = 0;
85   seq_[0] = 0;    // Default value if no calls to Add
86 
87   // Want largest <= kMaxSeq_ that allows reserve and makes count_[k_] = 0 mod 3
88   int reserve = (2 * k_ + 1);
89   level_limit_e_ = kMaxSeq_ - reserve;
90   level_limit_e_ = (level_limit_e_ / 3) * 3;  // Round down to multiple of 3
91   limit_e_ = level_limit_e_;
92 }
93 
94 // Compress level k by 3x, creating level k+1
NewLevel()95 void SubsetSequence::NewLevel() {
96   // printf("NewLevel 3 ** %d\n", k_ + 1);
97   //DumpUint8s("count[k]", count_, k_ + 1);
98   //DumpUint8s("seq[next]", seq_, next_e_);
99 
100   // Incoming level must be an exact multiple of three in size
101   CHECK((count_[k_] % 3) == 0);
102   int k_size = count_[k_];
103   int new_size = k_size / 3;
104 
105   // Compress down by 3x, via median
106   for (int j = 0; j < new_size; ++j) {
107     seq_[j] = Median3(j * 3);
108   }
109 
110   // Update counts
111   count_[k_] = 0;
112   // Else Overflow -- just continue with 3x dense Level K
113   if (k_ < (kMaxLevel_ - 1)) {++k_;}
114   count_[k_] = new_size;
115 
116   // Update limits
117   next_e_ = new_size;
118   limit_e_ = next_e_ + 3;
119 
120   // Want largest <= kMaxSeq_ that allows reserve and makes count_[k_] = 0 mod 3
121   int reserve = (2 * k_ + 1);
122   level_limit_e_ = kMaxSeq_ - reserve;
123   level_limit_e_ = (level_limit_e_ / 3) * 3;  // Round down to multiple of 3
124                                               //
125   //DumpUint8s("after: count[k]", count_, k_ + 1);
126   //DumpUint8s("after: seq[next]", seq_, next_e_);
127 }
128 
DoCarries()129 void SubsetSequence::DoCarries() {
130   CHECK(count_[k_] > 3);    // We depend on count_[k_] being > 3 to stop while
131   // Make room by carrying
132 
133   //DumpUint8s("DoCarries count[k]", count_, k_ + 1);
134   //DumpUint8s("DoCarries seq[next]", seq_, next_e_);
135 
136   int i = 0;
137   while (count_[i] == 3) {
138     next_e_ -= 3;
139     seq_[next_e_] = Median3(next_e_);
140     ++next_e_;
141     count_[i] = 0;
142     ++count_[i + 1];
143     ++i;
144   }
145   limit_e_ = next_e_ + 3;
146 
147   //DumpUint8s("after: DoCarries count[k]", count_, k_ + 1);
148   //DumpUint8s("after: DoCarries seq[next]", seq_, next_e_);
149 
150   // If we just fully carried into level K,
151   // Make sure there is now enough room, else start level K + 1
152   if (i >= k_) {
153     CHECK(count_[k_] == next_e_);
154     if (next_e_ >= level_limit_e_) {
155       NewLevel();
156     }
157   }
158 }
159 
Add(uint8 e)160 void SubsetSequence::Add(uint8 e) {
161   // Add an entry then carry as needed
162   seq_[next_e_] = e;
163   ++next_e_;
164   ++count_[0];
165 
166   if (next_e_ >= limit_e_) {
167     DoCarries();
168   }
169 }
170 
171 
172 // Collapse tail end by simple median across disparate-weight values,
173 // dropping or duplicating last value if need be.
174 // This routine is idempotent.
Flush()175 void SubsetSequence::Flush() {
176   // printf("Flush %d\n", count_[k_]);
177   int start_tail = count_[k_];
178   int size_tail = next_e_ - start_tail;
179   if ((size_tail % 3) == 2) {
180     seq_[next_e_] = seq_[next_e_ - 1];      // Duplicate last value
181     ++size_tail;
182   }
183 
184   // Compress tail down by 3x, via median
185   int new_size = size_tail / 3;             // May delete last value
186   for (int j = 0; j < new_size; ++j) {
187     seq_[start_tail + j] = Median3(start_tail + j * 3);
188   }
189 
190   next_e_ = start_tail + new_size;
191   count_[k_] = next_e_;
192 }
193 
194 
195 // Extract representative pattern of exactly N values into dst[0..n-1]
196 // This routine may be called multiple times, but it may downsample as a
197 // side effect, causing subsequent calls with larger N to get poor answers.
Extract(int to_n,uint8 * dst)198 void SubsetSequence::Extract(int to_n, uint8* dst) {
199   // Collapse partial-carries in tail
200   Flush();
201 
202   // Just use Bresenham to resample
203   int from_n = next_e_;
204   if (to_n >= from_n) {
205     // Up-sample from_n => to_n
206     int err = to_n - 1;          // bias toward no overshoot
207     int j = 0;
208     for (int i = 0; i < to_n; ++i) {
209       dst[i] = seq_[j];
210       err -= from_n;
211       if (err < 0) {
212         ++j;
213         err += to_n;
214       }
215     }
216   } else {
217     // Get to the point that the number of samples is <= 3 * to_n
218     while (next_e_ > (to_n * 3)) {
219       // Compress down by 3x, via median
220       // printf("Extract, median %d / 3\n", next_e_);
221       if ((next_e_ % 3) == 2) {
222         seq_[next_e_] = seq_[next_e_ - 1];    // Duplicate last value
223         ++next_e_;
224       }
225       int new_size = next_e_ / 3;             // May delete last value
226       for (int j = 0; j < new_size; ++j) {
227         seq_[j] = Median3(j * 3);
228       }
229       next_e_ = new_size;
230       count_[k_] = next_e_;
231     }
232     from_n = next_e_;
233 
234     if (to_n == from_n) {
235       // Copy verbatim
236       for (int i = 0; i < to_n; ++i) {
237         dst[i] = seq_[i];
238       }
239       return;
240     }
241 
242     // Down-sample from_n => to_n, using medians
243     int err = 0;          // Bias to immediate median sample
244     int j = 0;
245     for (int i = 0; i < from_n; ++i) {
246       err -= to_n;
247       if (err < 0) {
248         if (i <= (next_e_ - 2)) {
249           dst[j] = Median3(i);
250         } else {
251           dst[j] = seq_[i];
252         }
253         ++j;
254         err += from_n;
255       }
256     }
257   }
258 
259 }
260