• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 
2 // Licensed under the Apache License, Version 2.0 (the "License");
3 // you may not use this file except in compliance with the License.
4 // You may obtain a copy of the License at
5 //
6 //     http://www.apache.org/licenses/LICENSE-2.0
7 //
8 // Unless required by applicable law or agreed to in writing, software
9 // distributed under the License is distributed on an "AS IS" BASIS,
10 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 // See the License for the specific language governing permissions and
12 // limitations under the License.
13 //
14 // Copyright 2005-2010 Google, Inc.
15 // Author: sorenj@google.com (Jeffrey Sorensen)
16 
17 #ifndef FST_EXTENSIONS_NGRAM_BITMAP_INDEX_H_
18 #define FST_EXTENSIONS_NGRAM_BITMAP_INDEX_H_
19 
20 #include <vector>
21 using std::vector;
22 
23 #include <fst/compat.h>
24 
25 // This class is a bitstring storage class with an index that allows
26 // seeking to the Nth set or clear bit in time O(Log(N)) where N is
27 // the length of the bit vector.  In addition, it allows counting set or
28 // clear bits over ranges in constant time.
29 //
30 // This is accomplished by maintaining an "secondary" index of limited
31 // size in bits that maintains a running count of the number of bits set
32 // in each block of bitmap data.  A block is defined as the number of
33 // uint64 values that can fit in the secondary index before an overflow
34 // occurs.
35 //
36 // To handle overflows, a "primary" index containing a running count of
37 // bits set in each block is created using the type uint64.
38 
39 namespace fst {
40 
41 class BitmapIndex {
42  public:
StorageSize(size_t size)43   static size_t StorageSize(size_t size) {
44     return ((size + kStorageBlockMask) >> kStorageLogBitSize);
45   }
46 
BitmapIndex()47   BitmapIndex() : bits_(NULL), size_(0) { }
48 
Get(size_t index)49   bool Get(size_t index) const {
50     return (bits_[index >> kStorageLogBitSize] &
51             (kOne << (index & kStorageBlockMask))) != 0;
52   }
53 
Set(uint64 * bits,size_t index)54   static void Set(uint64* bits, size_t index) {
55     bits[index >> kStorageLogBitSize] |= (kOne << (index & kStorageBlockMask));
56   }
57 
Clear(uint64 * bits,size_t index)58   static void Clear(uint64* bits, size_t index) {
59     bits[index >> kStorageLogBitSize] &= ~(kOne << (index & kStorageBlockMask));
60   }
61 
Bits()62   size_t Bits() const {
63     return size_;
64   }
65 
ArraySize()66   size_t ArraySize() const {
67     return StorageSize(size_);
68   }
69 
70   // Returns the number of one bits in the bitmap
GetOnesCount()71   size_t GetOnesCount() const {
72     return primary_index_[primary_index_size() - 1];
73   }
74 
75   // Returns the number of one bits in positions 0 to limit - 1.
76   // REQUIRES: limit <= Bits()
77   size_t Rank1(size_t end) const;
78 
79   // Returns the number of one bits in the range start to end - 1.
80   // REQUIRES: limit <= Bits()
GetOnesCountInRange(size_t start,size_t end)81   size_t GetOnesCountInRange(size_t start, size_t end) const {
82     return Rank1(end) - Rank1(start);
83   }
84 
85   // Returns the number of zero bits in positions 0 to limit - 1.
86   // REQUIRES: limit <= Bits()
Rank0(size_t end)87   size_t Rank0(size_t end) const {
88     return end - Rank1(end);
89   }
90 
91   // Returns the number of zero bits in the range start to end - 1.
92   // REQUIRES: limit <= Bits()
GetZeroesCountInRange(size_t start,size_t end)93   size_t GetZeroesCountInRange(size_t start, size_t end) const {
94     return end - start - GetOnesCountInRange(start, end);
95   }
96 
97   // Return true if any bit between begin inclusive and end exclusive
98   // is set.  0 <= begin <= end <= Bits() is required.
99   //
TestRange(size_t start,size_t end)100   bool TestRange(size_t start, size_t end) const {
101     return Rank1(end) > Rank1(start);
102   }
103 
104   // Returns the offset to the nth set bit (zero based)
105   // or Bits() if index >= number of ones
106   size_t Select1(size_t bit_index) const;
107 
108   // Returns the offset to the nth clear bit (zero based)
109   // or Bits() if index > number of
110   size_t Select0(size_t bit_index) const;
111 
112   // Rebuilds from index for the associated Bitmap, should be called
113   // whenever changes have been made to the Bitmap or else behavior
114   // of the indexed bitmap methods will be undefined.
115   void BuildIndex(const uint64 *bits, size_t size);
116 
117   // the secondary index accumulates counts until it can possibly overflow
118   // this constant computes the number of uint64 units that can fit into
119   // units the size of uint16.
120   static const uint64 kOne = 1;
121   static const uint32 kStorageBitSize = 64;
122   static const uint32 kStorageLogBitSize = 6;
123   static const uint32 kSecondaryBlockSize = ((1 << 16) - 1)
124       >> kStorageLogBitSize;
125 
126  private:
127   static const uint32 kStorageBlockMask = kStorageBitSize - 1;
128 
129   // returns, from the index, the count of ones up to array_index
130   size_t get_index_ones_count(size_t array_index) const;
131 
132   // because the indexes, both primary and secondary, contain a running
133   // count of the population of one bits contained in [0,i), there is
134   // no reason to have an element in the zeroth position as this value would
135   // necessarily be zero.  (The bits are indexed in a zero based way.)  Thus
136   // we don't store the 0th element in either index.  Both of the following
137   // functions, if greater than 0, must be decremented by one before retreiving
138   // the value from the corresponding array.
139   // returns the 1 + the block that contains the bitindex in question
140   // the inverted version works the same but looks for zeros using an inverted
141   // view of the index
142   size_t find_primary_block(size_t bit_index) const;
143 
144   size_t find_inverted_primary_block(size_t bit_index) const;
145 
146   // similarly, the secondary index (which resets its count to zero at
147   // the end of every kSecondaryBlockSize entries) does not store the element
148   // at 0.  Note that the rem_bit_index parameter is the number of bits
149   // within the secondary block, after the bits accounted for by the primary
150   // block have been removed (i.e. the remaining bits)  And, because we
151   // reset to zero with each new block, there is no need to store those
152   // actual zeros.
153   // returns 1 + the secondary block that contains the bitindex in question
154   size_t find_secondary_block(size_t block, size_t rem_bit_index) const;
155 
156   size_t find_inverted_secondary_block(size_t block, size_t rem_bit_index)
157       const;
158 
159   // We create a primary index based upon the number of secondary index
160   // blocks.  The primary index uses fields wide enough to accomodate any
161   // index of the bitarray so cannot overflow
162   // The primary index is the actual running
163   // count of one bits set for all blocks (and, thus, all uint64s).
primary_index_size()164   size_t primary_index_size() const {
165     return (ArraySize() + kSecondaryBlockSize - 1) / kSecondaryBlockSize;
166   }
167 
168   const uint64* bits_;
169   size_t size_;
170 
171   // The primary index contains the running popcount of all blocks
172   // which means the nth value contains the popcounts of
173   // [0,n*kSecondaryBlockSize], however, the 0th element is omitted.
174   vector<uint32> primary_index_;
175   // The secondary index contains the running popcount of the associated
176   // bitmap.  It is the same length (in units of uint16) as the
177   // bitmap's map is in units of uint64s.
178   vector<uint16> secondary_index_;
179 };
180 
181 }  // end namespace fst
182 
183 #endif  // FST_EXTENSIONS_NGRAM_BITMAP_INDEX_H_
184