• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 **********************************************************************
3 *   Copyright (C) 2007, International Business Machines
4 *   Corporation and others.  All Rights Reserved.
5 **********************************************************************
6 *   file name:  bitset.cpp
7 *   encoding:   US-ASCII
8 *   tab size:   8 (not used)
9 *   indentation:4
10 *
11 *   created on: 2007jan15
12 *   created by: Markus Scherer
13 *
14 *   Idea for a "compiled", fast, read-only (immutable) version of a UnicodeSet
15 *   using a folded bit set consisting of a 1k-entry index table and a
16 *   compacted array of 64-bit words.
17 *   Uses a simple hash table for compaction.
18 *   Uses the original set for supplementary code points.
19 */
20 
21 #include "unicode/utypes.h"
22 #include "unicont.h"
23 
24 /*
25  * Hash table for up to 1k 64-bit words, for 1 bit per BMP code point.
26  * Hashes 64-bit words and maps them to 16-bit integers which are
27  * assigned in order of new incoming words for subsequent storage
28  * in a contiguous array.
29  */
30 struct BMPBitHash : public UObject {
31     int64_t keys[0x800];  // 2k
32     uint16_t values[0x800];
33     uint16_t reverse[0x400];
34     uint16_t count;
35     const int32_t prime=1301;  // Less than 2k.
36 
BMPBitHashBMPBitHash37     BMPBitHash() : count(0) {
38         // Fill values[] with 0xffff.
39         uprv_memset(values, 0xff, sizeof(values));
40     }
41 
42     /*
43      * Map a key to an integer count.
44      * Map at most 1k=0x400 different keys with this data structure.
45      */
mapBMPBitHash46     uint16_t map(int64_t key) {
47         int32_t hash=(int32_t)(key>>55)&0x1ff;
48         hash^=(int32_t)(key>>44)&0x7ff;
49         hash^=(int32_t)(key>>33)&0x7ff;
50         hash^=(int32_t)(key>>22)&0x7ff;
51         hash^=(int32_t)(key>>11)&0x7ff;
52         hash^=(int32_t)key&0x7ff;
53         for(;;) {
54             if(values[hash]==0xffff) {
55                 // Unused slot.
56                 keys[hash]=key;
57                 reverse[count]=hash;
58                 return values[hash]=count++;
59             } else if(keys[hash]==key) {
60                 // Found a slot with this key.
61                 return values[hash];
62             } else {
63                 // Used slot with a different key, move to another slot.
64                 hash=(hash+prime)&0x7ff;
65             }
66         }
67     }
68 
countKeysBMPBitHash69     uint16_t countKeys() const { return count; }
70 
71     /*
72      * Invert the hash map: Fill an array of length countKeys() with the keys
73      * indexed by their mapped values.
74      */
invertBMPBitHash75     void invert(int64_t *k) const {
76         uint16_t i;
77 
78         for(i=0; i<count; ++i) {
79             k[i]=keys[reverse[i]];
80         }
81     }
82 };
83 
84 class BitSet : public UObject, public UnicodeContainable {
85 public:
BitSet(const UnicodeSet & set,UErrorCode & errorCode)86     BitSet(const UnicodeSet &set, UErrorCode &errorCode) : bits(shortBits), restSet(set.clone()) {
87         if(U_FAILURE(errorCode)) {
88             return;
89         }
90         BMPBitHash *bitHash=new BMPBitHash;
91         if(bitHash==NULL || restSet==NULL) {
92             errorCode=U_MEMORY_ALLOCATION_ERROR;
93             return;
94         }
95 
96         UnicodeSetIterator iter(set);
97         int64_t b;
98         UChar32 start, end;
99         int32_t prevIndex, i, j;
100 
101         b=0;  // Not necessary but makes compilers happy.
102         prevIndex=-1;
103         for(;;) {
104             if(iter.nextRange() && !iter.isString()) {
105                 start=iter.getCodepoint();
106                 end=iter.getCodepointEnd();
107             } else {
108                 start=0x10000;
109             }
110             i=start>>6;
111             if(prevIndex!=i) {
112                 // Finish the end of the previous range.
113                 if(prevIndex<0) {
114                     prevIndex=0;
115                 } else {
116                     index[prevIndex++]=bitHash->map(b);
117                 }
118                 // Fill all-zero entries between ranges.
119                 if(prevIndex<i) {
120                     uint16_t zero=bitHash->map(0);
121                     do {
122                         index[prevIndex++]=zero;
123                     } while(prevIndex<i);
124                 }
125                 b=0;
126             }
127             if(start>0xffff) {
128                 break;
129             }
130             b|=~((INT64_C(1)<<(start&0x3f))-1);
131             j=end>>6;
132             if(i<j) {
133                 // Set bits for the start of the range.
134                 index[i++]=bitHash->map(b);
135                 // Fill all-one entries inside the range.
136                 if(i<j) {
137                     uint16_t all=bitHash->map(INT64_C(0xffffffffffffffff));
138                     do {
139                         index[i++]=all;
140                     } while(i<j);
141                 }
142                 b=INT64_C(0xffffffffffffffff);
143             }
144             /* i==j */
145             b&=(INT64_C(1)<<(end&0x3f))-1;
146             prevIndex=j;
147         }
148 
149         if(bitHash->countKeys()>LENGTHOF(shortBits)) {
150             bits=(int64_t *)uprv_malloc(bitHash->countKeys()*8);
151         }
152         if(bits!=NULL) {
153             bitHash->invert(bits);
154         } else {
155             bits=shortBits;
156             errorCode=U_MEMORY_ALLOCATION_ERROR;
157             return;
158         }
159 
160         latin1Set[0]=(uint32_t)bits[0];
161         latin1Set[1]=(uint32_t)(bits[0]>>32);
162         latin1Set[2]=(uint32_t)bits[1];
163         latin1Set[3]=(uint32_t)(bits[1]>>32);
164         latin1Set[4]=(uint32_t)bits[2];
165         latin1Set[5]=(uint32_t)(bits[2]>>32);
166         latin1Set[6]=(uint32_t)bits[3];
167         latin1Set[7]=(uint32_t)(bits[3]>>32);
168 
169         restSet.remove(0, 0xffff);
170     }
171 
~BitSet()172     ~BitSet() {
173         if(bits!=shortBits) {
174             uprv_free(bits);
175         }
176         delete restSet;
177     }
178 
contains(UChar32 c) const179     UBool contains(UChar32 c) const {
180         if((uint32_t)c<=0xff) {
181             return (UBool)((latin1Set[c>>5]&((uint32_t)1<<(c&0x1f)))!=0);
182         } else if((uint32_t)c<0xffff) {
183             return (UBool)((bits[c>>6]&(INT64_C(1)<<(c&0x3f)))!=0);
184         } else {
185             return restSet->contains(c);
186         }
187     }
188 
189 private:
190     uint16_t index[0x400];
191     int64_t shortBits[32];
192     int64_t *bits;
193 
194     uint32_t latin1Bits[8];
195 
196     UnicodeSet *restSet;
197 };
198