• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 **********************************************************************
3 *   Copyright (C) 2014, International Business Machines
4 *   Corporation and others.  All Rights Reserved.
5 **********************************************************************
6 *   file name:  bitset.cpp
7 *   encoding:   US-ASCII
8 *   tab size:   8 (not used)
9 *   indentation:4
10 *
11 *   created on: 2007jan15
12 *   created by: Markus Scherer
13 *
14 *   Idea for a "compiled", fast, read-only (immutable) version of a UnicodeSet
15 *   using a folded bit set consisting of a 1k-entry index table and a
16 *   compacted array of 64-bit words.
17 *   Uses a simple hash table for compaction.
18 *   Uses the original set for supplementary code points.
19 */
20 
21 #include "unicode/utypes.h"
22 #include "unicont.h"
23 #include "cmemory.h" // for UPRV_LENGTHOF
24 
25 /*
26  * Hash table for up to 1k 64-bit words, for 1 bit per BMP code point.
27  * Hashes 64-bit words and maps them to 16-bit integers which are
28  * assigned in order of new incoming words for subsequent storage
29  * in a contiguous array.
30  */
31 struct BMPBitHash : public UObject {
32     int64_t keys[0x800];  // 2k
33     uint16_t values[0x800];
34     uint16_t reverse[0x400];
35     uint16_t count;
36     const int32_t prime=1301;  // Less than 2k.
37 
BMPBitHashBMPBitHash38     BMPBitHash() : count(0) {
39         // Fill values[] with 0xffff.
40         uprv_memset(values, 0xff, sizeof(values));
41     }
42 
43     /*
44      * Map a key to an integer count.
45      * Map at most 1k=0x400 different keys with this data structure.
46      */
mapBMPBitHash47     uint16_t map(int64_t key) {
48         int32_t hash=(int32_t)(key>>55)&0x1ff;
49         hash^=(int32_t)(key>>44)&0x7ff;
50         hash^=(int32_t)(key>>33)&0x7ff;
51         hash^=(int32_t)(key>>22)&0x7ff;
52         hash^=(int32_t)(key>>11)&0x7ff;
53         hash^=(int32_t)key&0x7ff;
54         for(;;) {
55             if(values[hash]==0xffff) {
56                 // Unused slot.
57                 keys[hash]=key;
58                 reverse[count]=hash;
59                 return values[hash]=count++;
60             } else if(keys[hash]==key) {
61                 // Found a slot with this key.
62                 return values[hash];
63             } else {
64                 // Used slot with a different key, move to another slot.
65                 hash=(hash+prime)&0x7ff;
66             }
67         }
68     }
69 
countKeysBMPBitHash70     uint16_t countKeys() const { return count; }
71 
72     /*
73      * Invert the hash map: Fill an array of length countKeys() with the keys
74      * indexed by their mapped values.
75      */
invertBMPBitHash76     void invert(int64_t *k) const {
77         uint16_t i;
78 
79         for(i=0; i<count; ++i) {
80             k[i]=keys[reverse[i]];
81         }
82     }
83 };
84 
85 class BitSet : public UObject, public UnicodeContainable {
86 public:
BitSet(const UnicodeSet & set,UErrorCode & errorCode)87     BitSet(const UnicodeSet &set, UErrorCode &errorCode) : bits(shortBits), restSet(set.clone()) {
88         if(U_FAILURE(errorCode)) {
89             return;
90         }
91         BMPBitHash *bitHash=new BMPBitHash;
92         if(bitHash==NULL || restSet==NULL) {
93             errorCode=U_MEMORY_ALLOCATION_ERROR;
94             return;
95         }
96 
97         UnicodeSetIterator iter(set);
98         int64_t b;
99         UChar32 start, end;
100         int32_t prevIndex, i, j;
101 
102         b=0;  // Not necessary but makes compilers happy.
103         prevIndex=-1;
104         for(;;) {
105             if(iter.nextRange() && !iter.isString()) {
106                 start=iter.getCodepoint();
107                 end=iter.getCodepointEnd();
108             } else {
109                 start=0x10000;
110             }
111             i=start>>6;
112             if(prevIndex!=i) {
113                 // Finish the end of the previous range.
114                 if(prevIndex<0) {
115                     prevIndex=0;
116                 } else {
117                     index[prevIndex++]=bitHash->map(b);
118                 }
119                 // Fill all-zero entries between ranges.
120                 if(prevIndex<i) {
121                     uint16_t zero=bitHash->map(0);
122                     do {
123                         index[prevIndex++]=zero;
124                     } while(prevIndex<i);
125                 }
126                 b=0;
127             }
128             if(start>0xffff) {
129                 break;
130             }
131             b|=~((INT64_C(1)<<(start&0x3f))-1);
132             j=end>>6;
133             if(i<j) {
134                 // Set bits for the start of the range.
135                 index[i++]=bitHash->map(b);
136                 // Fill all-one entries inside the range.
137                 if(i<j) {
138                     uint16_t all=bitHash->map(INT64_C(0xffffffffffffffff));
139                     do {
140                         index[i++]=all;
141                     } while(i<j);
142                 }
143                 b=INT64_C(0xffffffffffffffff);
144             }
145             /* i==j */
146             b&=(INT64_C(1)<<(end&0x3f))-1;
147             prevIndex=j;
148         }
149 
150         if(bitHash->countKeys()>UPRV_LENGTHOF(shortBits)) {
151             bits=(int64_t *)uprv_malloc(bitHash->countKeys()*8);
152         }
153         if(bits!=NULL) {
154             bitHash->invert(bits);
155         } else {
156             bits=shortBits;
157             errorCode=U_MEMORY_ALLOCATION_ERROR;
158             return;
159         }
160 
161         latin1Set[0]=(uint32_t)bits[0];
162         latin1Set[1]=(uint32_t)(bits[0]>>32);
163         latin1Set[2]=(uint32_t)bits[1];
164         latin1Set[3]=(uint32_t)(bits[1]>>32);
165         latin1Set[4]=(uint32_t)bits[2];
166         latin1Set[5]=(uint32_t)(bits[2]>>32);
167         latin1Set[6]=(uint32_t)bits[3];
168         latin1Set[7]=(uint32_t)(bits[3]>>32);
169 
170         restSet.remove(0, 0xffff);
171     }
172 
~BitSet()173     ~BitSet() {
174         if(bits!=shortBits) {
175             uprv_free(bits);
176         }
177         delete restSet;
178     }
179 
contains(UChar32 c) const180     UBool contains(UChar32 c) const {
181         if((uint32_t)c<=0xff) {
182             return (UBool)((latin1Set[c>>5]&((uint32_t)1<<(c&0x1f)))!=0);
183         } else if((uint32_t)c<0xffff) {
184             return (UBool)((bits[c>>6]&(INT64_C(1)<<(c&0x3f)))!=0);
185         } else {
186             return restSet->contains(c);
187         }
188     }
189 
190 private:
191     uint16_t index[0x400];
192     int64_t shortBits[32];
193     int64_t *bits;
194 
195     uint32_t latin1Bits[8];
196 
197     UnicodeSet *restSet;
198 };
199