• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 *************************************************************************
3 *   © 2016 and later: Unicode, Inc. and others.
4 *   License & terms of use: http://www.unicode.org/copyright.html
5 *************************************************************************
6 *************************************************************************
7 *   Copyright (C) 2014, International Business Machines
8 *   Corporation and others.  All Rights Reserved.
9 *************************************************************************
10 *   file name:  bitset.cpp
11 *   encoding:   UTF-8
12 *   tab size:   8 (not used)
13 *   indentation:4
14 *
15 *   created on: 2007jan15
16 *   created by: Markus Scherer
17 *
18 *   Idea for a "compiled", fast, read-only (immutable) version of a UnicodeSet
19 *   using a folded bit set consisting of a 1k-entry index table and a
20 *   compacted array of 64-bit words.
21 *   Uses a simple hash table for compaction.
22 *   Uses the original set for supplementary code points.
23 */
24 
25 #include "unicode/uniset.h"
26 #include "unicode/uobject.h"
27 #include "unicode/usetiter.h"
28 #include "unicode/utypes.h"
29 #include "unicont.h"
30 #include "cmemory.h" // for UPRV_LENGTHOF
31 
32 using icu::UObject;
33 using icu::UnicodeSet;
34 using icu::UnicodeSetIterator;
35 
36 /*
37  * Hash table for up to 1k 64-bit words, for 1 bit per BMP code point.
38  * Hashes 64-bit words and maps them to 16-bit integers which are
39  * assigned in order of new incoming words for subsequent storage
40  * in a contiguous array.
41  */
42 struct BMPBitHash : public UObject {
43     int64_t keys[0x800];  // 2k
44     uint16_t values[0x800];
45     uint16_t reverse[0x400];
46     uint16_t count;
47     const int32_t prime=1301;  // Less than 2k.
48 
BMPBitHashBMPBitHash49     BMPBitHash() : count(0) {
50         // Fill values[] with 0xffff.
51         uprv_memset(values, 0xff, sizeof(values));
52     }
53 
54     /*
55      * Map a key to an integer count.
56      * Map at most 1k=0x400 different keys with this data structure.
57      */
mapBMPBitHash58     uint16_t map(int64_t key) {
59         int32_t hash=(int32_t)(key>>55)&0x1ff;
60         hash^=(int32_t)(key>>44)&0x7ff;
61         hash^=(int32_t)(key>>33)&0x7ff;
62         hash^=(int32_t)(key>>22)&0x7ff;
63         hash^=(int32_t)(key>>11)&0x7ff;
64         hash^=(int32_t)key&0x7ff;
65         for(;;) {
66             if(values[hash]==0xffff) {
67                 // Unused slot.
68                 keys[hash]=key;
69                 reverse[count]=hash;
70                 return values[hash]=count++;
71             } else if(keys[hash]==key) {
72                 // Found a slot with this key.
73                 return values[hash];
74             } else {
75                 // Used slot with a different key, move to another slot.
76                 hash=(hash+prime)&0x7ff;
77             }
78         }
79     }
80 
countKeysBMPBitHash81     uint16_t countKeys() const { return count; }
82 
83     /*
84      * Invert the hash map: Fill an array of length countKeys() with the keys
85      * indexed by their mapped values.
86      */
invertBMPBitHash87     void invert(int64_t *k) const {
88         uint16_t i;
89 
90         for(i=0; i<count; ++i) {
91             k[i]=keys[reverse[i]];
92         }
93     }
94 };
95 
96 class BitSet : public UObject, public UnicodeContainable {
97 public:
BitSet(const UnicodeSet & set,UErrorCode & errorCode)98     BitSet(const UnicodeSet &set, UErrorCode &errorCode) : bits(shortBits), restSet(set.clone()) {
99         if(U_FAILURE(errorCode)) {
100             return;
101         }
102         BMPBitHash *bitHash=new BMPBitHash;
103         if(bitHash==nullptr || restSet==nullptr) {
104             errorCode=U_MEMORY_ALLOCATION_ERROR;
105             return;
106         }
107 
108         UnicodeSetIterator iter(set);
109         int64_t b;
110         UChar32 start, end;
111         int32_t prevIndex, i, j;
112 
113         b=0;  // Not necessary but makes compilers happy.
114         prevIndex=-1;
115         for(;;) {
116             if(iter.nextRange() && !iter.isString()) {
117                 start=iter.getCodepoint();
118                 end=iter.getCodepointEnd();
119             } else {
120                 start=0x10000;
121             }
122             i=start>>6;
123             if(prevIndex!=i) {
124                 // Finish the end of the previous range.
125                 if(prevIndex<0) {
126                     prevIndex=0;
127                 } else {
128                     index[prevIndex++]=bitHash->map(b);
129                 }
130                 // Fill all-zero entries between ranges.
131                 if(prevIndex<i) {
132                     uint16_t zero=bitHash->map(0);
133                     do {
134                         index[prevIndex++]=zero;
135                     } while(prevIndex<i);
136                 }
137                 b=0;
138             }
139             if(start>0xffff) {
140                 break;
141             }
142             b|=~((INT64_C(1)<<(start&0x3f))-1);
143             j=end>>6;
144             if(i<j) {
145                 // Set bits for the start of the range.
146                 index[i++]=bitHash->map(b);
147                 // Fill all-one entries inside the range.
148                 if(i<j) {
149                     uint16_t all=bitHash->map(INT64_C(0xffffffffffffffff));
150                     do {
151                         index[i++]=all;
152                     } while(i<j);
153                 }
154                 b=INT64_C(0xffffffffffffffff);
155             }
156             /* i==j */
157             b&=(INT64_C(1)<<(end&0x3f))-1;
158             prevIndex=j;
159         }
160 
161         if(bitHash->countKeys()>UPRV_LENGTHOF(shortBits)) {
162             bits=(int64_t *)uprv_malloc(bitHash->countKeys()*8);
163         }
164         if(bits!=nullptr) {
165             bitHash->invert(bits);
166         } else {
167             bits=shortBits;
168             errorCode=U_MEMORY_ALLOCATION_ERROR;
169             return;
170         }
171 
172         latin1Set[0]=(uint32_t)bits[0];
173         latin1Set[1]=(uint32_t)(bits[0]>>32);
174         latin1Set[2]=(uint32_t)bits[1];
175         latin1Set[3]=(uint32_t)(bits[1]>>32);
176         latin1Set[4]=(uint32_t)bits[2];
177         latin1Set[5]=(uint32_t)(bits[2]>>32);
178         latin1Set[6]=(uint32_t)bits[3];
179         latin1Set[7]=(uint32_t)(bits[3]>>32);
180 
181         restSet->remove(0, 0xffff);
182     }
183 
~BitSet()184     ~BitSet() {
185         if(bits!=shortBits) {
186             uprv_free(bits);
187         }
188         delete restSet;
189     }
190 
contains(UChar32 c) const191     UBool contains(UChar32 c) const {
192         if((uint32_t)c<=0xff) {
193             return (UBool)((latin1Set[c>>5]&((uint32_t)1<<(c&0x1f)))!=0);
194         } else if((uint32_t)c<0xffff) {
195             return (UBool)((bits[c>>6]&(INT64_C(1)<<(c&0x3f)))!=0);
196         } else {
197             return restSet->contains(c);
198         }
199     }
200 
201 private:
202     uint16_t index[0x400];
203     int64_t shortBits[32];
204     int64_t *bits;
205 
206     uint32_t latin1Set[8];
207 
208     UnicodeSet *restSet;
209 };
210