1 /* 2 ************************************************************************* 3 * © 2016 and later: Unicode, Inc. and others. 4 * License & terms of use: http://www.unicode.org/copyright.html 5 ************************************************************************* 6 ************************************************************************* 7 * Copyright (C) 2014, International Business Machines 8 * Corporation and others. All Rights Reserved. 9 ************************************************************************* 10 * file name: bitset.cpp 11 * encoding: UTF-8 12 * tab size: 8 (not used) 13 * indentation:4 14 * 15 * created on: 2007jan15 16 * created by: Markus Scherer 17 * 18 * Idea for a "compiled", fast, read-only (immutable) version of a UnicodeSet 19 * using a folded bit set consisting of a 1k-entry index table and a 20 * compacted array of 64-bit words. 21 * Uses a simple hash table for compaction. 22 * Uses the original set for supplementary code points. 23 */ 24 25 #include "unicode/uniset.h" 26 #include "unicode/uobject.h" 27 #include "unicode/usetiter.h" 28 #include "unicode/utypes.h" 29 #include "unicont.h" 30 #include "cmemory.h" // for UPRV_LENGTHOF 31 32 using icu::UObject; 33 using icu::UnicodeSet; 34 using icu::UnicodeSetIterator; 35 36 /* 37 * Hash table for up to 1k 64-bit words, for 1 bit per BMP code point. 38 * Hashes 64-bit words and maps them to 16-bit integers which are 39 * assigned in order of new incoming words for subsequent storage 40 * in a contiguous array. 41 */ 42 struct BMPBitHash : public UObject { 43 int64_t keys[0x800]; // 2k 44 uint16_t values[0x800]; 45 uint16_t reverse[0x400]; 46 uint16_t count; 47 const int32_t prime=1301; // Less than 2k. 48 BMPBitHashBMPBitHash49 BMPBitHash() : count(0) { 50 // Fill values[] with 0xffff. 51 uprv_memset(values, 0xff, sizeof(values)); 52 } 53 54 /* 55 * Map a key to an integer count. 56 * Map at most 1k=0x400 different keys with this data structure. 57 */ mapBMPBitHash58 uint16_t map(int64_t key) { 59 int32_t hash=(int32_t)(key>>55)&0x1ff; 60 hash^=(int32_t)(key>>44)&0x7ff; 61 hash^=(int32_t)(key>>33)&0x7ff; 62 hash^=(int32_t)(key>>22)&0x7ff; 63 hash^=(int32_t)(key>>11)&0x7ff; 64 hash^=(int32_t)key&0x7ff; 65 for(;;) { 66 if(values[hash]==0xffff) { 67 // Unused slot. 68 keys[hash]=key; 69 reverse[count]=hash; 70 return values[hash]=count++; 71 } else if(keys[hash]==key) { 72 // Found a slot with this key. 73 return values[hash]; 74 } else { 75 // Used slot with a different key, move to another slot. 76 hash=(hash+prime)&0x7ff; 77 } 78 } 79 } 80 countKeysBMPBitHash81 uint16_t countKeys() const { return count; } 82 83 /* 84 * Invert the hash map: Fill an array of length countKeys() with the keys 85 * indexed by their mapped values. 86 */ invertBMPBitHash87 void invert(int64_t *k) const { 88 uint16_t i; 89 90 for(i=0; i<count; ++i) { 91 k[i]=keys[reverse[i]]; 92 } 93 } 94 }; 95 96 class BitSet : public UObject, public UnicodeContainable { 97 public: BitSet(const UnicodeSet & set,UErrorCode & errorCode)98 BitSet(const UnicodeSet &set, UErrorCode &errorCode) : bits(shortBits), restSet(set.clone()) { 99 if(U_FAILURE(errorCode)) { 100 return; 101 } 102 BMPBitHash *bitHash=new BMPBitHash; 103 if(bitHash==nullptr || restSet==nullptr) { 104 errorCode=U_MEMORY_ALLOCATION_ERROR; 105 return; 106 } 107 108 UnicodeSetIterator iter(set); 109 int64_t b; 110 UChar32 start, end; 111 int32_t prevIndex, i, j; 112 113 b=0; // Not necessary but makes compilers happy. 114 prevIndex=-1; 115 for(;;) { 116 if(iter.nextRange() && !iter.isString()) { 117 start=iter.getCodepoint(); 118 end=iter.getCodepointEnd(); 119 } else { 120 start=0x10000; 121 } 122 i=start>>6; 123 if(prevIndex!=i) { 124 // Finish the end of the previous range. 125 if(prevIndex<0) { 126 prevIndex=0; 127 } else { 128 index[prevIndex++]=bitHash->map(b); 129 } 130 // Fill all-zero entries between ranges. 131 if(prevIndex<i) { 132 uint16_t zero=bitHash->map(0); 133 do { 134 index[prevIndex++]=zero; 135 } while(prevIndex<i); 136 } 137 b=0; 138 } 139 if(start>0xffff) { 140 break; 141 } 142 b|=~((INT64_C(1)<<(start&0x3f))-1); 143 j=end>>6; 144 if(i<j) { 145 // Set bits for the start of the range. 146 index[i++]=bitHash->map(b); 147 // Fill all-one entries inside the range. 148 if(i<j) { 149 uint16_t all=bitHash->map(INT64_C(0xffffffffffffffff)); 150 do { 151 index[i++]=all; 152 } while(i<j); 153 } 154 b=INT64_C(0xffffffffffffffff); 155 } 156 /* i==j */ 157 b&=(INT64_C(1)<<(end&0x3f))-1; 158 prevIndex=j; 159 } 160 161 if(bitHash->countKeys()>UPRV_LENGTHOF(shortBits)) { 162 bits=(int64_t *)uprv_malloc(bitHash->countKeys()*8); 163 } 164 if(bits!=nullptr) { 165 bitHash->invert(bits); 166 } else { 167 bits=shortBits; 168 errorCode=U_MEMORY_ALLOCATION_ERROR; 169 return; 170 } 171 172 latin1Set[0]=(uint32_t)bits[0]; 173 latin1Set[1]=(uint32_t)(bits[0]>>32); 174 latin1Set[2]=(uint32_t)bits[1]; 175 latin1Set[3]=(uint32_t)(bits[1]>>32); 176 latin1Set[4]=(uint32_t)bits[2]; 177 latin1Set[5]=(uint32_t)(bits[2]>>32); 178 latin1Set[6]=(uint32_t)bits[3]; 179 latin1Set[7]=(uint32_t)(bits[3]>>32); 180 181 restSet->remove(0, 0xffff); 182 } 183 ~BitSet()184 ~BitSet() { 185 if(bits!=shortBits) { 186 uprv_free(bits); 187 } 188 delete restSet; 189 } 190 contains(UChar32 c) const191 UBool contains(UChar32 c) const { 192 if((uint32_t)c<=0xff) { 193 return (UBool)((latin1Set[c>>5]&((uint32_t)1<<(c&0x1f)))!=0); 194 } else if((uint32_t)c<0xffff) { 195 return (UBool)((bits[c>>6]&(INT64_C(1)<<(c&0x3f)))!=0); 196 } else { 197 return restSet->contains(c); 198 } 199 } 200 201 private: 202 uint16_t index[0x400]; 203 int64_t shortBits[32]; 204 int64_t *bits; 205 206 uint32_t latin1Set[8]; 207 208 UnicodeSet *restSet; 209 }; 210