1 /* 2 ************************************************************************* 3 * © 2016 and later: Unicode, Inc. and others. 4 * License & terms of use: http://www.unicode.org/copyright.html#License 5 ************************************************************************* 6 ************************************************************************* 7 * Copyright (C) 2014, International Business Machines 8 * Corporation and others. All Rights Reserved. 9 ************************************************************************* 10 * file name: bitset.cpp 11 * encoding: UTF-8 12 * tab size: 8 (not used) 13 * indentation:4 14 * 15 * created on: 2007jan15 16 * created by: Markus Scherer 17 * 18 * Idea for a "compiled", fast, read-only (immutable) version of a UnicodeSet 19 * using a folded bit set consisting of a 1k-entry index table and a 20 * compacted array of 64-bit words. 21 * Uses a simple hash table for compaction. 22 * Uses the original set for supplementary code points. 23 */ 24 25 #include "unicode/utypes.h" 26 #include "unicont.h" 27 #include "cmemory.h" // for UPRV_LENGTHOF 28 29 /* 30 * Hash table for up to 1k 64-bit words, for 1 bit per BMP code point. 31 * Hashes 64-bit words and maps them to 16-bit integers which are 32 * assigned in order of new incoming words for subsequent storage 33 * in a contiguous array. 34 */ 35 struct BMPBitHash : public UObject { 36 int64_t keys[0x800]; // 2k 37 uint16_t values[0x800]; 38 uint16_t reverse[0x400]; 39 uint16_t count; 40 const int32_t prime=1301; // Less than 2k. 41 BMPBitHashBMPBitHash42 BMPBitHash() : count(0) { 43 // Fill values[] with 0xffff. 44 uprv_memset(values, 0xff, sizeof(values)); 45 } 46 47 /* 48 * Map a key to an integer count. 49 * Map at most 1k=0x400 different keys with this data structure. 50 */ mapBMPBitHash51 uint16_t map(int64_t key) { 52 int32_t hash=(int32_t)(key>>55)&0x1ff; 53 hash^=(int32_t)(key>>44)&0x7ff; 54 hash^=(int32_t)(key>>33)&0x7ff; 55 hash^=(int32_t)(key>>22)&0x7ff; 56 hash^=(int32_t)(key>>11)&0x7ff; 57 hash^=(int32_t)key&0x7ff; 58 for(;;) { 59 if(values[hash]==0xffff) { 60 // Unused slot. 61 keys[hash]=key; 62 reverse[count]=hash; 63 return values[hash]=count++; 64 } else if(keys[hash]==key) { 65 // Found a slot with this key. 66 return values[hash]; 67 } else { 68 // Used slot with a different key, move to another slot. 69 hash=(hash+prime)&0x7ff; 70 } 71 } 72 } 73 countKeysBMPBitHash74 uint16_t countKeys() const { return count; } 75 76 /* 77 * Invert the hash map: Fill an array of length countKeys() with the keys 78 * indexed by their mapped values. 79 */ invertBMPBitHash80 void invert(int64_t *k) const { 81 uint16_t i; 82 83 for(i=0; i<count; ++i) { 84 k[i]=keys[reverse[i]]; 85 } 86 } 87 }; 88 89 class BitSet : public UObject, public UnicodeContainable { 90 public: BitSet(const UnicodeSet & set,UErrorCode & errorCode)91 BitSet(const UnicodeSet &set, UErrorCode &errorCode) : bits(shortBits), restSet(set.clone()) { 92 if(U_FAILURE(errorCode)) { 93 return; 94 } 95 BMPBitHash *bitHash=new BMPBitHash; 96 if(bitHash==NULL || restSet==NULL) { 97 errorCode=U_MEMORY_ALLOCATION_ERROR; 98 return; 99 } 100 101 UnicodeSetIterator iter(set); 102 int64_t b; 103 UChar32 start, end; 104 int32_t prevIndex, i, j; 105 106 b=0; // Not necessary but makes compilers happy. 107 prevIndex=-1; 108 for(;;) { 109 if(iter.nextRange() && !iter.isString()) { 110 start=iter.getCodepoint(); 111 end=iter.getCodepointEnd(); 112 } else { 113 start=0x10000; 114 } 115 i=start>>6; 116 if(prevIndex!=i) { 117 // Finish the end of the previous range. 118 if(prevIndex<0) { 119 prevIndex=0; 120 } else { 121 index[prevIndex++]=bitHash->map(b); 122 } 123 // Fill all-zero entries between ranges. 124 if(prevIndex<i) { 125 uint16_t zero=bitHash->map(0); 126 do { 127 index[prevIndex++]=zero; 128 } while(prevIndex<i); 129 } 130 b=0; 131 } 132 if(start>0xffff) { 133 break; 134 } 135 b|=~((INT64_C(1)<<(start&0x3f))-1); 136 j=end>>6; 137 if(i<j) { 138 // Set bits for the start of the range. 139 index[i++]=bitHash->map(b); 140 // Fill all-one entries inside the range. 141 if(i<j) { 142 uint16_t all=bitHash->map(INT64_C(0xffffffffffffffff)); 143 do { 144 index[i++]=all; 145 } while(i<j); 146 } 147 b=INT64_C(0xffffffffffffffff); 148 } 149 /* i==j */ 150 b&=(INT64_C(1)<<(end&0x3f))-1; 151 prevIndex=j; 152 } 153 154 if(bitHash->countKeys()>UPRV_LENGTHOF(shortBits)) { 155 bits=(int64_t *)uprv_malloc(bitHash->countKeys()*8); 156 } 157 if(bits!=NULL) { 158 bitHash->invert(bits); 159 } else { 160 bits=shortBits; 161 errorCode=U_MEMORY_ALLOCATION_ERROR; 162 return; 163 } 164 165 latin1Set[0]=(uint32_t)bits[0]; 166 latin1Set[1]=(uint32_t)(bits[0]>>32); 167 latin1Set[2]=(uint32_t)bits[1]; 168 latin1Set[3]=(uint32_t)(bits[1]>>32); 169 latin1Set[4]=(uint32_t)bits[2]; 170 latin1Set[5]=(uint32_t)(bits[2]>>32); 171 latin1Set[6]=(uint32_t)bits[3]; 172 latin1Set[7]=(uint32_t)(bits[3]>>32); 173 174 restSet.remove(0, 0xffff); 175 } 176 ~BitSet()177 ~BitSet() { 178 if(bits!=shortBits) { 179 uprv_free(bits); 180 } 181 delete restSet; 182 } 183 contains(UChar32 c) const184 UBool contains(UChar32 c) const { 185 if((uint32_t)c<=0xff) { 186 return (UBool)((latin1Set[c>>5]&((uint32_t)1<<(c&0x1f)))!=0); 187 } else if((uint32_t)c<0xffff) { 188 return (UBool)((bits[c>>6]&(INT64_C(1)<<(c&0x3f)))!=0); 189 } else { 190 return restSet->contains(c); 191 } 192 } 193 194 private: 195 uint16_t index[0x400]; 196 int64_t shortBits[32]; 197 int64_t *bits; 198 199 uint32_t latin1Bits[8]; 200 201 UnicodeSet *restSet; 202 }; 203