1 /* 2 ********************************************************************** 3 * Copyright (C) 2014, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ********************************************************************** 6 * file name: bitset.cpp 7 * encoding: US-ASCII 8 * tab size: 8 (not used) 9 * indentation:4 10 * 11 * created on: 2007jan15 12 * created by: Markus Scherer 13 * 14 * Idea for a "compiled", fast, read-only (immutable) version of a UnicodeSet 15 * using a folded bit set consisting of a 1k-entry index table and a 16 * compacted array of 64-bit words. 17 * Uses a simple hash table for compaction. 18 * Uses the original set for supplementary code points. 19 */ 20 21 #include "unicode/utypes.h" 22 #include "unicont.h" 23 #include "cmemory.h" // for UPRV_LENGTHOF 24 25 /* 26 * Hash table for up to 1k 64-bit words, for 1 bit per BMP code point. 27 * Hashes 64-bit words and maps them to 16-bit integers which are 28 * assigned in order of new incoming words for subsequent storage 29 * in a contiguous array. 30 */ 31 struct BMPBitHash : public UObject { 32 int64_t keys[0x800]; // 2k 33 uint16_t values[0x800]; 34 uint16_t reverse[0x400]; 35 uint16_t count; 36 const int32_t prime=1301; // Less than 2k. 37 BMPBitHashBMPBitHash38 BMPBitHash() : count(0) { 39 // Fill values[] with 0xffff. 40 uprv_memset(values, 0xff, sizeof(values)); 41 } 42 43 /* 44 * Map a key to an integer count. 45 * Map at most 1k=0x400 different keys with this data structure. 46 */ mapBMPBitHash47 uint16_t map(int64_t key) { 48 int32_t hash=(int32_t)(key>>55)&0x1ff; 49 hash^=(int32_t)(key>>44)&0x7ff; 50 hash^=(int32_t)(key>>33)&0x7ff; 51 hash^=(int32_t)(key>>22)&0x7ff; 52 hash^=(int32_t)(key>>11)&0x7ff; 53 hash^=(int32_t)key&0x7ff; 54 for(;;) { 55 if(values[hash]==0xffff) { 56 // Unused slot. 57 keys[hash]=key; 58 reverse[count]=hash; 59 return values[hash]=count++; 60 } else if(keys[hash]==key) { 61 // Found a slot with this key. 62 return values[hash]; 63 } else { 64 // Used slot with a different key, move to another slot. 65 hash=(hash+prime)&0x7ff; 66 } 67 } 68 } 69 countKeysBMPBitHash70 uint16_t countKeys() const { return count; } 71 72 /* 73 * Invert the hash map: Fill an array of length countKeys() with the keys 74 * indexed by their mapped values. 75 */ invertBMPBitHash76 void invert(int64_t *k) const { 77 uint16_t i; 78 79 for(i=0; i<count; ++i) { 80 k[i]=keys[reverse[i]]; 81 } 82 } 83 }; 84 85 class BitSet : public UObject, public UnicodeContainable { 86 public: BitSet(const UnicodeSet & set,UErrorCode & errorCode)87 BitSet(const UnicodeSet &set, UErrorCode &errorCode) : bits(shortBits), restSet(set.clone()) { 88 if(U_FAILURE(errorCode)) { 89 return; 90 } 91 BMPBitHash *bitHash=new BMPBitHash; 92 if(bitHash==NULL || restSet==NULL) { 93 errorCode=U_MEMORY_ALLOCATION_ERROR; 94 return; 95 } 96 97 UnicodeSetIterator iter(set); 98 int64_t b; 99 UChar32 start, end; 100 int32_t prevIndex, i, j; 101 102 b=0; // Not necessary but makes compilers happy. 103 prevIndex=-1; 104 for(;;) { 105 if(iter.nextRange() && !iter.isString()) { 106 start=iter.getCodepoint(); 107 end=iter.getCodepointEnd(); 108 } else { 109 start=0x10000; 110 } 111 i=start>>6; 112 if(prevIndex!=i) { 113 // Finish the end of the previous range. 114 if(prevIndex<0) { 115 prevIndex=0; 116 } else { 117 index[prevIndex++]=bitHash->map(b); 118 } 119 // Fill all-zero entries between ranges. 120 if(prevIndex<i) { 121 uint16_t zero=bitHash->map(0); 122 do { 123 index[prevIndex++]=zero; 124 } while(prevIndex<i); 125 } 126 b=0; 127 } 128 if(start>0xffff) { 129 break; 130 } 131 b|=~((INT64_C(1)<<(start&0x3f))-1); 132 j=end>>6; 133 if(i<j) { 134 // Set bits for the start of the range. 135 index[i++]=bitHash->map(b); 136 // Fill all-one entries inside the range. 137 if(i<j) { 138 uint16_t all=bitHash->map(INT64_C(0xffffffffffffffff)); 139 do { 140 index[i++]=all; 141 } while(i<j); 142 } 143 b=INT64_C(0xffffffffffffffff); 144 } 145 /* i==j */ 146 b&=(INT64_C(1)<<(end&0x3f))-1; 147 prevIndex=j; 148 } 149 150 if(bitHash->countKeys()>UPRV_LENGTHOF(shortBits)) { 151 bits=(int64_t *)uprv_malloc(bitHash->countKeys()*8); 152 } 153 if(bits!=NULL) { 154 bitHash->invert(bits); 155 } else { 156 bits=shortBits; 157 errorCode=U_MEMORY_ALLOCATION_ERROR; 158 return; 159 } 160 161 latin1Set[0]=(uint32_t)bits[0]; 162 latin1Set[1]=(uint32_t)(bits[0]>>32); 163 latin1Set[2]=(uint32_t)bits[1]; 164 latin1Set[3]=(uint32_t)(bits[1]>>32); 165 latin1Set[4]=(uint32_t)bits[2]; 166 latin1Set[5]=(uint32_t)(bits[2]>>32); 167 latin1Set[6]=(uint32_t)bits[3]; 168 latin1Set[7]=(uint32_t)(bits[3]>>32); 169 170 restSet.remove(0, 0xffff); 171 } 172 ~BitSet()173 ~BitSet() { 174 if(bits!=shortBits) { 175 uprv_free(bits); 176 } 177 delete restSet; 178 } 179 contains(UChar32 c) const180 UBool contains(UChar32 c) const { 181 if((uint32_t)c<=0xff) { 182 return (UBool)((latin1Set[c>>5]&((uint32_t)1<<(c&0x1f)))!=0); 183 } else if((uint32_t)c<0xffff) { 184 return (UBool)((bits[c>>6]&(INT64_C(1)<<(c&0x3f)))!=0); 185 } else { 186 return restSet->contains(c); 187 } 188 } 189 190 private: 191 uint16_t index[0x400]; 192 int64_t shortBits[32]; 193 int64_t *bits; 194 195 uint32_t latin1Bits[8]; 196 197 UnicodeSet *restSet; 198 }; 199