1 /* 2 ************************************************************************* 3 * Copyright (C) 2016 and later: Unicode, Inc. and others. 4 * License & terms of use: http://www.unicode.org/copyright.html#License 5 ************************************************************************* 6 ************************************************************************* 7 * Copyright (C) 2007, International Business Machines 8 * Corporation and others. All Rights Reserved. 9 ************************************************************************* 10 * file name: trieset.cpp 11 * encoding: US-ASCII 12 * tab size: 8 (not used) 13 * indentation:4 14 * 15 * created on: 2007jan15 16 * created by: Markus Scherer 17 * 18 * Idea for a "compiled", fast, read-only (immutable) version of a UnicodeSet 19 * using a UTrie with 8-bit (byte) results per code point. 20 * Modifies the trie index to make the BMP linear, and uses the original set 21 * for supplementary code points. 22 */ 23 24 #include "unicode/utypes.h" 25 #include "unicont.h" 26 27 #define UTRIE_GET8_LATIN1(trie) ((const uint8_t *)(trie)->data32+UTRIE_DATA_BLOCK_LENGTH) 28 29 #define UTRIE_GET8_FROM_LEAD(trie, c16) \ 30 ((const uint8_t *)(trie)->data32)[ \ 31 ((int32_t)((trie)->index[(c16)>>UTRIE_SHIFT])<<UTRIE_INDEX_SHIFT)+ \ 32 ((c16)&UTRIE_MASK) \ 33 ] 34 35 class TrieSet : public UObject, public UnicodeContainable { 36 public: TrieSet(const UnicodeSet & set,UErrorCode & errorCode)37 TrieSet(const UnicodeSet &set, UErrorCode &errorCode) 38 : trieData(NULL), latin1(NULL), restSet(set.clone()) { 39 if(U_FAILURE(errorCode)) { 40 return; 41 } 42 if(restSet==NULL) { 43 errorCode=U_MEMORY_ALLOCATION_ERROR; 44 return; 45 } 46 47 UNewTrie *newTrie=utrie_open(NULL, NULL, 0x11000, 0, 0, TRUE); 48 UChar32 start, end; 49 50 UnicodeSetIterator iter(set); 51 52 while(iter.nextRange() && !iter.isString()) { 53 start=iter.getCodepoint(); 54 end=iter.getCodepointEnd(); 55 if(start>0xffff) { 56 break; 57 } 58 if(end>0xffff) { 59 end=0xffff; 60 } 61 if(!utrie_setRange32(newTrie, start, end+1, TRUE, TRUE)) { 62 errorCode=U_INTERNAL_PROGRAM_ERROR; 63 return; 64 } 65 } 66 67 // Preflight the trie length. 68 int32_t length=utrie_serialize(newTrie, NULL, 0, NULL, 8, &errorCode); 69 if(errorCode!=U_BUFFER_OVERFLOW_ERROR) { 70 return; 71 } 72 73 trieData=(uint32_t *)uprv_malloc(length); 74 if(trieData==NULL) { 75 errorCode=U_MEMORY_ALLOCATION_ERROR; 76 return; 77 } 78 79 errorCode=U_ZERO_ERROR; 80 utrie_serialize(newTrie, trieData, length, NULL, 8, &errorCode); 81 utrie_unserialize(&trie, trieData, length, &errorCode); // TODO: Implement for 8-bit UTrie! 82 83 if(U_SUCCESS(errorCode)) { 84 // Copy the indexes for surrogate code points into the BMP range 85 // for simple access across the entire BMP. 86 uprv_memcpy((uint16_t *)trie.index+(0xd800>>UTRIE_SHIFT), 87 trie.index+UTRIE_BMP_INDEX_LENGTH, 88 (0x800>>UTRIE_SHIFT)*2); 89 latin1=UTRIE_GET8_LATIN1(&trie); 90 } 91 92 restSet.remove(0, 0xffff); 93 } 94 ~TrieSet()95 ~TrieSet() { 96 uprv_free(trieData); 97 delete restSet; 98 } 99 contains(UChar32 c) const100 UBool contains(UChar32 c) const { 101 if((uint32_t)c<=0xff) { 102 return (UBool)latin1[c]; 103 } else if((uint32_t)c<0xffff) { 104 return (UBool)UTRIE_GET8_FROM_LEAD(&trie, c); 105 } else { 106 return restSet->contains(c); 107 } 108 } 109 110 private: 111 uint32_t *trieData; 112 const uint8_t *latin1; 113 UTrie trie; 114 UnicodeSet *restSet; 115 }; 116