1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 // 4 // rbbisetb.h 5 /* 6 ********************************************************************** 7 * Copyright (c) 2001-2005, International Business Machines 8 * Corporation and others. All Rights Reserved. 9 ********************************************************************** 10 */ 11 12 #ifndef RBBISETB_H 13 #define RBBISETB_H 14 15 #include "unicode/utypes.h" 16 17 #if !UCONFIG_NO_BREAK_ITERATION 18 19 #include "unicode/uobject.h" 20 #include "rbbirb.h" 21 #include "utrie2.h" 22 #include "uvector.h" 23 24 U_NAMESPACE_BEGIN 25 26 // 27 // RBBISetBuilder Derives the character categories used by the runtime RBBI engine 28 // from the Unicode Sets appearing in the source RBBI rules, and 29 // creates the TRIE table used to map from Unicode to the 30 // character categories. 31 // 32 33 34 // 35 // RangeDescriptor 36 // 37 // Each of the non-overlapping character ranges gets one of these descriptors. 38 // All of them are strung together in a linked list, which is kept in order 39 // (by character) 40 // 41 class RangeDescriptor : public UMemory { 42 public: 43 UChar32 fStartChar; // Start of range, unicode 32 bit value. 44 UChar32 fEndChar; // End of range, unicode 32 bit value. 45 int32_t fNum; // runtime-mapped input value for this range. 46 UVector *fIncludesSets; // vector of the the original 47 // Unicode sets that include this range. 48 // (Contains ptrs to uset nodes) 49 RangeDescriptor *fNext; // Next RangeDescriptor in the linked list. 50 51 RangeDescriptor(UErrorCode &status); 52 RangeDescriptor(const RangeDescriptor &other, UErrorCode &status); 53 ~RangeDescriptor(); 54 void split(UChar32 where, UErrorCode &status); // Spit this range in two at "where", with 55 // where appearing in the second (higher) part. 56 void setDictionaryFlag(); // Check whether this range appears as part of 57 // the Unicode set named "dictionary" 58 59 private: 60 RangeDescriptor(const RangeDescriptor &other); // forbid copying of this class 61 RangeDescriptor &operator=(const RangeDescriptor &other); // forbid copying of this class 62 }; 63 64 65 // 66 // RBBISetBuilder Handles processing of Unicode Sets from RBBI rules. 67 // 68 // Starting with the rules parse tree from the scanner, 69 // 70 // - Enumerate the set of UnicodeSets that are referenced 71 // by the RBBI rules. 72 // - compute a derived set of non-overlapping UnicodeSets 73 // that will correspond to columns in the state table for 74 // the RBBI execution engine. 75 // - construct the trie table that maps input characters 76 // to set numbers in the non-overlapping set of sets. 77 // 78 79 80 class RBBISetBuilder : public UMemory { 81 public: 82 RBBISetBuilder(RBBIRuleBuilder *rb); 83 ~RBBISetBuilder(); 84 85 void buildRanges(); 86 void buildTrie(); 87 void addValToSets(UVector *sets, uint32_t val); 88 void addValToSet (RBBINode *usetNode, uint32_t val); 89 int32_t getNumCharCategories() const; // CharCategories are the same as input symbol set to the 90 // runtime state machine, which are the same as 91 // columns in the DFA state table 92 int32_t getTrieSize() /*const*/; // Size in bytes of the serialized Trie. 93 void serializeTrie(uint8_t *where); // write out the serialized Trie. 94 UChar32 getFirstChar(int32_t val) const; 95 UBool sawBOF() const; // Indicate whether any references to the {bof} pseudo 96 // character were encountered. 97 /** 98 * Merge two character categories that have been identified as having equivalent behavior. 99 * The ranges belonging to the second category (table column) will be added to the first. 100 * @param categories the pair of categories to be merged. 101 */ 102 void mergeCategories(IntPair categories); 103 104 static constexpr int32_t DICT_BIT = 0x4000; 105 106 #ifdef RBBI_DEBUG 107 void printSets(); 108 void printRanges(); 109 void printRangeGroups(); 110 #else 111 #define printSets() 112 #define printRanges() 113 #define printRangeGroups() 114 #endif 115 116 private: 117 void numberSets(); 118 119 RBBIRuleBuilder *fRB; // The RBBI Rule Compiler that owns us. 120 UErrorCode *fStatus; 121 122 RangeDescriptor *fRangeList; // Head of the linked list of RangeDescriptors 123 124 UTrie2 *fTrie; // The mapping TRIE that is the end result of processing 125 uint32_t fTrieSize; // the Unicode Sets. 126 127 // Groups correspond to character categories - 128 // groups of ranges that are in the same original UnicodeSets. 129 // fGroupCount is the index of the last used group. 130 // fGroupCount+1 is also the number of columns in the RBBI state table being compiled. 131 // State table column 0 is not used. Column 1 is for end-of-input. 132 // column 2 is for group 0. Funny counting. 133 int32_t fGroupCount; 134 135 UBool fSawBOF; 136 137 RBBISetBuilder(const RBBISetBuilder &other); // forbid copying of this class 138 RBBISetBuilder &operator=(const RBBISetBuilder &other); // forbid copying of this class 139 }; 140 141 142 143 U_NAMESPACE_END 144 145 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ 146 147 #endif 148