1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 // 4 // rbbisetb.h 5 /* 6 ********************************************************************** 7 * Copyright (c) 2001-2005, International Business Machines 8 * Corporation and others. All Rights Reserved. 9 ********************************************************************** 10 */ 11 12 #ifndef RBBISETB_H 13 #define RBBISETB_H 14 15 #include "unicode/utypes.h" 16 17 #if !UCONFIG_NO_BREAK_ITERATION 18 19 #include "unicode/ucptrie.h" 20 #include "unicode/umutablecptrie.h" 21 #include "unicode/uobject.h" 22 #include "rbbirb.h" 23 #include "uvector.h" 24 25 U_NAMESPACE_BEGIN 26 27 // 28 // RBBISetBuilder Derives the character categories used by the runtime RBBI engine 29 // from the Unicode Sets appearing in the source RBBI rules, and 30 // creates the TRIE table used to map from Unicode to the 31 // character categories. 32 // 33 34 35 // 36 // RangeDescriptor 37 // 38 // Each of the non-overlapping character ranges gets one of these descriptors. 39 // All of them are strung together in a linked list, which is kept in order 40 // (by character) 41 // 42 class RangeDescriptor : public UMemory { 43 public: 44 UChar32 fStartChar {}; // Start of range, unicode 32 bit value. 45 UChar32 fEndChar {}; // End of range, unicode 32 bit value. 46 int32_t fNum {0}; // runtime-mapped input value for this range. 47 bool fIncludesDict {false}; // True if the range includes $dictionary. 48 bool fFirstInGroup {false}; // True if first range in a group with the same fNum. 49 UVector *fIncludesSets {nullptr}; // vector of the the original 50 // Unicode sets that include this range. 51 // (Contains ptrs to uset nodes) 52 RangeDescriptor *fNext {nullptr}; // Next RangeDescriptor in the linked list. 53 54 RangeDescriptor(UErrorCode &status); 55 RangeDescriptor(const RangeDescriptor &other, UErrorCode &status); 56 ~RangeDescriptor(); 57 void split(UChar32 where, UErrorCode &status); // Spit this range in two at "where", with 58 // where appearing in the second (higher) part. 59 bool isDictionaryRange(); // Check whether this range appears as part of 60 // the Unicode set named "dictionary" 61 62 RangeDescriptor(const RangeDescriptor &other) = delete; // forbid default copying of this class 63 RangeDescriptor &operator=(const RangeDescriptor &other) = delete; // forbid assigning of this class 64 }; 65 66 67 // 68 // RBBISetBuilder Handles processing of Unicode Sets from RBBI rules. 69 // 70 // Starting with the rules parse tree from the scanner, 71 // 72 // - Enumerate the set of UnicodeSets that are referenced 73 // by the RBBI rules. 74 // - compute a derived set of non-overlapping UnicodeSets 75 // that will correspond to columns in the state table for 76 // the RBBI execution engine. 77 // - construct the trie table that maps input characters 78 // to set numbers in the non-overlapping set of sets. 79 // 80 81 82 class RBBISetBuilder : public UMemory { 83 public: 84 RBBISetBuilder(RBBIRuleBuilder *rb); 85 ~RBBISetBuilder(); 86 87 void buildRanges(); 88 void buildTrie(); 89 void addValToSets(UVector *sets, uint32_t val); 90 void addValToSet (RBBINode *usetNode, uint32_t val); 91 int32_t getNumCharCategories() const; // CharCategories are the same as input symbol set to the 92 // runtime state machine, which are the same as 93 // columns in the DFA state table 94 int32_t getDictCategoriesStart() const; // First char category that includes $dictionary, or 95 // last category + 1 if there are no dictionary categories. 96 int32_t getTrieSize() /*const*/; // Size in bytes of the serialized Trie. 97 void serializeTrie(uint8_t *where); // write out the serialized Trie. 98 UChar32 getFirstChar(int32_t val) const; 99 UBool sawBOF() const; // Indicate whether any references to the {bof} pseudo 100 // character were encountered. 101 /** 102 * Merge two character categories that have been identified as having equivalent behavior. 103 * The ranges belonging to the second category (table column) will be added to the first. 104 * @param categories the pair of categories to be merged. 105 */ 106 void mergeCategories(IntPair categories); 107 108 #ifdef RBBI_DEBUG 109 void printSets(); 110 void printRanges(); 111 void printRangeGroups(); 112 #else 113 #define printSets() 114 #define printRanges() 115 #define printRangeGroups() 116 #endif 117 118 private: 119 RBBIRuleBuilder *fRB; // The RBBI Rule Compiler that owns us. 120 UErrorCode *fStatus; 121 122 RangeDescriptor *fRangeList; // Head of the linked list of RangeDescriptors 123 124 UMutableCPTrie *fMutableTrie; // The mapping TRIE that is the end result of processing 125 UCPTrie *fTrie; // the Unicode Sets. 126 uint32_t fTrieSize; 127 128 // Number of range groups, which are groups of ranges that are in the same original UnicodeSets. 129 int32_t fGroupCount; 130 131 // The number of the first dictionary char category. 132 // If there are no Dictionary categories, set to the last category + 1. 133 int32_t fDictCategoriesStart; 134 135 UBool fSawBOF; 136 137 RBBISetBuilder(const RBBISetBuilder &other); // forbid copying of this class 138 RBBISetBuilder &operator=(const RBBISetBuilder &other); // forbid copying of this class 139 }; 140 141 142 143 U_NAMESPACE_END 144 145 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ 146 147 #endif 148