1 // Copyright (C) 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 // 4 // rbbisetb.h 5 /* 6 ********************************************************************** 7 * Copyright (c) 2001-2005, International Business Machines 8 * Corporation and others. All Rights Reserved. 9 ********************************************************************** 10 */ 11 12 #ifndef RBBISETB_H 13 #define RBBISETB_H 14 15 #include "unicode/utypes.h" 16 #include "unicode/uobject.h" 17 #include "rbbirb.h" 18 #include "uvector.h" 19 20 struct UNewTrie; 21 22 U_NAMESPACE_BEGIN 23 24 // 25 // RBBISetBuilder Derives the character categories used by the runtime RBBI engine 26 // from the Unicode Sets appearing in the source RBBI rules, and 27 // creates the TRIE table used to map from Unicode to the 28 // character categories. 29 // 30 31 32 // 33 // RangeDescriptor 34 // 35 // Each of the non-overlapping character ranges gets one of these descriptors. 36 // All of them are strung together in a linked list, which is kept in order 37 // (by character) 38 // 39 class RangeDescriptor : public UMemory { 40 public: 41 UChar32 fStartChar; // Start of range, unicode 32 bit value. 42 UChar32 fEndChar; // End of range, unicode 32 bit value. 43 int32_t fNum; // runtime-mapped input value for this range. 44 UVector *fIncludesSets; // vector of the the original 45 // Unicode sets that include this range. 46 // (Contains ptrs to uset nodes) 47 RangeDescriptor *fNext; // Next RangeDescriptor in the linked list. 48 49 RangeDescriptor(UErrorCode &status); 50 RangeDescriptor(const RangeDescriptor &other, UErrorCode &status); 51 ~RangeDescriptor(); 52 void split(UChar32 where, UErrorCode &status); // Spit this range in two at "where", with 53 // where appearing in the second (higher) part. 54 void setDictionaryFlag(); // Check whether this range appears as part of 55 // the Unicode set named "dictionary" 56 57 private: 58 RangeDescriptor(const RangeDescriptor &other); // forbid copying of this class 59 RangeDescriptor &operator=(const RangeDescriptor &other); // forbid copying of this class 60 }; 61 62 63 // 64 // RBBISetBuilder Handles processing of Unicode Sets from RBBI rules. 65 // 66 // Starting with the rules parse tree from the scanner, 67 // 68 // - Enumerate the set of UnicodeSets that are referenced 69 // by the RBBI rules. 70 // - compute a derived set of non-overlapping UnicodeSets 71 // that will correspond to columns in the state table for 72 // the RBBI execution engine. 73 // - construct the trie table that maps input characters 74 // to set numbers in the non-overlapping set of sets. 75 // 76 77 78 class RBBISetBuilder : public UMemory { 79 public: 80 RBBISetBuilder(RBBIRuleBuilder *rb); 81 ~RBBISetBuilder(); 82 83 void build(); 84 void addValToSets(UVector *sets, uint32_t val); 85 void addValToSet (RBBINode *usetNode, uint32_t val); 86 int32_t getNumCharCategories() const; // CharCategories are the same as input symbol set to the 87 // runtime state machine, which are the same as 88 // columns in the DFA state table 89 int32_t getTrieSize() /*const*/; // Size in bytes of the serialized Trie. 90 void serializeTrie(uint8_t *where); // write out the serialized Trie. 91 UChar32 getFirstChar(int32_t val) const; 92 UBool sawBOF() const; // Indicate whether any references to the {bof} pseudo 93 // character were encountered. 94 #ifdef RBBI_DEBUG 95 void printSets(); 96 void printRanges(); 97 void printRangeGroups(); 98 #else 99 #define printSets() 100 #define printRanges() 101 #define printRangeGroups() 102 #endif 103 104 private: 105 void numberSets(); 106 107 RBBIRuleBuilder *fRB; // The RBBI Rule Compiler that owns us. 108 UErrorCode *fStatus; 109 110 RangeDescriptor *fRangeList; // Head of the linked list of RangeDescriptors 111 112 UNewTrie *fTrie; // The mapping TRIE that is the end result of processing 113 uint32_t fTrieSize; // the Unicode Sets. 114 115 // Groups correspond to character categories - 116 // groups of ranges that are in the same original UnicodeSets. 117 // fGroupCount is the index of the last used group. 118 // fGroupCount+1 is also the number of columns in the RBBI state table being compiled. 119 // State table column 0 is not used. Column 1 is for end-of-input. 120 // column 2 is for group 0. Funny counting. 121 int32_t fGroupCount; 122 123 UBool fSawBOF; 124 125 RBBISetBuilder(const RBBISetBuilder &other); // forbid copying of this class 126 RBBISetBuilder &operator=(const RBBISetBuilder &other); // forbid copying of this class 127 }; 128 129 130 131 U_NAMESPACE_END 132 #endif 133