1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 // 4 // rbbitblb.h 5 // 6 7 /* 8 ********************************************************************** 9 * Copyright (c) 2002-2016, International Business Machines 10 * Corporation and others. All Rights Reserved. 11 ********************************************************************** 12 */ 13 14 #ifndef RBBITBLB_H 15 #define RBBITBLB_H 16 17 #include "unicode/utypes.h" 18 19 #if !UCONFIG_NO_BREAK_ITERATION 20 21 #include "unicode/uobject.h" 22 #include "unicode/rbbi.h" 23 #include "rbbidata.h" 24 #include "rbbirb.h" 25 #include "rbbinode.h" 26 27 28 U_NAMESPACE_BEGIN 29 30 class RBBIRuleScanner; 31 class RBBIRuleBuilder; 32 class UVector32; 33 34 // 35 // class RBBITableBuilder is part of the RBBI rule compiler. 36 // It builds the state transition table used by the RBBI runtime 37 // from the expression syntax tree generated by the rule scanner. 38 // 39 // This class is part of the RBBI implementation only. 40 // There is no user-visible public API here. 41 // 42 43 class RBBITableBuilder : public UMemory { 44 public: 45 RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode **rootNode, UErrorCode &status); 46 ~RBBITableBuilder(); 47 48 void buildForwardTable(); 49 50 /** Return the runtime size in bytes of the built state table. */ 51 int32_t getTableSize() const; 52 53 /** Fill in the runtime state table. Sufficient memory must exist at the specified location. 54 */ 55 void exportTable(void *where); 56 57 /** Use 8 bits to encode the forward table */ 58 bool use8BitsForTable() const; 59 60 /** 61 * Find duplicate (redundant) character classes. Begin looking with categories.first. 62 * Duplicate, if found are returned in the categories parameter. 63 * This is an iterator-like function, used to identify character classes 64 * (state table columns) that can be eliminated. 65 * @param categories in/out parameter, specifies where to start looking for duplicates, 66 * and returns the first pair of duplicates found, if any. 67 * @return true if duplicate char classes were found, false otherwise. 68 */ 69 bool findDuplCharClassFrom(IntPair *categories); 70 71 /** Remove a column from the state table. Used when two character categories 72 * have been found equivalent, and merged together, to eliminate the uneeded table column. 73 */ 74 void removeColumn(int32_t column); 75 76 /** 77 * Check for, and remove dupicate states (table rows). 78 * @return the number of states removed. 79 */ 80 int32_t removeDuplicateStates(); 81 82 /** Build the safe reverse table from the already-constructed forward table. */ 83 void buildSafeReverseTable(UErrorCode &status); 84 85 /** Return the runtime size in bytes of the built safe reverse state table. */ 86 int32_t getSafeTableSize() const; 87 88 /** Fill in the runtime safe state table. Sufficient memory must exist at the specified location. 89 */ 90 void exportSafeTable(void *where); 91 92 /** Use 8 bits to encode the safe reverse table */ 93 bool use8BitsForSafeTable() const; 94 95 private: 96 void calcNullable(RBBINode *n); 97 void calcFirstPos(RBBINode *n); 98 void calcLastPos(RBBINode *n); 99 void calcFollowPos(RBBINode *n); 100 void calcChainedFollowPos(RBBINode *n, RBBINode *endMarkNode); 101 void bofFixup(); 102 void buildStateTable(); 103 void mapLookAheadRules(); 104 void flagAcceptingStates(); 105 void flagLookAheadStates(); 106 void flagTaggedStates(); 107 void mergeRuleStatusVals(); 108 109 /** 110 * Merge redundant state table columns, eliminating character classes with identical behavior. 111 * Done after the state tables are generated, just before converting to their run-time format. 112 */ 113 int32_t mergeColumns(); 114 115 void addRuleRootNodes(UVector *dest, RBBINode *node); 116 117 /** 118 * Find duplicate (redundant) states, beginning at the specified pair, 119 * within this state table. This is an iterator-like function, used to 120 * identify states (state table rows) that can be eliminated. 121 * @param states in/out parameter, specifies where to start looking for duplicates, 122 * and returns the first pair of duplicates found, if any. 123 * @return true if duplicate states were found, false otherwise. 124 */ 125 bool findDuplicateState(IntPair *states); 126 127 /** Remove a duplicate state. 128 * @param duplStates The duplicate states. The first is kept, the second is removed. 129 * All references to the second in the state table are retargeted 130 * to the first. 131 */ 132 void removeState(IntPair duplStates); 133 134 /** Find the next duplicate state in the safe reverse table. An iterator function. 135 * @param states in/out parameter, specifies where to start looking for duplicates, 136 * and returns the first pair of duplicates found, if any. 137 * @return true if a duplicate pair of states was found. 138 */ 139 bool findDuplicateSafeState(IntPair *states); 140 141 /** Remove a duplicate state from the safe table. 142 * @param duplStates The duplicate states. The first is kept, the second is removed. 143 * All references to the second in the state table are retargeted 144 * to the first. 145 */ 146 void removeSafeState(IntPair duplStates); 147 148 // Set functions for UVector. 149 // TODO: make a USet subclass of UVector 150 151 void setAdd(UVector *dest, UVector *source); 152 UBool setEquals(UVector *a, UVector *b); 153 154 void sortedAdd(UVector **dest, int32_t val); 155 156 public: 157 #ifdef RBBI_DEBUG 158 void printSet(UVector *s); 159 void printPosSets(RBBINode *n /* = NULL*/); 160 void printStates(); 161 void printRuleStatusTable(); 162 void printReverseTable(); 163 #else 164 #define printSet(s) 165 #define printPosSets(n) 166 #define printStates() 167 #define printRuleStatusTable() 168 #define printReverseTable() 169 #endif 170 171 private: 172 RBBIRuleBuilder *fRB; 173 RBBINode *&fTree; // The root node of the parse tree to build a 174 // table for. 175 UErrorCode *fStatus; 176 177 /** State Descriptors, UVector<RBBIStateDescriptor> */ 178 UVector *fDStates; // D states (Aho's terminology) 179 // Index is state number 180 // Contents are RBBIStateDescriptor pointers. 181 182 /** Synthesized safe table, UVector of UnicodeString, one string per table row. */ 183 UVector *fSafeTable; 184 185 /** Map from rule number (fVal in look ahead nodes) to sequential lookahead index. */ 186 UVector32 *fLookAheadRuleMap = nullptr; 187 188 /* Counter used when assigning lookahead rule numbers. 189 * Contains the last look-ahead number already in use. 190 * The first look-ahead number is 2; Number 1 (ACCEPTING_UNCONDITIONAL) is reserved 191 * for non-lookahead accepting states. See the declarations of RBBIStateTableRowT. */ 192 int32_t fLASlotsInUse = ACCEPTING_UNCONDITIONAL; 193 194 195 RBBITableBuilder(const RBBITableBuilder &other) = delete; // forbid copying of this class 196 RBBITableBuilder &operator=(const RBBITableBuilder &other) = delete; // forbid copying of this class 197 }; 198 199 // 200 // RBBIStateDescriptor - The DFA is constructed as a set of these descriptors, 201 // one for each state. 202 class RBBIStateDescriptor : public UMemory { 203 public: 204 UBool fMarked; 205 uint32_t fAccepting; 206 uint32_t fLookAhead; 207 UVector *fTagVals; 208 int32_t fTagsIdx; 209 UVector *fPositions; // Set of parse tree positions associated 210 // with this state. Unordered (it's a set). 211 // UVector contents are RBBINode * 212 213 UVector32 *fDtran; // Transitions out of this state. 214 // indexed by input character 215 // contents is int index of dest state 216 // in RBBITableBuilder.fDStates 217 218 RBBIStateDescriptor(int maxInputSymbol, UErrorCode *fStatus); 219 ~RBBIStateDescriptor(); 220 221 private: 222 RBBIStateDescriptor(const RBBIStateDescriptor &other); // forbid copying of this class 223 RBBIStateDescriptor &operator=(const RBBIStateDescriptor &other); // forbid copying of this class 224 }; 225 226 227 228 U_NAMESPACE_END 229 230 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ 231 232 #endif 233