1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 // 4 // rbbitblb.h 5 // 6 7 /* 8 ********************************************************************** 9 * Copyright (c) 2002-2016, International Business Machines 10 * Corporation and others. All Rights Reserved. 11 ********************************************************************** 12 */ 13 14 #ifndef RBBITBLB_H 15 #define RBBITBLB_H 16 17 #include "unicode/utypes.h" 18 19 #if !UCONFIG_NO_BREAK_ITERATION 20 21 #include "unicode/uobject.h" 22 #include "unicode/rbbi.h" 23 #include "rbbirb.h" 24 #include "rbbinode.h" 25 26 27 U_NAMESPACE_BEGIN 28 29 class RBBIRuleScanner; 30 class RBBIRuleBuilder; 31 class UVector32; 32 33 // 34 // class RBBITableBuilder is part of the RBBI rule compiler. 35 // It builds the state transition table used by the RBBI runtime 36 // from the expression syntax tree generated by the rule scanner. 37 // 38 // This class is part of the RBBI implementation only. 39 // There is no user-visible public API here. 40 // 41 42 class RBBITableBuilder : public UMemory { 43 public: 44 RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode **rootNode, UErrorCode &status); 45 ~RBBITableBuilder(); 46 47 void buildForwardTable(); 48 49 /** Return the runtime size in bytes of the built state table. */ 50 int32_t getTableSize() const; 51 52 /** Fill in the runtime state table. Sufficient memory must exist at the specified location. 53 */ 54 void exportTable(void *where); 55 56 /** 57 * Find duplicate (redundant) character classes. Begin looking with categories.first. 58 * Duplicate, if found are returned in the categories parameter. 59 * This is an iterator-like function, used to identify character classes 60 * (state table columns) that can be eliminated. 61 * @param categories in/out parameter, specifies where to start looking for duplicates, 62 * and returns the first pair of duplicates found, if any. 63 * @return true if duplicate char classes were found, false otherwise. 64 */ 65 bool findDuplCharClassFrom(IntPair *categories); 66 67 /** Remove a column from the state table. Used when two character categories 68 * have been found equivalent, and merged together, to eliminate the uneeded table column. 69 */ 70 void removeColumn(int32_t column); 71 72 /** 73 * Check for, and remove dupicate states (table rows). 74 * @return the number of states removed. 75 */ 76 int32_t removeDuplicateStates(); 77 78 /** Build the safe reverse table from the already-constructed forward table. */ 79 void buildSafeReverseTable(UErrorCode &status); 80 81 /** Return the runtime size in bytes of the built safe reverse state table. */ 82 int32_t getSafeTableSize() const; 83 84 /** Fill in the runtime safe state table. Sufficient memory must exist at the specified location. 85 */ 86 void exportSafeTable(void *where); 87 88 89 private: 90 void calcNullable(RBBINode *n); 91 void calcFirstPos(RBBINode *n); 92 void calcLastPos(RBBINode *n); 93 void calcFollowPos(RBBINode *n); 94 void calcChainedFollowPos(RBBINode *n); 95 void bofFixup(); 96 void buildStateTable(); 97 void flagAcceptingStates(); 98 void flagLookAheadStates(); 99 void flagTaggedStates(); 100 void mergeRuleStatusVals(); 101 102 /** 103 * Merge redundant state table columns, eliminating character classes with identical behavior. 104 * Done after the state tables are generated, just before converting to their run-time format. 105 */ 106 int32_t mergeColumns(); 107 108 void addRuleRootNodes(UVector *dest, RBBINode *node); 109 110 /** 111 * Find duplicate (redundant) states, beginning at the specified pair, 112 * within this state table. This is an iterator-like function, used to 113 * identify states (state table rows) that can be eliminated. 114 * @param states in/out parameter, specifies where to start looking for duplicates, 115 * and returns the first pair of duplicates found, if any. 116 * @return true if duplicate states were found, false otherwise. 117 */ 118 bool findDuplicateState(IntPair *states); 119 120 /** Remove a duplicate state. 121 * @param duplStates The duplicate states. The first is kept, the second is removed. 122 * All references to the second in the state table are retargeted 123 * to the first. 124 */ 125 void removeState(IntPair duplStates); 126 127 /** Find the next duplicate state in the safe reverse table. An iterator function. 128 * @param states in/out parameter, specifies where to start looking for duplicates, 129 * and returns the first pair of duplicates found, if any. 130 * @return true if a duplicate pair of states was found. 131 */ 132 bool findDuplicateSafeState(IntPair *states); 133 134 /** Remove a duplicate state from the safe table. 135 * @param duplStates The duplicate states. The first is kept, the second is removed. 136 * All references to the second in the state table are retargeted 137 * to the first. 138 */ 139 void removeSafeState(IntPair duplStates); 140 141 // Set functions for UVector. 142 // TODO: make a USet subclass of UVector 143 144 void setAdd(UVector *dest, UVector *source); 145 UBool setEquals(UVector *a, UVector *b); 146 147 void sortedAdd(UVector **dest, int32_t val); 148 149 public: 150 #ifdef RBBI_DEBUG 151 void printSet(UVector *s); 152 void printPosSets(RBBINode *n /* = NULL*/); 153 void printStates(); 154 void printRuleStatusTable(); 155 void printReverseTable(); 156 #else 157 #define printSet(s) 158 #define printPosSets(n) 159 #define printStates() 160 #define printRuleStatusTable() 161 #define printReverseTable() 162 #endif 163 164 private: 165 RBBIRuleBuilder *fRB; 166 RBBINode *&fTree; // The root node of the parse tree to build a 167 // table for. 168 UErrorCode *fStatus; 169 170 /** State Descriptors, UVector<RBBIStateDescriptor> */ 171 UVector *fDStates; // D states (Aho's terminology) 172 // Index is state number 173 // Contents are RBBIStateDescriptor pointers. 174 175 /** Synthesized safe table, UVector of UnicodeString, one string per table row. */ 176 UVector *fSafeTable; 177 178 179 RBBITableBuilder(const RBBITableBuilder &other); // forbid copying of this class 180 RBBITableBuilder &operator=(const RBBITableBuilder &other); // forbid copying of this class 181 }; 182 183 // 184 // RBBIStateDescriptor - The DFA is constructed as a set of these descriptors, 185 // one for each state. 186 class RBBIStateDescriptor : public UMemory { 187 public: 188 UBool fMarked; 189 int32_t fAccepting; 190 int32_t fLookAhead; 191 UVector *fTagVals; 192 int32_t fTagsIdx; 193 UVector *fPositions; // Set of parse tree positions associated 194 // with this state. Unordered (it's a set). 195 // UVector contents are RBBINode * 196 197 UVector32 *fDtran; // Transitions out of this state. 198 // indexed by input character 199 // contents is int index of dest state 200 // in RBBITableBuilder.fDStates 201 202 RBBIStateDescriptor(int maxInputSymbol, UErrorCode *fStatus); 203 ~RBBIStateDescriptor(); 204 205 private: 206 RBBIStateDescriptor(const RBBIStateDescriptor &other); // forbid copying of this class 207 RBBIStateDescriptor &operator=(const RBBIStateDescriptor &other); // forbid copying of this class 208 }; 209 210 211 212 U_NAMESPACE_END 213 214 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ 215 216 #endif 217