1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 // 4 // rbbirb.h 5 // 6 // Copyright (C) 2002-2008, International Business Machines Corporation and others. 7 // All Rights Reserved. 8 // 9 // This file contains declarations for several classes from the 10 // Rule Based Break Iterator rule builder. 11 // 12 13 14 #ifndef RBBIRB_H 15 #define RBBIRB_H 16 17 #include "unicode/utypes.h" 18 19 #if !UCONFIG_NO_BREAK_ITERATION 20 21 #include <utility> 22 23 #include "unicode/uobject.h" 24 #include "unicode/rbbi.h" 25 #include "unicode/uniset.h" 26 #include "unicode/parseerr.h" 27 #include "uhash.h" 28 #include "uvector.h" 29 #include "unicode/symtable.h"// For UnicodeSet parsing, is the interface that 30 // looks up references to $variables within a set. 31 32 33 U_NAMESPACE_BEGIN 34 35 class RBBIRuleScanner; 36 struct RBBIRuleTableEl; 37 class RBBISetBuilder; 38 class RBBINode; 39 class RBBITableBuilder; 40 41 42 43 //-------------------------------------------------------------------------------- 44 // 45 // RBBISymbolTable. Implements SymbolTable interface that is used by the 46 // UnicodeSet parser to resolve references to $variables. 47 // 48 //-------------------------------------------------------------------------------- 49 class RBBISymbolTableEntry : public UMemory { // The symbol table hash table contains one 50 public: // of these structs for each entry. 51 RBBISymbolTableEntry(); 52 UnicodeString key; 53 RBBINode *val; 54 ~RBBISymbolTableEntry(); 55 56 private: 57 RBBISymbolTableEntry(const RBBISymbolTableEntry &other); // forbid copying of this class 58 RBBISymbolTableEntry &operator=(const RBBISymbolTableEntry &other); // forbid copying of this class 59 }; 60 61 62 class RBBISymbolTable : public UMemory, public SymbolTable { 63 private: 64 const UnicodeString &fRules; 65 UHashtable *fHashTable; 66 RBBIRuleScanner *fRuleScanner; 67 68 // These next two fields are part of the mechanism for passing references to 69 // already-constructed UnicodeSets back to the UnicodeSet constructor 70 // when the pattern includes $variable references. 71 const UnicodeString ffffString; // = "/uffff" 72 UnicodeSet *fCachedSetLookup; 73 74 public: 75 // API inherited from class SymbolTable 76 virtual const UnicodeString* lookup(const UnicodeString& s) const override; 77 virtual const UnicodeFunctor* lookupMatcher(UChar32 ch) const override; 78 virtual UnicodeString parseReference(const UnicodeString& text, 79 ParsePosition& pos, int32_t limit) const override; 80 81 // Additional Functions 82 RBBISymbolTable(RBBIRuleScanner *, const UnicodeString &fRules, UErrorCode &status); 83 virtual ~RBBISymbolTable(); 84 85 virtual RBBINode *lookupNode(const UnicodeString &key) const; 86 virtual void addEntry (const UnicodeString &key, RBBINode *val, UErrorCode &err); 87 88 #ifdef RBBI_DEBUG 89 virtual void rbbiSymtablePrint() const; 90 #else 91 // A do-nothing inline function for non-debug builds. Member funcs can't be empty 92 // or the call sites won't compile. 93 int32_t fFakeField; 94 #define rbbiSymtablePrint() fFakeField=0; 95 #endif 96 97 private: 98 RBBISymbolTable(const RBBISymbolTable &other); // forbid copying of this class 99 RBBISymbolTable &operator=(const RBBISymbolTable &other); // forbid copying of this class 100 }; 101 102 103 //-------------------------------------------------------------------------------- 104 // 105 // class RBBIRuleBuilder The top-level class handling RBBI rule compiling. 106 // 107 //-------------------------------------------------------------------------------- 108 class RBBIRuleBuilder : public UMemory { 109 public: 110 111 // Create a rule based break iterator from a set of rules. 112 // This function is the main entry point into the rule builder. The 113 // public ICU API for creating RBBIs uses this function to do the actual work. 114 // 115 static BreakIterator * createRuleBasedBreakIterator( const UnicodeString &rules, 116 UParseError *parseError, 117 UErrorCode &status); 118 119 public: 120 // The "public" functions and data members that appear below are accessed 121 // (and shared) by the various parts that make up the rule builder. They 122 // are NOT intended to be accessed by anything outside of the 123 // rule builder implementation. 124 RBBIRuleBuilder(const UnicodeString &rules, 125 UParseError *parseErr, 126 UErrorCode &status 127 ); 128 129 virtual ~RBBIRuleBuilder(); 130 131 /** 132 * Build the state tables and char class Trie from the source rules. 133 */ 134 RBBIDataHeader *build(UErrorCode &status); 135 136 137 /** 138 * Fold together redundant character classes (table columns) and 139 * redundant states (table rows). Done after initial table generation, 140 * before serializing the result. 141 */ 142 void optimizeTables(); 143 144 char *fDebugEnv; // controls debug trace output 145 UErrorCode *fStatus; // Error reporting. Keeping status 146 UParseError *fParseError; // here avoids passing it everywhere. 147 const UnicodeString &fRules; // The rule string that we are compiling 148 UnicodeString fStrippedRules; // The rule string, with comments stripped. 149 150 RBBIRuleScanner *fScanner; // The scanner. 151 RBBINode *fForwardTree; // The parse trees, generated by the scanner, 152 RBBINode *fReverseTree; // then manipulated by subsequent steps. 153 RBBINode *fSafeFwdTree; 154 RBBINode *fSafeRevTree; 155 156 RBBINode **fDefaultTree; // For rules not qualified with a ! 157 // the tree to which they belong to. 158 159 UBool fChainRules; // True for chained Unicode TR style rules. 160 // False for traditional regexp rules. 161 162 UBool fLBCMNoChain; // True: suppress chaining of rules on 163 // chars with LineBreak property == CM. 164 165 UBool fLookAheadHardBreak; // True: Look ahead matches cause an 166 // immediate break, no continuing for the 167 // longest match. 168 169 RBBISetBuilder *fSetBuilder; // Set and Character Category builder. 170 UVector *fUSetNodes; // Vector of all uset nodes. 171 172 RBBITableBuilder *fForwardTable; // State transition table, build time form. 173 174 UVector *fRuleStatusVals; // The values that can be returned 175 // from getRuleStatus(). 176 177 RBBIDataHeader *flattenData(); // Create the flattened (runtime format) 178 // data tables.. 179 private: 180 RBBIRuleBuilder(const RBBIRuleBuilder &other); // forbid copying of this class 181 RBBIRuleBuilder &operator=(const RBBIRuleBuilder &other); // forbid copying of this class 182 }; 183 184 185 186 187 //---------------------------------------------------------------------------- 188 // 189 // RBBISetTableEl is an entry in the hash table of UnicodeSets that have 190 // been encountered. The val Node will be of nodetype uset 191 // and contain pointers to the actual UnicodeSets. 192 // The Key is the source string for initializing the set. 193 // 194 // The hash table is used to avoid creating duplicate 195 // unnamed (not $var references) UnicodeSets. 196 // 197 // Memory Management: 198 // The Hash Table owns these RBBISetTableEl structs and 199 // the key strings. It does NOT own the val nodes. 200 // 201 //---------------------------------------------------------------------------- 202 struct RBBISetTableEl { 203 UnicodeString *key; 204 RBBINode *val; 205 }; 206 207 /** 208 * A pair of ints, used to bundle pairs of states or pairs of character classes. 209 */ 210 typedef std::pair<int32_t, int32_t> IntPair; 211 212 213 //---------------------------------------------------------------------------- 214 // 215 // RBBIDebugPrintf Printf equivalent, for debugging output. 216 // Conditional compilation of the implementation lets us 217 // get rid of the stdio dependency in environments where it 218 // is unavailable. 219 // 220 //---------------------------------------------------------------------------- 221 #ifdef RBBI_DEBUG 222 #include <stdio.h> 223 #define RBBIDebugPrintf printf 224 #define RBBIDebugPuts puts 225 #else 226 #undef RBBIDebugPrintf 227 #define RBBIDebugPuts(arg) 228 #endif 229 230 U_NAMESPACE_END 231 232 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ 233 234 #endif 235 236 237 238