1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 // 4 // rbbirb.h 5 // 6 // Copyright (C) 2002-2008, International Business Machines Corporation and others. 7 // All Rights Reserved. 8 // 9 // This file contains declarations for several classes from the 10 // Rule Based Break Iterator rule builder. 11 // 12 13 14 #ifndef RBBIRB_H 15 #define RBBIRB_H 16 17 #include "unicode/utypes.h" 18 19 #if !UCONFIG_NO_BREAK_ITERATION 20 21 #include "unicode/uobject.h" 22 #include "unicode/rbbi.h" 23 #include "unicode/uniset.h" 24 #include "unicode/parseerr.h" 25 #include "uhash.h" 26 #include "uvector.h" 27 #include "unicode/symtable.h"// For UnicodeSet parsing, is the interface that 28 // looks up references to $variables within a set. 29 30 31 32 U_NAMESPACE_BEGIN 33 34 class RBBIRuleScanner; 35 struct RBBIRuleTableEl; 36 class RBBISetBuilder; 37 class RBBINode; 38 class RBBITableBuilder; 39 40 41 42 //-------------------------------------------------------------------------------- 43 // 44 // RBBISymbolTable. Implements SymbolTable interface that is used by the 45 // UnicodeSet parser to resolve references to $variables. 46 // 47 //-------------------------------------------------------------------------------- 48 class RBBISymbolTableEntry : public UMemory { // The symbol table hash table contains one 49 public: // of these structs for each entry. 50 RBBISymbolTableEntry(); 51 UnicodeString key; 52 RBBINode *val; 53 ~RBBISymbolTableEntry(); 54 55 private: 56 RBBISymbolTableEntry(const RBBISymbolTableEntry &other); // forbid copying of this class 57 RBBISymbolTableEntry &operator=(const RBBISymbolTableEntry &other); // forbid copying of this class 58 }; 59 60 61 class RBBISymbolTable : public UMemory, public SymbolTable { 62 private: 63 const UnicodeString &fRules; 64 UHashtable *fHashTable; 65 RBBIRuleScanner *fRuleScanner; 66 67 // These next two fields are part of the mechanism for passing references to 68 // already-constructed UnicodeSets back to the UnicodeSet constructor 69 // when the pattern includes $variable references. 70 const UnicodeString ffffString; // = "/uffff" 71 UnicodeSet *fCachedSetLookup; 72 73 public: 74 // API inherited from class SymbolTable 75 virtual const UnicodeString* lookup(const UnicodeString& s) const; 76 virtual const UnicodeFunctor* lookupMatcher(UChar32 ch) const; 77 virtual UnicodeString parseReference(const UnicodeString& text, 78 ParsePosition& pos, int32_t limit) const; 79 80 // Additional Functions 81 RBBISymbolTable(RBBIRuleScanner *, const UnicodeString &fRules, UErrorCode &status); 82 virtual ~RBBISymbolTable(); 83 84 virtual RBBINode *lookupNode(const UnicodeString &key) const; 85 virtual void addEntry (const UnicodeString &key, RBBINode *val, UErrorCode &err); 86 87 #ifdef RBBI_DEBUG 88 virtual void rbbiSymtablePrint() const; 89 #else 90 // A do-nothing inline function for non-debug builds. Member funcs can't be empty 91 // or the call sites won't compile. 92 int32_t fFakeField; 93 #define rbbiSymtablePrint() fFakeField=0; 94 #endif 95 96 private: 97 RBBISymbolTable(const RBBISymbolTable &other); // forbid copying of this class 98 RBBISymbolTable &operator=(const RBBISymbolTable &other); // forbid copying of this class 99 }; 100 101 102 //-------------------------------------------------------------------------------- 103 // 104 // class RBBIRuleBuilder The top-level class handling RBBI rule compiling. 105 // 106 //-------------------------------------------------------------------------------- 107 class RBBIRuleBuilder : public UMemory { 108 public: 109 110 // Create a rule based break iterator from a set of rules. 111 // This function is the main entry point into the rule builder. The 112 // public ICU API for creating RBBIs uses this function to do the actual work. 113 // 114 static BreakIterator * createRuleBasedBreakIterator( const UnicodeString &rules, 115 UParseError *parseError, 116 UErrorCode &status); 117 118 public: 119 // The "public" functions and data members that appear below are accessed 120 // (and shared) by the various parts that make up the rule builder. They 121 // are NOT intended to be accessed by anything outside of the 122 // rule builder implementation. 123 RBBIRuleBuilder(const UnicodeString &rules, 124 UParseError *parseErr, 125 UErrorCode &status 126 ); 127 128 virtual ~RBBIRuleBuilder(); 129 char *fDebugEnv; // controls debug trace output 130 UErrorCode *fStatus; // Error reporting. Keeping status 131 UParseError *fParseError; // here avoids passing it everywhere. 132 const UnicodeString &fRules; // The rule string that we are compiling 133 134 RBBIRuleScanner *fScanner; // The scanner. 135 RBBINode *fForwardTree; // The parse trees, generated by the scanner, 136 RBBINode *fReverseTree; // then manipulated by subsequent steps. 137 RBBINode *fSafeFwdTree; 138 RBBINode *fSafeRevTree; 139 140 RBBINode **fDefaultTree; // For rules not qualified with a ! 141 // the tree to which they belong to. 142 143 UBool fChainRules; // True for chained Unicode TR style rules. 144 // False for traditional regexp rules. 145 146 UBool fLBCMNoChain; // True: suppress chaining of rules on 147 // chars with LineBreak property == CM. 148 149 UBool fLookAheadHardBreak; // True: Look ahead matches cause an 150 // immediate break, no continuing for the 151 // longest match. 152 153 RBBISetBuilder *fSetBuilder; // Set and Character Category builder. 154 UVector *fUSetNodes; // Vector of all uset nodes. 155 156 RBBITableBuilder *fForwardTables; // State transition tables 157 RBBITableBuilder *fReverseTables; 158 RBBITableBuilder *fSafeFwdTables; 159 RBBITableBuilder *fSafeRevTables; 160 161 UVector *fRuleStatusVals; // The values that can be returned 162 // from getRuleStatus(). 163 164 RBBIDataHeader *flattenData(); // Create the flattened (runtime format) 165 // data tables.. 166 private: 167 RBBIRuleBuilder(const RBBIRuleBuilder &other); // forbid copying of this class 168 RBBIRuleBuilder &operator=(const RBBIRuleBuilder &other); // forbid copying of this class 169 }; 170 171 172 173 174 //---------------------------------------------------------------------------- 175 // 176 // RBBISetTableEl is an entry in the hash table of UnicodeSets that have 177 // been encountered. The val Node will be of nodetype uset 178 // and contain pointers to the actual UnicodeSets. 179 // The Key is the source string for initializing the set. 180 // 181 // The hash table is used to avoid creating duplicate 182 // unnamed (not $var references) UnicodeSets. 183 // 184 // Memory Management: 185 // The Hash Table owns these RBBISetTableEl structs and 186 // the key strings. It does NOT own the val nodes. 187 // 188 //---------------------------------------------------------------------------- 189 struct RBBISetTableEl { 190 UnicodeString *key; 191 RBBINode *val; 192 }; 193 194 195 //---------------------------------------------------------------------------- 196 // 197 // RBBIDebugPrintf Printf equivalent, for debugging output. 198 // Conditional compilation of the implementation lets us 199 // get rid of the stdio dependency in environments where it 200 // is unavailable. 201 // 202 //---------------------------------------------------------------------------- 203 #ifdef RBBI_DEBUG 204 #include <stdio.h> 205 #define RBBIDebugPrintf printf 206 #define RBBIDebugPuts puts 207 #else 208 #undef RBBIDebugPrintf 209 #define RBBIDebugPuts(arg) 210 #endif 211 212 U_NAMESPACE_END 213 214 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ 215 216 #endif 217 218 219 220