1 // 2 // rbbiscan.h 3 // 4 // Copyright (C) 2002-2007, International Business Machines Corporation and others. 5 // All Rights Reserved. 6 // 7 // This file contains declarations for class RBBIRuleScanner 8 // 9 10 11 #ifndef RBBISCAN_H 12 #define RBBISCAN_H 13 14 #include "unicode/utypes.h" 15 #include "unicode/uobject.h" 16 #include "unicode/rbbi.h" 17 #include "unicode/uniset.h" 18 #include "unicode/parseerr.h" 19 #include "uhash.h" 20 #include "uvector.h" 21 #include "unicode/symtable.h"// For UnicodeSet parsing, is the interface that 22 // looks up references to $variables within a set. 23 #include "rbbinode.h" 24 //#include "rbbitblb.h" 25 26 27 28 U_NAMESPACE_BEGIN 29 30 class RBBIRuleBuilder; 31 class RBBISymbolTable; 32 33 34 //-------------------------------------------------------------------------------- 35 // 36 // class RBBIRuleScanner does the lowest level, character-at-a-time 37 // scanning of break iterator rules. 38 // 39 // The output of the scanner is parse trees for 40 // the rule expressions and a list of all Unicode Sets 41 // encountered. 42 // 43 //-------------------------------------------------------------------------------- 44 static const int kStackSize = 100; // The size of the state stack for 45 // rules parsing. Corresponds roughly 46 // to the depth of parentheses nesting 47 // that is allowed in the rules. 48 49 class RBBIRuleScanner : public UMemory { 50 public: 51 52 struct RBBIRuleChar { 53 UChar32 fChar; 54 UBool fEscaped; 55 }; 56 57 RBBIRuleScanner(RBBIRuleBuilder *rb); 58 59 60 virtual ~RBBIRuleScanner(); 61 62 void nextChar(RBBIRuleChar &c); // Get the next char from the input stream. 63 // Return false if at end. 64 65 UBool push(const RBBIRuleChar &c); // Push (unget) one character. 66 // Only a single character may be pushed. 67 68 void parse(); // Parse the rules, generating two parse 69 // trees, one each for the forward and 70 // reverse rules, 71 // and a list of UnicodeSets encountered. 72 73 /** 74 * Return a rules string without unnecessary 75 * characters. 76 */ 77 static UnicodeString stripRules(const UnicodeString &rules); 78 private: 79 80 UBool doParseActions(int32_t a); 81 void error(UErrorCode e); // error reporting convenience function. 82 void fixOpStack(RBBINode::OpPrecedence p); 83 // a character. 84 void findSetFor(const UnicodeString &s, RBBINode *node, UnicodeSet *setToAdopt = NULL); 85 86 UChar32 nextCharLL(); 87 #ifdef RBBI_DEBUG 88 void printNodeStack(const char *title); 89 #endif 90 RBBINode *pushNewNode(RBBINode::NodeType t); 91 void scanSet(); 92 93 94 RBBIRuleBuilder *fRB; // The rule builder that we are part of. 95 96 int32_t fScanIndex; // Index of current character being processed 97 // in the rule input string. 98 int32_t fNextIndex; // Index of the next character, which 99 // is the first character not yet scanned. 100 UBool fQuoteMode; // Scan is in a 'quoted region' 101 int32_t fLineNum; // Line number in input file. 102 int32_t fCharNum; // Char position within the line. 103 UChar32 fLastChar; // Previous char, needed to count CR-LF 104 // as a single line, not two. 105 106 RBBIRuleChar fC; // Current char for parse state machine 107 // processing. 108 UnicodeString fVarName; // $variableName, valid when we've just 109 // scanned one. 110 111 RBBIRuleTableEl **fStateTable; // State Transition Table for RBBI Rule 112 // parsing. index by p[state][char-class] 113 114 uint16_t fStack[kStackSize]; // State stack, holds state pushes 115 int32_t fStackPtr; // and pops as specified in the state 116 // transition rules. 117 118 RBBINode *fNodeStack[kStackSize]; // Node stack, holds nodes created 119 // during the parse of a rule 120 int32_t fNodeStackPtr; 121 122 123 UBool fReverseRule; // True if the rule currently being scanned 124 // is a reverse direction rule (if it 125 // starts with a '!') 126 127 UBool fLookAheadRule; // True if the rule includes a '/' 128 // somewhere within it. 129 130 RBBISymbolTable *fSymbolTable; // symbol table, holds definitions of 131 // $variable symbols. 132 133 UHashtable *fSetTable; // UnicocodeSet hash table, holds indexes to 134 // the sets created while parsing rules. 135 // The key is the string used for creating 136 // the set. 137 138 UnicodeSet *fRuleSets[10]; // Unicode Sets that are needed during 139 // the scanning of RBBI rules. The 140 // indicies for these are assigned by the 141 // perl script that builds the state tables. 142 // See rbbirpt.h. 143 144 int32_t fRuleNum; // Counts each rule as it is scanned. 145 146 int32_t fOptionStart; // Input index of start of a !!option 147 // keyword, while being scanned. 148 149 UnicodeSet *gRuleSet_rule_char; 150 UnicodeSet *gRuleSet_white_space; 151 UnicodeSet *gRuleSet_name_char; 152 UnicodeSet *gRuleSet_name_start_char; 153 154 RBBIRuleScanner(const RBBIRuleScanner &other); // forbid copying of this class 155 RBBIRuleScanner &operator=(const RBBIRuleScanner &other); // forbid copying of this class 156 }; 157 158 U_NAMESPACE_END 159 160 #endif 161