• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1  //
2  //  rbbiscan.h
3  //
4  //  Copyright (C) 2002-2008, International Business Machines Corporation and others.
5  //  All Rights Reserved.
6  //
7  //  This file contains declarations for class RBBIRuleScanner
8  //
9  
10  
11  #ifndef RBBISCAN_H
12  #define RBBISCAN_H
13  
14  #include "unicode/utypes.h"
15  #include "unicode/uobject.h"
16  #include "unicode/rbbi.h"
17  #include "unicode/uniset.h"
18  #include "unicode/parseerr.h"
19  #include "uhash.h"
20  #include "uvector.h"
21  #include "unicode/symtable.h"// For UnicodeSet parsing, is the interface that
22                            //    looks up references to $variables within a set.
23  #include "rbbinode.h"
24  //#include "rbbitblb.h"
25  
26  
27  
28  U_NAMESPACE_BEGIN
29  
30  class   RBBIRuleBuilder;
31  class   RBBISymbolTable;
32  
33  
34  //--------------------------------------------------------------------------------
35  //
36  //  class RBBIRuleScanner does the lowest level, character-at-a-time
37  //                        scanning of break iterator rules.
38  //
39  //                        The output of the scanner is parse trees for
40  //                        the rule expressions and a list of all Unicode Sets
41  //                        encountered.
42  //
43  //--------------------------------------------------------------------------------
44  
45  class RBBIRuleScanner : public UMemory {
46  public:
47  
48      enum {
49          kStackSize = 100            // The size of the state stack for
50      };                              //   rules parsing.  Corresponds roughly
51                                      //   to the depth of parentheses nesting
52                                      //   that is allowed in the rules.
53  
54      struct RBBIRuleChar {
55          UChar32             fChar;
56          UBool               fEscaped;
57      };
58  
59      RBBIRuleScanner(RBBIRuleBuilder  *rb);
60  
61  
62      virtual    ~RBBIRuleScanner();
63  
64      void        nextChar(RBBIRuleChar &c);          // Get the next char from the input stream.
65                                                      // Return false if at end.
66  
67      UBool       push(const RBBIRuleChar &c);        // Push (unget) one character.
68                                                      //   Only a single character may be pushed.
69  
70      void        parse();                            // Parse the rules, generating two parse
71                                                      //   trees, one each for the forward and
72                                                      //   reverse rules,
73                                                      //   and a list of UnicodeSets encountered.
74  
75      /**
76       * Return a rules string without unnecessary
77       * characters.
78       */
79      static UnicodeString stripRules(const UnicodeString &rules);
80  private:
81  
82      UBool       doParseActions(int32_t a);
83      void        error(UErrorCode e);                   // error reporting convenience function.
84      void        fixOpStack(RBBINode::OpPrecedence p);
85                                                         //   a character.
86      void        findSetFor(const UnicodeString &s, RBBINode *node, UnicodeSet *setToAdopt = NULL);
87  
88      UChar32     nextCharLL();
89  #ifdef RBBI_DEBUG
90      void        printNodeStack(const char *title);
91  #endif
92      RBBINode    *pushNewNode(RBBINode::NodeType  t);
93      void        scanSet();
94  
95  
96      RBBIRuleBuilder               *fRB;              // The rule builder that we are part of.
97  
98      int32_t                       fScanIndex;        // Index of current character being processed
99                                                       //   in the rule input string.
100      int32_t                       fNextIndex;        // Index of the next character, which
101                                                       //   is the first character not yet scanned.
102      UBool                         fQuoteMode;        // Scan is in a 'quoted region'
103      int32_t                       fLineNum;          // Line number in input file.
104      int32_t                       fCharNum;          // Char position within the line.
105      UChar32                       fLastChar;         // Previous char, needed to count CR-LF
106                                                       //   as a single line, not two.
107  
108      RBBIRuleChar                  fC;                // Current char for parse state machine
109                                                       //   processing.
110      UnicodeString                 fVarName;          // $variableName, valid when we've just
111                                                       //   scanned one.
112  
113      RBBIRuleTableEl               **fStateTable;     // State Transition Table for RBBI Rule
114                                                       //   parsing.  index by p[state][char-class]
115  
116      uint16_t                      fStack[kStackSize];  // State stack, holds state pushes
117      int32_t                       fStackPtr;           //  and pops as specified in the state
118                                                         //  transition rules.
119  
120      RBBINode                      *fNodeStack[kStackSize]; // Node stack, holds nodes created
121                                                             //  during the parse of a rule
122      int32_t                        fNodeStackPtr;
123  
124  
125      UBool                          fReverseRule;     // True if the rule currently being scanned
126                                                       //  is a reverse direction rule (if it
127                                                       //  starts with a '!')
128  
129      UBool                          fLookAheadRule;   // True if the rule includes a '/'
130                                                       //   somewhere within it.
131  
132      RBBISymbolTable               *fSymbolTable;     // symbol table, holds definitions of
133                                                       //   $variable symbols.
134  
135      UHashtable                    *fSetTable;        // UnicocodeSet hash table, holds indexes to
136                                                       //   the sets created while parsing rules.
137                                                       //   The key is the string used for creating
138                                                       //   the set.
139  
140      UnicodeSet                     fRuleSets[10];    // Unicode Sets that are needed during
141                                                       //  the scanning of RBBI rules.  The
142                                                       //  indicies for these are assigned by the
143                                                       //  perl script that builds the state tables.
144                                                       //  See rbbirpt.h.
145  
146      int32_t                        fRuleNum;         // Counts each rule as it is scanned.
147  
148      int32_t                        fOptionStart;     // Input index of start of a !!option
149                                                       //   keyword, while being scanned.
150  
151      UnicodeSet *gRuleSet_rule_char;
152      UnicodeSet *gRuleSet_white_space;
153      UnicodeSet *gRuleSet_name_char;
154      UnicodeSet *gRuleSet_name_start_char;
155  
156      RBBIRuleScanner(const RBBIRuleScanner &other); // forbid copying of this class
157      RBBIRuleScanner &operator=(const RBBIRuleScanner &other); // forbid copying of this class
158  };
159  
160  U_NAMESPACE_END
161  
162  #endif
163