• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* GENERATED SOURCE. DO NOT MODIFY. */
2 // © 2016 and later: Unicode, Inc. and others.
3 // License & terms of use: http://www.unicode.org/copyright.html#License
4 /*
5  *******************************************************************************
6  * Copyright (C) 2003-2016, International Business Machines Corporation and others. All Rights Reserved.
7  *******************************************************************************
8  */
9 
10 package android.icu.text;
11 
12 import java.text.ParsePosition;
13 import java.util.HashMap;
14 
15 import android.icu.impl.Assert;
16 import android.icu.impl.Utility;
17 import android.icu.lang.UCharacter;
18 
19 /**
20   *  This class is part of the Rule Based Break Iterator rule compiler.
21   *  It scans the rules and builds the parse tree.
22   *  There is no public API here.
23   */
24 class RBBIRuleScanner {
25 
26     private final static int    kStackSize = 100;               // The size of the state stack for
27     //   rules parsing.  Corresponds roughly
28     //   to the depth of parentheses nesting
29     //   that is allowed in the rules.
30 
31     static class RBBIRuleChar {
32         int             fChar;
33         boolean         fEscaped;
34     }
35 
36 
37 
38     RBBIRuleBuilder               fRB;              // The rule builder that we are part of.
39 
40     int                       fScanIndex;        // Index of current character being processed
41                                                      //   in the rule input string.
42     int                       fNextIndex;        // Index of the next character, which
43                                                      //   is the first character not yet scanned.
44     boolean                  fQuoteMode;        // Scan is in a 'quoted region'
45     int                       fLineNum;          // Line number in input file.
46     int                       fCharNum;          // Char position within the line.
47     int                       fLastChar;         // Previous char, needed to count CR-LF
48                                                      //   as a single line, not two.
49 
50     RBBIRuleChar              fC = new RBBIRuleChar();    // Current char for parse state machine
51                                                      //   processing.
52 
53 
54     short  fStack[] = new short[kStackSize];  // State stack, holds state pushes
55     int                       fStackPtr;           //  and pops as specified in the state
56                                                        //  transition rules.
57 
58     RBBINode  fNodeStack[] = new RBBINode[kStackSize]; // Node stack, holds nodes created
59                                                            //  during the parse of a rule
60     int                        fNodeStackPtr;
61 
62 
63     boolean                    fReverseRule;         // True if the rule currently being scanned
64                                                      //  is a reverse direction rule (if it
65                                                      //  starts with a '!')
66 
67     boolean                    fLookAheadRule;       // True if the rule includes a '/'
68                                                      //   somewhere within it.
69 
70     boolean                    fNoChainInRule;       // True if the current rule starts with a '^'.
71 
72 
73     RBBISymbolTable            fSymbolTable;         // symbol table, holds definitions of
74                                                      //   $variable symbols.
75 
76     HashMap<String, RBBISetTableEl> fSetTable = new HashMap<String, RBBISetTableEl>(); // UnicocodeSet hash table, holds indexes to
77                                                                                        //   the sets created while parsing rules.
78                                                                                        //   The key is the string used for creating
79                                                                                        //   the set.
80 
81     UnicodeSet      fRuleSets[] = new UnicodeSet[10];    // Unicode Sets that are needed during
82                                                      //  the scanning of RBBI rules.  The
83                                                      //  indicies for these are assigned by the
84                                                      //  perl script that builds the state tables.
85                                                      //  See rbbirpt.h.
86 
87     int                        fRuleNum;         // Counts each rule as it is scanned.
88 
89     int                        fOptionStart;     // Input index of start of a !!option
90                                                  //   keyword, while being scanned.
91 
92 
93 
94    static private String gRuleSet_rule_char_pattern       = "[^[\\p{Z}\\u0020-\\u007f]-[\\p{L}]-[\\p{N}]]";
95    static private String gRuleSet_name_char_pattern       = "[_\\p{L}\\p{N}]";
96    static private String gRuleSet_digit_char_pattern      = "[0-9]";
97    static private String gRuleSet_name_start_char_pattern = "[_\\p{L}]";
98    static private String gRuleSet_white_space_pattern     = "[\\p{Pattern_White_Space}]";
99    static private String kAny =  "any";
100 
101 
102 
103 
104     //----------------------------------------------------------------------------------------
105     //
106     //  Constructor.
107     //
108     //----------------------------------------------------------------------------------------
RBBIRuleScanner(RBBIRuleBuilder rb)109     RBBIRuleScanner(RBBIRuleBuilder rb) {
110         fRB = rb;
111         fLineNum = 1;
112 
113         //
114         //  Set up the constant Unicode Sets.
115         //     Note: These could be made static and shared among
116         //            all instances of RBBIRuleScanners.
117         fRuleSets[RBBIRuleParseTable.kRuleSet_rule_char - 128] = new UnicodeSet(gRuleSet_rule_char_pattern);
118         fRuleSets[RBBIRuleParseTable.kRuleSet_white_space - 128] = new UnicodeSet(gRuleSet_white_space_pattern);
119         fRuleSets[RBBIRuleParseTable.kRuleSet_name_char - 128] = new UnicodeSet(gRuleSet_name_char_pattern);
120         fRuleSets[RBBIRuleParseTable.kRuleSet_name_start_char - 128] = new UnicodeSet(gRuleSet_name_start_char_pattern);
121         fRuleSets[RBBIRuleParseTable.kRuleSet_digit_char - 128] = new UnicodeSet(gRuleSet_digit_char_pattern);
122 
123         fSymbolTable = new RBBISymbolTable(this);
124     }
125 
126     //----------------------------------------------------------------------------------------
127     //
128     //  doParseAction Do some action during rule parsing.
129     //                       Called by the parse state machine.
130     //                       Actions build the parse tree and Unicode Sets,
131     //                       and maintain the parse stack for nested expressions.
132     //
133     //----------------------------------------------------------------------------------------
doParseActions(int action)134     boolean doParseActions(int action) {
135         RBBINode n = null;
136 
137         boolean returnVal = true;
138 
139         switch (action) {
140 
141         case RBBIRuleParseTable.doExprStart:
142             pushNewNode(RBBINode.opStart);
143             fRuleNum++;
144             break;
145 
146         case RBBIRuleParseTable.doNoChain:
147             // Scanned a '^' while on the rule start state.
148             fNoChainInRule = true;
149             break;
150 
151 
152         case RBBIRuleParseTable.doExprOrOperator: {
153             fixOpStack(RBBINode.precOpCat);
154             RBBINode operandNode = fNodeStack[fNodeStackPtr--];
155             RBBINode orNode = pushNewNode(RBBINode.opOr);
156             orNode.fLeftChild = operandNode;
157             operandNode.fParent = orNode;
158         }
159             break;
160 
161         case RBBIRuleParseTable.doExprCatOperator:
162         // concatenation operator.
163         // For the implicit concatenation of adjacent terms in an expression
164         // that are
165         //   not separated by any other operator. Action is invoked between the
166         //   actions for the two terms.
167         {
168             fixOpStack(RBBINode.precOpCat);
169             RBBINode operandNode = fNodeStack[fNodeStackPtr--];
170             RBBINode catNode = pushNewNode(RBBINode.opCat);
171             catNode.fLeftChild = operandNode;
172             operandNode.fParent = catNode;
173         }
174             break;
175 
176         case RBBIRuleParseTable.doLParen:
177             // Open Paren.
178             //   The openParen node is a dummy operation type with a low
179             // precedence,
180             //     which has the affect of ensuring that any real binary op that
181             //     follows within the parens binds more tightly to the operands than
182             //     stuff outside of the parens.
183             pushNewNode(RBBINode.opLParen);
184             break;
185 
186         case RBBIRuleParseTable.doExprRParen:
187             fixOpStack(RBBINode.precLParen);
188             break;
189 
190         case RBBIRuleParseTable.doNOP:
191             break;
192 
193         case RBBIRuleParseTable.doStartAssign:
194             // We've just scanned "$variable = "
195             // The top of the node stack has the $variable ref node.
196 
197             // Save the start position of the RHS text in the StartExpression
198             // node
199             //   that precedes the $variableReference node on the stack.
200             //   This will eventually be used when saving the full $variable
201             // replacement
202             //   text as a string.
203             n = fNodeStack[fNodeStackPtr - 1];
204             n.fFirstPos = fNextIndex; // move past the '='
205 
206             // Push a new start-of-expression node; needed to keep parse of the
207             //   RHS expression happy.
208             pushNewNode(RBBINode.opStart);
209             break;
210 
211         case RBBIRuleParseTable.doEndAssign: {
212             // We have reached the end of an assignement statement.
213             //   Current scan char is the ';' that terminates the assignment.
214 
215             // Terminate expression, leaves expression parse tree rooted in TOS
216             // node.
217             fixOpStack(RBBINode.precStart);
218 
219             RBBINode startExprNode = fNodeStack[fNodeStackPtr - 2];
220             RBBINode varRefNode = fNodeStack[fNodeStackPtr - 1];
221             RBBINode RHSExprNode = fNodeStack[fNodeStackPtr];
222 
223             // Save original text of right side of assignment, excluding the
224             // terminating ';'
225             //  in the root of the node for the right-hand-side expression.
226             RHSExprNode.fFirstPos = startExprNode.fFirstPos;
227             RHSExprNode.fLastPos = fScanIndex;
228             // fRB.fRules.extractBetween(RHSExprNode.fFirstPos,
229             // RHSExprNode.fLastPos, RHSExprNode.fText);
230             RHSExprNode.fText = fRB.fRules.substring(RHSExprNode.fFirstPos,
231                     RHSExprNode.fLastPos);
232 
233             // Expression parse tree becomes l. child of the $variable reference
234             // node.
235             varRefNode.fLeftChild = RHSExprNode;
236             RHSExprNode.fParent = varRefNode;
237 
238             // Make a symbol table entry for the $variableRef node.
239             fSymbolTable.addEntry(varRefNode.fText, varRefNode);
240 
241             // Clean up the stack.
242             fNodeStackPtr -= 3;
243             break;
244         }
245 
246         case RBBIRuleParseTable.doEndOfRule: {
247             fixOpStack(RBBINode.precStart); // Terminate expression, leaves
248                                             // expression
249 
250             if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("rtree") >= 0) {
251                 printNodeStack("end of rule");
252             }
253             Assert.assrt(fNodeStackPtr == 1);
254             RBBINode thisRule = fNodeStack[fNodeStackPtr];
255 
256             // If this rule includes a look-ahead '/', add a endMark node to the
257             //   expression tree.
258             if (fLookAheadRule) {
259                 RBBINode endNode = pushNewNode(RBBINode.endMark);
260                 RBBINode catNode = pushNewNode(RBBINode.opCat);
261                 fNodeStackPtr -= 2;
262                 catNode.fLeftChild = thisRule;
263                 catNode.fRightChild = endNode;
264                 fNodeStack[fNodeStackPtr] = catNode;
265                 endNode.fVal = fRuleNum;
266                 endNode.fLookAheadEnd = true;
267                 thisRule = catNode;
268 
269                 // TODO: Disable chaining out of look-ahead (hard break) rules.
270                 //   The break on rule match is forced, so there is no point in building up
271                 //   the state table to chain into another rule for a longer match.
272             }
273 
274             // Mark this node as being the root of a rule.
275             thisRule.fRuleRoot = true;
276 
277             // Flag if chaining into this rule is wanted.
278             //
279             if (fRB.fChainRules &&          // If rule chaining is enabled globally via !!chain
280                     !fNoChainInRule) {      //     and no '^' chain-in inhibit was on this rule
281                 thisRule.fChainIn = true;
282             }
283 
284 
285             // All rule expressions are ORed together.
286             // The ';' that terminates an expression really just functions as a
287             // '|' with
288             //   a low operator prededence.
289             //
290             // Each of the four sets of rules are collected separately.
291             //  (forward, reverse, safe_forward, safe_reverse)
292             //  OR this rule into the appropriate group of them.
293             //
294 
295             int destRules = (fReverseRule ? RBBIRuleBuilder.fReverseTree : fRB.fDefaultTree);
296 
297             if (fRB.fTreeRoots[destRules] != null) {
298                 // This is not the first rule encountered.
299                 // OR previous stuff (from *destRules)
300                 // with the current rule expression (on the Node Stack)
301                 //  with the resulting OR expression going to *destRules
302                 //
303                 thisRule = fNodeStack[fNodeStackPtr];
304                 RBBINode prevRules = fRB.fTreeRoots[destRules];
305                 RBBINode orNode = pushNewNode(RBBINode.opOr);
306                 orNode.fLeftChild = prevRules;
307                 prevRules.fParent = orNode;
308                 orNode.fRightChild = thisRule;
309                 thisRule.fParent = orNode;
310                 fRB.fTreeRoots[destRules] = orNode;
311             } else {
312                 // This is the first rule encountered (for this direction).
313                 // Just move its parse tree from the stack to *destRules.
314                 fRB.fTreeRoots[destRules] = fNodeStack[fNodeStackPtr];
315             }
316             fReverseRule = false; // in preparation for the next rule.
317             fLookAheadRule = false;
318             fNoChainInRule = false;
319             fNodeStackPtr = 0;
320         }
321             break;
322 
323         case RBBIRuleParseTable.doRuleError:
324             error(RBBIRuleBuilder.U_BRK_RULE_SYNTAX);
325             returnVal = false;
326             break;
327 
328         case RBBIRuleParseTable.doVariableNameExpectedErr:
329             error(RBBIRuleBuilder.U_BRK_RULE_SYNTAX);
330             break;
331 
332         //
333         //  Unary operands + ? *
334         //    These all appear after the operand to which they apply.
335         //    When we hit one, the operand (may be a whole sub expression)
336         //    will be on the top of the stack.
337         //    Unary Operator becomes TOS, with the old TOS as its one child.
338         case RBBIRuleParseTable.doUnaryOpPlus: {
339             RBBINode operandNode = fNodeStack[fNodeStackPtr--];
340             RBBINode plusNode = pushNewNode(RBBINode.opPlus);
341             plusNode.fLeftChild = operandNode;
342             operandNode.fParent = plusNode;
343         }
344             break;
345 
346         case RBBIRuleParseTable.doUnaryOpQuestion: {
347             RBBINode operandNode = fNodeStack[fNodeStackPtr--];
348             RBBINode qNode = pushNewNode(RBBINode.opQuestion);
349             qNode.fLeftChild = operandNode;
350             operandNode.fParent = qNode;
351         }
352             break;
353 
354         case RBBIRuleParseTable.doUnaryOpStar: {
355             RBBINode operandNode = fNodeStack[fNodeStackPtr--];
356             RBBINode starNode = pushNewNode(RBBINode.opStar);
357             starNode.fLeftChild = operandNode;
358             operandNode.fParent = starNode;
359         }
360             break;
361 
362         case RBBIRuleParseTable.doRuleChar:
363         // A "Rule Character" is any single character that is a literal part
364         // of the regular expression. Like a, b and c in the expression "(abc*)
365         // | [:L:]"
366         // These are pretty uncommon in break rules; the terms are more commonly
367         //  sets. To keep things uniform, treat these characters like as
368         // sets that just happen to contain only one character.
369         {
370             n = pushNewNode(RBBINode.setRef);
371             String s = String.valueOf((char)fC.fChar);
372             findSetFor(s, n, null);
373             n.fFirstPos = fScanIndex;
374             n.fLastPos = fNextIndex;
375             n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos);
376             break;
377         }
378 
379         case RBBIRuleParseTable.doDotAny:
380         // scanned a ".", meaning match any single character.
381         {
382             n = pushNewNode(RBBINode.setRef);
383             findSetFor(kAny, n, null);
384             n.fFirstPos = fScanIndex;
385             n.fLastPos = fNextIndex;
386             n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos);
387             break;
388         }
389 
390         case RBBIRuleParseTable.doSlash:
391             // Scanned a '/', which identifies a look-ahead break position in a
392             // rule.
393             n = pushNewNode(RBBINode.lookAhead);
394             n.fVal = fRuleNum;
395             n.fFirstPos = fScanIndex;
396             n.fLastPos = fNextIndex;
397             n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos);
398             fLookAheadRule = true;
399             break;
400 
401         case RBBIRuleParseTable.doStartTagValue:
402             // Scanned a '{', the opening delimiter for a tag value within a
403             // rule.
404             n = pushNewNode(RBBINode.tag);
405             n.fVal = 0;
406             n.fFirstPos = fScanIndex;
407             n.fLastPos = fNextIndex;
408             break;
409 
410         case RBBIRuleParseTable.doTagDigit:
411         // Just scanned a decimal digit that's part of a tag value
412         {
413             n = fNodeStack[fNodeStackPtr];
414             int v = UCharacter.digit((char) fC.fChar, 10);
415             n.fVal = n.fVal * 10 + v;
416             break;
417         }
418 
419         case RBBIRuleParseTable.doTagValue:
420             n = fNodeStack[fNodeStackPtr];
421             n.fLastPos = fNextIndex;
422             n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos);
423             break;
424 
425         case RBBIRuleParseTable.doTagExpectedError:
426             error(RBBIRuleBuilder.U_BRK_MALFORMED_RULE_TAG);
427             returnVal = false;
428             break;
429 
430         case RBBIRuleParseTable.doOptionStart:
431             // Scanning a !!option. At the start of string.
432             fOptionStart = fScanIndex;
433             break;
434 
435         case RBBIRuleParseTable.doOptionEnd: {
436             String opt = fRB.fRules.substring(fOptionStart, fScanIndex);
437             if (opt.equals("chain")) {
438                 fRB.fChainRules = true;
439             } else if (opt.equals("LBCMNoChain")) {
440                 fRB.fLBCMNoChain = true;
441             } else if (opt.equals("forward")) {
442                 fRB.fDefaultTree = RBBIRuleBuilder.fForwardTree;
443             } else if (opt.equals("reverse")) {
444                 fRB.fDefaultTree = RBBIRuleBuilder.fReverseTree;
445             } else if (opt.equals("safe_forward")) {
446                 fRB.fDefaultTree = RBBIRuleBuilder.fSafeFwdTree;
447             } else if (opt.equals("safe_reverse")) {
448                 fRB.fDefaultTree = RBBIRuleBuilder.fSafeRevTree;
449             } else if (opt.equals("lookAheadHardBreak")) {
450                 fRB.fLookAheadHardBreak = true;
451             } else {
452                 error(RBBIRuleBuilder.U_BRK_UNRECOGNIZED_OPTION);
453             }
454             break;
455         }
456 
457         case RBBIRuleParseTable.doReverseDir:
458             fReverseRule = true;
459             break;
460 
461         case RBBIRuleParseTable.doStartVariableName:
462             n = pushNewNode(RBBINode.varRef);
463             n.fFirstPos = fScanIndex;
464             break;
465 
466         case RBBIRuleParseTable.doEndVariableName:
467             n = fNodeStack[fNodeStackPtr];
468             if (n == null || n.fType != RBBINode.varRef) {
469                 error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR);
470                 break;
471             }
472             n.fLastPos = fScanIndex;
473             n.fText = fRB.fRules.substring(n.fFirstPos + 1, n.fLastPos);
474             // Look the newly scanned name up in the symbol table
475             //   If there's an entry, set the l. child of the var ref to the
476             // replacement expression.
477             //   (We also pass through here when scanning assignments, but no harm
478             // is done, other
479             //    than a slight wasted effort that seems hard to avoid. Lookup will
480             // be null)
481             n.fLeftChild = fSymbolTable.lookupNode(n.fText);
482             break;
483 
484         case RBBIRuleParseTable.doCheckVarDef:
485             n = fNodeStack[fNodeStackPtr];
486             if (n.fLeftChild == null) {
487                 error(RBBIRuleBuilder.U_BRK_UNDEFINED_VARIABLE);
488                 returnVal = false;
489             }
490             break;
491 
492         case RBBIRuleParseTable.doExprFinished:
493             break;
494 
495         case RBBIRuleParseTable.doRuleErrorAssignExpr:
496             error(RBBIRuleBuilder.U_BRK_ASSIGN_ERROR);
497             returnVal = false;
498             break;
499 
500         case RBBIRuleParseTable.doExit:
501             returnVal = false;
502             break;
503 
504         case RBBIRuleParseTable.doScanUnicodeSet:
505             scanSet();
506             break;
507 
508         default:
509             error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR);
510             returnVal = false;
511             break;
512         }
513         return returnVal;
514     }
515 
516     //----------------------------------------------------------------------------------------
517     //
518     //  Error Throw and IllegalArgumentException in response to a rule parse
519     // error.
520     //
521     //----------------------------------------------------------------------------------------
error(int e)522     void error(int e) {
523         String s = "Error " + e + " at line " + fLineNum + " column "
524                 + fCharNum;
525         IllegalArgumentException ex = new IllegalArgumentException(s);
526         throw ex;
527 
528     }
529 
530     //----------------------------------------------------------------------------------------
531     //
532     //  fixOpStack The parse stack holds partially assembled chunks of the parse
533     // tree.
534     //               An entry on the stack may be as small as a single setRef node,
535     //               or as large as the parse tree
536     //               for an entire expression (this will be the one item left on the stack
537     //               when the parsing of an RBBI rule completes.
538     //
539     //               This function is called when a binary operator is encountered.
540     //               It looks back up the stack for operators that are not yet associated
541     //               with a right operand, and if the precedence of the stacked operator >=
542     //               the precedence of the current operator, binds the operand left,
543     //               to the previously encountered operator.
544     //
545     //----------------------------------------------------------------------------------------
fixOpStack(int p)546     void fixOpStack(int p) {
547         RBBINode n;
548         // printNodeStack("entering fixOpStack()");
549         for (;;) {
550             n = fNodeStack[fNodeStackPtr - 1]; // an operator node
551             if (n.fPrecedence == 0) {
552                 System.out.print("RBBIRuleScanner.fixOpStack, bad operator node");
553                 error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR);
554                 return;
555             }
556 
557             if (n.fPrecedence < p || n.fPrecedence <= RBBINode.precLParen) {
558                 // The most recent operand goes with the current operator,
559                 //   not with the previously stacked one.
560                 break;
561             }
562             // Stack operator is a binary op ( '|' or concatenation)
563             //   TOS operand becomes right child of this operator.
564             //   Resulting subexpression becomes the TOS operand.
565             n.fRightChild = fNodeStack[fNodeStackPtr];
566             fNodeStack[fNodeStackPtr].fParent = n;
567             fNodeStackPtr--;
568             // printNodeStack("looping in fixOpStack() ");
569         }
570 
571         if (p <= RBBINode.precLParen) {
572             // Scan is at a right paren or end of expression.
573             //  The scanned item must match the stack, or else there was an
574             // error.
575             //  Discard the left paren (or start expr) node from the stack,
576             //  leaving the completed (sub)expression as TOS.
577             if (n.fPrecedence != p) {
578                 // Right paren encountered matched start of expression node, or
579                 // end of expression matched with a left paren node.
580                 error(RBBIRuleBuilder.U_BRK_MISMATCHED_PAREN);
581             }
582             fNodeStack[fNodeStackPtr - 1] = fNodeStack[fNodeStackPtr];
583             fNodeStackPtr--;
584             // Delete the now-discarded LParen or Start node.
585             // delete n;
586         }
587         // printNodeStack("leaving fixOpStack()");
588     }
589 
590     //----------------------------------------------------------------------------
591     //
592     //       RBBISetTableEl is an entry in the hash table of UnicodeSets that have
593     //                        been encountered. The val Node will be of nodetype uset
594     //                        and contain pointers to the actual UnicodeSets.
595     //                        The Key is the source string for initializing the set.
596     //
597     //                        The hash table is used to avoid creating duplicate
598     //                        unnamed (not $var references) UnicodeSets.
599     //
600     //----------------------------------------------------------------------------
601     static class RBBISetTableEl {
602         String key;
603 
604         RBBINode val;
605     }
606 
607 
608     //----------------------------------------------------------------------------------------
609     //
610     //   findSetFor given a String,
611     //                  - find the corresponding Unicode Set (uset node)
612     //                         (create one if necessary)
613     //                  - Set fLeftChild of the caller's node (should be a setRef node)
614     //                         to the uset node
615     //                 Maintain a hash table of uset nodes, so the same one is always used
616     //                    for the same string.
617     //                 If a "to adopt" set is provided and we haven't seen this key before,
618     //                    add the provided set to the hash table.
619     //                 If the string is one (32 bit) char in length, the set contains
620     //                    just one element which is the char in question.
621     //                 If the string is "any", return a set containing all chars.
622     //
623     //----------------------------------------------------------------------------------------
findSetFor(String s, RBBINode node, UnicodeSet setToAdopt)624     void findSetFor(String s, RBBINode node, UnicodeSet setToAdopt) {
625 
626         RBBISetTableEl el;
627 
628         // First check whether we've already cached a set for this string.
629         // If so, just use the cached set in the new node.
630         //   delete any set provided by the caller, since we own it.
631         el = fSetTable.get(s);
632         if (el != null) {
633             node.fLeftChild = el.val;
634             Assert.assrt(node.fLeftChild.fType == RBBINode.uset);
635             return;
636         }
637 
638         // Haven't seen this set before.
639         // If the caller didn't provide us with a prebuilt set,
640         //   create a new UnicodeSet now.
641         if (setToAdopt == null) {
642             if (s.equals(kAny)) {
643                 setToAdopt = new UnicodeSet(0x000000, 0x10ffff);
644             } else {
645                 int c;
646                 c = UTF16.charAt(s, 0);
647                 setToAdopt = new UnicodeSet(c, c);
648             }
649         }
650 
651         //
652         // Make a new uset node to refer to this UnicodeSet
653         // This new uset node becomes the child of the caller's setReference
654         // node.
655         //
656         RBBINode usetNode = new RBBINode(RBBINode.uset);
657         usetNode.fInputSet = setToAdopt;
658         usetNode.fParent = node;
659         node.fLeftChild = usetNode;
660         usetNode.fText = s;
661 
662         //
663         // Add the new uset node to the list of all uset nodes.
664         //
665         fRB.fUSetNodes.add(usetNode);
666 
667         //
668         // Add the new set to the set hash table.
669         //
670         el = new RBBISetTableEl();
671         el.key = s;
672         el.val = usetNode;
673         fSetTable.put(el.key, el);
674 
675         return;
676     }
677 
678     //
679     //  Assorted Unicode character constants.
680     //     Numeric because there is no portable way to enter them as literals.
681     //     (Think EBCDIC).
682     //
683     static final int chNEL = 0x85; //    NEL newline variant
684 
685     static final int chLS = 0x2028; //    Unicode Line Separator
686 
687     //----------------------------------------------------------------------------------------
688     //
689     //  stripRules    Return a rules string without unnecessary
690     //                characters.
691     //
692     //----------------------------------------------------------------------------------------
stripRules(String rules)693     static String stripRules(String rules) {
694         StringBuilder strippedRules = new StringBuilder();
695         int rulesLength = rules.length();
696         for (int idx = 0; idx < rulesLength;) {
697             char ch = rules.charAt(idx++);
698             if (ch == '#') {
699                 while (idx < rulesLength
700                         && ch != '\r' && ch != '\n' && ch != chNEL) {
701                     ch = rules.charAt(idx++);
702                 }
703             }
704             if (!UCharacter.isISOControl(ch)) {
705                 strippedRules.append(ch);
706             }
707         }
708         return strippedRules.toString();
709     }
710 
711     //----------------------------------------------------------------------------------------
712     //
713     //  nextCharLL    Low Level Next Char from rule input source.
714     //                Get a char from the input character iterator,
715     //                keep track of input position for error reporting.
716     //
717     //----------------------------------------------------------------------------------------
nextCharLL()718     int nextCharLL() {
719         int ch;
720 
721         if (fNextIndex >= fRB.fRules.length()) {
722             return -1;
723         }
724         ch = UTF16.charAt(fRB.fRules, fNextIndex);
725         fNextIndex = UTF16.moveCodePointOffset(fRB.fRules, fNextIndex, 1);
726 
727         if (ch == '\r' ||
728             ch == chNEL ||
729             ch == chLS ||
730             ch == '\n' && fLastChar != '\r') {
731             // Character is starting a new line.  Bump up the line number, and
732             //  reset the column to 0.
733             fLineNum++;
734             fCharNum = 0;
735             if (fQuoteMode) {
736                 error(RBBIRuleBuilder.U_BRK_NEW_LINE_IN_QUOTED_STRING);
737                 fQuoteMode = false;
738             }
739         } else {
740             // Character is not starting a new line.  Except in the case of a
741             //   LF following a CR, increment the column position.
742             if (ch != '\n') {
743                 fCharNum++;
744             }
745         }
746         fLastChar = ch;
747         return ch;
748     }
749 
750     //---------------------------------------------------------------------------------
751     //
752     //   nextChar     for rules scanning.  At this level, we handle stripping
753     //                out comments and processing backslash character escapes.
754     //                The rest of the rules grammar is handled at the next level up.
755     //
756     //---------------------------------------------------------------------------------
nextChar(RBBIRuleChar c)757     void nextChar(RBBIRuleChar c) {
758 
759         // Unicode Character constants needed for the processing done by nextChar(),
760         //   in hex because literals wont work on EBCDIC machines.
761 
762         fScanIndex = fNextIndex;
763         c.fChar = nextCharLL();
764         c.fEscaped = false;
765 
766         //
767         //  check for '' sequence.
768         //  These are recognized in all contexts, whether in quoted text or not.
769         //
770         if (c.fChar == '\'') {
771             if (UTF16.charAt(fRB.fRules, fNextIndex) == '\'') {
772                 c.fChar = nextCharLL(); // get nextChar officially so character counts
773                 c.fEscaped = true; //   stay correct.
774             } else {
775                 // Single quote, by itself.
776                 //   Toggle quoting mode.
777                 //   Return either '('  or ')', because quotes cause a grouping of the quoted text.
778                 fQuoteMode = !fQuoteMode;
779                 if (fQuoteMode == true) {
780                     c.fChar = '(';
781                 } else {
782                     c.fChar = ')';
783                 }
784                 c.fEscaped = false; // The paren that we return is not escaped.
785                 return;
786             }
787         }
788 
789         if (fQuoteMode) {
790             c.fEscaped = true;
791         } else {
792             // We are not in a 'quoted region' of the source.
793             //
794             if (c.fChar == '#') {
795                 // Start of a comment.  Consume the rest of it.
796                 //  The new-line char that terminates the comment is always returned.
797                 //  It will be treated as white-space, and serves to break up anything
798                 //    that might otherwise incorrectly clump together with a comment in
799                 //    the middle (a variable name, for example.)
800                 for (;;) {
801                     c.fChar = nextCharLL();
802                     if (c.fChar == -1 || // EOF
803                         c.fChar == '\r' ||
804                         c.fChar == '\n' ||
805                         c.fChar == chNEL ||
806                         c.fChar == chLS)
807                     {
808                         break;
809                     }
810                 }
811             }
812             if (c.fChar == -1) {
813                 return;
814             }
815 
816             //
817             //  check for backslash escaped characters.
818             //  Use String.unescapeAt() to handle them.
819             //
820             if (c.fChar == '\\') {
821                 c.fEscaped = true;
822                 int[] unescapeIndex = new int[1];
823                 unescapeIndex[0] = fNextIndex;
824                 c.fChar = Utility.unescapeAt(fRB.fRules, unescapeIndex);
825                 if (unescapeIndex[0] == fNextIndex) {
826                     error(RBBIRuleBuilder.U_BRK_HEX_DIGITS_EXPECTED);
827                 }
828 
829                 fCharNum += unescapeIndex[0] - fNextIndex;
830                 fNextIndex = unescapeIndex[0];
831             }
832         }
833         // putc(c.fChar, stdout);
834     }
835 
836     //---------------------------------------------------------------------------------
837     //
838     //  Parse RBBI rules.   The state machine for rules parsing is here.
839     //                      The state tables are hand-written in the file rbbirpt.txt,
840     //                      and converted to the form used here by a perl
841     //                      script rbbicst.pl
842     //
843     //---------------------------------------------------------------------------------
parse()844     void parse() {
845         int state;
846         RBBIRuleParseTable.RBBIRuleTableElement tableEl;
847 
848         state = 1;
849         nextChar(fC);
850         //
851         // Main loop for the rule parsing state machine.
852         //   Runs once per state transition.
853         //   Each time through optionally performs, depending on the state table,
854         //      - an advance to the the next input char
855         //      - an action to be performed.
856         //      - pushing or popping a state to/from the local state return stack.
857         //
858         for (;;) {
859             // Quit if state == 0.  This is the normal way to exit the state machine.
860             //
861             if (state == 0) {
862                 break;
863             }
864 
865             // Find the state table element that matches the input char from the rule, or the
866             //    class of the input character.  Start with the first table row for this
867             //    state, then linearly scan forward until we find a row that matches the
868             //    character.  The last row for each state always matches all characters, so
869             //    the search will stop there, if not before.
870             //
871             tableEl = RBBIRuleParseTable.gRuleParseStateTable[state];
872             if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("scan") >= 0) {
873                 System.out.println("char, line, col = (\'" + (char) fC.fChar
874                         + "\', " + fLineNum + ", " + fCharNum + "    state = "
875                         + tableEl.fStateName);
876             }
877 
878             for (int tableRow = state;; tableRow++) { // loop over the state table rows associated with this state.
879                 tableEl = RBBIRuleParseTable.gRuleParseStateTable[tableRow];
880                 if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("scan") >= 0) {
881                     System.out.print(".");
882                 }
883                 if (tableEl.fCharClass < 127 && fC.fEscaped == false
884                         && tableEl.fCharClass == fC.fChar) {
885                     // Table row specified an individual character, not a set, and
886                     //   the input character is not escaped, and
887                     //   the input character matched it.
888                     break;
889                 }
890                 if (tableEl.fCharClass == 255) {
891                     // Table row specified default, match anything character class.
892                     break;
893                 }
894                 if (tableEl.fCharClass == 254 && fC.fEscaped) {
895                     // Table row specified "escaped" and the char was escaped.
896                     break;
897                 }
898                 if (tableEl.fCharClass == 253 && fC.fEscaped
899                         && (fC.fChar == 0x50 || fC.fChar == 0x70)) {
900                     // Table row specified "escaped P" and the char is either 'p' or 'P'.
901                     break;
902                 }
903                 if (tableEl.fCharClass == 252 && fC.fChar == -1) {
904                     // Table row specified eof and we hit eof on the input.
905                     break;
906                 }
907 
908                 if (tableEl.fCharClass >= 128 && tableEl.fCharClass < 240 && // Table specs a char class &&
909                         fC.fEscaped == false && //   char is not escaped &&
910                         fC.fChar != -1) { //   char is not EOF
911                     UnicodeSet uniset = fRuleSets[tableEl.fCharClass - 128];
912                     if (uniset.contains(fC.fChar)) {
913                         // Table row specified a character class, or set of characters,
914                         //   and the current char matches it.
915                         break;
916                     }
917                 }
918             }
919 
920             if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("scan") >= 0) {
921                 System.out.println("");
922             }
923             //
924             // We've found the row of the state table that matches the current input
925             //   character from the rules string.
926             // Perform any action specified  by this row in the state table.
927             if (doParseActions(tableEl.fAction) == false) {
928                 // Break out of the state machine loop if the
929                 //   the action signalled some kind of error, or
930                 //   the action was to exit, occurs on normal end-of-rules-input.
931                 break;
932             }
933 
934             if (tableEl.fPushState != 0) {
935                 fStackPtr++;
936                 if (fStackPtr >= kStackSize) {
937                     System.out.println("RBBIRuleScanner.parse() - state stack overflow.");
938                     error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR);
939                 }
940                 fStack[fStackPtr] = tableEl.fPushState;
941             }
942 
943             if (tableEl.fNextChar) {
944                 nextChar(fC);
945             }
946 
947             // Get the next state from the table entry, or from the
948             //   state stack if the next state was specified as "pop".
949             if (tableEl.fNextState != 255) {
950                 state = tableEl.fNextState;
951             } else {
952                 state = fStack[fStackPtr];
953                 fStackPtr--;
954                 if (fStackPtr < 0) {
955                     System.out.println("RBBIRuleScanner.parse() - state stack underflow.");
956                     error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR);
957                 }
958             }
959 
960         }
961 
962         // If there are no forward rules throw an error.
963         //
964         if (fRB.fTreeRoots[RBBIRuleBuilder.fForwardTree] == null) {
965             error(RBBIRuleBuilder.U_BRK_RULE_SYNTAX);
966         }
967 
968         //
969         // If there were NO user specified reverse rules, set up the equivalent of ".*;"
970         //
971         if (fRB.fTreeRoots[RBBIRuleBuilder.fReverseTree] == null) {
972             fRB.fTreeRoots[RBBIRuleBuilder.fReverseTree] = pushNewNode(RBBINode.opStar);
973             RBBINode operand = pushNewNode(RBBINode.setRef);
974             findSetFor(kAny, operand, null);
975             fRB.fTreeRoots[RBBIRuleBuilder.fReverseTree].fLeftChild = operand;
976             operand.fParent = fRB.fTreeRoots[RBBIRuleBuilder.fReverseTree];
977             fNodeStackPtr -= 2;
978         }
979 
980         //
981         // Parsing of the input RBBI rules is complete.
982         // We now have a parse tree for the rule expressions
983         // and a list of all UnicodeSets that are referenced.
984         //
985         if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("symbols") >= 0) {
986             fSymbolTable.rbbiSymtablePrint();
987         }
988         if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("ptree") >= 0) {
989             System.out.println("Completed Forward Rules Parse Tree...");
990             fRB.fTreeRoots[RBBIRuleBuilder.fForwardTree].printTree(true);
991             System.out.println("\nCompleted Reverse Rules Parse Tree...");
992             fRB.fTreeRoots[RBBIRuleBuilder.fReverseTree].printTree(true);
993             System.out.println("\nCompleted Safe Point Forward Rules Parse Tree...");
994             if (fRB.fTreeRoots[RBBIRuleBuilder.fSafeFwdTree] == null) {
995                 System.out.println("  -- null -- ");
996             } else {
997                 fRB.fTreeRoots[RBBIRuleBuilder.fSafeFwdTree].printTree(true);
998             }
999             System.out.println("\nCompleted Safe Point Reverse Rules Parse Tree...");
1000             if (fRB.fTreeRoots[RBBIRuleBuilder.fSafeRevTree] == null) {
1001                 System.out.println("  -- null -- ");
1002             } else {
1003                 fRB.fTreeRoots[RBBIRuleBuilder.fSafeRevTree].printTree(true);
1004             }
1005         }
1006     }
1007 
1008     //---------------------------------------------------------------------------------
1009     //
1010     //  printNodeStack     for debugging...
1011     //
1012     //---------------------------------------------------------------------------------
1013     ///CLOVER:OFF
printNodeStack(String title)1014     void printNodeStack(String title) {
1015         int i;
1016         System.out.println(title + ".  Dumping node stack...\n");
1017         for (i = fNodeStackPtr; i > 0; i--) {
1018             fNodeStack[i].printTree(true);
1019         }
1020     }
1021     ///CLOVER:ON
1022 
1023     //---------------------------------------------------------------------------------
1024     //
1025     //  pushNewNode   create a new RBBINode of the specified type and push it
1026     //                onto the stack of nodes.
1027     //
1028     //---------------------------------------------------------------------------------
pushNewNode(int nodeType)1029     RBBINode pushNewNode(int nodeType) {
1030         fNodeStackPtr++;
1031         if (fNodeStackPtr >= kStackSize) {
1032             System.out.println("RBBIRuleScanner.pushNewNode - stack overflow.");
1033             error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR);
1034         }
1035         fNodeStack[fNodeStackPtr] = new RBBINode(nodeType);
1036         return fNodeStack[fNodeStackPtr];
1037     }
1038 
1039     //---------------------------------------------------------------------------------
1040     //
1041     //  scanSet    Construct a UnicodeSet from the text at the current scan
1042     //             position.  Advance the scan position to the first character
1043     //             after the set.
1044     //
1045     //             A new RBBI setref node referring to the set is pushed onto the node
1046     //             stack.
1047     //
1048     //             The scan position is normally under the control of the state machine
1049     //             that controls rule parsing.  UnicodeSets, however, are parsed by
1050     //             the UnicodeSet constructor, not by the RBBI rule parser.
1051     //
1052     //---------------------------------------------------------------------------------
scanSet()1053     void scanSet() {
1054         UnicodeSet uset = null;
1055         int startPos;
1056         ParsePosition pos = new ParsePosition(fScanIndex);
1057         int i;
1058 
1059         startPos = fScanIndex;
1060         try {
1061             uset = new UnicodeSet(fRB.fRules, pos, fSymbolTable, UnicodeSet.IGNORE_SPACE);
1062         } catch (Exception e) { // TODO:  catch fewer exception types.
1063             // Repackage UnicodeSet errors as RBBI rule builder errors, with location info.
1064             error(RBBIRuleBuilder.U_BRK_MALFORMED_SET);
1065         }
1066 
1067         // Verify that the set contains at least one code point.
1068         //
1069         if (uset.isEmpty()) {
1070             // This set is empty.
1071             //  Make it an error, because it almost certainly is not what the user wanted.
1072             //  Also, avoids having to think about corner cases in the tree manipulation code
1073             //   that occurs later on.
1074             //  TODO:  this shouldn't be an error; it does happen.
1075             error(RBBIRuleBuilder.U_BRK_RULE_EMPTY_SET);
1076         }
1077 
1078         // Advance the RBBI parse postion over the UnicodeSet pattern.
1079         //   Don't just set fScanIndex because the line/char positions maintained
1080         //   for error reporting would be thrown off.
1081         i = pos.getIndex();
1082         for (;;) {
1083             if (fNextIndex >= i) {
1084                 break;
1085             }
1086             nextCharLL();
1087         }
1088 
1089         RBBINode n;
1090 
1091         n = pushNewNode(RBBINode.setRef);
1092         n.fFirstPos = startPos;
1093         n.fLastPos = fNextIndex;
1094         n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos);
1095         //  findSetFor() serves several purposes here:
1096         //     - Adopts storage for the UnicodeSet, will be responsible for deleting.
1097         //     - Mantains collection of all sets in use, needed later for establishing
1098         //          character categories for run time engine.
1099         //     - Eliminates mulitiple instances of the same set.
1100         //     - Creates a new uset node if necessary (if this isn't a duplicate.)
1101         findSetFor(n.fText, n, uset);
1102     }
1103 
1104 }
1105 
1106