• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* GENERATED SOURCE. DO NOT MODIFY. */
2 // © 2016 and later: Unicode, Inc. and others.
3 // License & terms of use: http://www.unicode.org/copyright.html
4 /*
5  *******************************************************************************
6  * Copyright (C) 2003-2016, International Business Machines Corporation and others. All Rights Reserved.
7  *******************************************************************************
8  */
9 
10 package android.icu.text;
11 
12 import java.text.ParsePosition;
13 import java.util.HashMap;
14 
15 import android.icu.impl.Assert;
16 import android.icu.impl.Utility;
17 import android.icu.lang.UCharacter;
18 import android.icu.lang.UProperty;
19 
20 /**
21   *  This class is part of the Rule Based Break Iterator rule compiler.
22   *  It scans the rules and builds the parse tree.
23   *  There is no public API here.
24   */
25 class RBBIRuleScanner {
26 
27     private final static int    kStackSize = 100;               // The size of the state stack for
28     //   rules parsing.  Corresponds roughly
29     //   to the depth of parentheses nesting
30     //   that is allowed in the rules.
31 
32     static class RBBIRuleChar {
33         int             fChar;
34         boolean         fEscaped;
35     }
36 
37 
38 
39     RBBIRuleBuilder               fRB;              // The rule builder that we are part of.
40 
41     int                       fScanIndex;        // Index of current character being processed
42                                                      //   in the rule input string.
43     int                       fNextIndex;        // Index of the next character, which
44                                                      //   is the first character not yet scanned.
45     boolean                  fQuoteMode;        // Scan is in a 'quoted region'
46     int                       fLineNum;          // Line number in input file.
47     int                       fCharNum;          // Char position within the line.
48     int                       fLastChar;         // Previous char, needed to count CR-LF
49                                                      //   as a single line, not two.
50 
51     RBBIRuleChar              fC = new RBBIRuleChar();    // Current char for parse state machine
52                                                      //   processing.
53 
54 
55     short  fStack[] = new short[kStackSize];  // State stack, holds state pushes
56     int                       fStackPtr;           //  and pops as specified in the state
57                                                        //  transition rules.
58 
59     RBBINode  fNodeStack[] = new RBBINode[kStackSize]; // Node stack, holds nodes created
60                                                            //  during the parse of a rule
61     int                        fNodeStackPtr;
62 
63 
64     boolean                    fReverseRule;         // True if the rule currently being scanned
65                                                      //  is a reverse direction rule (if it
66                                                      //  starts with a '!')
67 
68     boolean                    fLookAheadRule;       // True if the rule includes a '/'
69                                                      //   somewhere within it.
70 
71     boolean                    fNoChainInRule;       // True if the current rule starts with a '^'.
72 
73 
74     RBBISymbolTable            fSymbolTable;         // symbol table, holds definitions of
75                                                      //   $variable symbols.
76 
77     HashMap<String, RBBISetTableEl> fSetTable = new HashMap<>(); // UnicocodeSet hash table, holds indexes to
78                                                                                        //   the sets created while parsing rules.
79                                                                                        //   The key is the string used for creating
80                                                                                        //   the set.
81 
82     UnicodeSet      fRuleSets[] = new UnicodeSet[10];    // Unicode Sets that are needed during
83                                                      //  the scanning of RBBI rules.  The
84                                                      //  Indices for these are assigned by the
85                                                      //  perl script that builds the state tables.
86                                                      //  See rbbirpt.h.
87 
88     int                        fRuleNum;         // Counts each rule as it is scanned.
89 
90     int                        fOptionStart;     // Input index of start of a !!option
91                                                  //   keyword, while being scanned.
92 
93 
94    // gRuleSet_rule_char_pattern is characters that may appear as literals in patterns without escaping or quoting.
95    static private String gRuleSet_rule_char_pattern       = "[^[\\p{Z}\\u0020-\\u007f]-[\\p{L}]-[\\p{N}]]";
96    static private String gRuleSet_name_char_pattern       = "[_\\p{L}\\p{N}]";
97    static private String gRuleSet_digit_char_pattern      = "[0-9]";
98    static private String gRuleSet_name_start_char_pattern = "[_\\p{L}]";
99    static private String gRuleSet_white_space_pattern     = "[\\p{Pattern_White_Space}]";
100    static private String kAny =  "any";
101 
102 
103 
104 
105     //----------------------------------------------------------------------------------------
106     //
107     //  Constructor.
108     //
109     //----------------------------------------------------------------------------------------
RBBIRuleScanner(RBBIRuleBuilder rb)110     RBBIRuleScanner(RBBIRuleBuilder rb) {
111         fRB = rb;
112         fLineNum = 1;
113 
114         //
115         //  Set up the constant Unicode Sets.
116         //     Note: These could be made static and shared among
117         //            all instances of RBBIRuleScanners.
118         fRuleSets[RBBIRuleParseTable.kRuleSet_rule_char - 128] = new UnicodeSet(gRuleSet_rule_char_pattern);
119         fRuleSets[RBBIRuleParseTable.kRuleSet_white_space - 128] = new UnicodeSet(gRuleSet_white_space_pattern);
120         fRuleSets[RBBIRuleParseTable.kRuleSet_name_char - 128] = new UnicodeSet(gRuleSet_name_char_pattern);
121         fRuleSets[RBBIRuleParseTable.kRuleSet_name_start_char - 128] = new UnicodeSet(gRuleSet_name_start_char_pattern);
122         fRuleSets[RBBIRuleParseTable.kRuleSet_digit_char - 128] = new UnicodeSet(gRuleSet_digit_char_pattern);
123 
124         fSymbolTable = new RBBISymbolTable(this);
125     }
126 
127     //----------------------------------------------------------------------------------------
128     //
129     //  doParseAction Do some action during rule parsing.
130     //                       Called by the parse state machine.
131     //                       Actions build the parse tree and Unicode Sets,
132     //                       and maintain the parse stack for nested expressions.
133     //
134     //----------------------------------------------------------------------------------------
doParseActions(int action)135     boolean doParseActions(int action) {
136         RBBINode n = null;
137 
138         boolean returnVal = true;
139 
140         switch (action) {
141 
142         case RBBIRuleParseTable.doExprStart:
143             pushNewNode(RBBINode.opStart);
144             fRuleNum++;
145             break;
146 
147         case RBBIRuleParseTable.doNoChain:
148             // Scanned a '^' while on the rule start state.
149             fNoChainInRule = true;
150             break;
151 
152 
153         case RBBIRuleParseTable.doExprOrOperator: {
154             fixOpStack(RBBINode.precOpCat);
155             RBBINode operandNode = fNodeStack[fNodeStackPtr--];
156             RBBINode orNode = pushNewNode(RBBINode.opOr);
157             orNode.fLeftChild = operandNode;
158             operandNode.fParent = orNode;
159         }
160             break;
161 
162         case RBBIRuleParseTable.doExprCatOperator:
163         // concatenation operator.
164         // For the implicit concatenation of adjacent terms in an expression
165         // that are
166         //   not separated by any other operator. Action is invoked between the
167         //   actions for the two terms.
168         {
169             fixOpStack(RBBINode.precOpCat);
170             RBBINode operandNode = fNodeStack[fNodeStackPtr--];
171             RBBINode catNode = pushNewNode(RBBINode.opCat);
172             catNode.fLeftChild = operandNode;
173             operandNode.fParent = catNode;
174         }
175             break;
176 
177         case RBBIRuleParseTable.doLParen:
178             // Open Paren.
179             //   The openParen node is a dummy operation type with a low
180             // precedence,
181             //     which has the affect of ensuring that any real binary op that
182             //     follows within the parens binds more tightly to the operands than
183             //     stuff outside of the parens.
184             pushNewNode(RBBINode.opLParen);
185             break;
186 
187         case RBBIRuleParseTable.doExprRParen:
188             fixOpStack(RBBINode.precLParen);
189             break;
190 
191         case RBBIRuleParseTable.doNOP:
192             break;
193 
194         case RBBIRuleParseTable.doStartAssign:
195             // We've just scanned "$variable = "
196             // The top of the node stack has the $variable ref node.
197 
198             // Save the start position of the RHS text in the StartExpression
199             // node
200             //   that precedes the $variableReference node on the stack.
201             //   This will eventually be used when saving the full $variable
202             // replacement
203             //   text as a string.
204             n = fNodeStack[fNodeStackPtr - 1];
205             n.fFirstPos = fNextIndex; // move past the '='
206 
207             // Push a new start-of-expression node; needed to keep parse of the
208             //   RHS expression happy.
209             pushNewNode(RBBINode.opStart);
210             break;
211 
212         case RBBIRuleParseTable.doEndAssign: {
213             // We have reached the end of an assignment statement.
214             //   Current scan char is the ';' that terminates the assignment.
215 
216             // Terminate expression, leaves expression parse tree rooted in TOS
217             // node.
218             fixOpStack(RBBINode.precStart);
219 
220             RBBINode startExprNode = fNodeStack[fNodeStackPtr - 2];
221             RBBINode varRefNode = fNodeStack[fNodeStackPtr - 1];
222             RBBINode RHSExprNode = fNodeStack[fNodeStackPtr];
223 
224             // Save original text of right side of assignment, excluding the
225             // terminating ';'
226             //  in the root of the node for the right-hand-side expression.
227             RHSExprNode.fFirstPos = startExprNode.fFirstPos;
228             RHSExprNode.fLastPos = fScanIndex;
229             // fRB.fRules.extractBetween(RHSExprNode.fFirstPos,
230             // RHSExprNode.fLastPos, RHSExprNode.fText);
231             RHSExprNode.fText = fRB.fRules.substring(RHSExprNode.fFirstPos,
232                     RHSExprNode.fLastPos);
233 
234             // Expression parse tree becomes l. child of the $variable reference
235             // node.
236             varRefNode.fLeftChild = RHSExprNode;
237             RHSExprNode.fParent = varRefNode;
238 
239             // Make a symbol table entry for the $variableRef node.
240             fSymbolTable.addEntry(varRefNode.fText, varRefNode);
241 
242             // Clean up the stack.
243             fNodeStackPtr -= 3;
244             break;
245         }
246 
247         case RBBIRuleParseTable.doEndOfRule: {
248             fixOpStack(RBBINode.precStart); // Terminate expression, leaves
249                                             // expression
250 
251             if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("rtree") >= 0) {
252                 printNodeStack("end of rule");
253             }
254             Assert.assrt(fNodeStackPtr == 1);
255             RBBINode thisRule = fNodeStack[fNodeStackPtr];
256 
257             // If this rule includes a look-ahead '/', add a endMark node to the
258             //   expression tree.
259             if (fLookAheadRule) {
260                 RBBINode endNode = pushNewNode(RBBINode.endMark);
261                 RBBINode catNode = pushNewNode(RBBINode.opCat);
262                 fNodeStackPtr -= 2;
263                 catNode.fLeftChild = thisRule;
264                 catNode.fRightChild = endNode;
265                 fNodeStack[fNodeStackPtr] = catNode;
266                 endNode.fVal = fRuleNum;
267                 endNode.fLookAheadEnd = true;
268                 thisRule = catNode;
269 
270                 // TODO: Disable chaining out of look-ahead (hard break) rules.
271                 //   The break on rule match is forced, so there is no point in building up
272                 //   the state table to chain into another rule for a longer match.
273             }
274 
275             // Mark this node as being the root of a rule.
276             thisRule.fRuleRoot = true;
277 
278             // Flag if chaining into this rule is wanted.
279             //
280             if (fRB.fChainRules &&          // If rule chaining is enabled globally via !!chain
281                     !fNoChainInRule) {      //     and no '^' chain-in inhibit was on this rule
282                 thisRule.fChainIn = true;
283             }
284 
285 
286             // All rule expressions are ORed together.
287             // The ';' that terminates an expression really just functions as a
288             // '|' with
289             //   a low operator precedence.
290             //
291             // Each of the four sets of rules are collected separately.
292             //  (forward, reverse, safe_forward, safe_reverse)
293             //  OR this rule into the appropriate group of them.
294             //
295 
296             int destRules = (fReverseRule ? RBBIRuleBuilder.fSafeRevTree : fRB.fDefaultTree);
297 
298             if (fRB.fTreeRoots[destRules] != null) {
299                 // This is not the first rule encountered.
300                 // OR previous stuff (from *destRules)
301                 // with the current rule expression (on the Node Stack)
302                 //  with the resulting OR expression going to *destRules
303                 //
304                 thisRule = fNodeStack[fNodeStackPtr];
305                 RBBINode prevRules = fRB.fTreeRoots[destRules];
306                 RBBINode orNode = pushNewNode(RBBINode.opOr);
307                 orNode.fLeftChild = prevRules;
308                 prevRules.fParent = orNode;
309                 orNode.fRightChild = thisRule;
310                 thisRule.fParent = orNode;
311                 fRB.fTreeRoots[destRules] = orNode;
312             } else {
313                 // This is the first rule encountered (for this direction).
314                 // Just move its parse tree from the stack to *destRules.
315                 fRB.fTreeRoots[destRules] = fNodeStack[fNodeStackPtr];
316             }
317             fReverseRule = false; // in preparation for the next rule.
318             fLookAheadRule = false;
319             fNoChainInRule = false;
320             fNodeStackPtr = 0;
321         }
322             break;
323 
324         case RBBIRuleParseTable.doRuleError:
325             error(RBBIRuleBuilder.U_BRK_RULE_SYNTAX);
326             returnVal = false;
327             break;
328 
329         case RBBIRuleParseTable.doVariableNameExpectedErr:
330             error(RBBIRuleBuilder.U_BRK_RULE_SYNTAX);
331             break;
332 
333         //
334         //  Unary operands + ? *
335         //    These all appear after the operand to which they apply.
336         //    When we hit one, the operand (may be a whole sub expression)
337         //    will be on the top of the stack.
338         //    Unary Operator becomes TOS, with the old TOS as its one child.
339         case RBBIRuleParseTable.doUnaryOpPlus: {
340             RBBINode operandNode = fNodeStack[fNodeStackPtr--];
341             RBBINode plusNode = pushNewNode(RBBINode.opPlus);
342             plusNode.fLeftChild = operandNode;
343             operandNode.fParent = plusNode;
344         }
345             break;
346 
347         case RBBIRuleParseTable.doUnaryOpQuestion: {
348             RBBINode operandNode = fNodeStack[fNodeStackPtr--];
349             RBBINode qNode = pushNewNode(RBBINode.opQuestion);
350             qNode.fLeftChild = operandNode;
351             operandNode.fParent = qNode;
352         }
353             break;
354 
355         case RBBIRuleParseTable.doUnaryOpStar: {
356             RBBINode operandNode = fNodeStack[fNodeStackPtr--];
357             RBBINode starNode = pushNewNode(RBBINode.opStar);
358             starNode.fLeftChild = operandNode;
359             operandNode.fParent = starNode;
360         }
361             break;
362 
363         case RBBIRuleParseTable.doRuleChar:
364         // A "Rule Character" is any single character that is a literal part
365         // of the regular expression. Like a, b and c in the expression "(abc*)
366         // | [:L:]"
367         // These are pretty uncommon in break rules; the terms are more commonly
368         //  sets. To keep things uniform, treat these characters like as
369         // sets that just happen to contain only one character.
370         {
371             n = pushNewNode(RBBINode.setRef);
372             String s = String.valueOf((char)fC.fChar);
373             findSetFor(s, n, null);
374             n.fFirstPos = fScanIndex;
375             n.fLastPos = fNextIndex;
376             n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos);
377             break;
378         }
379 
380         case RBBIRuleParseTable.doDotAny:
381         // scanned a ".", meaning match any single character.
382         {
383             n = pushNewNode(RBBINode.setRef);
384             findSetFor(kAny, n, null);
385             n.fFirstPos = fScanIndex;
386             n.fLastPos = fNextIndex;
387             n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos);
388             break;
389         }
390 
391         case RBBIRuleParseTable.doSlash:
392             // Scanned a '/', which identifies a look-ahead break position in a
393             // rule.
394             n = pushNewNode(RBBINode.lookAhead);
395             n.fVal = fRuleNum;
396             n.fFirstPos = fScanIndex;
397             n.fLastPos = fNextIndex;
398             n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos);
399             fLookAheadRule = true;
400             break;
401 
402         case RBBIRuleParseTable.doStartTagValue:
403             // Scanned a '{', the opening delimiter for a tag value within a
404             // rule.
405             n = pushNewNode(RBBINode.tag);
406             n.fVal = 0;
407             n.fFirstPos = fScanIndex;
408             n.fLastPos = fNextIndex;
409             break;
410 
411         case RBBIRuleParseTable.doTagDigit:
412         // Just scanned a decimal digit that's part of a tag value
413         {
414             n = fNodeStack[fNodeStackPtr];
415             int v = UCharacter.digit((char) fC.fChar, 10);
416             n.fVal = n.fVal * 10 + v;
417             break;
418         }
419 
420         case RBBIRuleParseTable.doTagValue:
421             n = fNodeStack[fNodeStackPtr];
422             n.fLastPos = fNextIndex;
423             n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos);
424             break;
425 
426         case RBBIRuleParseTable.doTagExpectedError:
427             error(RBBIRuleBuilder.U_BRK_MALFORMED_RULE_TAG);
428             returnVal = false;
429             break;
430 
431         case RBBIRuleParseTable.doOptionStart:
432             // Scanning a !!option. At the start of string.
433             fOptionStart = fScanIndex;
434             break;
435 
436         case RBBIRuleParseTable.doOptionEnd: {
437             String opt = fRB.fRules.substring(fOptionStart, fScanIndex);
438             if (opt.equals("chain")) {
439                 fRB.fChainRules = true;
440             } else if (opt.equals("LBCMNoChain")) {
441                 fRB.fLBCMNoChain = true;
442             } else if (opt.equals("forward")) {
443                 fRB.fDefaultTree = RBBIRuleBuilder.fForwardTree;
444             } else if (opt.equals("reverse")) {
445                 fRB.fDefaultTree = RBBIRuleBuilder.fReverseTree;
446             } else if (opt.equals("safe_forward")) {
447                 fRB.fDefaultTree = RBBIRuleBuilder.fSafeFwdTree;
448             } else if (opt.equals("safe_reverse")) {
449                 fRB.fDefaultTree = RBBIRuleBuilder.fSafeRevTree;
450             } else if (opt.equals("lookAheadHardBreak")) {
451                 fRB.fLookAheadHardBreak = true;
452             } else if (opt.equals("quoted_literals_only")) {
453                 fRuleSets[RBBIRuleParseTable.kRuleSet_rule_char - 128].clear();
454             } else if (opt.equals("unquoted_literals")) {
455                 fRuleSets[RBBIRuleParseTable.kRuleSet_rule_char - 128].applyPattern(gRuleSet_rule_char_pattern);
456             } else {
457                 error(RBBIRuleBuilder.U_BRK_UNRECOGNIZED_OPTION);
458             }
459             break;
460         }
461 
462         case RBBIRuleParseTable.doReverseDir:
463             fReverseRule = true;
464             break;
465 
466         case RBBIRuleParseTable.doStartVariableName:
467             n = pushNewNode(RBBINode.varRef);
468             n.fFirstPos = fScanIndex;
469             break;
470 
471         case RBBIRuleParseTable.doEndVariableName:
472             n = fNodeStack[fNodeStackPtr];
473             if (n == null || n.fType != RBBINode.varRef) {
474                 error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR);
475                 break;
476             }
477             n.fLastPos = fScanIndex;
478             n.fText = fRB.fRules.substring(n.fFirstPos + 1, n.fLastPos);
479             // Look the newly scanned name up in the symbol table
480             //   If there's an entry, set the l. child of the var ref to the
481             // replacement expression.
482             //   (We also pass through here when scanning assignments, but no harm
483             // is done, other
484             //    than a slight wasted effort that seems hard to avoid. Lookup will
485             // be null)
486             n.fLeftChild = fSymbolTable.lookupNode(n.fText);
487             break;
488 
489         case RBBIRuleParseTable.doCheckVarDef:
490             n = fNodeStack[fNodeStackPtr];
491             if (n.fLeftChild == null) {
492                 error(RBBIRuleBuilder.U_BRK_UNDEFINED_VARIABLE);
493                 returnVal = false;
494             }
495             break;
496 
497         case RBBIRuleParseTable.doExprFinished:
498             break;
499 
500         case RBBIRuleParseTable.doRuleErrorAssignExpr:
501             error(RBBIRuleBuilder.U_BRK_ASSIGN_ERROR);
502             returnVal = false;
503             break;
504 
505         case RBBIRuleParseTable.doExit:
506             returnVal = false;
507             break;
508 
509         case RBBIRuleParseTable.doScanUnicodeSet:
510             scanSet();
511             break;
512 
513         default:
514             error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR);
515             returnVal = false;
516             break;
517         }
518         return returnVal;
519     }
520 
521     //----------------------------------------------------------------------------------------
522     //
523     //  Error Throw and IllegalArgumentException in response to a rule parse
524     // error.
525     //
526     //----------------------------------------------------------------------------------------
error(int e)527     void error(int e) {
528         String s = "Error " + e + " at line " + fLineNum + " column "
529                 + fCharNum;
530         IllegalArgumentException ex = new IllegalArgumentException(s);
531         throw ex;
532 
533     }
534 
535     //----------------------------------------------------------------------------------------
536     //
537     //  fixOpStack The parse stack holds partially assembled chunks of the parse
538     // tree.
539     //               An entry on the stack may be as small as a single setRef node,
540     //               or as large as the parse tree
541     //               for an entire expression (this will be the one item left on the stack
542     //               when the parsing of an RBBI rule completes.
543     //
544     //               This function is called when a binary operator is encountered.
545     //               It looks back up the stack for operators that are not yet associated
546     //               with a right operand, and if the precedence of the stacked operator >=
547     //               the precedence of the current operator, binds the operand left,
548     //               to the previously encountered operator.
549     //
550     //----------------------------------------------------------------------------------------
fixOpStack(int p)551     void fixOpStack(int p) {
552         RBBINode n;
553         // printNodeStack("entering fixOpStack()");
554         for (;;) {
555             n = fNodeStack[fNodeStackPtr - 1]; // an operator node
556             if (n.fPrecedence == 0) {
557                 System.out.print("RBBIRuleScanner.fixOpStack, bad operator node");
558                 error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR);
559                 return;
560             }
561 
562             if (n.fPrecedence < p || n.fPrecedence <= RBBINode.precLParen) {
563                 // The most recent operand goes with the current operator,
564                 //   not with the previously stacked one.
565                 break;
566             }
567             // Stack operator is a binary op ( '|' or concatenation)
568             //   TOS operand becomes right child of this operator.
569             //   Resulting subexpression becomes the TOS operand.
570             n.fRightChild = fNodeStack[fNodeStackPtr];
571             fNodeStack[fNodeStackPtr].fParent = n;
572             fNodeStackPtr--;
573             // printNodeStack("looping in fixOpStack() ");
574         }
575 
576         if (p <= RBBINode.precLParen) {
577             // Scan is at a right paren or end of expression.
578             //  The scanned item must match the stack, or else there was an
579             // error.
580             //  Discard the left paren (or start expr) node from the stack,
581             //  leaving the completed (sub)expression as TOS.
582             if (n.fPrecedence != p) {
583                 // Right paren encountered matched start of expression node, or
584                 // end of expression matched with a left paren node.
585                 error(RBBIRuleBuilder.U_BRK_MISMATCHED_PAREN);
586             }
587             fNodeStack[fNodeStackPtr - 1] = fNodeStack[fNodeStackPtr];
588             fNodeStackPtr--;
589             // Delete the now-discarded LParen or Start node.
590             // delete n;
591         }
592         // printNodeStack("leaving fixOpStack()");
593     }
594 
595     //----------------------------------------------------------------------------
596     //
597     //       RBBISetTableEl is an entry in the hash table of UnicodeSets that have
598     //                        been encountered. The val Node will be of nodetype uset
599     //                        and contain pointers to the actual UnicodeSets.
600     //                        The Key is the source string for initializing the set.
601     //
602     //                        The hash table is used to avoid creating duplicate
603     //                        unnamed (not $var references) UnicodeSets.
604     //
605     //----------------------------------------------------------------------------
606     static class RBBISetTableEl {
607         String key;
608 
609         RBBINode val;
610     }
611 
612 
613     //----------------------------------------------------------------------------------------
614     //
615     //   findSetFor given a String,
616     //                  - find the corresponding Unicode Set (uset node)
617     //                         (create one if necessary)
618     //                  - Set fLeftChild of the caller's node (should be a setRef node)
619     //                         to the uset node
620     //                 Maintain a hash table of uset nodes, so the same one is always used
621     //                    for the same string.
622     //                 If a "to adopt" set is provided and we haven't seen this key before,
623     //                    add the provided set to the hash table.
624     //                 If the string is one (32 bit) char in length, the set contains
625     //                    just one element which is the char in question.
626     //                 If the string is "any", return a set containing all chars.
627     //
628     //----------------------------------------------------------------------------------------
findSetFor(String s, RBBINode node, UnicodeSet setToAdopt)629     void findSetFor(String s, RBBINode node, UnicodeSet setToAdopt) {
630 
631         RBBISetTableEl el;
632 
633         // First check whether we've already cached a set for this string.
634         // If so, just use the cached set in the new node.
635         //   delete any set provided by the caller, since we own it.
636         el = fSetTable.get(s);
637         if (el != null) {
638             node.fLeftChild = el.val;
639             Assert.assrt(node.fLeftChild.fType == RBBINode.uset);
640             return;
641         }
642 
643         // Haven't seen this set before.
644         // If the caller didn't provide us with a prebuilt set,
645         //   create a new UnicodeSet now.
646         if (setToAdopt == null) {
647             if (s.equals(kAny)) {
648                 setToAdopt = new UnicodeSet(0x000000, 0x10ffff);
649             } else {
650                 int c;
651                 c = UTF16.charAt(s, 0);
652                 setToAdopt = new UnicodeSet(c, c);
653             }
654         }
655 
656         //
657         // Make a new uset node to refer to this UnicodeSet
658         // This new uset node becomes the child of the caller's setReference
659         // node.
660         //
661         RBBINode usetNode = new RBBINode(RBBINode.uset);
662         usetNode.fInputSet = setToAdopt;
663         usetNode.fParent = node;
664         node.fLeftChild = usetNode;
665         usetNode.fText = s;
666 
667         //
668         // Add the new uset node to the list of all uset nodes.
669         //
670         fRB.fUSetNodes.add(usetNode);
671 
672         //
673         // Add the new set to the set hash table.
674         //
675         el = new RBBISetTableEl();
676         el.key = s;
677         el.val = usetNode;
678         fSetTable.put(el.key, el);
679 
680         return;
681     }
682 
683     //
684     //  Assorted Unicode character constants.
685     //     Numeric because there is no portable way to enter them as literals.
686     //     (Think EBCDIC).
687     //
688     static final int chNEL = 0x85; //    NEL newline variant
689 
690     static final int chLS = 0x2028; //    Unicode Line Separator
691 
692     //----------------------------------------------------------------------------------------
693     //
694     //  stripRules    Return a rules string without unnecessary
695     //                characters.
696     //
697     //----------------------------------------------------------------------------------------
stripRules(String rules)698     static String stripRules(String rules) {
699         StringBuilder strippedRules = new StringBuilder();
700         int rulesLength = rules.length();
701 
702         for (int idx = 0; idx < rulesLength; idx = rules.offsetByCodePoints(idx, 1)) {
703             int cp = rules.codePointAt(idx);
704             boolean whiteSpace = UCharacter.hasBinaryProperty(cp, UProperty.PATTERN_WHITE_SPACE);
705             if (whiteSpace) {
706                 continue;
707             }
708             strippedRules.appendCodePoint(cp);
709         }
710         return strippedRules.toString();
711     }
712 
713     //----------------------------------------------------------------------------------------
714     //
715     //  nextCharLL    Low Level Next Char from rule input source.
716     //                Get a char from the input character iterator,
717     //                keep track of input position for error reporting.
718     //
719     //----------------------------------------------------------------------------------------
nextCharLL()720     int nextCharLL() {
721         int ch;
722 
723         if (fNextIndex >= fRB.fRules.length()) {
724             return -1;
725         }
726         ch = UTF16.charAt(fRB.fRules, fNextIndex);
727         if (Character.isBmpCodePoint(ch) && Character.isSurrogate((char)ch)) {
728             error(RBBIRuleBuilder.U_ILLEGAL_CHAR_FOUND);
729         }
730         fNextIndex = UTF16.moveCodePointOffset(fRB.fRules, fNextIndex, 1);
731 
732         if (ch == '\r' ||
733             ch == chNEL ||
734             ch == chLS ||
735             ch == '\n' && fLastChar != '\r') {
736             // Character is starting a new line.  Bump up the line number, and
737             //  reset the column to 0.
738             fLineNum++;
739             fCharNum = 0;
740             if (fQuoteMode) {
741                 error(RBBIRuleBuilder.U_BRK_NEW_LINE_IN_QUOTED_STRING);
742                 fQuoteMode = false;
743             }
744         } else {
745             // Character is not starting a new line.  Except in the case of a
746             //   LF following a CR, increment the column position.
747             if (ch != '\n') {
748                 fCharNum++;
749             }
750         }
751         fLastChar = ch;
752         return ch;
753     }
754 
755     //---------------------------------------------------------------------------------
756     //
757     //   nextChar     for rules scanning.  At this level, we handle stripping
758     //                out comments and processing backslash character escapes.
759     //                The rest of the rules grammar is handled at the next level up.
760     //
761     //---------------------------------------------------------------------------------
nextChar(RBBIRuleChar c)762     void nextChar(RBBIRuleChar c) {
763 
764         // Unicode Character constants needed for the processing done by nextChar(),
765         //   in hex because literals wont work on EBCDIC machines.
766 
767         fScanIndex = fNextIndex;
768         c.fChar = nextCharLL();
769         c.fEscaped = false;
770 
771         //
772         //  check for '' sequence.
773         //  These are recognized in all contexts, whether in quoted text or not.
774         //
775         if (c.fChar == '\'') {
776             if (UTF16.charAt(fRB.fRules, fNextIndex) == '\'') {
777                 c.fChar = nextCharLL(); // get nextChar officially so character counts
778                 c.fEscaped = true; //   stay correct.
779             } else {
780                 // Single quote, by itself.
781                 //   Toggle quoting mode.
782                 //   Return either '('  or ')', because quotes cause a grouping of the quoted text.
783                 fQuoteMode = !fQuoteMode;
784                 if (fQuoteMode == true) {
785                     c.fChar = '(';
786                 } else {
787                     c.fChar = ')';
788                 }
789                 c.fEscaped = false; // The paren that we return is not escaped.
790                 return;
791             }
792         }
793 
794         if (fQuoteMode) {
795             c.fEscaped = true;
796         } else {
797             // We are not in a 'quoted region' of the source.
798             //
799             if (c.fChar == '#') {
800                 // Start of a comment.  Consume the rest of it.
801                 //  The new-line char that terminates the comment is always returned.
802                 //  It will be treated as white-space, and serves to break up anything
803                 //    that might otherwise incorrectly clump together with a comment in
804                 //    the middle (a variable name, for example.)
805                 int commentStart = fScanIndex;
806                 for (;;) {
807                     c.fChar = nextCharLL();
808                     if (c.fChar == -1 || // EOF
809                         c.fChar == '\r' ||
810                         c.fChar == '\n' ||
811                         c.fChar == chNEL ||
812                         c.fChar == chLS)
813                     {
814                         break;
815                     }
816                 }
817                 for (int i=commentStart; i<fNextIndex-1; ++i) {
818                     fRB.fStrippedRules.setCharAt(i, ' ');
819                 }
820             }
821             if (c.fChar == -1) {
822                 return;
823             }
824 
825             //
826             //  check for backslash escaped characters.
827             //
828             if (c.fChar == '\\') {
829                 c.fEscaped = true;
830                 int cpAndLength = Utility.unescapeAndLengthAt(fRB.fRules, fNextIndex);
831                 if (cpAndLength < 0) {
832                     error(RBBIRuleBuilder.U_BRK_HEX_DIGITS_EXPECTED);
833                 }
834                 c.fChar = Utility.cpFromCodePointAndLength(cpAndLength);
835                 int length = Utility.lengthFromCodePointAndLength(cpAndLength);
836 
837                 fCharNum += length;
838                 fNextIndex += length;
839             }
840         }
841         // putc(c.fChar, stdout);
842     }
843 
844     //---------------------------------------------------------------------------------
845     //
846     //  Parse RBBI rules.   The state machine for rules parsing is here.
847     //                      The state tables are hand-written in the file rbbirpt.txt,
848     //                      and converted to the form used here by a perl
849     //                      script rbbicst.pl
850     //
851     //---------------------------------------------------------------------------------
parse()852     void parse() {
853         int state;
854         RBBIRuleParseTable.RBBIRuleTableElement tableEl;
855 
856         state = 1;
857         nextChar(fC);
858         //
859         // Main loop for the rule parsing state machine.
860         //   Runs once per state transition.
861         //   Each time through optionally performs, depending on the state table,
862         //      - an advance to the the next input char
863         //      - an action to be performed.
864         //      - pushing or popping a state to/from the local state return stack.
865         //
866         for (;;) {
867             // Quit if state == 0.  This is the normal way to exit the state machine.
868             //
869             if (state == 0) {
870                 break;
871             }
872 
873             // Find the state table element that matches the input char from the rule, or the
874             //    class of the input character.  Start with the first table row for this
875             //    state, then linearly scan forward until we find a row that matches the
876             //    character.  The last row for each state always matches all characters, so
877             //    the search will stop there, if not before.
878             //
879             tableEl = RBBIRuleParseTable.gRuleParseStateTable[state];
880             if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("scan") >= 0) {
881                 System.out.println("char, line, col = (\'" + (char) fC.fChar
882                         + "\', " + fLineNum + ", " + fCharNum + "    state = "
883                         + tableEl.fStateName);
884             }
885 
886             for (int tableRow = state;; tableRow++) { // loop over the state table rows associated with this state.
887                 tableEl = RBBIRuleParseTable.gRuleParseStateTable[tableRow];
888                 if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("scan") >= 0) {
889                     System.out.print(".");
890                 }
891                 if (tableEl.fCharClass < 127 && fC.fEscaped == false
892                         && tableEl.fCharClass == fC.fChar) {
893                     // Table row specified an individual character, not a set, and
894                     //   the input character is not escaped, and
895                     //   the input character matched it.
896                     break;
897                 }
898                 if (tableEl.fCharClass == 255) {
899                     // Table row specified default, match anything character class.
900                     break;
901                 }
902                 if (tableEl.fCharClass == 254 && fC.fEscaped) {
903                     // Table row specified "escaped" and the char was escaped.
904                     break;
905                 }
906                 if (tableEl.fCharClass == 253 && fC.fEscaped
907                         && (fC.fChar == 0x50 || fC.fChar == 0x70)) {
908                     // Table row specified "escaped P" and the char is either 'p' or 'P'.
909                     break;
910                 }
911                 if (tableEl.fCharClass == 252 && fC.fChar == -1) {
912                     // Table row specified eof and we hit eof on the input.
913                     break;
914                 }
915 
916                 if (tableEl.fCharClass >= 128 && tableEl.fCharClass < 240 && // Table specs a char class &&
917                         fC.fEscaped == false && //   char is not escaped &&
918                         fC.fChar != -1) { //   char is not EOF
919                     UnicodeSet uniset = fRuleSets[tableEl.fCharClass - 128];
920                     if (uniset.contains(fC.fChar)) {
921                         // Table row specified a character class, or set of characters,
922                         //   and the current char matches it.
923                         break;
924                     }
925                 }
926             }
927 
928             if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("scan") >= 0) {
929                 System.out.println("");
930             }
931             //
932             // We've found the row of the state table that matches the current input
933             //   character from the rules string.
934             // Perform any action specified  by this row in the state table.
935             if (doParseActions(tableEl.fAction) == false) {
936                 // Break out of the state machine loop if the
937                 //   the action signaled some kind of error, or
938                 //   the action was to exit, occurs on normal end-of-rules-input.
939                 break;
940             }
941 
942             if (tableEl.fPushState != 0) {
943                 fStackPtr++;
944                 if (fStackPtr >= kStackSize) {
945                     System.out.println("RBBIRuleScanner.parse() - state stack overflow.");
946                     error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR);
947                 }
948                 fStack[fStackPtr] = tableEl.fPushState;
949             }
950 
951             if (tableEl.fNextChar) {
952                 nextChar(fC);
953             }
954 
955             // Get the next state from the table entry, or from the
956             //   state stack if the next state was specified as "pop".
957             if (tableEl.fNextState != 255) {
958                 state = tableEl.fNextState;
959             } else {
960                 state = fStack[fStackPtr];
961                 fStackPtr--;
962                 if (fStackPtr < 0) {
963                     System.out.println("RBBIRuleScanner.parse() - state stack underflow.");
964                     error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR);
965                 }
966             }
967 
968         }
969 
970         // If there are no forward rules throw an error.
971         //
972         if (fRB.fTreeRoots[RBBIRuleBuilder.fForwardTree] == null) {
973             error(RBBIRuleBuilder.U_BRK_RULE_SYNTAX);
974         }
975 
976         //
977         // Parsing of the input RBBI rules is complete.
978         // We now have a parse tree for the rule expressions
979         // and a list of all UnicodeSets that are referenced.
980         //
981         if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("symbols") >= 0) {
982             fSymbolTable.rbbiSymtablePrint();
983         }
984         if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("ptree") >= 0) {
985             System.out.println("Completed Forward Rules Parse Tree...");
986             fRB.fTreeRoots[RBBIRuleBuilder.fForwardTree].printTree(true);
987             System.out.println("\nCompleted Reverse Rules Parse Tree...");
988             fRB.fTreeRoots[RBBIRuleBuilder.fReverseTree].printTree(true);
989             System.out.println("\nCompleted Safe Point Forward Rules Parse Tree...");
990             if (fRB.fTreeRoots[RBBIRuleBuilder.fSafeFwdTree] == null) {
991                 System.out.println("  -- null -- ");
992             } else {
993                 fRB.fTreeRoots[RBBIRuleBuilder.fSafeFwdTree].printTree(true);
994             }
995             System.out.println("\nCompleted Safe Point Reverse Rules Parse Tree...");
996             if (fRB.fTreeRoots[RBBIRuleBuilder.fSafeRevTree] == null) {
997                 System.out.println("  -- null -- ");
998             } else {
999                 fRB.fTreeRoots[RBBIRuleBuilder.fSafeRevTree].printTree(true);
1000             }
1001         }
1002     }
1003 
1004     //---------------------------------------------------------------------------------
1005     //
1006     //  printNodeStack     for debugging...
1007     //
1008     //---------------------------------------------------------------------------------
1009     ///CLOVER:OFF
printNodeStack(String title)1010     void printNodeStack(String title) {
1011         int i;
1012         System.out.println(title + ".  Dumping node stack...\n");
1013         for (i = fNodeStackPtr; i > 0; i--) {
1014             fNodeStack[i].printTree(true);
1015         }
1016     }
1017     ///CLOVER:ON
1018 
1019     //---------------------------------------------------------------------------------
1020     //
1021     //  pushNewNode   create a new RBBINode of the specified type and push it
1022     //                onto the stack of nodes.
1023     //
1024     //---------------------------------------------------------------------------------
pushNewNode(int nodeType)1025     RBBINode pushNewNode(int nodeType) {
1026         fNodeStackPtr++;
1027         if (fNodeStackPtr >= kStackSize) {
1028             System.out.println("RBBIRuleScanner.pushNewNode - stack overflow.");
1029             error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR);
1030         }
1031         fNodeStack[fNodeStackPtr] = new RBBINode(nodeType);
1032         return fNodeStack[fNodeStackPtr];
1033     }
1034 
1035     //---------------------------------------------------------------------------------
1036     //
1037     //  scanSet    Construct a UnicodeSet from the text at the current scan
1038     //             position.  Advance the scan position to the first character
1039     //             after the set.
1040     //
1041     //             A new RBBI setref node referring to the set is pushed onto the node
1042     //             stack.
1043     //
1044     //             The scan position is normally under the control of the state machine
1045     //             that controls rule parsing.  UnicodeSets, however, are parsed by
1046     //             the UnicodeSet constructor, not by the RBBI rule parser.
1047     //
1048     //---------------------------------------------------------------------------------
scanSet()1049     void scanSet() {
1050         UnicodeSet uset = null;
1051         int startPos;
1052         ParsePosition pos = new ParsePosition(fScanIndex);
1053         int i;
1054 
1055         startPos = fScanIndex;
1056         try {
1057             uset = new UnicodeSet(fRB.fRules, pos, fSymbolTable, UnicodeSet.IGNORE_SPACE);
1058         } catch (Exception e) { // TODO:  catch fewer exception types.
1059             // Repackage UnicodeSet errors as RBBI rule builder errors, with location info.
1060             error(RBBIRuleBuilder.U_BRK_MALFORMED_SET);
1061         }
1062 
1063         // Verify that the set contains at least one code point.
1064         //
1065         if (uset.isEmpty()) {
1066             // This set is empty.
1067             //  Make it an error, because it almost certainly is not what the user wanted.
1068             //  Also, avoids having to think about corner cases in the tree manipulation code
1069             //   that occurs later on.
1070             //  TODO:  this shouldn't be an error; it does happen.
1071             error(RBBIRuleBuilder.U_BRK_RULE_EMPTY_SET);
1072         }
1073 
1074         // Advance the RBBI parse position over the UnicodeSet pattern.
1075         //   Don't just set fScanIndex because the line/char positions maintained
1076         //   for error reporting would be thrown off.
1077         i = pos.getIndex();
1078         for (;;) {
1079             if (fNextIndex >= i) {
1080                 break;
1081             }
1082             nextCharLL();
1083         }
1084 
1085         RBBINode n;
1086 
1087         n = pushNewNode(RBBINode.setRef);
1088         n.fFirstPos = startPos;
1089         n.fLastPos = fNextIndex;
1090         n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos);
1091         //  findSetFor() serves several purposes here:
1092         //     - Adopts storage for the UnicodeSet, will be responsible for deleting.
1093         //     - Maintains collection of all sets in use, needed later for establishing
1094         //          character categories for run time engine.
1095         //     - Eliminates multiple instances of the same set.
1096         //     - Creates a new uset node if necessary (if this isn't a duplicate.)
1097         findSetFor(n.fText, n, uset);
1098     }
1099 
1100     /**
1101      * @return  the number of rules that have been seen.
1102      */
numRules()1103     int numRules() {
1104         return fRuleNum;
1105     }
1106 }
1107