1 /* GENERATED SOURCE. DO NOT MODIFY. */ 2 // © 2016 and later: Unicode, Inc. and others. 3 // License & terms of use: http://www.unicode.org/copyright.html 4 /* 5 ******************************************************************************* 6 * Copyright (C) 2003-2016, International Business Machines Corporation and others. All Rights Reserved. 7 ******************************************************************************* 8 */ 9 10 package android.icu.text; 11 12 import java.text.ParsePosition; 13 import java.util.HashMap; 14 15 import android.icu.impl.Assert; 16 import android.icu.impl.Utility; 17 import android.icu.lang.UCharacter; 18 import android.icu.lang.UProperty; 19 20 /** 21 * This class is part of the Rule Based Break Iterator rule compiler. 22 * It scans the rules and builds the parse tree. 23 * There is no public API here. 24 */ 25 class RBBIRuleScanner { 26 27 private final static int kStackSize = 100; // The size of the state stack for 28 // rules parsing. Corresponds roughly 29 // to the depth of parentheses nesting 30 // that is allowed in the rules. 31 32 static class RBBIRuleChar { 33 int fChar; 34 boolean fEscaped; 35 } 36 37 38 39 RBBIRuleBuilder fRB; // The rule builder that we are part of. 40 41 int fScanIndex; // Index of current character being processed 42 // in the rule input string. 43 int fNextIndex; // Index of the next character, which 44 // is the first character not yet scanned. 45 boolean fQuoteMode; // Scan is in a 'quoted region' 46 int fLineNum; // Line number in input file. 47 int fCharNum; // Char position within the line. 48 int fLastChar; // Previous char, needed to count CR-LF 49 // as a single line, not two. 50 51 RBBIRuleChar fC = new RBBIRuleChar(); // Current char for parse state machine 52 // processing. 53 54 55 short fStack[] = new short[kStackSize]; // State stack, holds state pushes 56 int fStackPtr; // and pops as specified in the state 57 // transition rules. 58 59 RBBINode fNodeStack[] = new RBBINode[kStackSize]; // Node stack, holds nodes created 60 // during the parse of a rule 61 int fNodeStackPtr; 62 63 64 boolean fReverseRule; // True if the rule currently being scanned 65 // is a reverse direction rule (if it 66 // starts with a '!') 67 68 boolean fLookAheadRule; // True if the rule includes a '/' 69 // somewhere within it. 70 71 boolean fNoChainInRule; // True if the current rule starts with a '^'. 72 73 74 RBBISymbolTable fSymbolTable; // symbol table, holds definitions of 75 // $variable symbols. 76 77 HashMap<String, RBBISetTableEl> fSetTable = new HashMap<>(); // UnicocodeSet hash table, holds indexes to 78 // the sets created while parsing rules. 79 // The key is the string used for creating 80 // the set. 81 82 UnicodeSet fRuleSets[] = new UnicodeSet[10]; // Unicode Sets that are needed during 83 // the scanning of RBBI rules. The 84 // Indices for these are assigned by the 85 // perl script that builds the state tables. 86 // See rbbirpt.h. 87 88 int fRuleNum; // Counts each rule as it is scanned. 89 90 int fOptionStart; // Input index of start of a !!option 91 // keyword, while being scanned. 92 93 94 // gRuleSet_rule_char_pattern is characters that may appear as literals in patterns without escaping or quoting. 95 static private String gRuleSet_rule_char_pattern = "[^[\\p{Z}\\u0020-\\u007f]-[\\p{L}]-[\\p{N}]]"; 96 static private String gRuleSet_name_char_pattern = "[_\\p{L}\\p{N}]"; 97 static private String gRuleSet_digit_char_pattern = "[0-9]"; 98 static private String gRuleSet_name_start_char_pattern = "[_\\p{L}]"; 99 static private String gRuleSet_white_space_pattern = "[\\p{Pattern_White_Space}]"; 100 static private String kAny = "any"; 101 102 103 104 105 //---------------------------------------------------------------------------------------- 106 // 107 // Constructor. 108 // 109 //---------------------------------------------------------------------------------------- RBBIRuleScanner(RBBIRuleBuilder rb)110 RBBIRuleScanner(RBBIRuleBuilder rb) { 111 fRB = rb; 112 fLineNum = 1; 113 114 // 115 // Set up the constant Unicode Sets. 116 // Note: These could be made static and shared among 117 // all instances of RBBIRuleScanners. 118 fRuleSets[RBBIRuleParseTable.kRuleSet_rule_char - 128] = new UnicodeSet(gRuleSet_rule_char_pattern); 119 fRuleSets[RBBIRuleParseTable.kRuleSet_white_space - 128] = new UnicodeSet(gRuleSet_white_space_pattern); 120 fRuleSets[RBBIRuleParseTable.kRuleSet_name_char - 128] = new UnicodeSet(gRuleSet_name_char_pattern); 121 fRuleSets[RBBIRuleParseTable.kRuleSet_name_start_char - 128] = new UnicodeSet(gRuleSet_name_start_char_pattern); 122 fRuleSets[RBBIRuleParseTable.kRuleSet_digit_char - 128] = new UnicodeSet(gRuleSet_digit_char_pattern); 123 124 fSymbolTable = new RBBISymbolTable(this); 125 } 126 127 //---------------------------------------------------------------------------------------- 128 // 129 // doParseAction Do some action during rule parsing. 130 // Called by the parse state machine. 131 // Actions build the parse tree and Unicode Sets, 132 // and maintain the parse stack for nested expressions. 133 // 134 //---------------------------------------------------------------------------------------- doParseActions(int action)135 boolean doParseActions(int action) { 136 RBBINode n = null; 137 138 boolean returnVal = true; 139 140 switch (action) { 141 142 case RBBIRuleParseTable.doExprStart: 143 pushNewNode(RBBINode.opStart); 144 fRuleNum++; 145 break; 146 147 case RBBIRuleParseTable.doNoChain: 148 // Scanned a '^' while on the rule start state. 149 fNoChainInRule = true; 150 break; 151 152 153 case RBBIRuleParseTable.doExprOrOperator: { 154 fixOpStack(RBBINode.precOpCat); 155 RBBINode operandNode = fNodeStack[fNodeStackPtr--]; 156 RBBINode orNode = pushNewNode(RBBINode.opOr); 157 orNode.fLeftChild = operandNode; 158 operandNode.fParent = orNode; 159 } 160 break; 161 162 case RBBIRuleParseTable.doExprCatOperator: 163 // concatenation operator. 164 // For the implicit concatenation of adjacent terms in an expression 165 // that are 166 // not separated by any other operator. Action is invoked between the 167 // actions for the two terms. 168 { 169 fixOpStack(RBBINode.precOpCat); 170 RBBINode operandNode = fNodeStack[fNodeStackPtr--]; 171 RBBINode catNode = pushNewNode(RBBINode.opCat); 172 catNode.fLeftChild = operandNode; 173 operandNode.fParent = catNode; 174 } 175 break; 176 177 case RBBIRuleParseTable.doLParen: 178 // Open Paren. 179 // The openParen node is a dummy operation type with a low 180 // precedence, 181 // which has the affect of ensuring that any real binary op that 182 // follows within the parens binds more tightly to the operands than 183 // stuff outside of the parens. 184 pushNewNode(RBBINode.opLParen); 185 break; 186 187 case RBBIRuleParseTable.doExprRParen: 188 fixOpStack(RBBINode.precLParen); 189 break; 190 191 case RBBIRuleParseTable.doNOP: 192 break; 193 194 case RBBIRuleParseTable.doStartAssign: 195 // We've just scanned "$variable = " 196 // The top of the node stack has the $variable ref node. 197 198 // Save the start position of the RHS text in the StartExpression 199 // node 200 // that precedes the $variableReference node on the stack. 201 // This will eventually be used when saving the full $variable 202 // replacement 203 // text as a string. 204 n = fNodeStack[fNodeStackPtr - 1]; 205 n.fFirstPos = fNextIndex; // move past the '=' 206 207 // Push a new start-of-expression node; needed to keep parse of the 208 // RHS expression happy. 209 pushNewNode(RBBINode.opStart); 210 break; 211 212 case RBBIRuleParseTable.doEndAssign: { 213 // We have reached the end of an assignment statement. 214 // Current scan char is the ';' that terminates the assignment. 215 216 // Terminate expression, leaves expression parse tree rooted in TOS 217 // node. 218 fixOpStack(RBBINode.precStart); 219 220 RBBINode startExprNode = fNodeStack[fNodeStackPtr - 2]; 221 RBBINode varRefNode = fNodeStack[fNodeStackPtr - 1]; 222 RBBINode RHSExprNode = fNodeStack[fNodeStackPtr]; 223 224 // Save original text of right side of assignment, excluding the 225 // terminating ';' 226 // in the root of the node for the right-hand-side expression. 227 RHSExprNode.fFirstPos = startExprNode.fFirstPos; 228 RHSExprNode.fLastPos = fScanIndex; 229 // fRB.fRules.extractBetween(RHSExprNode.fFirstPos, 230 // RHSExprNode.fLastPos, RHSExprNode.fText); 231 RHSExprNode.fText = fRB.fRules.substring(RHSExprNode.fFirstPos, 232 RHSExprNode.fLastPos); 233 234 // Expression parse tree becomes l. child of the $variable reference 235 // node. 236 varRefNode.fLeftChild = RHSExprNode; 237 RHSExprNode.fParent = varRefNode; 238 239 // Make a symbol table entry for the $variableRef node. 240 fSymbolTable.addEntry(varRefNode.fText, varRefNode); 241 242 // Clean up the stack. 243 fNodeStackPtr -= 3; 244 break; 245 } 246 247 case RBBIRuleParseTable.doEndOfRule: { 248 fixOpStack(RBBINode.precStart); // Terminate expression, leaves 249 // expression 250 251 if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("rtree") >= 0) { 252 printNodeStack("end of rule"); 253 } 254 Assert.assrt(fNodeStackPtr == 1); 255 RBBINode thisRule = fNodeStack[fNodeStackPtr]; 256 257 // If this rule includes a look-ahead '/', add a endMark node to the 258 // expression tree. 259 if (fLookAheadRule) { 260 RBBINode endNode = pushNewNode(RBBINode.endMark); 261 RBBINode catNode = pushNewNode(RBBINode.opCat); 262 fNodeStackPtr -= 2; 263 catNode.fLeftChild = thisRule; 264 catNode.fRightChild = endNode; 265 fNodeStack[fNodeStackPtr] = catNode; 266 endNode.fVal = fRuleNum; 267 endNode.fLookAheadEnd = true; 268 thisRule = catNode; 269 270 // TODO: Disable chaining out of look-ahead (hard break) rules. 271 // The break on rule match is forced, so there is no point in building up 272 // the state table to chain into another rule for a longer match. 273 } 274 275 // Mark this node as being the root of a rule. 276 thisRule.fRuleRoot = true; 277 278 // Flag if chaining into this rule is wanted. 279 // 280 if (fRB.fChainRules && // If rule chaining is enabled globally via !!chain 281 !fNoChainInRule) { // and no '^' chain-in inhibit was on this rule 282 thisRule.fChainIn = true; 283 } 284 285 286 // All rule expressions are ORed together. 287 // The ';' that terminates an expression really just functions as a 288 // '|' with 289 // a low operator precedence. 290 // 291 // Each of the four sets of rules are collected separately. 292 // (forward, reverse, safe_forward, safe_reverse) 293 // OR this rule into the appropriate group of them. 294 // 295 296 int destRules = (fReverseRule ? RBBIRuleBuilder.fSafeRevTree : fRB.fDefaultTree); 297 298 if (fRB.fTreeRoots[destRules] != null) { 299 // This is not the first rule encountered. 300 // OR previous stuff (from *destRules) 301 // with the current rule expression (on the Node Stack) 302 // with the resulting OR expression going to *destRules 303 // 304 thisRule = fNodeStack[fNodeStackPtr]; 305 RBBINode prevRules = fRB.fTreeRoots[destRules]; 306 RBBINode orNode = pushNewNode(RBBINode.opOr); 307 orNode.fLeftChild = prevRules; 308 prevRules.fParent = orNode; 309 orNode.fRightChild = thisRule; 310 thisRule.fParent = orNode; 311 fRB.fTreeRoots[destRules] = orNode; 312 } else { 313 // This is the first rule encountered (for this direction). 314 // Just move its parse tree from the stack to *destRules. 315 fRB.fTreeRoots[destRules] = fNodeStack[fNodeStackPtr]; 316 } 317 fReverseRule = false; // in preparation for the next rule. 318 fLookAheadRule = false; 319 fNoChainInRule = false; 320 fNodeStackPtr = 0; 321 } 322 break; 323 324 case RBBIRuleParseTable.doRuleError: 325 error(RBBIRuleBuilder.U_BRK_RULE_SYNTAX); 326 returnVal = false; 327 break; 328 329 case RBBIRuleParseTable.doVariableNameExpectedErr: 330 error(RBBIRuleBuilder.U_BRK_RULE_SYNTAX); 331 break; 332 333 // 334 // Unary operands + ? * 335 // These all appear after the operand to which they apply. 336 // When we hit one, the operand (may be a whole sub expression) 337 // will be on the top of the stack. 338 // Unary Operator becomes TOS, with the old TOS as its one child. 339 case RBBIRuleParseTable.doUnaryOpPlus: { 340 RBBINode operandNode = fNodeStack[fNodeStackPtr--]; 341 RBBINode plusNode = pushNewNode(RBBINode.opPlus); 342 plusNode.fLeftChild = operandNode; 343 operandNode.fParent = plusNode; 344 } 345 break; 346 347 case RBBIRuleParseTable.doUnaryOpQuestion: { 348 RBBINode operandNode = fNodeStack[fNodeStackPtr--]; 349 RBBINode qNode = pushNewNode(RBBINode.opQuestion); 350 qNode.fLeftChild = operandNode; 351 operandNode.fParent = qNode; 352 } 353 break; 354 355 case RBBIRuleParseTable.doUnaryOpStar: { 356 RBBINode operandNode = fNodeStack[fNodeStackPtr--]; 357 RBBINode starNode = pushNewNode(RBBINode.opStar); 358 starNode.fLeftChild = operandNode; 359 operandNode.fParent = starNode; 360 } 361 break; 362 363 case RBBIRuleParseTable.doRuleChar: 364 // A "Rule Character" is any single character that is a literal part 365 // of the regular expression. Like a, b and c in the expression "(abc*) 366 // | [:L:]" 367 // These are pretty uncommon in break rules; the terms are more commonly 368 // sets. To keep things uniform, treat these characters like as 369 // sets that just happen to contain only one character. 370 { 371 n = pushNewNode(RBBINode.setRef); 372 String s = String.valueOf((char)fC.fChar); 373 findSetFor(s, n, null); 374 n.fFirstPos = fScanIndex; 375 n.fLastPos = fNextIndex; 376 n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos); 377 break; 378 } 379 380 case RBBIRuleParseTable.doDotAny: 381 // scanned a ".", meaning match any single character. 382 { 383 n = pushNewNode(RBBINode.setRef); 384 findSetFor(kAny, n, null); 385 n.fFirstPos = fScanIndex; 386 n.fLastPos = fNextIndex; 387 n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos); 388 break; 389 } 390 391 case RBBIRuleParseTable.doSlash: 392 // Scanned a '/', which identifies a look-ahead break position in a 393 // rule. 394 n = pushNewNode(RBBINode.lookAhead); 395 n.fVal = fRuleNum; 396 n.fFirstPos = fScanIndex; 397 n.fLastPos = fNextIndex; 398 n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos); 399 fLookAheadRule = true; 400 break; 401 402 case RBBIRuleParseTable.doStartTagValue: 403 // Scanned a '{', the opening delimiter for a tag value within a 404 // rule. 405 n = pushNewNode(RBBINode.tag); 406 n.fVal = 0; 407 n.fFirstPos = fScanIndex; 408 n.fLastPos = fNextIndex; 409 break; 410 411 case RBBIRuleParseTable.doTagDigit: 412 // Just scanned a decimal digit that's part of a tag value 413 { 414 n = fNodeStack[fNodeStackPtr]; 415 int v = UCharacter.digit((char) fC.fChar, 10); 416 n.fVal = n.fVal * 10 + v; 417 break; 418 } 419 420 case RBBIRuleParseTable.doTagValue: 421 n = fNodeStack[fNodeStackPtr]; 422 n.fLastPos = fNextIndex; 423 n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos); 424 break; 425 426 case RBBIRuleParseTable.doTagExpectedError: 427 error(RBBIRuleBuilder.U_BRK_MALFORMED_RULE_TAG); 428 returnVal = false; 429 break; 430 431 case RBBIRuleParseTable.doOptionStart: 432 // Scanning a !!option. At the start of string. 433 fOptionStart = fScanIndex; 434 break; 435 436 case RBBIRuleParseTable.doOptionEnd: { 437 String opt = fRB.fRules.substring(fOptionStart, fScanIndex); 438 if (opt.equals("chain")) { 439 fRB.fChainRules = true; 440 } else if (opt.equals("LBCMNoChain")) { 441 fRB.fLBCMNoChain = true; 442 } else if (opt.equals("forward")) { 443 fRB.fDefaultTree = RBBIRuleBuilder.fForwardTree; 444 } else if (opt.equals("reverse")) { 445 fRB.fDefaultTree = RBBIRuleBuilder.fReverseTree; 446 } else if (opt.equals("safe_forward")) { 447 fRB.fDefaultTree = RBBIRuleBuilder.fSafeFwdTree; 448 } else if (opt.equals("safe_reverse")) { 449 fRB.fDefaultTree = RBBIRuleBuilder.fSafeRevTree; 450 } else if (opt.equals("lookAheadHardBreak")) { 451 fRB.fLookAheadHardBreak = true; 452 } else if (opt.equals("quoted_literals_only")) { 453 fRuleSets[RBBIRuleParseTable.kRuleSet_rule_char - 128].clear(); 454 } else if (opt.equals("unquoted_literals")) { 455 fRuleSets[RBBIRuleParseTable.kRuleSet_rule_char - 128].applyPattern(gRuleSet_rule_char_pattern); 456 } else { 457 error(RBBIRuleBuilder.U_BRK_UNRECOGNIZED_OPTION); 458 } 459 break; 460 } 461 462 case RBBIRuleParseTable.doReverseDir: 463 fReverseRule = true; 464 break; 465 466 case RBBIRuleParseTable.doStartVariableName: 467 n = pushNewNode(RBBINode.varRef); 468 n.fFirstPos = fScanIndex; 469 break; 470 471 case RBBIRuleParseTable.doEndVariableName: 472 n = fNodeStack[fNodeStackPtr]; 473 if (n == null || n.fType != RBBINode.varRef) { 474 error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR); 475 break; 476 } 477 n.fLastPos = fScanIndex; 478 n.fText = fRB.fRules.substring(n.fFirstPos + 1, n.fLastPos); 479 // Look the newly scanned name up in the symbol table 480 // If there's an entry, set the l. child of the var ref to the 481 // replacement expression. 482 // (We also pass through here when scanning assignments, but no harm 483 // is done, other 484 // than a slight wasted effort that seems hard to avoid. Lookup will 485 // be null) 486 n.fLeftChild = fSymbolTable.lookupNode(n.fText); 487 break; 488 489 case RBBIRuleParseTable.doCheckVarDef: 490 n = fNodeStack[fNodeStackPtr]; 491 if (n.fLeftChild == null) { 492 error(RBBIRuleBuilder.U_BRK_UNDEFINED_VARIABLE); 493 returnVal = false; 494 } 495 break; 496 497 case RBBIRuleParseTable.doExprFinished: 498 break; 499 500 case RBBIRuleParseTable.doRuleErrorAssignExpr: 501 error(RBBIRuleBuilder.U_BRK_ASSIGN_ERROR); 502 returnVal = false; 503 break; 504 505 case RBBIRuleParseTable.doExit: 506 returnVal = false; 507 break; 508 509 case RBBIRuleParseTable.doScanUnicodeSet: 510 scanSet(); 511 break; 512 513 default: 514 error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR); 515 returnVal = false; 516 break; 517 } 518 return returnVal; 519 } 520 521 //---------------------------------------------------------------------------------------- 522 // 523 // Error Throw and IllegalArgumentException in response to a rule parse 524 // error. 525 // 526 //---------------------------------------------------------------------------------------- error(int e)527 void error(int e) { 528 String s = "Error " + e + " at line " + fLineNum + " column " 529 + fCharNum; 530 IllegalArgumentException ex = new IllegalArgumentException(s); 531 throw ex; 532 533 } 534 535 //---------------------------------------------------------------------------------------- 536 // 537 // fixOpStack The parse stack holds partially assembled chunks of the parse 538 // tree. 539 // An entry on the stack may be as small as a single setRef node, 540 // or as large as the parse tree 541 // for an entire expression (this will be the one item left on the stack 542 // when the parsing of an RBBI rule completes. 543 // 544 // This function is called when a binary operator is encountered. 545 // It looks back up the stack for operators that are not yet associated 546 // with a right operand, and if the precedence of the stacked operator >= 547 // the precedence of the current operator, binds the operand left, 548 // to the previously encountered operator. 549 // 550 //---------------------------------------------------------------------------------------- fixOpStack(int p)551 void fixOpStack(int p) { 552 RBBINode n; 553 // printNodeStack("entering fixOpStack()"); 554 for (;;) { 555 n = fNodeStack[fNodeStackPtr - 1]; // an operator node 556 if (n.fPrecedence == 0) { 557 System.out.print("RBBIRuleScanner.fixOpStack, bad operator node"); 558 error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR); 559 return; 560 } 561 562 if (n.fPrecedence < p || n.fPrecedence <= RBBINode.precLParen) { 563 // The most recent operand goes with the current operator, 564 // not with the previously stacked one. 565 break; 566 } 567 // Stack operator is a binary op ( '|' or concatenation) 568 // TOS operand becomes right child of this operator. 569 // Resulting subexpression becomes the TOS operand. 570 n.fRightChild = fNodeStack[fNodeStackPtr]; 571 fNodeStack[fNodeStackPtr].fParent = n; 572 fNodeStackPtr--; 573 // printNodeStack("looping in fixOpStack() "); 574 } 575 576 if (p <= RBBINode.precLParen) { 577 // Scan is at a right paren or end of expression. 578 // The scanned item must match the stack, or else there was an 579 // error. 580 // Discard the left paren (or start expr) node from the stack, 581 // leaving the completed (sub)expression as TOS. 582 if (n.fPrecedence != p) { 583 // Right paren encountered matched start of expression node, or 584 // end of expression matched with a left paren node. 585 error(RBBIRuleBuilder.U_BRK_MISMATCHED_PAREN); 586 } 587 fNodeStack[fNodeStackPtr - 1] = fNodeStack[fNodeStackPtr]; 588 fNodeStackPtr--; 589 // Delete the now-discarded LParen or Start node. 590 // delete n; 591 } 592 // printNodeStack("leaving fixOpStack()"); 593 } 594 595 //---------------------------------------------------------------------------- 596 // 597 // RBBISetTableEl is an entry in the hash table of UnicodeSets that have 598 // been encountered. The val Node will be of nodetype uset 599 // and contain pointers to the actual UnicodeSets. 600 // The Key is the source string for initializing the set. 601 // 602 // The hash table is used to avoid creating duplicate 603 // unnamed (not $var references) UnicodeSets. 604 // 605 //---------------------------------------------------------------------------- 606 static class RBBISetTableEl { 607 String key; 608 609 RBBINode val; 610 } 611 612 613 //---------------------------------------------------------------------------------------- 614 // 615 // findSetFor given a String, 616 // - find the corresponding Unicode Set (uset node) 617 // (create one if necessary) 618 // - Set fLeftChild of the caller's node (should be a setRef node) 619 // to the uset node 620 // Maintain a hash table of uset nodes, so the same one is always used 621 // for the same string. 622 // If a "to adopt" set is provided and we haven't seen this key before, 623 // add the provided set to the hash table. 624 // If the string is one (32 bit) char in length, the set contains 625 // just one element which is the char in question. 626 // If the string is "any", return a set containing all chars. 627 // 628 //---------------------------------------------------------------------------------------- findSetFor(String s, RBBINode node, UnicodeSet setToAdopt)629 void findSetFor(String s, RBBINode node, UnicodeSet setToAdopt) { 630 631 RBBISetTableEl el; 632 633 // First check whether we've already cached a set for this string. 634 // If so, just use the cached set in the new node. 635 // delete any set provided by the caller, since we own it. 636 el = fSetTable.get(s); 637 if (el != null) { 638 node.fLeftChild = el.val; 639 Assert.assrt(node.fLeftChild.fType == RBBINode.uset); 640 return; 641 } 642 643 // Haven't seen this set before. 644 // If the caller didn't provide us with a prebuilt set, 645 // create a new UnicodeSet now. 646 if (setToAdopt == null) { 647 if (s.equals(kAny)) { 648 setToAdopt = new UnicodeSet(0x000000, 0x10ffff); 649 } else { 650 int c; 651 c = UTF16.charAt(s, 0); 652 setToAdopt = new UnicodeSet(c, c); 653 } 654 } 655 656 // 657 // Make a new uset node to refer to this UnicodeSet 658 // This new uset node becomes the child of the caller's setReference 659 // node. 660 // 661 RBBINode usetNode = new RBBINode(RBBINode.uset); 662 usetNode.fInputSet = setToAdopt; 663 usetNode.fParent = node; 664 node.fLeftChild = usetNode; 665 usetNode.fText = s; 666 667 // 668 // Add the new uset node to the list of all uset nodes. 669 // 670 fRB.fUSetNodes.add(usetNode); 671 672 // 673 // Add the new set to the set hash table. 674 // 675 el = new RBBISetTableEl(); 676 el.key = s; 677 el.val = usetNode; 678 fSetTable.put(el.key, el); 679 680 return; 681 } 682 683 // 684 // Assorted Unicode character constants. 685 // Numeric because there is no portable way to enter them as literals. 686 // (Think EBCDIC). 687 // 688 static final int chNEL = 0x85; // NEL newline variant 689 690 static final int chLS = 0x2028; // Unicode Line Separator 691 692 //---------------------------------------------------------------------------------------- 693 // 694 // stripRules Return a rules string without unnecessary 695 // characters. 696 // 697 //---------------------------------------------------------------------------------------- stripRules(String rules)698 static String stripRules(String rules) { 699 StringBuilder strippedRules = new StringBuilder(); 700 int rulesLength = rules.length(); 701 702 for (int idx = 0; idx < rulesLength; idx = rules.offsetByCodePoints(idx, 1)) { 703 int cp = rules.codePointAt(idx); 704 boolean whiteSpace = UCharacter.hasBinaryProperty(cp, UProperty.PATTERN_WHITE_SPACE); 705 if (whiteSpace) { 706 continue; 707 } 708 strippedRules.appendCodePoint(cp); 709 } 710 return strippedRules.toString(); 711 } 712 713 //---------------------------------------------------------------------------------------- 714 // 715 // nextCharLL Low Level Next Char from rule input source. 716 // Get a char from the input character iterator, 717 // keep track of input position for error reporting. 718 // 719 //---------------------------------------------------------------------------------------- nextCharLL()720 int nextCharLL() { 721 int ch; 722 723 if (fNextIndex >= fRB.fRules.length()) { 724 return -1; 725 } 726 ch = UTF16.charAt(fRB.fRules, fNextIndex); 727 if (Character.isBmpCodePoint(ch) && Character.isSurrogate((char)ch)) { 728 error(RBBIRuleBuilder.U_ILLEGAL_CHAR_FOUND); 729 } 730 fNextIndex = UTF16.moveCodePointOffset(fRB.fRules, fNextIndex, 1); 731 732 if (ch == '\r' || 733 ch == chNEL || 734 ch == chLS || 735 ch == '\n' && fLastChar != '\r') { 736 // Character is starting a new line. Bump up the line number, and 737 // reset the column to 0. 738 fLineNum++; 739 fCharNum = 0; 740 if (fQuoteMode) { 741 error(RBBIRuleBuilder.U_BRK_NEW_LINE_IN_QUOTED_STRING); 742 fQuoteMode = false; 743 } 744 } else { 745 // Character is not starting a new line. Except in the case of a 746 // LF following a CR, increment the column position. 747 if (ch != '\n') { 748 fCharNum++; 749 } 750 } 751 fLastChar = ch; 752 return ch; 753 } 754 755 //--------------------------------------------------------------------------------- 756 // 757 // nextChar for rules scanning. At this level, we handle stripping 758 // out comments and processing backslash character escapes. 759 // The rest of the rules grammar is handled at the next level up. 760 // 761 //--------------------------------------------------------------------------------- nextChar(RBBIRuleChar c)762 void nextChar(RBBIRuleChar c) { 763 764 // Unicode Character constants needed for the processing done by nextChar(), 765 // in hex because literals wont work on EBCDIC machines. 766 767 fScanIndex = fNextIndex; 768 c.fChar = nextCharLL(); 769 c.fEscaped = false; 770 771 // 772 // check for '' sequence. 773 // These are recognized in all contexts, whether in quoted text or not. 774 // 775 if (c.fChar == '\'') { 776 if (UTF16.charAt(fRB.fRules, fNextIndex) == '\'') { 777 c.fChar = nextCharLL(); // get nextChar officially so character counts 778 c.fEscaped = true; // stay correct. 779 } else { 780 // Single quote, by itself. 781 // Toggle quoting mode. 782 // Return either '(' or ')', because quotes cause a grouping of the quoted text. 783 fQuoteMode = !fQuoteMode; 784 if (fQuoteMode == true) { 785 c.fChar = '('; 786 } else { 787 c.fChar = ')'; 788 } 789 c.fEscaped = false; // The paren that we return is not escaped. 790 return; 791 } 792 } 793 794 if (fQuoteMode) { 795 c.fEscaped = true; 796 } else { 797 // We are not in a 'quoted region' of the source. 798 // 799 if (c.fChar == '#') { 800 // Start of a comment. Consume the rest of it. 801 // The new-line char that terminates the comment is always returned. 802 // It will be treated as white-space, and serves to break up anything 803 // that might otherwise incorrectly clump together with a comment in 804 // the middle (a variable name, for example.) 805 int commentStart = fScanIndex; 806 for (;;) { 807 c.fChar = nextCharLL(); 808 if (c.fChar == -1 || // EOF 809 c.fChar == '\r' || 810 c.fChar == '\n' || 811 c.fChar == chNEL || 812 c.fChar == chLS) 813 { 814 break; 815 } 816 } 817 for (int i=commentStart; i<fNextIndex-1; ++i) { 818 fRB.fStrippedRules.setCharAt(i, ' '); 819 } 820 } 821 if (c.fChar == -1) { 822 return; 823 } 824 825 // 826 // check for backslash escaped characters. 827 // 828 if (c.fChar == '\\') { 829 c.fEscaped = true; 830 int cpAndLength = Utility.unescapeAndLengthAt(fRB.fRules, fNextIndex); 831 if (cpAndLength < 0) { 832 error(RBBIRuleBuilder.U_BRK_HEX_DIGITS_EXPECTED); 833 } 834 c.fChar = Utility.cpFromCodePointAndLength(cpAndLength); 835 int length = Utility.lengthFromCodePointAndLength(cpAndLength); 836 837 fCharNum += length; 838 fNextIndex += length; 839 } 840 } 841 // putc(c.fChar, stdout); 842 } 843 844 //--------------------------------------------------------------------------------- 845 // 846 // Parse RBBI rules. The state machine for rules parsing is here. 847 // The state tables are hand-written in the file rbbirpt.txt, 848 // and converted to the form used here by a perl 849 // script rbbicst.pl 850 // 851 //--------------------------------------------------------------------------------- parse()852 void parse() { 853 int state; 854 RBBIRuleParseTable.RBBIRuleTableElement tableEl; 855 856 state = 1; 857 nextChar(fC); 858 // 859 // Main loop for the rule parsing state machine. 860 // Runs once per state transition. 861 // Each time through optionally performs, depending on the state table, 862 // - an advance to the the next input char 863 // - an action to be performed. 864 // - pushing or popping a state to/from the local state return stack. 865 // 866 for (;;) { 867 // Quit if state == 0. This is the normal way to exit the state machine. 868 // 869 if (state == 0) { 870 break; 871 } 872 873 // Find the state table element that matches the input char from the rule, or the 874 // class of the input character. Start with the first table row for this 875 // state, then linearly scan forward until we find a row that matches the 876 // character. The last row for each state always matches all characters, so 877 // the search will stop there, if not before. 878 // 879 tableEl = RBBIRuleParseTable.gRuleParseStateTable[state]; 880 if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("scan") >= 0) { 881 System.out.println("char, line, col = (\'" + (char) fC.fChar 882 + "\', " + fLineNum + ", " + fCharNum + " state = " 883 + tableEl.fStateName); 884 } 885 886 for (int tableRow = state;; tableRow++) { // loop over the state table rows associated with this state. 887 tableEl = RBBIRuleParseTable.gRuleParseStateTable[tableRow]; 888 if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("scan") >= 0) { 889 System.out.print("."); 890 } 891 if (tableEl.fCharClass < 127 && fC.fEscaped == false 892 && tableEl.fCharClass == fC.fChar) { 893 // Table row specified an individual character, not a set, and 894 // the input character is not escaped, and 895 // the input character matched it. 896 break; 897 } 898 if (tableEl.fCharClass == 255) { 899 // Table row specified default, match anything character class. 900 break; 901 } 902 if (tableEl.fCharClass == 254 && fC.fEscaped) { 903 // Table row specified "escaped" and the char was escaped. 904 break; 905 } 906 if (tableEl.fCharClass == 253 && fC.fEscaped 907 && (fC.fChar == 0x50 || fC.fChar == 0x70)) { 908 // Table row specified "escaped P" and the char is either 'p' or 'P'. 909 break; 910 } 911 if (tableEl.fCharClass == 252 && fC.fChar == -1) { 912 // Table row specified eof and we hit eof on the input. 913 break; 914 } 915 916 if (tableEl.fCharClass >= 128 && tableEl.fCharClass < 240 && // Table specs a char class && 917 fC.fEscaped == false && // char is not escaped && 918 fC.fChar != -1) { // char is not EOF 919 UnicodeSet uniset = fRuleSets[tableEl.fCharClass - 128]; 920 if (uniset.contains(fC.fChar)) { 921 // Table row specified a character class, or set of characters, 922 // and the current char matches it. 923 break; 924 } 925 } 926 } 927 928 if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("scan") >= 0) { 929 System.out.println(""); 930 } 931 // 932 // We've found the row of the state table that matches the current input 933 // character from the rules string. 934 // Perform any action specified by this row in the state table. 935 if (doParseActions(tableEl.fAction) == false) { 936 // Break out of the state machine loop if the 937 // the action signaled some kind of error, or 938 // the action was to exit, occurs on normal end-of-rules-input. 939 break; 940 } 941 942 if (tableEl.fPushState != 0) { 943 fStackPtr++; 944 if (fStackPtr >= kStackSize) { 945 System.out.println("RBBIRuleScanner.parse() - state stack overflow."); 946 error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR); 947 } 948 fStack[fStackPtr] = tableEl.fPushState; 949 } 950 951 if (tableEl.fNextChar) { 952 nextChar(fC); 953 } 954 955 // Get the next state from the table entry, or from the 956 // state stack if the next state was specified as "pop". 957 if (tableEl.fNextState != 255) { 958 state = tableEl.fNextState; 959 } else { 960 state = fStack[fStackPtr]; 961 fStackPtr--; 962 if (fStackPtr < 0) { 963 System.out.println("RBBIRuleScanner.parse() - state stack underflow."); 964 error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR); 965 } 966 } 967 968 } 969 970 // If there are no forward rules throw an error. 971 // 972 if (fRB.fTreeRoots[RBBIRuleBuilder.fForwardTree] == null) { 973 error(RBBIRuleBuilder.U_BRK_RULE_SYNTAX); 974 } 975 976 // 977 // Parsing of the input RBBI rules is complete. 978 // We now have a parse tree for the rule expressions 979 // and a list of all UnicodeSets that are referenced. 980 // 981 if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("symbols") >= 0) { 982 fSymbolTable.rbbiSymtablePrint(); 983 } 984 if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("ptree") >= 0) { 985 System.out.println("Completed Forward Rules Parse Tree..."); 986 fRB.fTreeRoots[RBBIRuleBuilder.fForwardTree].printTree(true); 987 System.out.println("\nCompleted Reverse Rules Parse Tree..."); 988 fRB.fTreeRoots[RBBIRuleBuilder.fReverseTree].printTree(true); 989 System.out.println("\nCompleted Safe Point Forward Rules Parse Tree..."); 990 if (fRB.fTreeRoots[RBBIRuleBuilder.fSafeFwdTree] == null) { 991 System.out.println(" -- null -- "); 992 } else { 993 fRB.fTreeRoots[RBBIRuleBuilder.fSafeFwdTree].printTree(true); 994 } 995 System.out.println("\nCompleted Safe Point Reverse Rules Parse Tree..."); 996 if (fRB.fTreeRoots[RBBIRuleBuilder.fSafeRevTree] == null) { 997 System.out.println(" -- null -- "); 998 } else { 999 fRB.fTreeRoots[RBBIRuleBuilder.fSafeRevTree].printTree(true); 1000 } 1001 } 1002 } 1003 1004 //--------------------------------------------------------------------------------- 1005 // 1006 // printNodeStack for debugging... 1007 // 1008 //--------------------------------------------------------------------------------- 1009 ///CLOVER:OFF printNodeStack(String title)1010 void printNodeStack(String title) { 1011 int i; 1012 System.out.println(title + ". Dumping node stack...\n"); 1013 for (i = fNodeStackPtr; i > 0; i--) { 1014 fNodeStack[i].printTree(true); 1015 } 1016 } 1017 ///CLOVER:ON 1018 1019 //--------------------------------------------------------------------------------- 1020 // 1021 // pushNewNode create a new RBBINode of the specified type and push it 1022 // onto the stack of nodes. 1023 // 1024 //--------------------------------------------------------------------------------- pushNewNode(int nodeType)1025 RBBINode pushNewNode(int nodeType) { 1026 fNodeStackPtr++; 1027 if (fNodeStackPtr >= kStackSize) { 1028 System.out.println("RBBIRuleScanner.pushNewNode - stack overflow."); 1029 error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR); 1030 } 1031 fNodeStack[fNodeStackPtr] = new RBBINode(nodeType); 1032 return fNodeStack[fNodeStackPtr]; 1033 } 1034 1035 //--------------------------------------------------------------------------------- 1036 // 1037 // scanSet Construct a UnicodeSet from the text at the current scan 1038 // position. Advance the scan position to the first character 1039 // after the set. 1040 // 1041 // A new RBBI setref node referring to the set is pushed onto the node 1042 // stack. 1043 // 1044 // The scan position is normally under the control of the state machine 1045 // that controls rule parsing. UnicodeSets, however, are parsed by 1046 // the UnicodeSet constructor, not by the RBBI rule parser. 1047 // 1048 //--------------------------------------------------------------------------------- scanSet()1049 void scanSet() { 1050 UnicodeSet uset = null; 1051 int startPos; 1052 ParsePosition pos = new ParsePosition(fScanIndex); 1053 int i; 1054 1055 startPos = fScanIndex; 1056 try { 1057 uset = new UnicodeSet(fRB.fRules, pos, fSymbolTable, UnicodeSet.IGNORE_SPACE); 1058 } catch (Exception e) { // TODO: catch fewer exception types. 1059 // Repackage UnicodeSet errors as RBBI rule builder errors, with location info. 1060 error(RBBIRuleBuilder.U_BRK_MALFORMED_SET); 1061 } 1062 1063 // Verify that the set contains at least one code point. 1064 // 1065 if (uset.isEmpty()) { 1066 // This set is empty. 1067 // Make it an error, because it almost certainly is not what the user wanted. 1068 // Also, avoids having to think about corner cases in the tree manipulation code 1069 // that occurs later on. 1070 // TODO: this shouldn't be an error; it does happen. 1071 error(RBBIRuleBuilder.U_BRK_RULE_EMPTY_SET); 1072 } 1073 1074 // Advance the RBBI parse position over the UnicodeSet pattern. 1075 // Don't just set fScanIndex because the line/char positions maintained 1076 // for error reporting would be thrown off. 1077 i = pos.getIndex(); 1078 for (;;) { 1079 if (fNextIndex >= i) { 1080 break; 1081 } 1082 nextCharLL(); 1083 } 1084 1085 RBBINode n; 1086 1087 n = pushNewNode(RBBINode.setRef); 1088 n.fFirstPos = startPos; 1089 n.fLastPos = fNextIndex; 1090 n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos); 1091 // findSetFor() serves several purposes here: 1092 // - Adopts storage for the UnicodeSet, will be responsible for deleting. 1093 // - Maintains collection of all sets in use, needed later for establishing 1094 // character categories for run time engine. 1095 // - Eliminates multiple instances of the same set. 1096 // - Creates a new uset node if necessary (if this isn't a duplicate.) 1097 findSetFor(n.fText, n, uset); 1098 } 1099 1100 /** 1101 * @return the number of rules that have been seen. 1102 */ numRules()1103 int numRules() { 1104 return fRuleNum; 1105 } 1106 } 1107