1 /* GENERATED SOURCE. DO NOT MODIFY. */ 2 // © 2016 and later: Unicode, Inc. and others. 3 // License & terms of use: http://www.unicode.org/copyright.html#License 4 /* 5 ********************************************************************** 6 * Copyright (c) 2001-2011, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 ********************************************************************** 9 */ 10 package ohos.global.icu.text; 11 12 import java.text.ParsePosition; 13 import java.util.ArrayList; 14 import java.util.HashMap; 15 import java.util.List; 16 import java.util.Map; 17 18 import ohos.global.icu.impl.IllegalIcuArgumentException; 19 import ohos.global.icu.impl.PatternProps; 20 import ohos.global.icu.impl.Utility; 21 import ohos.global.icu.lang.UCharacter; 22 import ohos.global.icu.text.RuleBasedTransliterator.Data; 23 24 class TransliteratorParser { 25 26 //---------------------------------------------------------------------- 27 // Data members 28 //---------------------------------------------------------------------- 29 30 /** 31 * PUBLIC data member. 32 * A Vector of RuleBasedTransliterator.Data objects, one for each discrete group 33 * of rules in the rule set 34 */ 35 public List<Data> dataVector; 36 37 /** 38 * PUBLIC data member. 39 * A Vector of Strings containing all of the ID blocks in the rule set 40 */ 41 public List<String> idBlockVector; 42 43 /** 44 * The current data object for which we are parsing rules 45 */ 46 private Data curData; 47 48 /** 49 * PUBLIC data member containing the parsed compound filter, if any. 50 */ 51 public UnicodeSet compoundFilter; 52 53 54 private int direction; 55 56 /** 57 * Temporary symbol table used during parsing. 58 */ 59 private ParseData parseData; 60 61 /** 62 * Temporary vector of set variables. When parsing is complete, this 63 * is copied into the array data.variables. As with data.variables, 64 * element 0 corresponds to character data.variablesBase. 65 */ 66 private List<Object> variablesVector; 67 68 /** 69 * Temporary table of variable names. When parsing is complete, this is 70 * copied into data.variableNames. 71 */ 72 private Map<String, char[]> variableNames; 73 74 /** 75 * String of standins for segments. Used during the parsing of a single 76 * rule. segmentStandins.charAt(0) is the standin for "$1" and corresponds 77 * to StringMatcher object segmentObjects.elementAt(0), etc. 78 */ 79 private StringBuffer segmentStandins; 80 81 /** 82 * Vector of StringMatcher objects for segments. Used during the 83 * parsing of a single rule. 84 * segmentStandins.charAt(0) is the standin for "$1" and corresponds 85 * to StringMatcher object segmentObjects.elementAt(0), etc. 86 */ 87 private List<StringMatcher> segmentObjects; 88 89 /** 90 * The next available stand-in for variables. This starts at some point in 91 * the private use area (discovered dynamically) and increments up toward 92 * <code>variableLimit</code>. At any point during parsing, available 93 * variables are <code>variableNext..variableLimit-1</code>. 94 */ 95 private char variableNext; 96 97 /** 98 * The last available stand-in for variables. This is discovered 99 * dynamically. At any point during parsing, available variables are 100 * <code>variableNext..variableLimit-1</code>. During variable definition 101 * we use the special value variableLimit-1 as a placeholder. 102 */ 103 private char variableLimit; 104 105 /** 106 * When we encounter an undefined variable, we do not immediately signal 107 * an error, in case we are defining this variable, e.g., "$a = [a-z];". 108 * Instead, we save the name of the undefined variable, and substitute 109 * in the placeholder char variableLimit - 1, and decrement 110 * variableLimit. 111 */ 112 private String undefinedVariableName; 113 114 /** 115 * The stand-in character for the 'dot' set, represented by '.' in 116 * patterns. This is allocated the first time it is needed, and 117 * reused thereafter. 118 */ 119 private int dotStandIn = -1; 120 121 //---------------------------------------------------------------------- 122 // Constants 123 //---------------------------------------------------------------------- 124 125 // Indicator for ID blocks 126 private static final String ID_TOKEN = "::"; 127 private static final int ID_TOKEN_LEN = 2; 128 129 /* 130 (reserved for future expansion) 131 // markers for beginning and end of rule groups 132 private static final String BEGIN_TOKEN = "BEGIN"; 133 private static final String END_TOKEN = "END"; 134 */ 135 136 // Operators 137 private static final char VARIABLE_DEF_OP = '='; 138 private static final char FORWARD_RULE_OP = '>'; 139 private static final char REVERSE_RULE_OP = '<'; 140 private static final char FWDREV_RULE_OP = '~'; // internal rep of <> op 141 142 private static final String OPERATORS = "=><\u2190\u2192\u2194"; 143 private static final String HALF_ENDERS = "=><\u2190\u2192\u2194;"; 144 145 // Other special characters 146 private static final char QUOTE = '\''; 147 private static final char ESCAPE = '\\'; 148 private static final char END_OF_RULE = ';'; 149 private static final char RULE_COMMENT_CHAR = '#'; 150 151 private static final char CONTEXT_ANTE = '{'; // ante{key 152 private static final char CONTEXT_POST = '}'; // key}post 153 private static final char CURSOR_POS = '|'; 154 private static final char CURSOR_OFFSET = '@'; 155 private static final char ANCHOR_START = '^'; 156 157 private static final char KLEENE_STAR = '*'; 158 private static final char ONE_OR_MORE = '+'; 159 private static final char ZERO_OR_ONE = '?'; 160 161 private static final char DOT = '.'; 162 private static final String DOT_SET = "[^[:Zp:][:Zl:]\\r\\n$]"; 163 164 // By definition, the ANCHOR_END special character is a 165 // trailing SymbolTable.SYMBOL_REF character. 166 // private static final char ANCHOR_END = '$'; 167 168 // Segments of the input string are delimited by "(" and ")". In the 169 // output string these segments are referenced as "$1", "$2", etc. 170 private static final char SEGMENT_OPEN = '('; 171 private static final char SEGMENT_CLOSE = ')'; 172 173 // A function is denoted &Source-Target/Variant(text) 174 private static final char FUNCTION = '&'; 175 176 // Aliases for some of the syntax characters. These are provided so 177 // transliteration rules can be expressed in XML without clashing with 178 // XML syntax characters '<', '>', and '&'. 179 private static final char ALT_REVERSE_RULE_OP = '\u2190'; // Left Arrow 180 private static final char ALT_FORWARD_RULE_OP = '\u2192'; // Right Arrow 181 private static final char ALT_FWDREV_RULE_OP = '\u2194'; // Left Right Arrow 182 private static final char ALT_FUNCTION = '\u2206'; // Increment (~Greek Capital Delta) 183 184 // Special characters disallowed at the top level 185 private static UnicodeSet ILLEGAL_TOP = new UnicodeSet("[\\)]"); 186 187 // Special characters disallowed within a segment 188 private static UnicodeSet ILLEGAL_SEG = new UnicodeSet("[\\{\\}\\|\\@]"); 189 190 // Special characters disallowed within a function argument 191 private static UnicodeSet ILLEGAL_FUNC = new UnicodeSet("[\\^\\(\\.\\*\\+\\?\\{\\}\\|\\@]"); 192 193 //---------------------------------------------------------------------- 194 // class ParseData 195 //---------------------------------------------------------------------- 196 197 /** 198 * This class implements the SymbolTable interface. It is used 199 * during parsing to give UnicodeSet access to variables that 200 * have been defined so far. Note that it uses variablesVector, 201 * _not_ data.variables. 202 */ 203 private class ParseData implements SymbolTable { 204 205 /** 206 * Implement SymbolTable API. 207 */ 208 @Override lookup(String name)209 public char[] lookup(String name) { 210 return variableNames.get(name); 211 } 212 213 /** 214 * Implement SymbolTable API. 215 */ 216 @Override lookupMatcher(int ch)217 public UnicodeMatcher lookupMatcher(int ch) { 218 // Note that we cannot use data.lookup() because the 219 // set array has not been constructed yet. 220 int i = ch - curData.variablesBase; 221 if (i >= 0 && i < variablesVector.size()) { 222 return (UnicodeMatcher) variablesVector.get(i); 223 } 224 return null; 225 } 226 227 /** 228 * Implement SymbolTable API. Parse out a symbol reference 229 * name. 230 */ 231 @Override parseReference(String text, ParsePosition pos, int limit)232 public String parseReference(String text, ParsePosition pos, int limit) { 233 int start = pos.getIndex(); 234 int i = start; 235 while (i < limit) { 236 char c = text.charAt(i); 237 if ((i==start && !UCharacter.isUnicodeIdentifierStart(c)) || 238 !UCharacter.isUnicodeIdentifierPart(c)) { 239 break; 240 } 241 ++i; 242 } 243 if (i == start) { // No valid name chars 244 return null; 245 } 246 pos.setIndex(i); 247 return text.substring(start, i); 248 } 249 250 /** 251 * Return true if the given character is a matcher standin or a plain 252 * character (non standin). 253 */ isMatcher(int ch)254 public boolean isMatcher(int ch) { 255 // Note that we cannot use data.lookup() because the 256 // set array has not been constructed yet. 257 int i = ch - curData.variablesBase; 258 if (i >= 0 && i < variablesVector.size()) { 259 return variablesVector.get(i) instanceof UnicodeMatcher; 260 } 261 return true; 262 } 263 264 /** 265 * Return true if the given character is a replacer standin or a plain 266 * character (non standin). 267 */ isReplacer(int ch)268 public boolean isReplacer(int ch) { 269 // Note that we cannot use data.lookup() because the 270 // set array has not been constructed yet. 271 int i = ch - curData.variablesBase; 272 if (i >= 0 && i < variablesVector.size()) { 273 return variablesVector.get(i) instanceof UnicodeReplacer; 274 } 275 return true; 276 } 277 } 278 279 //---------------------------------------------------------------------- 280 // classes RuleBody, RuleArray, and RuleReader 281 //---------------------------------------------------------------------- 282 283 /** 284 * A private abstract class representing the interface to rule 285 * source code that is broken up into lines. Handles the 286 * folding of lines terminated by a backslash. This folding 287 * is limited; it does not account for comments, quotes, or 288 * escapes, so its use to be limited. 289 */ 290 private static abstract class RuleBody { 291 292 /** 293 * Retrieve the next line of the source, or return null if 294 * none. Folds lines terminated by a backslash into the 295 * next line, without regard for comments, quotes, or 296 * escapes. 297 */ nextLine()298 String nextLine() { 299 String s = handleNextLine(); 300 if (s != null && 301 s.length() > 0 && 302 s.charAt(s.length() - 1) == '\\') { 303 StringBuilder b = new StringBuilder(s); 304 do { 305 b.deleteCharAt(b.length()-1); 306 s = handleNextLine(); 307 if (s == null) { 308 break; 309 } 310 b.append(s); 311 } while (s.length() > 0 && 312 s.charAt(s.length() - 1) == '\\'); 313 s = b.toString(); 314 } 315 return s; 316 } 317 318 /** 319 * Reset to the first line of the source. 320 */ reset()321 abstract void reset(); 322 323 /** 324 * Subclass method to return the next line of the source. 325 */ handleNextLine()326 abstract String handleNextLine(); 327 } 328 329 /** 330 * RuleBody subclass for a String[] array. 331 */ 332 private static class RuleArray extends RuleBody { 333 String[] array; 334 int i; RuleArray(String[] array)335 public RuleArray(String[] array) { this.array = array; i = 0; } 336 @Override handleNextLine()337 public String handleNextLine() { 338 return (i < array.length) ? array[i++] : null; 339 } 340 @Override reset()341 public void reset() { 342 i = 0; 343 } 344 } 345 346 /* 347 * RuleBody subclass for a ResourceReader. 348 */ 349 /* private static class RuleReader extends RuleBody { 350 ResourceReader reader; 351 public RuleReader(ResourceReader reader) { this.reader = reader; } 352 public String handleNextLine() { 353 try { 354 return reader.readLine(); 355 } catch (java.io.IOException e) {} 356 return null; 357 } 358 public void reset() { 359 reader.reset(); 360 } 361 }*/ 362 363 //---------------------------------------------------------------------- 364 // class RuleHalf 365 //---------------------------------------------------------------------- 366 367 /** 368 * A class representing one side of a rule. This class knows how to 369 * parse half of a rule. It is tightly coupled to the method 370 * TransliteratorParser.parseRule(). 371 */ 372 private static class RuleHalf { 373 374 public String text; 375 376 public int cursor = -1; // position of cursor in text 377 public int ante = -1; // position of ante context marker '{' in text 378 public int post = -1; // position of post context marker '}' in text 379 380 // Record the offset to the cursor either to the left or to the 381 // right of the key. This is indicated by characters on the output 382 // side that allow the cursor to be positioned arbitrarily within 383 // the matching text. For example, abc{def} > | @@@ xyz; changes 384 // def to xyz and moves the cursor to before abc. Offset characters 385 // must be at the start or end, and they cannot move the cursor past 386 // the ante- or postcontext text. Placeholders are only valid in 387 // output text. The length of the ante and post context is 388 // determined at runtime, because of supplementals and quantifiers. 389 public int cursorOffset = 0; // only nonzero on output side 390 391 // Position of first CURSOR_OFFSET on _right_. This will be -1 392 // for |@, -2 for |@@, etc., and 1 for @|, 2 for @@|, etc. 393 private int cursorOffsetPos = 0; 394 395 public boolean anchorStart = false; 396 public boolean anchorEnd = false; 397 398 /** 399 * The segment number from 1..n of the next '(' we see 400 * during parsing; 1-based. 401 */ 402 private int nextSegmentNumber = 1; 403 404 /** 405 * Parse one side of a rule, stopping at either the limit, 406 * the END_OF_RULE character, or an operator. 407 * @return the index after the terminating character, or 408 * if limit was reached, limit 409 */ parse(String rule, int pos, int limit, TransliteratorParser parser)410 public int parse(String rule, int pos, int limit, 411 TransliteratorParser parser) { 412 int start = pos; 413 StringBuffer buf = new StringBuffer(); 414 pos = parseSection(rule, pos, limit, parser, buf, ILLEGAL_TOP, false); 415 text = buf.toString(); 416 417 if (cursorOffset > 0 && cursor != cursorOffsetPos) { 418 syntaxError("Misplaced " + CURSOR_POS, rule, start); 419 } 420 421 return pos; 422 } 423 424 /** 425 * Parse a section of one side of a rule, stopping at either 426 * the limit, the END_OF_RULE character, an operator, or a 427 * segment close character. This method parses both a 428 * top-level rule half and a segment within such a rule half. 429 * It calls itself recursively to parse segments and nested 430 * segments. 431 * @param buf buffer into which to accumulate the rule pattern 432 * characters, either literal characters from the rule or 433 * standins for UnicodeMatcher objects including segments. 434 * @param illegal the set of special characters that is illegal during 435 * this parse. 436 * @param isSegment if true, then we've already seen a '(' and 437 * pos on entry points right after it. Accumulate everything 438 * up to the closing ')', put it in a segment matcher object, 439 * generate a standin for it, and add the standin to buf. As 440 * a side effect, update the segments vector with a reference 441 * to the segment matcher. This works recursively for nested 442 * segments. If isSegment is false, just accumulate 443 * characters into buf. 444 * @return the index after the terminating character, or 445 * if limit was reached, limit 446 */ parseSection(String rule, int pos, int limit, TransliteratorParser parser, StringBuffer buf, UnicodeSet illegal, boolean isSegment)447 private int parseSection(String rule, int pos, int limit, 448 TransliteratorParser parser, 449 StringBuffer buf, 450 UnicodeSet illegal, 451 boolean isSegment) { 452 int start = pos; 453 ParsePosition pp = null; 454 int quoteStart = -1; // Most recent 'single quoted string' 455 int quoteLimit = -1; 456 int varStart = -1; // Most recent $variableReference 457 int varLimit = -1; 458 int[] iref = new int[1]; 459 int bufStart = buf.length(); 460 461 main: 462 while (pos < limit) { 463 // Since all syntax characters are in the BMP, fetching 464 // 16-bit code units suffices here. 465 char c = rule.charAt(pos++); 466 if (PatternProps.isWhiteSpace(c)) { 467 continue; 468 } 469 // HALF_ENDERS is all chars that end a rule half: "<>=;" 470 if (HALF_ENDERS.indexOf(c) >= 0) { 471 ///CLOVER:OFF 472 // isSegment is always false 473 if (isSegment) { 474 syntaxError("Unclosed segment", rule, start); 475 } 476 ///CLOVER:ON 477 break main; 478 } 479 if (anchorEnd) { 480 // Text after a presumed end anchor is a syntax err 481 syntaxError("Malformed variable reference", rule, start); 482 } 483 if (UnicodeSet.resemblesPattern(rule, pos-1)) { 484 if (pp == null) { 485 pp = new ParsePosition(0); 486 } 487 pp.setIndex(pos-1); // Backup to opening '[' 488 buf.append(parser.parseSet(rule, pp)); 489 pos = pp.getIndex(); 490 continue; 491 } 492 // Handle escapes 493 if (c == ESCAPE) { 494 if (pos == limit) { 495 syntaxError("Trailing backslash", rule, start); 496 } 497 iref[0] = pos; 498 int escaped = Utility.unescapeAt(rule, iref); 499 pos = iref[0]; 500 if (escaped == -1) { 501 syntaxError("Malformed escape", rule, start); 502 } 503 parser.checkVariableRange(escaped, rule, start); 504 UTF16.append(buf, escaped); 505 continue; 506 } 507 // Handle quoted matter 508 if (c == QUOTE) { 509 int iq = rule.indexOf(QUOTE, pos); 510 if (iq == pos) { 511 buf.append(c); // Parse [''] outside quotes as ['] 512 ++pos; 513 } else { 514 /* This loop picks up a run of quoted text of the 515 * form 'aaaa' each time through. If this run 516 * hasn't really ended ('aaaa''bbbb') then it keeps 517 * looping, each time adding on a new run. When it 518 * reaches the final quote it breaks. 519 */ 520 quoteStart = buf.length(); 521 for (;;) { 522 if (iq < 0) { 523 syntaxError("Unterminated quote", rule, start); 524 } 525 buf.append(rule.substring(pos, iq)); 526 pos = iq+1; 527 if (pos < limit && rule.charAt(pos) == QUOTE) { 528 // Parse [''] inside quotes as ['] 529 iq = rule.indexOf(QUOTE, pos+1); 530 // Continue looping 531 } else { 532 break; 533 } 534 } 535 quoteLimit = buf.length(); 536 537 for (iq=quoteStart; iq<quoteLimit; ++iq) { 538 parser.checkVariableRange(buf.charAt(iq), rule, start); 539 } 540 } 541 continue; 542 } 543 544 parser.checkVariableRange(c, rule, start); 545 546 if (illegal.contains(c)) { 547 syntaxError("Illegal character '" + c + '\'', rule, start); 548 } 549 550 switch (c) { 551 552 //------------------------------------------------------ 553 // Elements allowed within and out of segments 554 //------------------------------------------------------ 555 case ANCHOR_START: 556 if (buf.length() == 0 && !anchorStart) { 557 anchorStart = true; 558 } else { 559 syntaxError("Misplaced anchor start", 560 rule, start); 561 } 562 break; 563 case SEGMENT_OPEN: 564 { 565 // bufSegStart is the offset in buf to the first 566 // character of the segment we are parsing. 567 int bufSegStart = buf.length(); 568 569 // Record segment number now, since nextSegmentNumber 570 // will be incremented during the call to parseSection 571 // if there are nested segments. 572 int segmentNumber = nextSegmentNumber++; // 1-based 573 574 // Parse the segment 575 pos = parseSection(rule, pos, limit, parser, buf, ILLEGAL_SEG, true); 576 577 // After parsing a segment, the relevant characters are 578 // in buf, starting at offset bufSegStart. Extract them 579 // into a string matcher, and replace them with a 580 // standin for that matcher. 581 StringMatcher m = 582 new StringMatcher(buf.substring(bufSegStart), 583 segmentNumber, parser.curData); 584 585 // Record and associate object and segment number 586 parser.setSegmentObject(segmentNumber, m); 587 buf.setLength(bufSegStart); 588 buf.append(parser.getSegmentStandin(segmentNumber)); 589 } 590 break; 591 case FUNCTION: 592 case ALT_FUNCTION: 593 { 594 iref[0] = pos; 595 TransliteratorIDParser.SingleID single = TransliteratorIDParser.parseFilterID(rule, iref); 596 // The next character MUST be a segment open 597 if (single == null || 598 !Utility.parseChar(rule, iref, SEGMENT_OPEN)) { 599 syntaxError("Invalid function", rule, start); 600 } 601 602 Transliterator t = single.getInstance(); 603 if (t == null) { 604 syntaxError("Invalid function ID", rule, start); 605 } 606 607 // bufSegStart is the offset in buf to the first 608 // character of the segment we are parsing. 609 int bufSegStart = buf.length(); 610 611 // Parse the segment 612 pos = parseSection(rule, iref[0], limit, parser, buf, ILLEGAL_FUNC, true); 613 614 // After parsing a segment, the relevant characters are 615 // in buf, starting at offset bufSegStart. 616 FunctionReplacer r = 617 new FunctionReplacer(t, 618 new StringReplacer(buf.substring(bufSegStart), parser.curData)); 619 620 // Replace the buffer contents with a stand-in 621 buf.setLength(bufSegStart); 622 buf.append(parser.generateStandInFor(r)); 623 } 624 break; 625 case SymbolTable.SYMBOL_REF: 626 // Handle variable references and segment references "$1" .. "$9" 627 { 628 // A variable reference must be followed immediately 629 // by a Unicode identifier start and zero or more 630 // Unicode identifier part characters, or by a digit 631 // 1..9 if it is a segment reference. 632 if (pos == limit) { 633 // A variable ref character at the end acts as 634 // an anchor to the context limit, as in perl. 635 anchorEnd = true; 636 break; 637 } 638 // Parse "$1" "$2" .. "$9" .. (no upper limit) 639 c = rule.charAt(pos); 640 int r = UCharacter.digit(c, 10); 641 if (r >= 1 && r <= 9) { 642 iref[0] = pos; 643 r = Utility.parseNumber(rule, iref, 10); 644 if (r < 0) { 645 syntaxError("Undefined segment reference", 646 rule, start); 647 } 648 pos = iref[0]; 649 buf.append(parser.getSegmentStandin(r)); 650 } else { 651 if (pp == null) { // Lazy create 652 pp = new ParsePosition(0); 653 } 654 pp.setIndex(pos); 655 String name = parser.parseData. 656 parseReference(rule, pp, limit); 657 if (name == null) { 658 // This means the '$' was not followed by a 659 // valid name. Try to interpret it as an 660 // end anchor then. If this also doesn't work 661 // (if we see a following character) then signal 662 // an error. 663 anchorEnd = true; 664 break; 665 } 666 pos = pp.getIndex(); 667 // If this is a variable definition statement, 668 // then the LHS variable will be undefined. In 669 // that case appendVariableDef() will append the 670 // special placeholder char variableLimit-1. 671 varStart = buf.length(); 672 parser.appendVariableDef(name, buf); 673 varLimit = buf.length(); 674 } 675 } 676 break; 677 case DOT: 678 buf.append(parser.getDotStandIn()); 679 break; 680 case KLEENE_STAR: 681 case ONE_OR_MORE: 682 case ZERO_OR_ONE: 683 // Quantifiers. We handle single characters, quoted strings, 684 // variable references, and segments. 685 // a+ matches aaa 686 // 'foo'+ matches foofoofoo 687 // $v+ matches xyxyxy if $v == xy 688 // (seg)+ matches segsegseg 689 { 690 ///CLOVER:OFF 691 // isSegment is always false 692 if (isSegment && buf.length() == bufStart) { 693 // The */+ immediately follows '(' 694 syntaxError("Misplaced quantifier", rule, start); 695 break; 696 } 697 ///CLOVER:ON 698 699 int qstart, qlimit; 700 // The */+ follows an isolated character or quote 701 // or variable reference 702 if (buf.length() == quoteLimit) { 703 // The */+ follows a 'quoted string' 704 qstart = quoteStart; 705 qlimit = quoteLimit; 706 } else if (buf.length() == varLimit) { 707 // The */+ follows a $variableReference 708 qstart = varStart; 709 qlimit = varLimit; 710 } else { 711 // The */+ follows a single character, possibly 712 // a segment standin 713 qstart = buf.length() - 1; 714 qlimit = qstart + 1; 715 } 716 717 UnicodeMatcher m; 718 try { 719 m = new StringMatcher(buf.toString(), qstart, qlimit, 720 0, parser.curData); 721 } catch (RuntimeException e) { 722 final String precontext = pos < 50 ? rule.substring(0, pos) : "..." + rule.substring(pos - 50, pos); 723 final String postContext = limit-pos <= 50 ? rule.substring(pos, limit) : rule.substring(pos, pos+50) + "..."; 724 throw new IllegalIcuArgumentException("Failure in rule: " + precontext + "$$$" 725 + postContext).initCause(e); 726 } 727 int min = 0; 728 int max = Quantifier.MAX; 729 switch (c) { 730 case ONE_OR_MORE: 731 min = 1; 732 break; 733 case ZERO_OR_ONE: 734 min = 0; 735 max = 1; 736 break; 737 // case KLEENE_STAR: 738 // do nothing -- min, max already set 739 } 740 m = new Quantifier(m, min, max); 741 buf.setLength(qstart); 742 buf.append(parser.generateStandInFor(m)); 743 } 744 break; 745 746 //------------------------------------------------------ 747 // Elements allowed ONLY WITHIN segments 748 //------------------------------------------------------ 749 case SEGMENT_CLOSE: 750 // assert(isSegment); 751 // We're done parsing a segment. 752 break main; 753 754 //------------------------------------------------------ 755 // Elements allowed ONLY OUTSIDE segments 756 //------------------------------------------------------ 757 case CONTEXT_ANTE: 758 if (ante >= 0) { 759 syntaxError("Multiple ante contexts", rule, start); 760 } 761 ante = buf.length(); 762 break; 763 case CONTEXT_POST: 764 if (post >= 0) { 765 syntaxError("Multiple post contexts", rule, start); 766 } 767 post = buf.length(); 768 break; 769 case CURSOR_POS: 770 if (cursor >= 0) { 771 syntaxError("Multiple cursors", rule, start); 772 } 773 cursor = buf.length(); 774 break; 775 case CURSOR_OFFSET: 776 if (cursorOffset < 0) { 777 if (buf.length() > 0) { 778 syntaxError("Misplaced " + c, rule, start); 779 } 780 --cursorOffset; 781 } else if (cursorOffset > 0) { 782 if (buf.length() != cursorOffsetPos || cursor >= 0) { 783 syntaxError("Misplaced " + c, rule, start); 784 } 785 ++cursorOffset; 786 } else { 787 if (cursor == 0 && buf.length() == 0) { 788 cursorOffset = -1; 789 } else if (cursor < 0) { 790 cursorOffsetPos = buf.length(); 791 cursorOffset = 1; 792 } else { 793 syntaxError("Misplaced " + c, rule, start); 794 } 795 } 796 break; 797 798 //------------------------------------------------------ 799 // Non-special characters 800 //------------------------------------------------------ 801 default: 802 // Disallow unquoted characters other than [0-9A-Za-z] 803 // in the printable ASCII range. These characters are 804 // reserved for possible future use. 805 if (c >= 0x0021 && c <= 0x007E && 806 !((c >= '0' && c <= '9') || 807 (c >= 'A' && c <= 'Z') || 808 (c >= 'a' && c <= 'z'))) { 809 syntaxError("Unquoted " + c, rule, start); 810 } 811 buf.append(c); 812 break; 813 } 814 } 815 return pos; 816 } 817 818 /** 819 * Remove context. 820 */ removeContext()821 void removeContext() { 822 text = text.substring(ante < 0 ? 0 : ante, 823 post < 0 ? text.length() : post); 824 ante = post = -1; 825 anchorStart = anchorEnd = false; 826 } 827 828 /** 829 * Return true if this half looks like valid output, that is, does not 830 * contain quantifiers or other special input-only elements. 831 */ isValidOutput(TransliteratorParser parser)832 public boolean isValidOutput(TransliteratorParser parser) { 833 for (int i=0; i<text.length(); ) { 834 int c = UTF16.charAt(text, i); 835 i += UTF16.getCharCount(c); 836 if (!parser.parseData.isReplacer(c)) { 837 return false; 838 } 839 } 840 return true; 841 } 842 843 /** 844 * Return true if this half looks like valid input, that is, does not 845 * contain functions or other special output-only elements. 846 */ isValidInput(TransliteratorParser parser)847 public boolean isValidInput(TransliteratorParser parser) { 848 for (int i=0; i<text.length(); ) { 849 int c = UTF16.charAt(text, i); 850 i += UTF16.getCharCount(c); 851 if (!parser.parseData.isMatcher(c)) { 852 return false; 853 } 854 } 855 return true; 856 } 857 } 858 859 //---------------------------------------------------------------------- 860 // PUBLIC methods 861 //---------------------------------------------------------------------- 862 863 /** 864 * Constructor. 865 */ TransliteratorParser()866 public TransliteratorParser() { 867 } 868 869 /** 870 * Parse a set of rules. After the parse completes, examine the public 871 * data members for results. 872 */ parse(String rules, int dir)873 public void parse(String rules, int dir) { 874 parseRules(new RuleArray(new String[] { rules }), dir); 875 } 876 877 /* 878 * Parse a set of rules. After the parse completes, examine the public 879 * data members for results. 880 */ 881 /* public void parse(ResourceReader rules, int direction) { 882 parseRules(new RuleReader(rules), direction); 883 }*/ 884 885 //---------------------------------------------------------------------- 886 // PRIVATE methods 887 //---------------------------------------------------------------------- 888 889 /** 890 * Parse an array of zero or more rules. The strings in the array are 891 * treated as if they were concatenated together, with rule terminators 892 * inserted between array elements if not present already. 893 * 894 * Any previous rules are discarded. Typically this method is called exactly 895 * once, during construction. 896 * 897 * The member this.data will be set to null if there are no rules. 898 * 899 * @exception IllegalIcuArgumentException if there is a syntax error in the 900 * rules 901 */ parseRules(RuleBody ruleArray, int dir)902 void parseRules(RuleBody ruleArray, int dir) { 903 boolean parsingIDs = true; 904 int ruleCount = 0; 905 906 dataVector = new ArrayList<Data>(); 907 idBlockVector = new ArrayList<String>(); 908 curData = null; 909 direction = dir; 910 compoundFilter = null; 911 variablesVector = new ArrayList<Object>(); 912 variableNames = new HashMap<String, char[]>(); 913 parseData = new ParseData(); 914 915 List<RuntimeException> errors = new ArrayList<RuntimeException>(); 916 int errorCount = 0; 917 918 ruleArray.reset(); 919 920 StringBuilder idBlockResult = new StringBuilder(); 921 922 // The compound filter offset is an index into idBlockResult. 923 // If it is 0, then the compound filter occurred at the start, 924 // and it is the offset to the _start_ of the compound filter 925 // pattern. Otherwise it is the offset to the _limit_ of the 926 // compound filter pattern within idBlockResult. 927 this.compoundFilter = null; 928 int compoundFilterOffset = -1; 929 930 main: 931 for (;;) { 932 String rule = ruleArray.nextLine(); 933 if (rule == null) { 934 break; 935 } 936 int pos = 0; 937 int limit = rule.length(); 938 while (pos < limit) { 939 char c = rule.charAt(pos++); 940 if (PatternProps.isWhiteSpace(c)) { 941 continue; 942 } 943 // Skip lines starting with the comment character 944 if (c == RULE_COMMENT_CHAR) { 945 pos = rule.indexOf("\n", pos) + 1; 946 if (pos == 0) { 947 break; // No "\n" found; rest of rule is a commnet 948 } 949 continue; // Either fall out or restart with next line 950 } 951 952 // skip empty rules 953 if (c == END_OF_RULE) 954 continue; 955 956 // Often a rule file contains multiple errors. It's 957 // convenient to the rule author if these are all reported 958 // at once. We keep parsing rules even after a failure, up 959 // to a specified limit, and report all errors at once. 960 try { 961 ++ruleCount; 962 963 // We've found the start of a rule or ID. c is its first 964 // character, and pos points past c. 965 --pos; 966 // Look for an ID token. Must have at least ID_TOKEN_LEN + 1 967 // chars left. 968 if ((pos + ID_TOKEN_LEN + 1) <= limit && 969 rule.regionMatches(pos, ID_TOKEN, 0, ID_TOKEN_LEN)) { 970 pos += ID_TOKEN_LEN; 971 c = rule.charAt(pos); 972 while (PatternProps.isWhiteSpace(c) && pos < limit) { 973 ++pos; 974 c = rule.charAt(pos); 975 } 976 int[] p = new int[] { pos }; 977 978 if (!parsingIDs) { 979 if (curData != null) { 980 if (direction == Transliterator.FORWARD) 981 dataVector.add(curData); 982 else 983 dataVector.add(0, curData); 984 curData = null; 985 } 986 parsingIDs = true; 987 } 988 989 TransliteratorIDParser.SingleID id = 990 TransliteratorIDParser.parseSingleID( 991 rule, p, direction); 992 if (p[0] != pos && Utility.parseChar(rule, p, END_OF_RULE)) { 993 // Successful ::ID parse. 994 995 if (direction == Transliterator.FORWARD) { 996 idBlockResult.append(id.canonID).append(END_OF_RULE); 997 } else { 998 idBlockResult.insert(0, id.canonID + END_OF_RULE); 999 } 1000 1001 } else { 1002 // Couldn't parse an ID. Try to parse a global filter 1003 int[] withParens = new int[] { -1 }; 1004 UnicodeSet f = TransliteratorIDParser.parseGlobalFilter(rule, p, direction, withParens, null); 1005 if (f != null && Utility.parseChar(rule, p, END_OF_RULE)) { 1006 if ((direction == Transliterator.FORWARD) == 1007 (withParens[0] == 0)) { 1008 if (compoundFilter != null) { 1009 // Multiple compound filters 1010 syntaxError("Multiple global filters", rule, pos); 1011 } 1012 compoundFilter = f; 1013 compoundFilterOffset = ruleCount; 1014 } 1015 } else { 1016 // Invalid ::id 1017 // Can be parsed as neither an ID nor a global filter 1018 syntaxError("Invalid ::ID", rule, pos); 1019 } 1020 } 1021 1022 pos = p[0]; 1023 } else { 1024 if (parsingIDs) { 1025 if (direction == Transliterator.FORWARD) 1026 idBlockVector.add(idBlockResult.toString()); 1027 else 1028 idBlockVector.add(0, idBlockResult.toString()); 1029 idBlockResult.delete(0, idBlockResult.length()); 1030 parsingIDs = false; 1031 curData = new RuleBasedTransliterator.Data(); 1032 1033 // By default, rules use part of the private use area 1034 // E000..F8FF for variables and other stand-ins. Currently 1035 // the range F000..F8FF is typically sufficient. The 'use 1036 // variable range' pragma allows rule sets to modify this. 1037 setVariableRange(0xF000, 0xF8FF); 1038 } 1039 1040 if (resemblesPragma(rule, pos, limit)) { 1041 int ppp = parsePragma(rule, pos, limit); 1042 if (ppp < 0) { 1043 syntaxError("Unrecognized pragma", rule, pos); 1044 } 1045 pos = ppp; 1046 // Parse a rule 1047 } else { 1048 pos = parseRule(rule, pos, limit); 1049 } 1050 } 1051 } catch (IllegalArgumentException e) { 1052 if (errorCount == 30) { 1053 IllegalIcuArgumentException icuEx = new IllegalIcuArgumentException("\nMore than 30 errors; further messages squelched"); 1054 icuEx.initCause(e); 1055 errors.add(icuEx); 1056 break main; 1057 } 1058 e.fillInStackTrace(); 1059 errors.add(e); 1060 ++errorCount; 1061 pos = ruleEnd(rule, pos, limit) + 1; // +1 advances past ';' 1062 } 1063 } 1064 } 1065 if (parsingIDs && idBlockResult.length() > 0) { 1066 if (direction == Transliterator.FORWARD) 1067 idBlockVector.add(idBlockResult.toString()); 1068 else 1069 idBlockVector.add(0, idBlockResult.toString()); 1070 } 1071 else if (!parsingIDs && curData != null) { 1072 if (direction == Transliterator.FORWARD) 1073 dataVector.add(curData); 1074 else 1075 dataVector.add(0, curData); 1076 } 1077 1078 // Convert the set vector to an array 1079 for (int i = 0; i < dataVector.size(); i++) { 1080 Data data = dataVector.get(i); 1081 data.variables = new Object[variablesVector.size()]; 1082 variablesVector.toArray(data.variables); 1083 data.variableNames = new HashMap<String, char[]>(); 1084 data.variableNames.putAll(variableNames); 1085 } 1086 variablesVector = null; 1087 1088 // Do more syntax checking and index the rules 1089 try { 1090 if (compoundFilter != null) { 1091 if ((direction == Transliterator.FORWARD && 1092 compoundFilterOffset != 1) || 1093 (direction == Transliterator.REVERSE && 1094 compoundFilterOffset != ruleCount)) { 1095 throw new IllegalIcuArgumentException("Compound filters misplaced"); 1096 } 1097 } 1098 1099 for (int i = 0; i < dataVector.size(); i++) { 1100 Data data = dataVector.get(i); 1101 data.ruleSet.freeze(); 1102 } 1103 1104 if (idBlockVector.size() == 1 && (idBlockVector.get(0)).length() == 0) 1105 idBlockVector.remove(0); 1106 1107 } catch (IllegalArgumentException e) { 1108 e.fillInStackTrace(); 1109 errors.add(e); 1110 } 1111 1112 if (errors.size() != 0) { 1113 for (int i = errors.size()-1; i > 0; --i) { 1114 RuntimeException previous = errors.get(i-1); 1115 while (previous.getCause() != null) { 1116 previous = (RuntimeException) previous.getCause(); // chain specially 1117 } 1118 previous.initCause(errors.get(i)); 1119 } 1120 throw errors.get(0); 1121 // if initCause not supported: throw new IllegalArgumentException(errors.toString()); 1122 } 1123 } 1124 1125 /** 1126 * MAIN PARSER. Parse the next rule in the given rule string, starting 1127 * at pos. Return the index after the last character parsed. Do not 1128 * parse characters at or after limit. 1129 * 1130 * Important: The character at pos must be a non-whitespace character 1131 * that is not the comment character. 1132 * 1133 * This method handles quoting, escaping, and whitespace removal. It 1134 * parses the end-of-rule character. It recognizes context and cursor 1135 * indicators. Once it does a lexical breakdown of the rule at pos, it 1136 * creates a rule object and adds it to our rule list. 1137 * 1138 * This method is tightly coupled to the inner class RuleHalf. 1139 */ parseRule(String rule, int pos, int limit)1140 private int parseRule(String rule, int pos, int limit) { 1141 // Locate the left side, operator, and right side 1142 int start = pos; 1143 char operator = 0; 1144 1145 // Set up segments data 1146 segmentStandins = new StringBuffer(); 1147 segmentObjects = new ArrayList<StringMatcher>(); 1148 1149 RuleHalf left = new RuleHalf(); 1150 RuleHalf right = new RuleHalf(); 1151 1152 undefinedVariableName = null; 1153 pos = left.parse(rule, pos, limit, this); 1154 1155 if (pos == limit || 1156 OPERATORS.indexOf(operator = rule.charAt(--pos)) < 0) { 1157 syntaxError("No operator pos=" + pos, rule, start); 1158 } 1159 ++pos; 1160 1161 // Found an operator char. Check for forward-reverse operator. 1162 if (operator == REVERSE_RULE_OP && 1163 (pos < limit && rule.charAt(pos) == FORWARD_RULE_OP)) { 1164 ++pos; 1165 operator = FWDREV_RULE_OP; 1166 } 1167 1168 // Translate alternate op characters. 1169 switch (operator) { 1170 case ALT_FORWARD_RULE_OP: 1171 operator = FORWARD_RULE_OP; 1172 break; 1173 case ALT_REVERSE_RULE_OP: 1174 operator = REVERSE_RULE_OP; 1175 break; 1176 case ALT_FWDREV_RULE_OP: 1177 operator = FWDREV_RULE_OP; 1178 break; 1179 } 1180 1181 pos = right.parse(rule, pos, limit, this); 1182 1183 if (pos < limit) { 1184 if (rule.charAt(--pos) == END_OF_RULE) { 1185 ++pos; 1186 } else { 1187 // RuleHalf parser must have terminated at an operator 1188 syntaxError("Unquoted operator", rule, start); 1189 } 1190 } 1191 1192 if (operator == VARIABLE_DEF_OP) { 1193 // LHS is the name. RHS is a single character, either a literal 1194 // or a set (already parsed). If RHS is longer than one 1195 // character, it is either a multi-character string, or multiple 1196 // sets, or a mixture of chars and sets -- syntax error. 1197 1198 // We expect to see a single undefined variable (the one being 1199 // defined). 1200 if (undefinedVariableName == null) { 1201 syntaxError("Missing '$' or duplicate definition", rule, start); 1202 } 1203 if (left.text.length() != 1 || left.text.charAt(0) != variableLimit) { 1204 syntaxError("Malformed LHS", rule, start); 1205 } 1206 if (left.anchorStart || left.anchorEnd || 1207 right.anchorStart || right.anchorEnd) { 1208 syntaxError("Malformed variable def", rule, start); 1209 } 1210 // We allow anything on the right, including an empty string. 1211 int n = right.text.length(); 1212 char[] value = new char[n]; 1213 right.text.getChars(0, n, value, 0); 1214 variableNames.put(undefinedVariableName, value); 1215 1216 ++variableLimit; 1217 return pos; 1218 } 1219 1220 // If this is not a variable definition rule, we shouldn't have 1221 // any undefined variable names. 1222 if (undefinedVariableName != null) { 1223 syntaxError("Undefined variable $" + undefinedVariableName, 1224 rule, start); 1225 } 1226 1227 // Verify segments 1228 if (segmentStandins.length() > segmentObjects.size()) { 1229 syntaxError("Undefined segment reference", rule, start); 1230 } 1231 for (int i=0; i<segmentStandins.length(); ++i) { 1232 if (segmentStandins.charAt(i) == 0) { 1233 syntaxError("Internal error", rule, start); // will never happen 1234 } 1235 } 1236 for (int i=0; i<segmentObjects.size(); ++i) { 1237 if (segmentObjects.get(i) == null) { 1238 syntaxError("Internal error", rule, start); // will never happen 1239 } 1240 } 1241 1242 // If the direction we want doesn't match the rule 1243 // direction, do nothing. 1244 if (operator != FWDREV_RULE_OP && 1245 ((direction == Transliterator.FORWARD) != (operator == FORWARD_RULE_OP))) { 1246 return pos; 1247 } 1248 1249 // Transform the rule into a forward rule by swapping the 1250 // sides if necessary. 1251 if (direction == Transliterator.REVERSE) { 1252 RuleHalf temp = left; 1253 left = right; 1254 right = temp; 1255 } 1256 1257 // Remove non-applicable elements in forward-reverse 1258 // rules. Bidirectional rules ignore elements that do not 1259 // apply. 1260 if (operator == FWDREV_RULE_OP) { 1261 right.removeContext(); 1262 left.cursor = -1; 1263 left.cursorOffset = 0; 1264 } 1265 1266 // Normalize context 1267 if (left.ante < 0) { 1268 left.ante = 0; 1269 } 1270 if (left.post < 0) { 1271 left.post = left.text.length(); 1272 } 1273 1274 // Context is only allowed on the input side. Cursors are only 1275 // allowed on the output side. Segment delimiters can only appear 1276 // on the left, and references on the right. Cursor offset 1277 // cannot appear without an explicit cursor. Cursor offset 1278 // cannot place the cursor outside the limits of the context. 1279 // Anchors are only allowed on the input side. 1280 if (right.ante >= 0 || right.post >= 0 || left.cursor >= 0 || 1281 (right.cursorOffset != 0 && right.cursor < 0) || 1282 // - The following two checks were used to ensure that the 1283 // - the cursor offset stayed within the ante- or postcontext. 1284 // - However, with the addition of quantifiers, we have to 1285 // - allow arbitrary cursor offsets and do runtime checking. 1286 //(right.cursorOffset > (left.text.length() - left.post)) || 1287 //(-right.cursorOffset > left.ante) || 1288 right.anchorStart || right.anchorEnd || 1289 !left.isValidInput(this) || !right.isValidOutput(this) || 1290 left.ante > left.post) { 1291 syntaxError("Malformed rule", rule, start); 1292 } 1293 1294 // Flatten segment objects vector to an array 1295 UnicodeMatcher[] segmentsArray = null; 1296 if (segmentObjects.size() > 0) { 1297 segmentsArray = new UnicodeMatcher[segmentObjects.size()]; 1298 segmentObjects.toArray(segmentsArray); 1299 } 1300 1301 curData.ruleSet.addRule(new TransliterationRule( 1302 left.text, left.ante, left.post, 1303 right.text, right.cursor, right.cursorOffset, 1304 segmentsArray, 1305 left.anchorStart, left.anchorEnd, 1306 curData)); 1307 1308 return pos; 1309 } 1310 1311 /** 1312 * Set the variable range to [start, end] (inclusive). 1313 */ setVariableRange(int start, int end)1314 private void setVariableRange(int start, int end) { 1315 if (start > end || start < 0 || end > 0xFFFF) { 1316 throw new IllegalIcuArgumentException("Invalid variable range " + start + ", " + end); 1317 } 1318 1319 curData.variablesBase = (char) start; // first private use 1320 1321 if (dataVector.size() == 0) { 1322 variableNext = (char) start; 1323 variableLimit = (char) (end + 1); 1324 } 1325 } 1326 1327 /** 1328 * Assert that the given character is NOT within the variable range. 1329 * If it is, signal an error. This is neccesary to ensure that the 1330 * variable range does not overlap characters used in a rule. 1331 */ checkVariableRange(int ch, String rule, int start)1332 private void checkVariableRange(int ch, String rule, int start) { 1333 if (ch >= curData.variablesBase && ch < variableLimit) { 1334 syntaxError("Variable range character in rule", rule, start); 1335 } 1336 } 1337 1338 // (The following method is part of an unimplemented feature. 1339 // Remove this clover pragma after the feature is implemented. 1340 // 2003-06-11 ICU 2.6 Alan) 1341 ///CLOVER:OFF 1342 /** 1343 * Set the maximum backup to 'backup', in response to a pragma 1344 * statement. 1345 */ pragmaMaximumBackup(int backup)1346 private void pragmaMaximumBackup(int backup) { 1347 //TODO Finish 1348 throw new IllegalIcuArgumentException("use maximum backup pragma not implemented yet"); 1349 } 1350 ///CLOVER:ON 1351 1352 // (The following method is part of an unimplemented feature. 1353 // Remove this clover pragma after the feature is implemented. 1354 // 2003-06-11 ICU 2.6 Alan) 1355 ///CLOVER:OFF 1356 /** 1357 * Begin normalizing all rules using the given mode, in response 1358 * to a pragma statement. 1359 */ pragmaNormalizeRules(Normalizer.Mode mode)1360 private void pragmaNormalizeRules(Normalizer.Mode mode) { 1361 //TODO Finish 1362 throw new IllegalIcuArgumentException("use normalize rules pragma not implemented yet"); 1363 } 1364 ///CLOVER:ON 1365 1366 /** 1367 * Return true if the given rule looks like a pragma. 1368 * @param pos offset to the first non-whitespace character 1369 * of the rule. 1370 * @param limit pointer past the last character of the rule. 1371 */ resemblesPragma(String rule, int pos, int limit)1372 static boolean resemblesPragma(String rule, int pos, int limit) { 1373 // Must start with /use\s/i 1374 return Utility.parsePattern(rule, pos, limit, "use ", null) >= 0; 1375 } 1376 1377 /** 1378 * Parse a pragma. This method assumes resemblesPragma() has 1379 * already returned true. 1380 * @param pos offset to the first non-whitespace character 1381 * of the rule. 1382 * @param limit pointer past the last character of the rule. 1383 * @return the position index after the final ';' of the pragma, 1384 * or -1 on failure. 1385 */ parsePragma(String rule, int pos, int limit)1386 private int parsePragma(String rule, int pos, int limit) { 1387 int[] array = new int[2]; 1388 1389 // resemblesPragma() has already returned true, so we 1390 // know that pos points to /use\s/i; we can skip 4 characters 1391 // immediately 1392 pos += 4; 1393 1394 // Here are the pragmas we recognize: 1395 // use variable range 0xE000 0xEFFF; 1396 // use maximum backup 16; 1397 // use nfd rules; 1398 int p = Utility.parsePattern(rule, pos, limit, "~variable range # #~;", array); 1399 if (p >= 0) { 1400 setVariableRange(array[0], array[1]); 1401 return p; 1402 } 1403 1404 p = Utility.parsePattern(rule, pos, limit, "~maximum backup #~;", array); 1405 if (p >= 0) { 1406 pragmaMaximumBackup(array[0]); 1407 return p; 1408 } 1409 1410 p = Utility.parsePattern(rule, pos, limit, "~nfd rules~;", null); 1411 if (p >= 0) { 1412 pragmaNormalizeRules(Normalizer.NFD); 1413 return p; 1414 } 1415 1416 p = Utility.parsePattern(rule, pos, limit, "~nfc rules~;", null); 1417 if (p >= 0) { 1418 pragmaNormalizeRules(Normalizer.NFC); 1419 return p; 1420 } 1421 1422 // Syntax error: unable to parse pragma 1423 return -1; 1424 } 1425 1426 /** 1427 * Throw an exception indicating a syntax error. Search the rule string 1428 * for the probable end of the rule. Of course, if the error is that 1429 * the end of rule marker is missing, then the rule end will not be found. 1430 * In any case the rule start will be correctly reported. 1431 * @param msg error description 1432 * @param rule pattern string 1433 * @param start position of first character of current rule 1434 */ syntaxError(String msg, String rule, int start)1435 static final void syntaxError(String msg, String rule, int start) { 1436 int end = ruleEnd(rule, start, rule.length()); 1437 throw new IllegalIcuArgumentException(msg + " in \"" + 1438 Utility.escape(rule.substring(start, end)) + '"'); 1439 } 1440 ruleEnd(String rule, int start, int limit)1441 static final int ruleEnd(String rule, int start, int limit) { 1442 int end = Utility.quotedIndexOf(rule, start, limit, ";"); 1443 if (end < 0) { 1444 end = limit; 1445 } 1446 return end; 1447 } 1448 1449 /** 1450 * Parse a UnicodeSet out, store it, and return the stand-in character 1451 * used to represent it. 1452 */ parseSet(String rule, ParsePosition pos)1453 private final char parseSet(String rule, ParsePosition pos) { 1454 UnicodeSet set = new UnicodeSet(rule, pos, parseData); 1455 if (variableNext >= variableLimit) { 1456 throw new RuntimeException("Private use variables exhausted"); 1457 } 1458 set.compact(); 1459 return generateStandInFor(set); 1460 } 1461 1462 /** 1463 * Generate and return a stand-in for a new UnicodeMatcher or UnicodeReplacer. 1464 * Store the object. 1465 */ generateStandInFor(Object obj)1466 char generateStandInFor(Object obj) { 1467 // assert(obj != null); 1468 1469 // Look up previous stand-in, if any. This is a short list 1470 // (typical n is 0, 1, or 2); linear search is optimal. 1471 for (int i=0; i<variablesVector.size(); ++i) { 1472 if (variablesVector.get(i) == obj) { // [sic] pointer comparison 1473 return (char) (curData.variablesBase + i); 1474 } 1475 } 1476 1477 if (variableNext >= variableLimit) { 1478 throw new RuntimeException("Variable range exhausted"); 1479 } 1480 variablesVector.add(obj); 1481 return variableNext++; 1482 } 1483 1484 /** 1485 * Return the standin for segment seg (1-based). 1486 */ getSegmentStandin(int seg)1487 public char getSegmentStandin(int seg) { 1488 if (segmentStandins.length() < seg) { 1489 segmentStandins.setLength(seg); 1490 } 1491 char c = segmentStandins.charAt(seg-1); 1492 if (c == 0) { 1493 if (variableNext >= variableLimit) { 1494 throw new RuntimeException("Variable range exhausted"); 1495 } 1496 c = variableNext++; 1497 // Set a placeholder in the master variables vector that will be 1498 // filled in later by setSegmentObject(). We know that we will get 1499 // called first because setSegmentObject() will call us. 1500 variablesVector.add(null); 1501 segmentStandins.setCharAt(seg-1, c); 1502 } 1503 return c; 1504 } 1505 1506 /** 1507 * Set the object for segment seg (1-based). 1508 */ setSegmentObject(int seg, StringMatcher obj)1509 public void setSegmentObject(int seg, StringMatcher obj) { 1510 // Since we call parseSection() recursively, nested 1511 // segments will result in segment i+1 getting parsed 1512 // and stored before segment i; be careful with the 1513 // vector handling here. 1514 while (segmentObjects.size() < seg) { 1515 segmentObjects.add(null); 1516 } 1517 int index = getSegmentStandin(seg) - curData.variablesBase; 1518 if (segmentObjects.get(seg-1) != null || 1519 variablesVector.get(index) != null) { 1520 throw new RuntimeException(); // should never happen 1521 } 1522 segmentObjects.set(seg-1, obj); 1523 variablesVector.set(index, obj); 1524 } 1525 1526 /** 1527 * Return the stand-in for the dot set. It is allocated the first 1528 * time and reused thereafter. 1529 */ getDotStandIn()1530 char getDotStandIn() { 1531 if (dotStandIn == -1) { 1532 dotStandIn = generateStandInFor(new UnicodeSet(DOT_SET)); 1533 } 1534 return (char) dotStandIn; 1535 } 1536 1537 /** 1538 * Append the value of the given variable name to the given 1539 * StringBuffer. 1540 * @exception IllegalIcuArgumentException if the name is unknown. 1541 */ appendVariableDef(String name, StringBuffer buf)1542 private void appendVariableDef(String name, StringBuffer buf) { 1543 char[] ch = variableNames.get(name); 1544 if (ch == null) { 1545 // We allow one undefined variable so that variable definition 1546 // statements work. For the first undefined variable we return 1547 // the special placeholder variableLimit-1, and save the variable 1548 // name. 1549 if (undefinedVariableName == null) { 1550 undefinedVariableName = name; 1551 if (variableNext >= variableLimit) { 1552 throw new RuntimeException("Private use variables exhausted"); 1553 } 1554 buf.append(--variableLimit); 1555 } else { 1556 throw new IllegalIcuArgumentException("Undefined variable $" 1557 + name); 1558 } 1559 } else { 1560 buf.append(ch); 1561 } 1562 } 1563 } 1564 1565 //eof 1566