1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html#License 3 /* 4 ********************************************************************** 5 * Copyright (c) 2001-2011, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ********************************************************************** 8 */ 9 package com.ibm.icu.text; 10 11 import java.text.ParsePosition; 12 import java.util.ArrayList; 13 import java.util.HashMap; 14 import java.util.List; 15 import java.util.Map; 16 17 import com.ibm.icu.impl.IllegalIcuArgumentException; 18 import com.ibm.icu.impl.PatternProps; 19 import com.ibm.icu.impl.Utility; 20 import com.ibm.icu.lang.UCharacter; 21 import com.ibm.icu.text.RuleBasedTransliterator.Data; 22 23 class TransliteratorParser { 24 25 //---------------------------------------------------------------------- 26 // Data members 27 //---------------------------------------------------------------------- 28 29 /** 30 * PUBLIC data member. 31 * A Vector of RuleBasedTransliterator.Data objects, one for each discrete group 32 * of rules in the rule set 33 */ 34 public List<Data> dataVector; 35 36 /** 37 * PUBLIC data member. 38 * A Vector of Strings containing all of the ID blocks in the rule set 39 */ 40 public List<String> idBlockVector; 41 42 /** 43 * The current data object for which we are parsing rules 44 */ 45 private Data curData; 46 47 /** 48 * PUBLIC data member containing the parsed compound filter, if any. 49 */ 50 public UnicodeSet compoundFilter; 51 52 53 private int direction; 54 55 /** 56 * Temporary symbol table used during parsing. 57 */ 58 private ParseData parseData; 59 60 /** 61 * Temporary vector of set variables. When parsing is complete, this 62 * is copied into the array data.variables. As with data.variables, 63 * element 0 corresponds to character data.variablesBase. 64 */ 65 private List<Object> variablesVector; 66 67 /** 68 * Temporary table of variable names. When parsing is complete, this is 69 * copied into data.variableNames. 70 */ 71 private Map<String, char[]> variableNames; 72 73 /** 74 * String of standins for segments. Used during the parsing of a single 75 * rule. segmentStandins.charAt(0) is the standin for "$1" and corresponds 76 * to StringMatcher object segmentObjects.elementAt(0), etc. 77 */ 78 private StringBuffer segmentStandins; 79 80 /** 81 * Vector of StringMatcher objects for segments. Used during the 82 * parsing of a single rule. 83 * segmentStandins.charAt(0) is the standin for "$1" and corresponds 84 * to StringMatcher object segmentObjects.elementAt(0), etc. 85 */ 86 private List<StringMatcher> segmentObjects; 87 88 /** 89 * The next available stand-in for variables. This starts at some point in 90 * the private use area (discovered dynamically) and increments up toward 91 * <code>variableLimit</code>. At any point during parsing, available 92 * variables are <code>variableNext..variableLimit-1</code>. 93 */ 94 private char variableNext; 95 96 /** 97 * The last available stand-in for variables. This is discovered 98 * dynamically. At any point during parsing, available variables are 99 * <code>variableNext..variableLimit-1</code>. During variable definition 100 * we use the special value variableLimit-1 as a placeholder. 101 */ 102 private char variableLimit; 103 104 /** 105 * When we encounter an undefined variable, we do not immediately signal 106 * an error, in case we are defining this variable, e.g., "$a = [a-z];". 107 * Instead, we save the name of the undefined variable, and substitute 108 * in the placeholder char variableLimit - 1, and decrement 109 * variableLimit. 110 */ 111 private String undefinedVariableName; 112 113 /** 114 * The stand-in character for the 'dot' set, represented by '.' in 115 * patterns. This is allocated the first time it is needed, and 116 * reused thereafter. 117 */ 118 private int dotStandIn = -1; 119 120 //---------------------------------------------------------------------- 121 // Constants 122 //---------------------------------------------------------------------- 123 124 // Indicator for ID blocks 125 private static final String ID_TOKEN = "::"; 126 private static final int ID_TOKEN_LEN = 2; 127 128 /* 129 (reserved for future expansion) 130 // markers for beginning and end of rule groups 131 private static final String BEGIN_TOKEN = "BEGIN"; 132 private static final String END_TOKEN = "END"; 133 */ 134 135 // Operators 136 private static final char VARIABLE_DEF_OP = '='; 137 private static final char FORWARD_RULE_OP = '>'; 138 private static final char REVERSE_RULE_OP = '<'; 139 private static final char FWDREV_RULE_OP = '~'; // internal rep of <> op 140 141 private static final String OPERATORS = "=><\u2190\u2192\u2194"; 142 private static final String HALF_ENDERS = "=><\u2190\u2192\u2194;"; 143 144 // Other special characters 145 private static final char QUOTE = '\''; 146 private static final char ESCAPE = '\\'; 147 private static final char END_OF_RULE = ';'; 148 private static final char RULE_COMMENT_CHAR = '#'; 149 150 private static final char CONTEXT_ANTE = '{'; // ante{key 151 private static final char CONTEXT_POST = '}'; // key}post 152 private static final char CURSOR_POS = '|'; 153 private static final char CURSOR_OFFSET = '@'; 154 private static final char ANCHOR_START = '^'; 155 156 private static final char KLEENE_STAR = '*'; 157 private static final char ONE_OR_MORE = '+'; 158 private static final char ZERO_OR_ONE = '?'; 159 160 private static final char DOT = '.'; 161 private static final String DOT_SET = "[^[:Zp:][:Zl:]\\r\\n$]"; 162 163 // By definition, the ANCHOR_END special character is a 164 // trailing SymbolTable.SYMBOL_REF character. 165 // private static final char ANCHOR_END = '$'; 166 167 // Segments of the input string are delimited by "(" and ")". In the 168 // output string these segments are referenced as "$1", "$2", etc. 169 private static final char SEGMENT_OPEN = '('; 170 private static final char SEGMENT_CLOSE = ')'; 171 172 // A function is denoted &Source-Target/Variant(text) 173 private static final char FUNCTION = '&'; 174 175 // Aliases for some of the syntax characters. These are provided so 176 // transliteration rules can be expressed in XML without clashing with 177 // XML syntax characters '<', '>', and '&'. 178 private static final char ALT_REVERSE_RULE_OP = '\u2190'; // Left Arrow 179 private static final char ALT_FORWARD_RULE_OP = '\u2192'; // Right Arrow 180 private static final char ALT_FWDREV_RULE_OP = '\u2194'; // Left Right Arrow 181 private static final char ALT_FUNCTION = '\u2206'; // Increment (~Greek Capital Delta) 182 183 // Special characters disallowed at the top level 184 private static UnicodeSet ILLEGAL_TOP = new UnicodeSet("[\\)]"); 185 186 // Special characters disallowed within a segment 187 private static UnicodeSet ILLEGAL_SEG = new UnicodeSet("[\\{\\}\\|\\@]"); 188 189 // Special characters disallowed within a function argument 190 private static UnicodeSet ILLEGAL_FUNC = new UnicodeSet("[\\^\\(\\.\\*\\+\\?\\{\\}\\|\\@]"); 191 192 //---------------------------------------------------------------------- 193 // class ParseData 194 //---------------------------------------------------------------------- 195 196 /** 197 * This class implements the SymbolTable interface. It is used 198 * during parsing to give UnicodeSet access to variables that 199 * have been defined so far. Note that it uses variablesVector, 200 * _not_ data.variables. 201 */ 202 private class ParseData implements SymbolTable { 203 204 /** 205 * Implement SymbolTable API. 206 */ 207 @Override lookup(String name)208 public char[] lookup(String name) { 209 return variableNames.get(name); 210 } 211 212 /** 213 * Implement SymbolTable API. 214 */ 215 @Override lookupMatcher(int ch)216 public UnicodeMatcher lookupMatcher(int ch) { 217 // Note that we cannot use data.lookup() because the 218 // set array has not been constructed yet. 219 int i = ch - curData.variablesBase; 220 if (i >= 0 && i < variablesVector.size()) { 221 return (UnicodeMatcher) variablesVector.get(i); 222 } 223 return null; 224 } 225 226 /** 227 * Implement SymbolTable API. Parse out a symbol reference 228 * name. 229 */ 230 @Override parseReference(String text, ParsePosition pos, int limit)231 public String parseReference(String text, ParsePosition pos, int limit) { 232 int start = pos.getIndex(); 233 int i = start; 234 while (i < limit) { 235 char c = text.charAt(i); 236 if ((i==start && !UCharacter.isUnicodeIdentifierStart(c)) || 237 !UCharacter.isUnicodeIdentifierPart(c)) { 238 break; 239 } 240 ++i; 241 } 242 if (i == start) { // No valid name chars 243 return null; 244 } 245 pos.setIndex(i); 246 return text.substring(start, i); 247 } 248 249 /** 250 * Return true if the given character is a matcher standin or a plain 251 * character (non standin). 252 */ isMatcher(int ch)253 public boolean isMatcher(int ch) { 254 // Note that we cannot use data.lookup() because the 255 // set array has not been constructed yet. 256 int i = ch - curData.variablesBase; 257 if (i >= 0 && i < variablesVector.size()) { 258 return variablesVector.get(i) instanceof UnicodeMatcher; 259 } 260 return true; 261 } 262 263 /** 264 * Return true if the given character is a replacer standin or a plain 265 * character (non standin). 266 */ isReplacer(int ch)267 public boolean isReplacer(int ch) { 268 // Note that we cannot use data.lookup() because the 269 // set array has not been constructed yet. 270 int i = ch - curData.variablesBase; 271 if (i >= 0 && i < variablesVector.size()) { 272 return variablesVector.get(i) instanceof UnicodeReplacer; 273 } 274 return true; 275 } 276 } 277 278 //---------------------------------------------------------------------- 279 // classes RuleBody, RuleArray, and RuleReader 280 //---------------------------------------------------------------------- 281 282 /** 283 * A private abstract class representing the interface to rule 284 * source code that is broken up into lines. Handles the 285 * folding of lines terminated by a backslash. This folding 286 * is limited; it does not account for comments, quotes, or 287 * escapes, so its use to be limited. 288 */ 289 private static abstract class RuleBody { 290 291 /** 292 * Retrieve the next line of the source, or return null if 293 * none. Folds lines terminated by a backslash into the 294 * next line, without regard for comments, quotes, or 295 * escapes. 296 */ nextLine()297 String nextLine() { 298 String s = handleNextLine(); 299 if (s != null && 300 s.length() > 0 && 301 s.charAt(s.length() - 1) == '\\') { 302 StringBuilder b = new StringBuilder(s); 303 do { 304 b.deleteCharAt(b.length()-1); 305 s = handleNextLine(); 306 if (s == null) { 307 break; 308 } 309 b.append(s); 310 } while (s.length() > 0 && 311 s.charAt(s.length() - 1) == '\\'); 312 s = b.toString(); 313 } 314 return s; 315 } 316 317 /** 318 * Reset to the first line of the source. 319 */ reset()320 abstract void reset(); 321 322 /** 323 * Subclass method to return the next line of the source. 324 */ handleNextLine()325 abstract String handleNextLine(); 326 } 327 328 /** 329 * RuleBody subclass for a String[] array. 330 */ 331 private static class RuleArray extends RuleBody { 332 String[] array; 333 int i; RuleArray(String[] array)334 public RuleArray(String[] array) { this.array = array; i = 0; } 335 @Override handleNextLine()336 public String handleNextLine() { 337 return (i < array.length) ? array[i++] : null; 338 } 339 @Override reset()340 public void reset() { 341 i = 0; 342 } 343 } 344 345 /* 346 * RuleBody subclass for a ResourceReader. 347 */ 348 /* private static class RuleReader extends RuleBody { 349 ResourceReader reader; 350 public RuleReader(ResourceReader reader) { this.reader = reader; } 351 public String handleNextLine() { 352 try { 353 return reader.readLine(); 354 } catch (java.io.IOException e) {} 355 return null; 356 } 357 public void reset() { 358 reader.reset(); 359 } 360 }*/ 361 362 //---------------------------------------------------------------------- 363 // class RuleHalf 364 //---------------------------------------------------------------------- 365 366 /** 367 * A class representing one side of a rule. This class knows how to 368 * parse half of a rule. It is tightly coupled to the method 369 * TransliteratorParser.parseRule(). 370 */ 371 private static class RuleHalf { 372 373 public String text; 374 375 public int cursor = -1; // position of cursor in text 376 public int ante = -1; // position of ante context marker '{' in text 377 public int post = -1; // position of post context marker '}' in text 378 379 // Record the offset to the cursor either to the left or to the 380 // right of the key. This is indicated by characters on the output 381 // side that allow the cursor to be positioned arbitrarily within 382 // the matching text. For example, abc{def} > | @@@ xyz; changes 383 // def to xyz and moves the cursor to before abc. Offset characters 384 // must be at the start or end, and they cannot move the cursor past 385 // the ante- or postcontext text. Placeholders are only valid in 386 // output text. The length of the ante and post context is 387 // determined at runtime, because of supplementals and quantifiers. 388 public int cursorOffset = 0; // only nonzero on output side 389 390 // Position of first CURSOR_OFFSET on _right_. This will be -1 391 // for |@, -2 for |@@, etc., and 1 for @|, 2 for @@|, etc. 392 private int cursorOffsetPos = 0; 393 394 public boolean anchorStart = false; 395 public boolean anchorEnd = false; 396 397 /** 398 * The segment number from 1..n of the next '(' we see 399 * during parsing; 1-based. 400 */ 401 private int nextSegmentNumber = 1; 402 403 /** 404 * Parse one side of a rule, stopping at either the limit, 405 * the END_OF_RULE character, or an operator. 406 * @return the index after the terminating character, or 407 * if limit was reached, limit 408 */ parse(String rule, int pos, int limit, TransliteratorParser parser)409 public int parse(String rule, int pos, int limit, 410 TransliteratorParser parser) { 411 int start = pos; 412 StringBuffer buf = new StringBuffer(); 413 pos = parseSection(rule, pos, limit, parser, buf, ILLEGAL_TOP, false); 414 text = buf.toString(); 415 416 if (cursorOffset > 0 && cursor != cursorOffsetPos) { 417 syntaxError("Misplaced " + CURSOR_POS, rule, start); 418 } 419 420 return pos; 421 } 422 423 /** 424 * Parse a section of one side of a rule, stopping at either 425 * the limit, the END_OF_RULE character, an operator, or a 426 * segment close character. This method parses both a 427 * top-level rule half and a segment within such a rule half. 428 * It calls itself recursively to parse segments and nested 429 * segments. 430 * @param buf buffer into which to accumulate the rule pattern 431 * characters, either literal characters from the rule or 432 * standins for UnicodeMatcher objects including segments. 433 * @param illegal the set of special characters that is illegal during 434 * this parse. 435 * @param isSegment if true, then we've already seen a '(' and 436 * pos on entry points right after it. Accumulate everything 437 * up to the closing ')', put it in a segment matcher object, 438 * generate a standin for it, and add the standin to buf. As 439 * a side effect, update the segments vector with a reference 440 * to the segment matcher. This works recursively for nested 441 * segments. If isSegment is false, just accumulate 442 * characters into buf. 443 * @return the index after the terminating character, or 444 * if limit was reached, limit 445 */ parseSection(String rule, int pos, int limit, TransliteratorParser parser, StringBuffer buf, UnicodeSet illegal, boolean isSegment)446 private int parseSection(String rule, int pos, int limit, 447 TransliteratorParser parser, 448 StringBuffer buf, 449 UnicodeSet illegal, 450 boolean isSegment) { 451 int start = pos; 452 ParsePosition pp = null; 453 int quoteStart = -1; // Most recent 'single quoted string' 454 int quoteLimit = -1; 455 int varStart = -1; // Most recent $variableReference 456 int varLimit = -1; 457 int[] iref = new int[1]; 458 int bufStart = buf.length(); 459 460 main: 461 while (pos < limit) { 462 // Since all syntax characters are in the BMP, fetching 463 // 16-bit code units suffices here. 464 char c = rule.charAt(pos++); 465 if (PatternProps.isWhiteSpace(c)) { 466 continue; 467 } 468 // HALF_ENDERS is all chars that end a rule half: "<>=;" 469 if (HALF_ENDERS.indexOf(c) >= 0) { 470 ///CLOVER:OFF 471 // isSegment is always false 472 if (isSegment) { 473 syntaxError("Unclosed segment", rule, start); 474 } 475 ///CLOVER:ON 476 break main; 477 } 478 if (anchorEnd) { 479 // Text after a presumed end anchor is a syntax err 480 syntaxError("Malformed variable reference", rule, start); 481 } 482 if (UnicodeSet.resemblesPattern(rule, pos-1)) { 483 if (pp == null) { 484 pp = new ParsePosition(0); 485 } 486 pp.setIndex(pos-1); // Backup to opening '[' 487 buf.append(parser.parseSet(rule, pp)); 488 pos = pp.getIndex(); 489 continue; 490 } 491 // Handle escapes 492 if (c == ESCAPE) { 493 if (pos == limit) { 494 syntaxError("Trailing backslash", rule, start); 495 } 496 iref[0] = pos; 497 int escaped = Utility.unescapeAt(rule, iref); 498 pos = iref[0]; 499 if (escaped == -1) { 500 syntaxError("Malformed escape", rule, start); 501 } 502 parser.checkVariableRange(escaped, rule, start); 503 UTF16.append(buf, escaped); 504 continue; 505 } 506 // Handle quoted matter 507 if (c == QUOTE) { 508 int iq = rule.indexOf(QUOTE, pos); 509 if (iq == pos) { 510 buf.append(c); // Parse [''] outside quotes as ['] 511 ++pos; 512 } else { 513 /* This loop picks up a run of quoted text of the 514 * form 'aaaa' each time through. If this run 515 * hasn't really ended ('aaaa''bbbb') then it keeps 516 * looping, each time adding on a new run. When it 517 * reaches the final quote it breaks. 518 */ 519 quoteStart = buf.length(); 520 for (;;) { 521 if (iq < 0) { 522 syntaxError("Unterminated quote", rule, start); 523 } 524 buf.append(rule.substring(pos, iq)); 525 pos = iq+1; 526 if (pos < limit && rule.charAt(pos) == QUOTE) { 527 // Parse [''] inside quotes as ['] 528 iq = rule.indexOf(QUOTE, pos+1); 529 // Continue looping 530 } else { 531 break; 532 } 533 } 534 quoteLimit = buf.length(); 535 536 for (iq=quoteStart; iq<quoteLimit; ++iq) { 537 parser.checkVariableRange(buf.charAt(iq), rule, start); 538 } 539 } 540 continue; 541 } 542 543 parser.checkVariableRange(c, rule, start); 544 545 if (illegal.contains(c)) { 546 syntaxError("Illegal character '" + c + '\'', rule, start); 547 } 548 549 switch (c) { 550 551 //------------------------------------------------------ 552 // Elements allowed within and out of segments 553 //------------------------------------------------------ 554 case ANCHOR_START: 555 if (buf.length() == 0 && !anchorStart) { 556 anchorStart = true; 557 } else { 558 syntaxError("Misplaced anchor start", 559 rule, start); 560 } 561 break; 562 case SEGMENT_OPEN: 563 { 564 // bufSegStart is the offset in buf to the first 565 // character of the segment we are parsing. 566 int bufSegStart = buf.length(); 567 568 // Record segment number now, since nextSegmentNumber 569 // will be incremented during the call to parseSection 570 // if there are nested segments. 571 int segmentNumber = nextSegmentNumber++; // 1-based 572 573 // Parse the segment 574 pos = parseSection(rule, pos, limit, parser, buf, ILLEGAL_SEG, true); 575 576 // After parsing a segment, the relevant characters are 577 // in buf, starting at offset bufSegStart. Extract them 578 // into a string matcher, and replace them with a 579 // standin for that matcher. 580 StringMatcher m = 581 new StringMatcher(buf.substring(bufSegStart), 582 segmentNumber, parser.curData); 583 584 // Record and associate object and segment number 585 parser.setSegmentObject(segmentNumber, m); 586 buf.setLength(bufSegStart); 587 buf.append(parser.getSegmentStandin(segmentNumber)); 588 } 589 break; 590 case FUNCTION: 591 case ALT_FUNCTION: 592 { 593 iref[0] = pos; 594 TransliteratorIDParser.SingleID single = TransliteratorIDParser.parseFilterID(rule, iref); 595 // The next character MUST be a segment open 596 if (single == null || 597 !Utility.parseChar(rule, iref, SEGMENT_OPEN)) { 598 syntaxError("Invalid function", rule, start); 599 } 600 601 Transliterator t = single.getInstance(); 602 if (t == null) { 603 syntaxError("Invalid function ID", rule, start); 604 } 605 606 // bufSegStart is the offset in buf to the first 607 // character of the segment we are parsing. 608 int bufSegStart = buf.length(); 609 610 // Parse the segment 611 pos = parseSection(rule, iref[0], limit, parser, buf, ILLEGAL_FUNC, true); 612 613 // After parsing a segment, the relevant characters are 614 // in buf, starting at offset bufSegStart. 615 FunctionReplacer r = 616 new FunctionReplacer(t, 617 new StringReplacer(buf.substring(bufSegStart), parser.curData)); 618 619 // Replace the buffer contents with a stand-in 620 buf.setLength(bufSegStart); 621 buf.append(parser.generateStandInFor(r)); 622 } 623 break; 624 case SymbolTable.SYMBOL_REF: 625 // Handle variable references and segment references "$1" .. "$9" 626 { 627 // A variable reference must be followed immediately 628 // by a Unicode identifier start and zero or more 629 // Unicode identifier part characters, or by a digit 630 // 1..9 if it is a segment reference. 631 if (pos == limit) { 632 // A variable ref character at the end acts as 633 // an anchor to the context limit, as in perl. 634 anchorEnd = true; 635 break; 636 } 637 // Parse "$1" "$2" .. "$9" .. (no upper limit) 638 c = rule.charAt(pos); 639 int r = UCharacter.digit(c, 10); 640 if (r >= 1 && r <= 9) { 641 iref[0] = pos; 642 r = Utility.parseNumber(rule, iref, 10); 643 if (r < 0) { 644 syntaxError("Undefined segment reference", 645 rule, start); 646 } 647 pos = iref[0]; 648 buf.append(parser.getSegmentStandin(r)); 649 } else { 650 if (pp == null) { // Lazy create 651 pp = new ParsePosition(0); 652 } 653 pp.setIndex(pos); 654 String name = parser.parseData. 655 parseReference(rule, pp, limit); 656 if (name == null) { 657 // This means the '$' was not followed by a 658 // valid name. Try to interpret it as an 659 // end anchor then. If this also doesn't work 660 // (if we see a following character) then signal 661 // an error. 662 anchorEnd = true; 663 break; 664 } 665 pos = pp.getIndex(); 666 // If this is a variable definition statement, 667 // then the LHS variable will be undefined. In 668 // that case appendVariableDef() will append the 669 // special placeholder char variableLimit-1. 670 varStart = buf.length(); 671 parser.appendVariableDef(name, buf); 672 varLimit = buf.length(); 673 } 674 } 675 break; 676 case DOT: 677 buf.append(parser.getDotStandIn()); 678 break; 679 case KLEENE_STAR: 680 case ONE_OR_MORE: 681 case ZERO_OR_ONE: 682 // Quantifiers. We handle single characters, quoted strings, 683 // variable references, and segments. 684 // a+ matches aaa 685 // 'foo'+ matches foofoofoo 686 // $v+ matches xyxyxy if $v == xy 687 // (seg)+ matches segsegseg 688 { 689 ///CLOVER:OFF 690 // isSegment is always false 691 if (isSegment && buf.length() == bufStart) { 692 // The */+ immediately follows '(' 693 syntaxError("Misplaced quantifier", rule, start); 694 break; 695 } 696 ///CLOVER:ON 697 698 int qstart, qlimit; 699 // The */+ follows an isolated character or quote 700 // or variable reference 701 if (buf.length() == quoteLimit) { 702 // The */+ follows a 'quoted string' 703 qstart = quoteStart; 704 qlimit = quoteLimit; 705 } else if (buf.length() == varLimit) { 706 // The */+ follows a $variableReference 707 qstart = varStart; 708 qlimit = varLimit; 709 } else { 710 // The */+ follows a single character, possibly 711 // a segment standin 712 qstart = buf.length() - 1; 713 qlimit = qstart + 1; 714 } 715 716 UnicodeMatcher m; 717 try { 718 m = new StringMatcher(buf.toString(), qstart, qlimit, 719 0, parser.curData); 720 } catch (RuntimeException e) { 721 final String precontext = pos < 50 ? rule.substring(0, pos) : "..." + rule.substring(pos - 50, pos); 722 final String postContext = limit-pos <= 50 ? rule.substring(pos, limit) : rule.substring(pos, pos+50) + "..."; 723 throw new IllegalIcuArgumentException("Failure in rule: " + precontext + "$$$" 724 + postContext).initCause(e); 725 } 726 int min = 0; 727 int max = Quantifier.MAX; 728 switch (c) { 729 case ONE_OR_MORE: 730 min = 1; 731 break; 732 case ZERO_OR_ONE: 733 min = 0; 734 max = 1; 735 break; 736 // case KLEENE_STAR: 737 // do nothing -- min, max already set 738 } 739 m = new Quantifier(m, min, max); 740 buf.setLength(qstart); 741 buf.append(parser.generateStandInFor(m)); 742 } 743 break; 744 745 //------------------------------------------------------ 746 // Elements allowed ONLY WITHIN segments 747 //------------------------------------------------------ 748 case SEGMENT_CLOSE: 749 // assert(isSegment); 750 // We're done parsing a segment. 751 break main; 752 753 //------------------------------------------------------ 754 // Elements allowed ONLY OUTSIDE segments 755 //------------------------------------------------------ 756 case CONTEXT_ANTE: 757 if (ante >= 0) { 758 syntaxError("Multiple ante contexts", rule, start); 759 } 760 ante = buf.length(); 761 break; 762 case CONTEXT_POST: 763 if (post >= 0) { 764 syntaxError("Multiple post contexts", rule, start); 765 } 766 post = buf.length(); 767 break; 768 case CURSOR_POS: 769 if (cursor >= 0) { 770 syntaxError("Multiple cursors", rule, start); 771 } 772 cursor = buf.length(); 773 break; 774 case CURSOR_OFFSET: 775 if (cursorOffset < 0) { 776 if (buf.length() > 0) { 777 syntaxError("Misplaced " + c, rule, start); 778 } 779 --cursorOffset; 780 } else if (cursorOffset > 0) { 781 if (buf.length() != cursorOffsetPos || cursor >= 0) { 782 syntaxError("Misplaced " + c, rule, start); 783 } 784 ++cursorOffset; 785 } else { 786 if (cursor == 0 && buf.length() == 0) { 787 cursorOffset = -1; 788 } else if (cursor < 0) { 789 cursorOffsetPos = buf.length(); 790 cursorOffset = 1; 791 } else { 792 syntaxError("Misplaced " + c, rule, start); 793 } 794 } 795 break; 796 797 //------------------------------------------------------ 798 // Non-special characters 799 //------------------------------------------------------ 800 default: 801 // Disallow unquoted characters other than [0-9A-Za-z] 802 // in the printable ASCII range. These characters are 803 // reserved for possible future use. 804 if (c >= 0x0021 && c <= 0x007E && 805 !((c >= '0' && c <= '9') || 806 (c >= 'A' && c <= 'Z') || 807 (c >= 'a' && c <= 'z'))) { 808 syntaxError("Unquoted " + c, rule, start); 809 } 810 buf.append(c); 811 break; 812 } 813 } 814 return pos; 815 } 816 817 /** 818 * Remove context. 819 */ removeContext()820 void removeContext() { 821 text = text.substring(ante < 0 ? 0 : ante, 822 post < 0 ? text.length() : post); 823 ante = post = -1; 824 anchorStart = anchorEnd = false; 825 } 826 827 /** 828 * Return true if this half looks like valid output, that is, does not 829 * contain quantifiers or other special input-only elements. 830 */ isValidOutput(TransliteratorParser parser)831 public boolean isValidOutput(TransliteratorParser parser) { 832 for (int i=0; i<text.length(); ) { 833 int c = UTF16.charAt(text, i); 834 i += UTF16.getCharCount(c); 835 if (!parser.parseData.isReplacer(c)) { 836 return false; 837 } 838 } 839 return true; 840 } 841 842 /** 843 * Return true if this half looks like valid input, that is, does not 844 * contain functions or other special output-only elements. 845 */ isValidInput(TransliteratorParser parser)846 public boolean isValidInput(TransliteratorParser parser) { 847 for (int i=0; i<text.length(); ) { 848 int c = UTF16.charAt(text, i); 849 i += UTF16.getCharCount(c); 850 if (!parser.parseData.isMatcher(c)) { 851 return false; 852 } 853 } 854 return true; 855 } 856 } 857 858 //---------------------------------------------------------------------- 859 // PUBLIC methods 860 //---------------------------------------------------------------------- 861 862 /** 863 * Constructor. 864 */ TransliteratorParser()865 public TransliteratorParser() { 866 } 867 868 /** 869 * Parse a set of rules. After the parse completes, examine the public 870 * data members for results. 871 */ parse(String rules, int dir)872 public void parse(String rules, int dir) { 873 parseRules(new RuleArray(new String[] { rules }), dir); 874 } 875 876 /* 877 * Parse a set of rules. After the parse completes, examine the public 878 * data members for results. 879 */ 880 /* public void parse(ResourceReader rules, int direction) { 881 parseRules(new RuleReader(rules), direction); 882 }*/ 883 884 //---------------------------------------------------------------------- 885 // PRIVATE methods 886 //---------------------------------------------------------------------- 887 888 /** 889 * Parse an array of zero or more rules. The strings in the array are 890 * treated as if they were concatenated together, with rule terminators 891 * inserted between array elements if not present already. 892 * 893 * Any previous rules are discarded. Typically this method is called exactly 894 * once, during construction. 895 * 896 * The member this.data will be set to null if there are no rules. 897 * 898 * @exception IllegalIcuArgumentException if there is a syntax error in the 899 * rules 900 */ parseRules(RuleBody ruleArray, int dir)901 void parseRules(RuleBody ruleArray, int dir) { 902 boolean parsingIDs = true; 903 int ruleCount = 0; 904 905 dataVector = new ArrayList<Data>(); 906 idBlockVector = new ArrayList<String>(); 907 curData = null; 908 direction = dir; 909 compoundFilter = null; 910 variablesVector = new ArrayList<Object>(); 911 variableNames = new HashMap<String, char[]>(); 912 parseData = new ParseData(); 913 914 List<RuntimeException> errors = new ArrayList<RuntimeException>(); 915 int errorCount = 0; 916 917 ruleArray.reset(); 918 919 StringBuilder idBlockResult = new StringBuilder(); 920 921 // The compound filter offset is an index into idBlockResult. 922 // If it is 0, then the compound filter occurred at the start, 923 // and it is the offset to the _start_ of the compound filter 924 // pattern. Otherwise it is the offset to the _limit_ of the 925 // compound filter pattern within idBlockResult. 926 this.compoundFilter = null; 927 int compoundFilterOffset = -1; 928 929 main: 930 for (;;) { 931 String rule = ruleArray.nextLine(); 932 if (rule == null) { 933 break; 934 } 935 int pos = 0; 936 int limit = rule.length(); 937 while (pos < limit) { 938 char c = rule.charAt(pos++); 939 if (PatternProps.isWhiteSpace(c)) { 940 continue; 941 } 942 // Skip lines starting with the comment character 943 if (c == RULE_COMMENT_CHAR) { 944 pos = rule.indexOf("\n", pos) + 1; 945 if (pos == 0) { 946 break; // No "\n" found; rest of rule is a commnet 947 } 948 continue; // Either fall out or restart with next line 949 } 950 951 // skip empty rules 952 if (c == END_OF_RULE) 953 continue; 954 955 // Often a rule file contains multiple errors. It's 956 // convenient to the rule author if these are all reported 957 // at once. We keep parsing rules even after a failure, up 958 // to a specified limit, and report all errors at once. 959 try { 960 ++ruleCount; 961 962 // We've found the start of a rule or ID. c is its first 963 // character, and pos points past c. 964 --pos; 965 // Look for an ID token. Must have at least ID_TOKEN_LEN + 1 966 // chars left. 967 if ((pos + ID_TOKEN_LEN + 1) <= limit && 968 rule.regionMatches(pos, ID_TOKEN, 0, ID_TOKEN_LEN)) { 969 pos += ID_TOKEN_LEN; 970 c = rule.charAt(pos); 971 while (PatternProps.isWhiteSpace(c) && pos < limit) { 972 ++pos; 973 c = rule.charAt(pos); 974 } 975 int[] p = new int[] { pos }; 976 977 if (!parsingIDs) { 978 if (curData != null) { 979 if (direction == Transliterator.FORWARD) 980 dataVector.add(curData); 981 else 982 dataVector.add(0, curData); 983 curData = null; 984 } 985 parsingIDs = true; 986 } 987 988 TransliteratorIDParser.SingleID id = 989 TransliteratorIDParser.parseSingleID( 990 rule, p, direction); 991 if (p[0] != pos && Utility.parseChar(rule, p, END_OF_RULE)) { 992 // Successful ::ID parse. 993 994 if (direction == Transliterator.FORWARD) { 995 idBlockResult.append(id.canonID).append(END_OF_RULE); 996 } else { 997 idBlockResult.insert(0, id.canonID + END_OF_RULE); 998 } 999 1000 } else { 1001 // Couldn't parse an ID. Try to parse a global filter 1002 int[] withParens = new int[] { -1 }; 1003 UnicodeSet f = TransliteratorIDParser.parseGlobalFilter(rule, p, direction, withParens, null); 1004 if (f != null && Utility.parseChar(rule, p, END_OF_RULE)) { 1005 if ((direction == Transliterator.FORWARD) == 1006 (withParens[0] == 0)) { 1007 if (compoundFilter != null) { 1008 // Multiple compound filters 1009 syntaxError("Multiple global filters", rule, pos); 1010 } 1011 compoundFilter = f; 1012 compoundFilterOffset = ruleCount; 1013 } 1014 } else { 1015 // Invalid ::id 1016 // Can be parsed as neither an ID nor a global filter 1017 syntaxError("Invalid ::ID", rule, pos); 1018 } 1019 } 1020 1021 pos = p[0]; 1022 } else { 1023 if (parsingIDs) { 1024 if (direction == Transliterator.FORWARD) 1025 idBlockVector.add(idBlockResult.toString()); 1026 else 1027 idBlockVector.add(0, idBlockResult.toString()); 1028 idBlockResult.delete(0, idBlockResult.length()); 1029 parsingIDs = false; 1030 curData = new RuleBasedTransliterator.Data(); 1031 1032 // By default, rules use part of the private use area 1033 // E000..F8FF for variables and other stand-ins. Currently 1034 // the range F000..F8FF is typically sufficient. The 'use 1035 // variable range' pragma allows rule sets to modify this. 1036 setVariableRange(0xF000, 0xF8FF); 1037 } 1038 1039 if (resemblesPragma(rule, pos, limit)) { 1040 int ppp = parsePragma(rule, pos, limit); 1041 if (ppp < 0) { 1042 syntaxError("Unrecognized pragma", rule, pos); 1043 } 1044 pos = ppp; 1045 // Parse a rule 1046 } else { 1047 pos = parseRule(rule, pos, limit); 1048 } 1049 } 1050 } catch (IllegalArgumentException e) { 1051 if (errorCount == 30) { 1052 IllegalIcuArgumentException icuEx = new IllegalIcuArgumentException("\nMore than 30 errors; further messages squelched"); 1053 icuEx.initCause(e); 1054 errors.add(icuEx); 1055 break main; 1056 } 1057 e.fillInStackTrace(); 1058 errors.add(e); 1059 ++errorCount; 1060 pos = ruleEnd(rule, pos, limit) + 1; // +1 advances past ';' 1061 } 1062 } 1063 } 1064 if (parsingIDs && idBlockResult.length() > 0) { 1065 if (direction == Transliterator.FORWARD) 1066 idBlockVector.add(idBlockResult.toString()); 1067 else 1068 idBlockVector.add(0, idBlockResult.toString()); 1069 } 1070 else if (!parsingIDs && curData != null) { 1071 if (direction == Transliterator.FORWARD) 1072 dataVector.add(curData); 1073 else 1074 dataVector.add(0, curData); 1075 } 1076 1077 // Convert the set vector to an array 1078 for (int i = 0; i < dataVector.size(); i++) { 1079 Data data = dataVector.get(i); 1080 data.variables = new Object[variablesVector.size()]; 1081 variablesVector.toArray(data.variables); 1082 data.variableNames = new HashMap<String, char[]>(); 1083 data.variableNames.putAll(variableNames); 1084 } 1085 variablesVector = null; 1086 1087 // Do more syntax checking and index the rules 1088 try { 1089 if (compoundFilter != null) { 1090 if ((direction == Transliterator.FORWARD && 1091 compoundFilterOffset != 1) || 1092 (direction == Transliterator.REVERSE && 1093 compoundFilterOffset != ruleCount)) { 1094 throw new IllegalIcuArgumentException("Compound filters misplaced"); 1095 } 1096 } 1097 1098 for (int i = 0; i < dataVector.size(); i++) { 1099 Data data = dataVector.get(i); 1100 data.ruleSet.freeze(); 1101 } 1102 1103 if (idBlockVector.size() == 1 && (idBlockVector.get(0)).length() == 0) 1104 idBlockVector.remove(0); 1105 1106 } catch (IllegalArgumentException e) { 1107 e.fillInStackTrace(); 1108 errors.add(e); 1109 } 1110 1111 if (errors.size() != 0) { 1112 for (int i = errors.size()-1; i > 0; --i) { 1113 RuntimeException previous = errors.get(i-1); 1114 while (previous.getCause() != null) { 1115 previous = (RuntimeException) previous.getCause(); // chain specially 1116 } 1117 previous.initCause(errors.get(i)); 1118 } 1119 throw errors.get(0); 1120 // if initCause not supported: throw new IllegalArgumentException(errors.toString()); 1121 } 1122 } 1123 1124 /** 1125 * MAIN PARSER. Parse the next rule in the given rule string, starting 1126 * at pos. Return the index after the last character parsed. Do not 1127 * parse characters at or after limit. 1128 * 1129 * Important: The character at pos must be a non-whitespace character 1130 * that is not the comment character. 1131 * 1132 * This method handles quoting, escaping, and whitespace removal. It 1133 * parses the end-of-rule character. It recognizes context and cursor 1134 * indicators. Once it does a lexical breakdown of the rule at pos, it 1135 * creates a rule object and adds it to our rule list. 1136 * 1137 * This method is tightly coupled to the inner class RuleHalf. 1138 */ parseRule(String rule, int pos, int limit)1139 private int parseRule(String rule, int pos, int limit) { 1140 // Locate the left side, operator, and right side 1141 int start = pos; 1142 char operator = 0; 1143 1144 // Set up segments data 1145 segmentStandins = new StringBuffer(); 1146 segmentObjects = new ArrayList<StringMatcher>(); 1147 1148 RuleHalf left = new RuleHalf(); 1149 RuleHalf right = new RuleHalf(); 1150 1151 undefinedVariableName = null; 1152 pos = left.parse(rule, pos, limit, this); 1153 1154 if (pos == limit || 1155 OPERATORS.indexOf(operator = rule.charAt(--pos)) < 0) { 1156 syntaxError("No operator pos=" + pos, rule, start); 1157 } 1158 ++pos; 1159 1160 // Found an operator char. Check for forward-reverse operator. 1161 if (operator == REVERSE_RULE_OP && 1162 (pos < limit && rule.charAt(pos) == FORWARD_RULE_OP)) { 1163 ++pos; 1164 operator = FWDREV_RULE_OP; 1165 } 1166 1167 // Translate alternate op characters. 1168 switch (operator) { 1169 case ALT_FORWARD_RULE_OP: 1170 operator = FORWARD_RULE_OP; 1171 break; 1172 case ALT_REVERSE_RULE_OP: 1173 operator = REVERSE_RULE_OP; 1174 break; 1175 case ALT_FWDREV_RULE_OP: 1176 operator = FWDREV_RULE_OP; 1177 break; 1178 } 1179 1180 pos = right.parse(rule, pos, limit, this); 1181 1182 if (pos < limit) { 1183 if (rule.charAt(--pos) == END_OF_RULE) { 1184 ++pos; 1185 } else { 1186 // RuleHalf parser must have terminated at an operator 1187 syntaxError("Unquoted operator", rule, start); 1188 } 1189 } 1190 1191 if (operator == VARIABLE_DEF_OP) { 1192 // LHS is the name. RHS is a single character, either a literal 1193 // or a set (already parsed). If RHS is longer than one 1194 // character, it is either a multi-character string, or multiple 1195 // sets, or a mixture of chars and sets -- syntax error. 1196 1197 // We expect to see a single undefined variable (the one being 1198 // defined). 1199 if (undefinedVariableName == null) { 1200 syntaxError("Missing '$' or duplicate definition", rule, start); 1201 } 1202 if (left.text.length() != 1 || left.text.charAt(0) != variableLimit) { 1203 syntaxError("Malformed LHS", rule, start); 1204 } 1205 if (left.anchorStart || left.anchorEnd || 1206 right.anchorStart || right.anchorEnd) { 1207 syntaxError("Malformed variable def", rule, start); 1208 } 1209 // We allow anything on the right, including an empty string. 1210 int n = right.text.length(); 1211 char[] value = new char[n]; 1212 right.text.getChars(0, n, value, 0); 1213 variableNames.put(undefinedVariableName, value); 1214 1215 ++variableLimit; 1216 return pos; 1217 } 1218 1219 // If this is not a variable definition rule, we shouldn't have 1220 // any undefined variable names. 1221 if (undefinedVariableName != null) { 1222 syntaxError("Undefined variable $" + undefinedVariableName, 1223 rule, start); 1224 } 1225 1226 // Verify segments 1227 if (segmentStandins.length() > segmentObjects.size()) { 1228 syntaxError("Undefined segment reference", rule, start); 1229 } 1230 for (int i=0; i<segmentStandins.length(); ++i) { 1231 if (segmentStandins.charAt(i) == 0) { 1232 syntaxError("Internal error", rule, start); // will never happen 1233 } 1234 } 1235 for (int i=0; i<segmentObjects.size(); ++i) { 1236 if (segmentObjects.get(i) == null) { 1237 syntaxError("Internal error", rule, start); // will never happen 1238 } 1239 } 1240 1241 // If the direction we want doesn't match the rule 1242 // direction, do nothing. 1243 if (operator != FWDREV_RULE_OP && 1244 ((direction == Transliterator.FORWARD) != (operator == FORWARD_RULE_OP))) { 1245 return pos; 1246 } 1247 1248 // Transform the rule into a forward rule by swapping the 1249 // sides if necessary. 1250 if (direction == Transliterator.REVERSE) { 1251 RuleHalf temp = left; 1252 left = right; 1253 right = temp; 1254 } 1255 1256 // Remove non-applicable elements in forward-reverse 1257 // rules. Bidirectional rules ignore elements that do not 1258 // apply. 1259 if (operator == FWDREV_RULE_OP) { 1260 right.removeContext(); 1261 left.cursor = -1; 1262 left.cursorOffset = 0; 1263 } 1264 1265 // Normalize context 1266 if (left.ante < 0) { 1267 left.ante = 0; 1268 } 1269 if (left.post < 0) { 1270 left.post = left.text.length(); 1271 } 1272 1273 // Context is only allowed on the input side. Cursors are only 1274 // allowed on the output side. Segment delimiters can only appear 1275 // on the left, and references on the right. Cursor offset 1276 // cannot appear without an explicit cursor. Cursor offset 1277 // cannot place the cursor outside the limits of the context. 1278 // Anchors are only allowed on the input side. 1279 if (right.ante >= 0 || right.post >= 0 || left.cursor >= 0 || 1280 (right.cursorOffset != 0 && right.cursor < 0) || 1281 // - The following two checks were used to ensure that the 1282 // - the cursor offset stayed within the ante- or postcontext. 1283 // - However, with the addition of quantifiers, we have to 1284 // - allow arbitrary cursor offsets and do runtime checking. 1285 //(right.cursorOffset > (left.text.length() - left.post)) || 1286 //(-right.cursorOffset > left.ante) || 1287 right.anchorStart || right.anchorEnd || 1288 !left.isValidInput(this) || !right.isValidOutput(this) || 1289 left.ante > left.post) { 1290 syntaxError("Malformed rule", rule, start); 1291 } 1292 1293 // Flatten segment objects vector to an array 1294 UnicodeMatcher[] segmentsArray = null; 1295 if (segmentObjects.size() > 0) { 1296 segmentsArray = new UnicodeMatcher[segmentObjects.size()]; 1297 segmentObjects.toArray(segmentsArray); 1298 } 1299 1300 curData.ruleSet.addRule(new TransliterationRule( 1301 left.text, left.ante, left.post, 1302 right.text, right.cursor, right.cursorOffset, 1303 segmentsArray, 1304 left.anchorStart, left.anchorEnd, 1305 curData)); 1306 1307 return pos; 1308 } 1309 1310 /** 1311 * Set the variable range to [start, end] (inclusive). 1312 */ setVariableRange(int start, int end)1313 private void setVariableRange(int start, int end) { 1314 if (start > end || start < 0 || end > 0xFFFF) { 1315 throw new IllegalIcuArgumentException("Invalid variable range " + start + ", " + end); 1316 } 1317 1318 curData.variablesBase = (char) start; // first private use 1319 1320 if (dataVector.size() == 0) { 1321 variableNext = (char) start; 1322 variableLimit = (char) (end + 1); 1323 } 1324 } 1325 1326 /** 1327 * Assert that the given character is NOT within the variable range. 1328 * If it is, signal an error. This is neccesary to ensure that the 1329 * variable range does not overlap characters used in a rule. 1330 */ checkVariableRange(int ch, String rule, int start)1331 private void checkVariableRange(int ch, String rule, int start) { 1332 if (ch >= curData.variablesBase && ch < variableLimit) { 1333 syntaxError("Variable range character in rule", rule, start); 1334 } 1335 } 1336 1337 // (The following method is part of an unimplemented feature. 1338 // Remove this clover pragma after the feature is implemented. 1339 // 2003-06-11 ICU 2.6 Alan) 1340 ///CLOVER:OFF 1341 /** 1342 * Set the maximum backup to 'backup', in response to a pragma 1343 * statement. 1344 */ pragmaMaximumBackup(int backup)1345 private void pragmaMaximumBackup(int backup) { 1346 //TODO Finish 1347 throw new IllegalIcuArgumentException("use maximum backup pragma not implemented yet"); 1348 } 1349 ///CLOVER:ON 1350 1351 // (The following method is part of an unimplemented feature. 1352 // Remove this clover pragma after the feature is implemented. 1353 // 2003-06-11 ICU 2.6 Alan) 1354 ///CLOVER:OFF 1355 /** 1356 * Begin normalizing all rules using the given mode, in response 1357 * to a pragma statement. 1358 */ pragmaNormalizeRules(Normalizer.Mode mode)1359 private void pragmaNormalizeRules(Normalizer.Mode mode) { 1360 //TODO Finish 1361 throw new IllegalIcuArgumentException("use normalize rules pragma not implemented yet"); 1362 } 1363 ///CLOVER:ON 1364 1365 /** 1366 * Return true if the given rule looks like a pragma. 1367 * @param pos offset to the first non-whitespace character 1368 * of the rule. 1369 * @param limit pointer past the last character of the rule. 1370 */ resemblesPragma(String rule, int pos, int limit)1371 static boolean resemblesPragma(String rule, int pos, int limit) { 1372 // Must start with /use\s/i 1373 return Utility.parsePattern(rule, pos, limit, "use ", null) >= 0; 1374 } 1375 1376 /** 1377 * Parse a pragma. This method assumes resemblesPragma() has 1378 * already returned true. 1379 * @param pos offset to the first non-whitespace character 1380 * of the rule. 1381 * @param limit pointer past the last character of the rule. 1382 * @return the position index after the final ';' of the pragma, 1383 * or -1 on failure. 1384 */ parsePragma(String rule, int pos, int limit)1385 private int parsePragma(String rule, int pos, int limit) { 1386 int[] array = new int[2]; 1387 1388 // resemblesPragma() has already returned true, so we 1389 // know that pos points to /use\s/i; we can skip 4 characters 1390 // immediately 1391 pos += 4; 1392 1393 // Here are the pragmas we recognize: 1394 // use variable range 0xE000 0xEFFF; 1395 // use maximum backup 16; 1396 // use nfd rules; 1397 int p = Utility.parsePattern(rule, pos, limit, "~variable range # #~;", array); 1398 if (p >= 0) { 1399 setVariableRange(array[0], array[1]); 1400 return p; 1401 } 1402 1403 p = Utility.parsePattern(rule, pos, limit, "~maximum backup #~;", array); 1404 if (p >= 0) { 1405 pragmaMaximumBackup(array[0]); 1406 return p; 1407 } 1408 1409 p = Utility.parsePattern(rule, pos, limit, "~nfd rules~;", null); 1410 if (p >= 0) { 1411 pragmaNormalizeRules(Normalizer.NFD); 1412 return p; 1413 } 1414 1415 p = Utility.parsePattern(rule, pos, limit, "~nfc rules~;", null); 1416 if (p >= 0) { 1417 pragmaNormalizeRules(Normalizer.NFC); 1418 return p; 1419 } 1420 1421 // Syntax error: unable to parse pragma 1422 return -1; 1423 } 1424 1425 /** 1426 * Throw an exception indicating a syntax error. Search the rule string 1427 * for the probable end of the rule. Of course, if the error is that 1428 * the end of rule marker is missing, then the rule end will not be found. 1429 * In any case the rule start will be correctly reported. 1430 * @param msg error description 1431 * @param rule pattern string 1432 * @param start position of first character of current rule 1433 */ syntaxError(String msg, String rule, int start)1434 static final void syntaxError(String msg, String rule, int start) { 1435 int end = ruleEnd(rule, start, rule.length()); 1436 throw new IllegalIcuArgumentException(msg + " in \"" + 1437 Utility.escape(rule.substring(start, end)) + '"'); 1438 } 1439 ruleEnd(String rule, int start, int limit)1440 static final int ruleEnd(String rule, int start, int limit) { 1441 int end = Utility.quotedIndexOf(rule, start, limit, ";"); 1442 if (end < 0) { 1443 end = limit; 1444 } 1445 return end; 1446 } 1447 1448 /** 1449 * Parse a UnicodeSet out, store it, and return the stand-in character 1450 * used to represent it. 1451 */ parseSet(String rule, ParsePosition pos)1452 private final char parseSet(String rule, ParsePosition pos) { 1453 UnicodeSet set = new UnicodeSet(rule, pos, parseData); 1454 if (variableNext >= variableLimit) { 1455 throw new RuntimeException("Private use variables exhausted"); 1456 } 1457 set.compact(); 1458 return generateStandInFor(set); 1459 } 1460 1461 /** 1462 * Generate and return a stand-in for a new UnicodeMatcher or UnicodeReplacer. 1463 * Store the object. 1464 */ generateStandInFor(Object obj)1465 char generateStandInFor(Object obj) { 1466 // assert(obj != null); 1467 1468 // Look up previous stand-in, if any. This is a short list 1469 // (typical n is 0, 1, or 2); linear search is optimal. 1470 for (int i=0; i<variablesVector.size(); ++i) { 1471 if (variablesVector.get(i) == obj) { // [sic] pointer comparison 1472 return (char) (curData.variablesBase + i); 1473 } 1474 } 1475 1476 if (variableNext >= variableLimit) { 1477 throw new RuntimeException("Variable range exhausted"); 1478 } 1479 variablesVector.add(obj); 1480 return variableNext++; 1481 } 1482 1483 /** 1484 * Return the standin for segment seg (1-based). 1485 */ getSegmentStandin(int seg)1486 public char getSegmentStandin(int seg) { 1487 if (segmentStandins.length() < seg) { 1488 segmentStandins.setLength(seg); 1489 } 1490 char c = segmentStandins.charAt(seg-1); 1491 if (c == 0) { 1492 if (variableNext >= variableLimit) { 1493 throw new RuntimeException("Variable range exhausted"); 1494 } 1495 c = variableNext++; 1496 // Set a placeholder in the master variables vector that will be 1497 // filled in later by setSegmentObject(). We know that we will get 1498 // called first because setSegmentObject() will call us. 1499 variablesVector.add(null); 1500 segmentStandins.setCharAt(seg-1, c); 1501 } 1502 return c; 1503 } 1504 1505 /** 1506 * Set the object for segment seg (1-based). 1507 */ setSegmentObject(int seg, StringMatcher obj)1508 public void setSegmentObject(int seg, StringMatcher obj) { 1509 // Since we call parseSection() recursively, nested 1510 // segments will result in segment i+1 getting parsed 1511 // and stored before segment i; be careful with the 1512 // vector handling here. 1513 while (segmentObjects.size() < seg) { 1514 segmentObjects.add(null); 1515 } 1516 int index = getSegmentStandin(seg) - curData.variablesBase; 1517 if (segmentObjects.get(seg-1) != null || 1518 variablesVector.get(index) != null) { 1519 throw new RuntimeException(); // should never happen 1520 } 1521 segmentObjects.set(seg-1, obj); 1522 variablesVector.set(index, obj); 1523 } 1524 1525 /** 1526 * Return the stand-in for the dot set. It is allocated the first 1527 * time and reused thereafter. 1528 */ getDotStandIn()1529 char getDotStandIn() { 1530 if (dotStandIn == -1) { 1531 dotStandIn = generateStandInFor(new UnicodeSet(DOT_SET)); 1532 } 1533 return (char) dotStandIn; 1534 } 1535 1536 /** 1537 * Append the value of the given variable name to the given 1538 * StringBuffer. 1539 * @exception IllegalIcuArgumentException if the name is unknown. 1540 */ appendVariableDef(String name, StringBuffer buf)1541 private void appendVariableDef(String name, StringBuffer buf) { 1542 char[] ch = variableNames.get(name); 1543 if (ch == null) { 1544 // We allow one undefined variable so that variable definition 1545 // statements work. For the first undefined variable we return 1546 // the special placeholder variableLimit-1, and save the variable 1547 // name. 1548 if (undefinedVariableName == null) { 1549 undefinedVariableName = name; 1550 if (variableNext >= variableLimit) { 1551 throw new RuntimeException("Private use variables exhausted"); 1552 } 1553 buf.append(--variableLimit); 1554 } else { 1555 throw new IllegalIcuArgumentException("Undefined variable $" 1556 + name); 1557 } 1558 } else { 1559 buf.append(ch); 1560 } 1561 } 1562 } 1563 1564 //eof 1565