1 /* GENERATED SOURCE. DO NOT MODIFY. */ 2 // © 2016 and later: Unicode, Inc. and others. 3 // License & terms of use: http://www.unicode.org/copyright.html 4 /* 5 ******************************************************************************* 6 * Copyright (C) 1996-2010, International Business Machines Corporation and * 7 * others. All Rights Reserved. * 8 ******************************************************************************* 9 */ 10 package android.icu.text; 11 12 import android.icu.impl.Utility; 13 14 /** 15 * A transliteration rule used by 16 * <code>RuleBasedTransliterator</code>. 17 * <code>TransliterationRule</code> is an immutable object. 18 * 19 * <p>A rule consists of an input pattern and an output string. When 20 * the input pattern is matched, the output string is emitted. The 21 * input pattern consists of zero or more characters which are matched 22 * exactly (the key) and optional context. Context must match if it 23 * is specified. Context may be specified before the key, after the 24 * key, or both. The key, preceding context, and following context 25 * may contain variables. Variables represent a set of Unicode 26 * characters, such as the letters <i>a</i> through <i>z</i>. 27 * Variables are detected by looking up each character in a supplied 28 * variable list to see if it has been so defined. 29 * 30 * <p>A rule may contain segments in its input string and segment 31 * references in its output string. A segment is a substring of the 32 * input pattern, indicated by an offset and limit. The segment may 33 * be in the preceding or following context. It may not span a 34 * context boundary. A segment reference is a special character in 35 * the output string that causes a segment of the input string (not 36 * the input pattern) to be copied to the output string. The range of 37 * special characters that represent segment references is defined by 38 * RuleBasedTransliterator.Data. 39 * 40 * <p>Example: The rule "([a-z]) . ([0-9]) > $2 . $1" will change the input 41 * string "abc.123" to "ab1.c23". 42 * 43 * <p>Copyright © IBM Corporation 1999. All rights reserved. 44 * 45 * @author Alan Liu 46 */ 47 class TransliterationRule { 48 49 // TODO Eliminate the pattern and keyLength data members. They 50 // are used only by masks() and getIndexValue() which are called 51 // only during build time, not during run-time. Perhaps these 52 // methods and pattern/keyLength can be isolated into a separate 53 // object. 54 55 /** 56 * The match that must occur before the key, or null if there is no 57 * preceding context. 58 */ 59 private StringMatcher anteContext; 60 61 /** 62 * The matcher object for the key. If null, then the key is empty. 63 */ 64 private StringMatcher key; 65 66 /** 67 * The match that must occur after the key, or null if there is no 68 * following context. 69 */ 70 private StringMatcher postContext; 71 72 /** 73 * The object that performs the replacement if the key, 74 * anteContext, and postContext are matched. Never null. 75 */ 76 private UnicodeReplacer output; 77 78 /** 79 * The string that must be matched, consisting of the anteContext, key, 80 * and postContext, concatenated together, in that order. Some components 81 * may be empty (zero length). 82 * @see anteContextLength 83 * @see keyLength 84 */ 85 private String pattern; 86 87 /** 88 * An array of matcher objects corresponding to the input pattern 89 * segments. If there are no segments this is null. N.B. This is 90 * a UnicodeMatcher for generality, but in practice it is always a 91 * StringMatcher. In the future we may generalize this, but for 92 * now we sometimes cast down to StringMatcher. 93 */ 94 UnicodeMatcher[] segments; 95 96 /** 97 * The length of the string that must match before the key. If 98 * zero, then there is no matching requirement before the key. 99 * Substring [0,anteContextLength) of pattern is the anteContext. 100 */ 101 private int anteContextLength; 102 103 /** 104 * The length of the key. Substring [anteContextLength, 105 * anteContextLength + keyLength) is the key. 106 */ 107 private int keyLength; 108 109 /** 110 * Miscellaneous attributes. 111 */ 112 byte flags; 113 114 /** 115 * Flag attributes. 116 */ 117 static final int ANCHOR_START = 1; 118 static final int ANCHOR_END = 2; 119 120 /** 121 * An alias pointer to the data for this rule. The data provides 122 * lookup services for matchers and segments. 123 */ 124 private final RuleBasedTransliterator.Data data; 125 126 127 /** 128 * Construct a new rule with the given input, output text, and other 129 * attributes. A cursor position may be specified for the output text. 130 * @param input input string, including key and optional ante and 131 * post context 132 * @param anteContextPos offset into input to end of ante context, or -1 if 133 * none. Must be <= input.length() if not -1. 134 * @param postContextPos offset into input to start of post context, or -1 135 * if none. Must be <= input.length() if not -1, and must be >= 136 * anteContextPos. 137 * @param output output string 138 * @param cursorPos offset into output at which cursor is located, or -1 if 139 * none. If less than zero, then the cursor is placed after the 140 * <code>output</code>; that is, -1 is equivalent to 141 * <code>output.length()</code>. If greater than 142 * <code>output.length()</code> then an exception is thrown. 143 * @param cursorOffset an offset to be added to cursorPos to position the 144 * cursor either in the ante context, if < 0, or in the post context, if > 145 * 0. For example, the rule "abc{def} > | @@@ xyz;" changes "def" to 146 * "xyz" and moves the cursor to before "a". It would have a cursorOffset 147 * of -3. 148 * @param segs array of UnicodeMatcher corresponding to input pattern 149 * segments, or null if there are none 150 * @param anchorStart true if the the rule is anchored on the left to 151 * the context start 152 * @param anchorEnd true if the rule is anchored on the right to the 153 * context limit 154 */ TransliterationRule(String input, int anteContextPos, int postContextPos, String output, int cursorPos, int cursorOffset, UnicodeMatcher[] segs, boolean anchorStart, boolean anchorEnd, RuleBasedTransliterator.Data theData)155 public TransliterationRule(String input, 156 int anteContextPos, int postContextPos, 157 String output, 158 int cursorPos, int cursorOffset, 159 UnicodeMatcher[] segs, 160 boolean anchorStart, boolean anchorEnd, 161 RuleBasedTransliterator.Data theData) { 162 data = theData; 163 164 // Do range checks only when warranted to save time 165 if (anteContextPos < 0) { 166 anteContextLength = 0; 167 } else { 168 if (anteContextPos > input.length()) { 169 throw new IllegalArgumentException("Invalid ante context"); 170 } 171 anteContextLength = anteContextPos; 172 } 173 if (postContextPos < 0) { 174 keyLength = input.length() - anteContextLength; 175 } else { 176 if (postContextPos < anteContextLength || 177 postContextPos > input.length()) { 178 throw new IllegalArgumentException("Invalid post context"); 179 } 180 keyLength = postContextPos - anteContextLength; 181 } 182 if (cursorPos < 0) { 183 cursorPos = output.length(); 184 } else if (cursorPos > output.length()) { 185 throw new IllegalArgumentException("Invalid cursor position"); 186 } 187 188 // We don't validate the segments array. The caller must 189 // guarantee that the segments are well-formed (that is, that 190 // all $n references in the output refer to indices of this 191 // array, and that no array elements are null). 192 this.segments = segs; 193 194 pattern = input; 195 flags = 0; 196 if (anchorStart) { 197 flags |= ANCHOR_START; 198 } 199 if (anchorEnd) { 200 flags |= ANCHOR_END; 201 } 202 203 anteContext = null; 204 if (anteContextLength > 0) { 205 anteContext = new StringMatcher(pattern.substring(0, anteContextLength), 206 0, data); 207 } 208 209 key = null; 210 if (keyLength > 0) { 211 key = new StringMatcher(pattern.substring(anteContextLength, anteContextLength + keyLength), 212 0, data); 213 } 214 215 int postContextLength = pattern.length() - keyLength - anteContextLength; 216 postContext = null; 217 if (postContextLength > 0) { 218 postContext = new StringMatcher(pattern.substring(anteContextLength + keyLength), 219 0, data); 220 } 221 222 this.output = new StringReplacer(output, cursorPos + cursorOffset, data); 223 } 224 225 /** 226 * Return the preceding context length. This method is needed to 227 * support the <code>Transliterator</code> method 228 * <code>getMaximumContextLength()</code>. 229 */ getAnteContextLength()230 public int getAnteContextLength() { 231 return anteContextLength + (((flags & ANCHOR_START) != 0) ? 1 : 0); 232 } 233 234 /** 235 * Internal method. Returns 8-bit index value for this rule. 236 * This is the low byte of the first character of the key, 237 * unless the first character of the key is a set. If it's a 238 * set, or otherwise can match multiple keys, the index value is -1. 239 */ getIndexValue()240 final int getIndexValue() { 241 if (anteContextLength == pattern.length()) { 242 // A pattern with just ante context {such as foo)>bar} can 243 // match any key. 244 return -1; 245 } 246 int c = UTF16.charAt(pattern, anteContextLength); 247 return data.lookupMatcher(c) == null ? (c & 0xFF) : -1; 248 } 249 250 /** 251 * Internal method. Returns true if this rule matches the given 252 * index value. The index value is an 8-bit integer, 0..255, 253 * representing the low byte of the first character of the key. 254 * It matches this rule if it matches the first character of the 255 * key, or if the first character of the key is a set, and the set 256 * contains any character with a low byte equal to the index 257 * value. If the rule contains only ante context, as in foo)>bar, 258 * then it will match any key. 259 */ matchesIndexValue(int v)260 final boolean matchesIndexValue(int v) { 261 // Delegate to the key, or if there is none, to the postContext. 262 // If there is neither then we match any key; return true. 263 UnicodeMatcher m = (key != null) ? key : postContext; 264 return (m != null) ? m.matchesIndexValue(v) : true; 265 } 266 267 /** 268 * Return true if this rule masks another rule. If r1 masks r2 then 269 * r1 matches any input string that r2 matches. If r1 masks r2 and r2 masks 270 * r1 then r1 == r2. Examples: "a>x" masks "ab>y". "a>x" masks "a[b]>y". 271 * "[c]a>x" masks "[dc]a>y". 272 */ masks(TransliterationRule r2)273 public boolean masks(TransliterationRule r2) { 274 /* Rule r1 masks rule r2 if the string formed of the 275 * antecontext, key, and postcontext overlaps in the following 276 * way: 277 * 278 * r1: aakkkpppp 279 * r2: aaakkkkkpppp 280 * ^ 281 * 282 * The strings must be aligned at the first character of the 283 * key. The length of r1 to the left of the alignment point 284 * must be <= the length of r2 to the left; ditto for the 285 * right. The characters of r1 must equal (or be a superset 286 * of) the corresponding characters of r2. The superset 287 * operation should be performed to check for UnicodeSet 288 * masking. 289 * 290 * Anchors: Two patterns that differ only in anchors only 291 * mask one another if they are exactly equal, and r2 has 292 * all the anchors r1 has (optionally, plus some). Here Y 293 * means the row masks the column, N means it doesn't. 294 * 295 * ab ^ab ab$ ^ab$ 296 * ab Y Y Y Y 297 * ^ab N Y N Y 298 * ab$ N N Y Y 299 * ^ab$ N N N Y 300 * 301 * Post context: {a}b masks ab, but not vice versa, since {a}b 302 * matches everything ab matches, and {a}b matches {|a|}b but ab 303 * does not. Pre context is different (a{b} does not align with 304 * ab). 305 */ 306 307 /* LIMITATION of the current mask algorithm: Some rule 308 * maskings are currently not detected. For example, 309 * "{Lu}]a>x" masks "A]a>y". This can be added later. TODO 310 */ 311 312 int len = pattern.length(); 313 int left = anteContextLength; 314 int left2 = r2.anteContextLength; 315 int right = pattern.length() - left; 316 int right2 = r2.pattern.length() - left2; 317 318 // TODO Clean this up -- some logic might be combinable with the 319 // next statement. 320 321 // Test for anchor masking 322 if (left == left2 && right == right2 && 323 keyLength <= r2.keyLength && 324 r2.pattern.regionMatches(0, pattern, 0, len)) { 325 // The following boolean logic implements the table above 326 return (flags == r2.flags) || 327 (!((flags & ANCHOR_START) != 0) && !((flags & ANCHOR_END) != 0)) || 328 (((r2.flags & ANCHOR_START) != 0) && ((r2.flags & ANCHOR_END) != 0)); 329 } 330 331 return left <= left2 && 332 (right < right2 || 333 (right == right2 && keyLength <= r2.keyLength)) && 334 r2.pattern.regionMatches(left2 - left, pattern, 0, len); 335 } 336 posBefore(Replaceable str, int pos)337 static final int posBefore(Replaceable str, int pos) { 338 return (pos > 0) ? 339 pos - UTF16.getCharCount(str.char32At(pos-1)) : 340 pos - 1; 341 } 342 posAfter(Replaceable str, int pos)343 static final int posAfter(Replaceable str, int pos) { 344 return (pos >= 0 && pos < str.length()) ? 345 pos + UTF16.getCharCount(str.char32At(pos)) : 346 pos + 1; 347 } 348 349 /** 350 * Attempt a match and replacement at the given position. Return 351 * the degree of match between this rule and the given text. The 352 * degree of match may be mismatch, a partial match, or a full 353 * match. A mismatch means at least one character of the text 354 * does not match the context or key. A partial match means some 355 * context and key characters match, but the text is not long 356 * enough to match all of them. A full match means all context 357 * and key characters match. 358 * 359 * If a full match is obtained, perform a replacement, update pos, 360 * and return U_MATCH. Otherwise both text and pos are unchanged. 361 * 362 * @param text the text 363 * @param pos the position indices 364 * @param incremental if true, test for partial matches that may 365 * be completed by additional text inserted at pos.limit. 366 * @return one of <code>U_MISMATCH</code>, 367 * <code>U_PARTIAL_MATCH</code>, or <code>U_MATCH</code>. If 368 * incremental is false then U_PARTIAL_MATCH will not be returned. 369 */ matchAndReplace(Replaceable text, Transliterator.Position pos, boolean incremental)370 public int matchAndReplace(Replaceable text, 371 Transliterator.Position pos, 372 boolean incremental) { 373 // Matching and replacing are done in one method because the 374 // replacement operation needs information obtained during the 375 // match. Another way to do this is to have the match method 376 // create a match result struct with relevant offsets, and to pass 377 // this into the replace method. 378 379 // ============================ MATCH =========================== 380 381 // Reset segment match data 382 if (segments != null) { 383 for (int i=0; i<segments.length; ++i) { 384 ((StringMatcher) segments[i]).resetMatch(); 385 } 386 } 387 388 int keyLimit; 389 int[] intRef = new int[1]; 390 391 // ------------------------ Ante Context ------------------------ 392 393 // A mismatch in the ante context, or with the start anchor, 394 // is an outright U_MISMATCH regardless of whether we are 395 // incremental or not. 396 int oText; // offset into 'text' 397 int minOText; 398 399 // Note (1): We process text in 16-bit code units, rather than 400 // 32-bit code points. This works because stand-ins are 401 // always in the BMP and because we are doing a literal match 402 // operation, which can be done 16-bits at a time. 403 404 int anteLimit = posBefore(text, pos.contextStart); 405 406 int match; 407 408 // Start reverse match at char before pos.start 409 intRef[0] = posBefore(text, pos.start); 410 411 if (anteContext != null) { 412 match = anteContext.matches(text, intRef, anteLimit, false); 413 if (match != UnicodeMatcher.U_MATCH) { 414 return UnicodeMatcher.U_MISMATCH; 415 } 416 } 417 418 oText = intRef[0]; 419 420 minOText = posAfter(text, oText); 421 422 // ------------------------ Start Anchor ------------------------ 423 424 if (((flags & ANCHOR_START) != 0) && oText != anteLimit) { 425 return UnicodeMatcher.U_MISMATCH; 426 } 427 428 // -------------------- Key and Post Context -------------------- 429 430 intRef[0] = pos.start; 431 432 if (key != null) { 433 match = key.matches(text, intRef, pos.limit, incremental); 434 if (match != UnicodeMatcher.U_MATCH) { 435 return match; 436 } 437 } 438 439 keyLimit = intRef[0]; 440 441 if (postContext != null) { 442 if (incremental && keyLimit == pos.limit) { 443 // The key matches just before pos.limit, and there is 444 // a postContext. Since we are in incremental mode, 445 // we must assume more characters may be inserted at 446 // pos.limit -- this is a partial match. 447 return UnicodeMatcher.U_PARTIAL_MATCH; 448 } 449 450 match = postContext.matches(text, intRef, pos.contextLimit, incremental); 451 if (match != UnicodeMatcher.U_MATCH) { 452 return match; 453 } 454 } 455 456 oText = intRef[0]; 457 458 // ------------------------- Stop Anchor ------------------------ 459 460 if (((flags & ANCHOR_END)) != 0) { 461 if (oText != pos.contextLimit) { 462 return UnicodeMatcher.U_MISMATCH; 463 } 464 if (incremental) { 465 return UnicodeMatcher.U_PARTIAL_MATCH; 466 } 467 } 468 469 // =========================== REPLACE ========================== 470 471 // We have a full match. The key is between pos.start and 472 // keyLimit. 473 474 int newLength = output.replace(text, pos.start, keyLimit, intRef); 475 int lenDelta = newLength - (keyLimit - pos.start); 476 int newStart = intRef[0]; 477 478 oText += lenDelta; 479 pos.limit += lenDelta; 480 pos.contextLimit += lenDelta; 481 // Restrict new value of start to [minOText, min(oText, pos.limit)]. 482 pos.start = Math.max(minOText, Math.min(Math.min(oText, pos.limit), newStart)); 483 return UnicodeMatcher.U_MATCH; 484 } 485 486 /** 487 * Create a source string that represents this rule. Append it to the 488 * given string. 489 */ toRule(boolean escapeUnprintable)490 public String toRule(boolean escapeUnprintable) { 491 // int i; 492 493 StringBuffer rule = new StringBuffer(); 494 495 // Accumulate special characters (and non-specials following them) 496 // into quoteBuf. Append quoteBuf, within single quotes, when 497 // a non-quoted element must be inserted. 498 StringBuffer quoteBuf = new StringBuffer(); 499 500 // Do not emit the braces '{' '}' around the pattern if there 501 // is neither anteContext nor postContext. 502 boolean emitBraces = 503 (anteContext != null) || (postContext != null); 504 505 // Emit start anchor 506 if ((flags & ANCHOR_START) != 0) { 507 rule.append('^'); 508 } 509 510 // Emit the input pattern 511 Utility.appendToRule(rule, anteContext, escapeUnprintable, quoteBuf); 512 513 if (emitBraces) { 514 Utility.appendToRule(rule, '{', true, escapeUnprintable, quoteBuf); 515 } 516 517 Utility.appendToRule(rule, key, escapeUnprintable, quoteBuf); 518 519 if (emitBraces) { 520 Utility.appendToRule(rule, '}', true, escapeUnprintable, quoteBuf); 521 } 522 523 Utility.appendToRule(rule, postContext, escapeUnprintable, quoteBuf); 524 525 // Emit end anchor 526 if ((flags & ANCHOR_END) != 0) { 527 rule.append('$'); 528 } 529 530 Utility.appendToRule(rule, " > ", true, escapeUnprintable, quoteBuf); 531 532 // Emit the output pattern 533 534 Utility.appendToRule(rule, output.toReplacerPattern(escapeUnprintable), 535 true, escapeUnprintable, quoteBuf); 536 537 Utility.appendToRule(rule, ';', true, escapeUnprintable, quoteBuf); 538 539 return rule.toString(); 540 } 541 542 /** 543 * Return a string representation of this object. 544 * @return string representation of this object 545 */ 546 @Override toString()547 public String toString() { 548 return '{' + toRule(true) + '}'; 549 } 550 551 /** 552 * Find the source and target sets, subject to the input filter. 553 * There is a known issue with filters containing multiple characters. 554 */ 555 // TODO: Problem: the rule is [{ab}]c > x 556 // The filter is [a{bc}]. 557 // If the input is abc, then the rule will work. 558 // However, following code applying the filter won't catch that case. 559 addSourceTargetSet(UnicodeSet filter, UnicodeSet sourceSet, UnicodeSet targetSet, UnicodeSet revisiting)560 void addSourceTargetSet(UnicodeSet filter, UnicodeSet sourceSet, UnicodeSet targetSet, UnicodeSet revisiting) { 561 int limit = anteContextLength + keyLength; 562 UnicodeSet tempSource = new UnicodeSet(); 563 UnicodeSet temp = new UnicodeSet(); 564 565 // We need to walk through the pattern. 566 // Iff some of the characters at ALL of the the positions are matched by the filter, then we add temp to toUnionTo 567 for (int i=anteContextLength; i<limit; ) { 568 int ch = UTF16.charAt(pattern, i); 569 i += UTF16.getCharCount(ch); 570 UnicodeMatcher matcher = data.lookupMatcher(ch); 571 if (matcher == null) { 572 if (!filter.contains(ch)) { 573 return; 574 } 575 tempSource.add(ch); 576 } else { 577 try { 578 if (!filter.containsSome((UnicodeSet) matcher)) { 579 return; 580 } 581 matcher.addMatchSetTo(tempSource); 582 } catch (ClassCastException e) { // if the matcher is not a UnicodeSet 583 temp.clear(); 584 matcher.addMatchSetTo(temp); 585 if (!filter.containsSome(temp)) { 586 return; 587 } 588 tempSource.addAll(temp); 589 } 590 } 591 } 592 // if we made our way through the gauntlet, add to source/target 593 sourceSet.addAll(tempSource); 594 output.addReplacementSetTo(targetSet); 595 } 596 } 597