1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 package com.ibm.icu.impl; 4 5 import java.io.IOException; 6 import java.text.CharacterIterator; 7 import java.util.Locale; 8 9 import com.ibm.icu.lang.UCharacter; 10 import com.ibm.icu.lang.UCharacterCategory; 11 import com.ibm.icu.text.BreakIterator; 12 import com.ibm.icu.text.Edits; 13 import com.ibm.icu.util.ICUUncheckedIOException; 14 import com.ibm.icu.util.ULocale; 15 16 public final class CaseMapImpl { 17 /** 18 * Implementation of UCaseProps.ContextIterator, iterates over a String. 19 * See ustrcase.c/utf16_caseContextIterator(). 20 */ 21 public static final class StringContextIterator implements UCaseProps.ContextIterator { 22 /** 23 * Constructor. 24 * @param src String to iterate over. 25 */ StringContextIterator(CharSequence src)26 public StringContextIterator(CharSequence src) { 27 this.s=src; 28 limit=src.length(); 29 cpStart=cpLimit=index=0; 30 dir=0; 31 } 32 33 /** 34 * Constructor. 35 * @param src String to iterate over. 36 * @param cpStart Start index of the current code point. 37 * @param cpLimit Limit index of the current code point. 38 */ StringContextIterator(CharSequence src, int cpStart, int cpLimit)39 public StringContextIterator(CharSequence src, int cpStart, int cpLimit) { 40 s = src; 41 index = 0; 42 limit = src.length(); 43 this.cpStart = cpStart; 44 this.cpLimit = cpLimit; 45 dir = 0; 46 } 47 48 /** 49 * Set the iteration limit for nextCaseMapCP() to an index within the string. 50 * If the limit parameter is negative or past the string, then the 51 * string length is restored as the iteration limit. 52 * 53 * <p>This limit does not affect the next() function which always 54 * iterates to the very end of the string. 55 * 56 * @param lim The iteration limit. 57 */ setLimit(int lim)58 public void setLimit(int lim) { 59 if(0<=lim && lim<=s.length()) { 60 limit=lim; 61 } else { 62 limit=s.length(); 63 } 64 } 65 66 /** 67 * Move to the iteration limit without fetching code points up to there. 68 */ moveToLimit()69 public void moveToLimit() { 70 cpStart=cpLimit=limit; 71 } 72 73 /** 74 * Iterate forward through the string to fetch the next code point 75 * to be case-mapped, and set the context indexes for it. 76 * 77 * <p>When the iteration limit is reached (and -1 is returned), 78 * getCPStart() will be at the iteration limit. 79 * 80 * <p>Iteration with next() does not affect the position for nextCaseMapCP(). 81 * 82 * @return The next code point to be case-mapped, or <0 when the iteration is done. 83 */ nextCaseMapCP()84 public int nextCaseMapCP() { 85 cpStart=cpLimit; 86 if(cpLimit<limit) { 87 int c=Character.codePointAt(s, cpLimit); 88 cpLimit+=Character.charCount(c); 89 return c; 90 } else { 91 return -1; 92 } 93 } 94 setCPStartAndLimit(int s, int l)95 public void setCPStartAndLimit(int s, int l) { 96 cpStart = s; 97 cpLimit = l; 98 dir = 0; 99 } 100 /** 101 * Returns the start of the code point that was last returned 102 * by nextCaseMapCP(). 103 */ getCPStart()104 public int getCPStart() { 105 return cpStart; 106 } 107 108 /** 109 * Returns the limit of the code point that was last returned 110 * by nextCaseMapCP(). 111 */ getCPLimit()112 public int getCPLimit() { 113 return cpLimit; 114 } 115 getCPLength()116 public int getCPLength() { 117 return cpLimit-cpStart; 118 } 119 120 // implement UCaseProps.ContextIterator 121 // The following code is not used anywhere in this private class 122 @Override reset(int direction)123 public void reset(int direction) { 124 if(direction>0) { 125 /* reset for forward iteration */ 126 dir=1; 127 index=cpLimit; 128 } else if(direction<0) { 129 /* reset for backward iteration */ 130 dir=-1; 131 index=cpStart; 132 } else { 133 // not a valid direction 134 dir=0; 135 index=0; 136 } 137 } 138 139 @Override next()140 public int next() { 141 int c; 142 143 if(dir>0 && index<s.length()) { 144 c=Character.codePointAt(s, index); 145 index+=Character.charCount(c); 146 return c; 147 } else if(dir<0 && index>0) { 148 c=Character.codePointBefore(s, index); 149 index-=Character.charCount(c); 150 return c; 151 } 152 return -1; 153 } 154 155 // variables 156 protected CharSequence s; 157 protected int index, limit, cpStart, cpLimit; 158 protected int dir; // 0=initial state >0=forward <0=backward 159 } 160 161 public static final int TITLECASE_WHOLE_STRING = 0x20; 162 public static final int TITLECASE_SENTENCES = 0x40; 163 164 /** 165 * Bit mask for the titlecasing iterator options bit field. 166 * Currently only 3 out of 8 values are used: 167 * 0 (words), TITLECASE_WHOLE_STRING, TITLECASE_SENTENCES. 168 * See stringoptions.h. 169 * @internal 170 */ 171 private static final int TITLECASE_ITERATOR_MASK = 0xe0; 172 173 public static final int TITLECASE_ADJUST_TO_CASED = 0x400; 174 175 /** 176 * Bit mask for the titlecasing index adjustment options bit set. 177 * Currently two bits are defined: 178 * TITLECASE_NO_BREAK_ADJUSTMENT, TITLECASE_ADJUST_TO_CASED. 179 * See stringoptions.h. 180 * @internal 181 */ 182 private static final int TITLECASE_ADJUSTMENT_MASK = 0x600; 183 addTitleAdjustmentOption(int options, int newOption)184 public static int addTitleAdjustmentOption(int options, int newOption) { 185 int adjOptions = options & TITLECASE_ADJUSTMENT_MASK; 186 if (adjOptions !=0 && adjOptions != newOption) { 187 throw new IllegalArgumentException("multiple titlecasing index adjustment options"); 188 } 189 return options | newOption; 190 } 191 192 private static final int LNS = 193 (1 << UCharacterCategory.UPPERCASE_LETTER) | 194 (1 << UCharacterCategory.LOWERCASE_LETTER) | 195 (1 << UCharacterCategory.TITLECASE_LETTER) | 196 // Not MODIFIER_LETTER: We count only cased modifier letters. 197 (1 << UCharacterCategory.OTHER_LETTER) | 198 199 (1 << UCharacterCategory.DECIMAL_DIGIT_NUMBER) | 200 (1 << UCharacterCategory.LETTER_NUMBER) | 201 (1 << UCharacterCategory.OTHER_NUMBER) | 202 203 (1 << UCharacterCategory.MATH_SYMBOL) | 204 (1 << UCharacterCategory.CURRENCY_SYMBOL) | 205 (1 << UCharacterCategory.MODIFIER_SYMBOL) | 206 (1 << UCharacterCategory.OTHER_SYMBOL) | 207 208 (1 << UCharacterCategory.PRIVATE_USE); 209 isLNS(int c)210 private static boolean isLNS(int c) { 211 // Letter, number, symbol, 212 // or a private use code point because those are typically used as letters or numbers. 213 // Consider modifier letters only if they are cased. 214 int gc = UCharacterProperty.INSTANCE.getType(c); 215 return ((1 << gc) & LNS) != 0 || 216 (gc == UCharacterCategory.MODIFIER_LETTER && 217 UCaseProps.INSTANCE.getType(c) != UCaseProps.NONE); 218 } 219 addTitleIteratorOption(int options, int newOption)220 public static int addTitleIteratorOption(int options, int newOption) { 221 int iterOptions = options & TITLECASE_ITERATOR_MASK; 222 if (iterOptions !=0 && iterOptions != newOption) { 223 throw new IllegalArgumentException("multiple titlecasing iterator options"); 224 } 225 return options | newOption; 226 } 227 getTitleBreakIterator( Locale locale, int options, BreakIterator iter)228 public static BreakIterator getTitleBreakIterator( 229 Locale locale, int options, BreakIterator iter) { 230 options &= TITLECASE_ITERATOR_MASK; 231 if (options != 0 && iter != null) { 232 throw new IllegalArgumentException( 233 "titlecasing iterator option together with an explicit iterator"); 234 } 235 if (iter == null) { 236 switch (options) { 237 case 0: 238 iter = BreakIterator.getWordInstance(locale); 239 break; 240 case TITLECASE_WHOLE_STRING: 241 iter = new WholeStringBreakIterator(); 242 break; 243 case TITLECASE_SENTENCES: 244 iter = BreakIterator.getSentenceInstance(locale); 245 break; 246 default: 247 throw new IllegalArgumentException("unknown titlecasing iterator option"); 248 } 249 } 250 return iter; 251 } 252 getTitleBreakIterator( ULocale locale, int options, BreakIterator iter)253 public static BreakIterator getTitleBreakIterator( 254 ULocale locale, int options, BreakIterator iter) { 255 options &= TITLECASE_ITERATOR_MASK; 256 if (options != 0 && iter != null) { 257 throw new IllegalArgumentException( 258 "titlecasing iterator option together with an explicit iterator"); 259 } 260 if (iter == null) { 261 switch (options) { 262 case 0: 263 iter = BreakIterator.getWordInstance(locale); 264 break; 265 case TITLECASE_WHOLE_STRING: 266 iter = new WholeStringBreakIterator(); 267 break; 268 case TITLECASE_SENTENCES: 269 iter = BreakIterator.getSentenceInstance(locale); 270 break; 271 default: 272 throw new IllegalArgumentException("unknown titlecasing iterator option"); 273 } 274 } 275 return iter; 276 } 277 278 /** 279 * Omit unchanged text when case-mapping with Edits. 280 */ 281 public static final int OMIT_UNCHANGED_TEXT = 0x4000; 282 283 private static final class WholeStringBreakIterator extends BreakIterator { 284 private int length; 285 notImplemented()286 private static void notImplemented() { 287 throw new UnsupportedOperationException("should not occur"); 288 } 289 290 @Override first()291 public int first() { 292 return 0; 293 } 294 295 @Override last()296 public int last() { 297 notImplemented(); 298 return 0; 299 } 300 301 @Override next(int n)302 public int next(int n) { 303 notImplemented(); 304 return 0; 305 } 306 307 @Override next()308 public int next() { 309 return length; 310 } 311 312 @Override previous()313 public int previous() { 314 notImplemented(); 315 return 0; 316 } 317 318 @Override following(int offset)319 public int following(int offset) { 320 notImplemented(); 321 return 0; 322 } 323 324 @Override current()325 public int current() { 326 notImplemented(); 327 return 0; 328 } 329 330 @Override getText()331 public CharacterIterator getText() { 332 notImplemented(); 333 return null; 334 } 335 336 @Override setText(CharacterIterator newText)337 public void setText(CharacterIterator newText) { 338 length = newText.getEndIndex(); 339 } 340 341 @Override setText(CharSequence newText)342 public void setText(CharSequence newText) { 343 length = newText.length(); 344 } 345 346 @Override setText(String newText)347 public void setText(String newText) { 348 length = newText.length(); 349 } 350 } 351 appendCodePoint(Appendable a, int c)352 private static int appendCodePoint(Appendable a, int c) throws IOException { 353 if (c <= Character.MAX_VALUE) { 354 a.append((char)c); 355 return 1; 356 } else { 357 a.append((char)(0xd7c0 + (c >> 10))); 358 a.append((char)(Character.MIN_LOW_SURROGATE + (c & 0x3ff))); 359 return 2; 360 } 361 } 362 363 /** 364 * Appends a full case mapping result, see {@link UCaseProps#MAX_STRING_LENGTH}. 365 * @throws IOException 366 */ appendResult(int result, Appendable dest, int cpLength, int options, Edits edits)367 private static void appendResult(int result, Appendable dest, 368 int cpLength, int options, Edits edits) throws IOException { 369 // Decode the result. 370 if (result < 0) { 371 // (not) original code point 372 if (edits != null) { 373 edits.addUnchanged(cpLength); 374 } 375 if ((options & OMIT_UNCHANGED_TEXT) != 0) { 376 return; 377 } 378 appendCodePoint(dest, ~result); 379 } else if (result <= UCaseProps.MAX_STRING_LENGTH) { 380 // The mapping has already been appended to result. 381 if (edits != null) { 382 edits.addReplace(cpLength, result); 383 } 384 } else { 385 // Append the single-code point mapping. 386 int length = appendCodePoint(dest, result); 387 if (edits != null) { 388 edits.addReplace(cpLength, length); 389 } 390 } 391 } 392 appendUnchanged(CharSequence src, int start, int length, Appendable dest, int options, Edits edits)393 private static final void appendUnchanged(CharSequence src, int start, int length, 394 Appendable dest, int options, Edits edits) throws IOException { 395 if (length > 0) { 396 if (edits != null) { 397 edits.addUnchanged(length); 398 } 399 if ((options & OMIT_UNCHANGED_TEXT) != 0) { 400 return; 401 } 402 dest.append(src, start, start + length); 403 } 404 } 405 applyEdits(CharSequence src, StringBuilder replacementChars, Edits edits)406 private static String applyEdits(CharSequence src, StringBuilder replacementChars, Edits edits) { 407 if (!edits.hasChanges()) { 408 return src.toString(); 409 } 410 StringBuilder result = new StringBuilder(src.length() + edits.lengthDelta()); 411 for (Edits.Iterator ei = edits.getCoarseIterator(); ei.next();) { 412 if (ei.hasChange()) { 413 int i = ei.replacementIndex(); 414 result.append(replacementChars, i, i + ei.newLength()); 415 } else { 416 int i = ei.sourceIndex(); 417 result.append(src, i, i + ei.oldLength()); 418 } 419 } 420 return result.toString(); 421 } 422 423 private static final Trie2_16 CASE_TRIE = UCaseProps.getTrie(); 424 425 /** 426 * caseLocale >= 0: Lowercases [srcStart..srcLimit[ but takes context [0..srcLength[ into account. 427 * caseLocale < 0: Case-folds [srcStart..srcLimit[. 428 */ internalToLower(int caseLocale, int options, CharSequence src, int srcStart, int srcLimit, StringContextIterator iter, Appendable dest, Edits edits)429 private static void internalToLower(int caseLocale, int options, 430 CharSequence src, int srcStart, int srcLimit, StringContextIterator iter, 431 Appendable dest, Edits edits) throws IOException { 432 byte[] latinToLower; 433 if (caseLocale == UCaseProps.LOC_ROOT || 434 (caseLocale >= 0 ? 435 !(caseLocale == UCaseProps.LOC_TURKISH || caseLocale == UCaseProps.LOC_LITHUANIAN) : 436 (options & UCaseProps.FOLD_CASE_OPTIONS_MASK) == UCharacter.FOLD_CASE_DEFAULT)) { 437 latinToLower = UCaseProps.LatinCase.TO_LOWER_NORMAL; 438 } else { 439 latinToLower = UCaseProps.LatinCase.TO_LOWER_TR_LT; 440 } 441 int prev = srcStart; 442 int srcIndex = srcStart; 443 outerLoop: 444 for (;;) { 445 // fast path for simple cases 446 char lead; 447 for (;;) { 448 if (srcIndex >= srcLimit) { 449 break outerLoop; 450 } 451 lead = src.charAt(srcIndex); 452 int delta; 453 if (lead < UCaseProps.LatinCase.LONG_S) { 454 byte d = latinToLower[lead]; 455 if (d == UCaseProps.LatinCase.EXC) { break; } 456 ++srcIndex; 457 if (d == 0) { continue; } 458 delta = d; 459 } else if (lead >= 0xd800) { 460 break; // surrogate or higher 461 } else { 462 int props = CASE_TRIE.getFromU16SingleLead(lead); 463 if (UCaseProps.propsHasException(props)) { break; } 464 ++srcIndex; 465 if (!UCaseProps.isUpperOrTitleFromProps(props) || 466 (delta = UCaseProps.getDelta(props)) == 0) { 467 continue; 468 } 469 } 470 lead += delta; 471 appendUnchanged(src, prev, srcIndex - 1 - prev, dest, options, edits); 472 dest.append(lead); 473 if (edits != null) { 474 edits.addReplace(1, 1); 475 } 476 prev = srcIndex; 477 } 478 // slow path 479 int cpStart = srcIndex++; 480 char trail; 481 int c; 482 if (Character.isHighSurrogate(lead) && srcIndex < srcLimit && 483 Character.isLowSurrogate(trail = src.charAt(srcIndex))) { 484 c = Character.toCodePoint(lead, trail); 485 ++srcIndex; 486 } else { 487 c = lead; 488 } 489 // We need to append unchanged text before calling the UCaseProps.toFullXyz() methods 490 // because they will sometimes append their mapping to dest, 491 // and that must be after copying the previous text. 492 appendUnchanged(src, prev, cpStart - prev, dest, options, edits); 493 prev = cpStart; 494 if (caseLocale >= 0) { 495 if (iter == null) { 496 iter = new StringContextIterator(src, cpStart, srcIndex); 497 } else { 498 iter.setCPStartAndLimit(cpStart, srcIndex); 499 } 500 c = UCaseProps.INSTANCE.toFullLower(c, iter, dest, caseLocale); 501 } else { 502 c = UCaseProps.INSTANCE.toFullFolding(c, dest, options); 503 } 504 if (c >= 0) { 505 appendResult(c, dest, srcIndex - cpStart, options, edits); 506 prev = srcIndex; 507 } 508 } 509 appendUnchanged(src, prev, srcIndex - prev, dest, options, edits); 510 } 511 internalToUpper(int caseLocale, int options, CharSequence src, Appendable dest, Edits edits)512 private static void internalToUpper(int caseLocale, int options, 513 CharSequence src, Appendable dest, Edits edits) throws IOException { 514 StringContextIterator iter = null; 515 byte[] latinToUpper; 516 if (caseLocale == UCaseProps.LOC_TURKISH) { 517 latinToUpper = UCaseProps.LatinCase.TO_UPPER_TR; 518 } else { 519 latinToUpper = UCaseProps.LatinCase.TO_UPPER_NORMAL; 520 } 521 int prev = 0; 522 int srcIndex = 0; 523 int srcLength = src.length(); 524 outerLoop: 525 for (;;) { 526 // fast path for simple cases 527 char lead; 528 for (;;) { 529 if (srcIndex >= srcLength) { 530 break outerLoop; 531 } 532 lead = src.charAt(srcIndex); 533 int delta; 534 if (lead < UCaseProps.LatinCase.LONG_S) { 535 byte d = latinToUpper[lead]; 536 if (d == UCaseProps.LatinCase.EXC) { break; } 537 ++srcIndex; 538 if (d == 0) { continue; } 539 delta = d; 540 } else if (lead >= 0xd800) { 541 break; // surrogate or higher 542 } else { 543 int props = CASE_TRIE.getFromU16SingleLead(lead); 544 if (UCaseProps.propsHasException(props)) { break; } 545 ++srcIndex; 546 if (UCaseProps.getTypeFromProps(props) != UCaseProps.LOWER || 547 (delta = UCaseProps.getDelta(props)) == 0) { 548 continue; 549 } 550 } 551 lead += delta; 552 appendUnchanged(src, prev, srcIndex - 1 - prev, dest, options, edits); 553 dest.append(lead); 554 if (edits != null) { 555 edits.addReplace(1, 1); 556 } 557 prev = srcIndex; 558 } 559 // slow path 560 int cpStart = srcIndex++; 561 char trail; 562 int c; 563 if (Character.isHighSurrogate(lead) && srcIndex < srcLength && 564 Character.isLowSurrogate(trail = src.charAt(srcIndex))) { 565 c = Character.toCodePoint(lead, trail); 566 ++srcIndex; 567 } else { 568 c = lead; 569 } 570 if (iter == null) { 571 iter = new StringContextIterator(src, cpStart, srcIndex); 572 } else { 573 iter.setCPStartAndLimit(cpStart, srcIndex); 574 } 575 // We need to append unchanged text before calling UCaseProps.toFullUpper() 576 // because it will sometimes append its mapping to dest, 577 // and that must be after copying the previous text. 578 appendUnchanged(src, prev, cpStart - prev, dest, options, edits); 579 prev = cpStart; 580 c = UCaseProps.INSTANCE.toFullUpper(c, iter, dest, caseLocale); 581 if (c >= 0) { 582 appendResult(c, dest, srcIndex - cpStart, options, edits); 583 prev = srcIndex; 584 } 585 } 586 appendUnchanged(src, prev, srcIndex - prev, dest, options, edits); 587 } 588 toLower(int caseLocale, int options, CharSequence src)589 public static String toLower(int caseLocale, int options, CharSequence src) { 590 if (src.length() <= 100 && (options & OMIT_UNCHANGED_TEXT) == 0) { 591 if (src.length() == 0) { 592 return src.toString(); 593 } 594 // Collect and apply only changes. 595 // Good if no or few changes. Bad (slow) if many changes. 596 Edits edits = new Edits(); 597 StringBuilder replacementChars = toLower( 598 caseLocale, options | OMIT_UNCHANGED_TEXT, src, new StringBuilder(), edits); 599 return applyEdits(src, replacementChars, edits); 600 } else { 601 return toLower(caseLocale, options, src, 602 new StringBuilder(src.length()), null).toString(); 603 } 604 } 605 toLower(int caseLocale, int options, CharSequence src, A dest, Edits edits)606 public static <A extends Appendable> A toLower(int caseLocale, int options, 607 CharSequence src, A dest, Edits edits) { 608 try { 609 if (edits != null) { 610 edits.reset(); 611 } 612 internalToLower(caseLocale, options, src, 0, src.length(), null, dest, edits); 613 return dest; 614 } catch (IOException e) { 615 throw new ICUUncheckedIOException(e); 616 } 617 } 618 toUpper(int caseLocale, int options, CharSequence src)619 public static String toUpper(int caseLocale, int options, CharSequence src) { 620 if (src.length() <= 100 && (options & OMIT_UNCHANGED_TEXT) == 0) { 621 if (src.length() == 0) { 622 return src.toString(); 623 } 624 // Collect and apply only changes. 625 // Good if no or few changes. Bad (slow) if many changes. 626 Edits edits = new Edits(); 627 StringBuilder replacementChars = toUpper( 628 caseLocale, options | OMIT_UNCHANGED_TEXT, src, new StringBuilder(), edits); 629 return applyEdits(src, replacementChars, edits); 630 } else { 631 return toUpper(caseLocale, options, src, 632 new StringBuilder(src.length()), null).toString(); 633 } 634 } 635 toUpper(int caseLocale, int options, CharSequence src, A dest, Edits edits)636 public static <A extends Appendable> A toUpper(int caseLocale, int options, 637 CharSequence src, A dest, Edits edits) { 638 try { 639 if (edits != null) { 640 edits.reset(); 641 } 642 if (caseLocale == UCaseProps.LOC_GREEK) { 643 return GreekUpper.toUpper(options, src, dest, edits); 644 } 645 internalToUpper(caseLocale, options, src, dest, edits); 646 return dest; 647 } catch (IOException e) { 648 throw new ICUUncheckedIOException(e); 649 } 650 } 651 toTitle(int caseLocale, int options, BreakIterator iter, CharSequence src)652 public static String toTitle(int caseLocale, int options, BreakIterator iter, CharSequence src) { 653 if (src.length() <= 100 && (options & OMIT_UNCHANGED_TEXT) == 0) { 654 if (src.length() == 0) { 655 return src.toString(); 656 } 657 // Collect and apply only changes. 658 // Good if no or few changes. Bad (slow) if many changes. 659 Edits edits = new Edits(); 660 StringBuilder replacementChars = toTitle( 661 caseLocale, options | OMIT_UNCHANGED_TEXT, iter, src, 662 new StringBuilder(), edits); 663 return applyEdits(src, replacementChars, edits); 664 } else { 665 return toTitle(caseLocale, options, iter, src, 666 new StringBuilder(src.length()), null).toString(); 667 } 668 } 669 toTitle( int caseLocale, int options, BreakIterator titleIter, CharSequence src, A dest, Edits edits)670 public static <A extends Appendable> A toTitle( 671 int caseLocale, int options, BreakIterator titleIter, 672 CharSequence src, A dest, Edits edits) { 673 try { 674 if (edits != null) { 675 edits.reset(); 676 } 677 678 /* set up local variables */ 679 StringContextIterator iter = new StringContextIterator(src); 680 int srcLength = src.length(); 681 int prev=0; 682 boolean isFirstIndex=true; 683 684 /* titlecasing loop */ 685 while(prev<srcLength) { 686 /* find next index where to titlecase */ 687 int index; 688 if(isFirstIndex) { 689 isFirstIndex=false; 690 index=titleIter.first(); 691 } else { 692 index=titleIter.next(); 693 } 694 if(index==BreakIterator.DONE || index>srcLength) { 695 index=srcLength; 696 } 697 698 /* 699 * Segment [prev..index[ into 3 parts: 700 * a) skipped characters (copy as-is) [prev..titleStart[ 701 * b) first letter (titlecase) [titleStart..titleLimit[ 702 * c) subsequent characters (lowercase) [titleLimit..index[ 703 */ 704 if(prev<index) { 705 // Find and copy skipped characters [prev..titleStart[ 706 int titleStart=prev; 707 iter.setLimit(index); 708 int c=iter.nextCaseMapCP(); 709 if ((options&UCharacter.TITLECASE_NO_BREAK_ADJUSTMENT)==0) { 710 // Adjust the titlecasing index to the next cased character, 711 // or to the next letter/number/symbol/private use. 712 // Stop with titleStart<titleLimit<=index 713 // if there is a character to be titlecased, 714 // or else stop with titleStart==titleLimit==index. 715 boolean toCased = (options&CaseMapImpl.TITLECASE_ADJUST_TO_CASED) != 0; 716 while ((toCased ? 717 UCaseProps.NONE==UCaseProps.INSTANCE.getType(c) : 718 !CaseMapImpl.isLNS(c)) && 719 (c=iter.nextCaseMapCP())>=0) {} 720 // If c<0 then we have only uncased characters in [prev..index[ 721 // and stopped with titleStart==titleLimit==index. 722 titleStart=iter.getCPStart(); 723 if (prev < titleStart) { 724 appendUnchanged(src, prev, titleStart-prev, dest, options, edits); 725 } 726 } 727 728 if(titleStart<index) { 729 int titleLimit=iter.getCPLimit(); 730 // titlecase c which is from [titleStart..titleLimit[ 731 c = UCaseProps.INSTANCE.toFullTitle(c, iter, dest, caseLocale); 732 appendResult(c, dest, iter.getCPLength(), options, edits); 733 734 // Special case Dutch IJ titlecasing 735 if (titleStart+1 < index && caseLocale == UCaseProps.LOC_DUTCH) { 736 char c1 = src.charAt(titleStart); 737 if ((c1 == 'i' || c1 == 'I')) { 738 char c2 = src.charAt(titleStart+1); 739 if (c2 == 'j') { 740 dest.append('J'); 741 if (edits != null) { 742 edits.addReplace(1, 1); 743 } 744 c = iter.nextCaseMapCP(); 745 titleLimit++; 746 assert c == c2; 747 assert titleLimit == iter.getCPLimit(); 748 } else if (c2 == 'J') { 749 // Keep the capital J from getting lowercased. 750 appendUnchanged(src, titleStart + 1, 1, dest, options, edits); 751 c = iter.nextCaseMapCP(); 752 titleLimit++; 753 assert c == c2; 754 assert titleLimit == iter.getCPLimit(); 755 } 756 } 757 } 758 759 // lowercase [titleLimit..index[ 760 if(titleLimit<index) { 761 if((options&UCharacter.TITLECASE_NO_LOWERCASE)==0) { 762 // Normal operation: Lowercase the rest of the word. 763 internalToLower(caseLocale, options, 764 src, titleLimit, index, iter, dest, edits); 765 } else { 766 // Optionally just copy the rest of the word unchanged. 767 appendUnchanged(src, titleLimit, index-titleLimit, dest, options, edits); 768 } 769 iter.moveToLimit(); 770 } 771 } 772 } 773 774 prev=index; 775 } 776 return dest; 777 } catch (IOException e) { 778 throw new ICUUncheckedIOException(e); 779 } 780 } 781 fold(int options, CharSequence src)782 public static String fold(int options, CharSequence src) { 783 if (src.length() <= 100 && (options & OMIT_UNCHANGED_TEXT) == 0) { 784 if (src.length() == 0) { 785 return src.toString(); 786 } 787 // Collect and apply only changes. 788 // Good if no or few changes. Bad (slow) if many changes. 789 Edits edits = new Edits(); 790 StringBuilder replacementChars = fold( 791 options | OMIT_UNCHANGED_TEXT, src, new StringBuilder(), edits); 792 return applyEdits(src, replacementChars, edits); 793 } else { 794 return fold(options, src, new StringBuilder(src.length()), null).toString(); 795 } 796 } 797 fold(int options, CharSequence src, A dest, Edits edits)798 public static <A extends Appendable> A fold(int options, 799 CharSequence src, A dest, Edits edits) { 800 try { 801 if (edits != null) { 802 edits.reset(); 803 } 804 internalToLower(-1, options, src, 0, src.length(), null, dest, edits); 805 return dest; 806 } catch (IOException e) { 807 throw new ICUUncheckedIOException(e); 808 } 809 } 810 811 private static final class GreekUpper { 812 // Data bits. 813 private static final int UPPER_MASK = 0x3ff; 814 private static final int HAS_VOWEL = 0x1000; 815 private static final int HAS_YPOGEGRAMMENI = 0x2000; 816 private static final int HAS_ACCENT = 0x4000; 817 private static final int HAS_DIALYTIKA = 0x8000; 818 // Further bits during data building and processing, not stored in the data map. 819 private static final int HAS_COMBINING_DIALYTIKA = 0x10000; 820 private static final int HAS_OTHER_GREEK_DIACRITIC = 0x20000; 821 822 private static final int HAS_VOWEL_AND_ACCENT = HAS_VOWEL | HAS_ACCENT; 823 private static final int HAS_VOWEL_AND_ACCENT_AND_DIALYTIKA = 824 HAS_VOWEL_AND_ACCENT | HAS_DIALYTIKA; 825 private static final int HAS_EITHER_DIALYTIKA = HAS_DIALYTIKA | HAS_COMBINING_DIALYTIKA; 826 827 // State bits. 828 private static final int AFTER_CASED = 1; 829 private static final int AFTER_VOWEL_WITH_ACCENT = 2; 830 831 // Data generated by prototype code, see 832 // http://site.icu-project.org/design/case/greek-upper 833 // TODO: Move this data into ucase.icu. 834 private static final char[] data0370 = { 835 // U+0370..03FF 836 0x0370, // Ͱ 837 0x0370, // ͱ 838 0x0372, // Ͳ 839 0x0372, // ͳ 840 0, 841 0, 842 0x0376, // Ͷ 843 0x0376, // ͷ 844 0, 845 0, 846 0x037A, // ͺ 847 0x03FD, // ͻ 848 0x03FE, // ͼ 849 0x03FF, // ͽ 850 0, 851 0x037F, // Ϳ 852 0, 853 0, 854 0, 855 0, 856 0, 857 0, 858 0x0391 | HAS_VOWEL | HAS_ACCENT, // Ά 859 0, 860 0x0395 | HAS_VOWEL | HAS_ACCENT, // Έ 861 0x0397 | HAS_VOWEL | HAS_ACCENT, // Ή 862 0x0399 | HAS_VOWEL | HAS_ACCENT, // Ί 863 0, 864 0x039F | HAS_VOWEL | HAS_ACCENT, // Ό 865 0, 866 0x03A5 | HAS_VOWEL | HAS_ACCENT, // Ύ 867 0x03A9 | HAS_VOWEL | HAS_ACCENT, // Ώ 868 0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA, // ΐ 869 0x0391 | HAS_VOWEL, // Α 870 0x0392, // Β 871 0x0393, // Γ 872 0x0394, // Δ 873 0x0395 | HAS_VOWEL, // Ε 874 0x0396, // Ζ 875 0x0397 | HAS_VOWEL, // Η 876 0x0398, // Θ 877 0x0399 | HAS_VOWEL, // Ι 878 0x039A, // Κ 879 0x039B, // Λ 880 0x039C, // Μ 881 0x039D, // Ν 882 0x039E, // Ξ 883 0x039F | HAS_VOWEL, // Ο 884 0x03A0, // Π 885 0x03A1, // Ρ 886 0, 887 0x03A3, // Σ 888 0x03A4, // Τ 889 0x03A5 | HAS_VOWEL, // Υ 890 0x03A6, // Φ 891 0x03A7, // Χ 892 0x03A8, // Ψ 893 0x03A9 | HAS_VOWEL, // Ω 894 0x0399 | HAS_VOWEL | HAS_DIALYTIKA, // Ϊ 895 0x03A5 | HAS_VOWEL | HAS_DIALYTIKA, // Ϋ 896 0x0391 | HAS_VOWEL | HAS_ACCENT, // ά 897 0x0395 | HAS_VOWEL | HAS_ACCENT, // έ 898 0x0397 | HAS_VOWEL | HAS_ACCENT, // ή 899 0x0399 | HAS_VOWEL | HAS_ACCENT, // ί 900 0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA, // ΰ 901 0x0391 | HAS_VOWEL, // α 902 0x0392, // β 903 0x0393, // γ 904 0x0394, // δ 905 0x0395 | HAS_VOWEL, // ε 906 0x0396, // ζ 907 0x0397 | HAS_VOWEL, // η 908 0x0398, // θ 909 0x0399 | HAS_VOWEL, // ι 910 0x039A, // κ 911 0x039B, // λ 912 0x039C, // μ 913 0x039D, // ν 914 0x039E, // ξ 915 0x039F | HAS_VOWEL, // ο 916 0x03A0, // π 917 0x03A1, // ρ 918 0x03A3, // ς 919 0x03A3, // σ 920 0x03A4, // τ 921 0x03A5 | HAS_VOWEL, // υ 922 0x03A6, // φ 923 0x03A7, // χ 924 0x03A8, // ψ 925 0x03A9 | HAS_VOWEL, // ω 926 0x0399 | HAS_VOWEL | HAS_DIALYTIKA, // ϊ 927 0x03A5 | HAS_VOWEL | HAS_DIALYTIKA, // ϋ 928 0x039F | HAS_VOWEL | HAS_ACCENT, // ό 929 0x03A5 | HAS_VOWEL | HAS_ACCENT, // ύ 930 0x03A9 | HAS_VOWEL | HAS_ACCENT, // ώ 931 0x03CF, // Ϗ 932 0x0392, // ϐ 933 0x0398, // ϑ 934 0x03D2, // ϒ 935 0x03D2 | HAS_ACCENT, // ϓ 936 0x03D2 | HAS_DIALYTIKA, // ϔ 937 0x03A6, // ϕ 938 0x03A0, // ϖ 939 0x03CF, // ϗ 940 0x03D8, // Ϙ 941 0x03D8, // ϙ 942 0x03DA, // Ϛ 943 0x03DA, // ϛ 944 0x03DC, // Ϝ 945 0x03DC, // ϝ 946 0x03DE, // Ϟ 947 0x03DE, // ϟ 948 0x03E0, // Ϡ 949 0x03E0, // ϡ 950 0, 951 0, 952 0, 953 0, 954 0, 955 0, 956 0, 957 0, 958 0, 959 0, 960 0, 961 0, 962 0, 963 0, 964 0x039A, // ϰ 965 0x03A1, // ϱ 966 0x03F9, // ϲ 967 0x037F, // ϳ 968 0x03F4, // ϴ 969 0x0395 | HAS_VOWEL, // ϵ 970 0, 971 0x03F7, // Ϸ 972 0x03F7, // ϸ 973 0x03F9, // Ϲ 974 0x03FA, // Ϻ 975 0x03FA, // ϻ 976 0x03FC, // ϼ 977 0x03FD, // Ͻ 978 0x03FE, // Ͼ 979 0x03FF, // Ͽ 980 }; 981 982 private static final char[] data1F00 = { 983 // U+1F00..1FFF 984 0x0391 | HAS_VOWEL, // ἀ 985 0x0391 | HAS_VOWEL, // ἁ 986 0x0391 | HAS_VOWEL | HAS_ACCENT, // ἂ 987 0x0391 | HAS_VOWEL | HAS_ACCENT, // ἃ 988 0x0391 | HAS_VOWEL | HAS_ACCENT, // ἄ 989 0x0391 | HAS_VOWEL | HAS_ACCENT, // ἅ 990 0x0391 | HAS_VOWEL | HAS_ACCENT, // ἆ 991 0x0391 | HAS_VOWEL | HAS_ACCENT, // ἇ 992 0x0391 | HAS_VOWEL, // Ἀ 993 0x0391 | HAS_VOWEL, // Ἁ 994 0x0391 | HAS_VOWEL | HAS_ACCENT, // Ἂ 995 0x0391 | HAS_VOWEL | HAS_ACCENT, // Ἃ 996 0x0391 | HAS_VOWEL | HAS_ACCENT, // Ἄ 997 0x0391 | HAS_VOWEL | HAS_ACCENT, // Ἅ 998 0x0391 | HAS_VOWEL | HAS_ACCENT, // Ἆ 999 0x0391 | HAS_VOWEL | HAS_ACCENT, // Ἇ 1000 0x0395 | HAS_VOWEL, // ἐ 1001 0x0395 | HAS_VOWEL, // ἑ 1002 0x0395 | HAS_VOWEL | HAS_ACCENT, // ἒ 1003 0x0395 | HAS_VOWEL | HAS_ACCENT, // ἓ 1004 0x0395 | HAS_VOWEL | HAS_ACCENT, // ἔ 1005 0x0395 | HAS_VOWEL | HAS_ACCENT, // ἕ 1006 0, 1007 0, 1008 0x0395 | HAS_VOWEL, // Ἐ 1009 0x0395 | HAS_VOWEL, // Ἑ 1010 0x0395 | HAS_VOWEL | HAS_ACCENT, // Ἒ 1011 0x0395 | HAS_VOWEL | HAS_ACCENT, // Ἓ 1012 0x0395 | HAS_VOWEL | HAS_ACCENT, // Ἔ 1013 0x0395 | HAS_VOWEL | HAS_ACCENT, // Ἕ 1014 0, 1015 0, 1016 0x0397 | HAS_VOWEL, // ἠ 1017 0x0397 | HAS_VOWEL, // ἡ 1018 0x0397 | HAS_VOWEL | HAS_ACCENT, // ἢ 1019 0x0397 | HAS_VOWEL | HAS_ACCENT, // ἣ 1020 0x0397 | HAS_VOWEL | HAS_ACCENT, // ἤ 1021 0x0397 | HAS_VOWEL | HAS_ACCENT, // ἥ 1022 0x0397 | HAS_VOWEL | HAS_ACCENT, // ἦ 1023 0x0397 | HAS_VOWEL | HAS_ACCENT, // ἧ 1024 0x0397 | HAS_VOWEL, // Ἠ 1025 0x0397 | HAS_VOWEL, // Ἡ 1026 0x0397 | HAS_VOWEL | HAS_ACCENT, // Ἢ 1027 0x0397 | HAS_VOWEL | HAS_ACCENT, // Ἣ 1028 0x0397 | HAS_VOWEL | HAS_ACCENT, // Ἤ 1029 0x0397 | HAS_VOWEL | HAS_ACCENT, // Ἥ 1030 0x0397 | HAS_VOWEL | HAS_ACCENT, // Ἦ 1031 0x0397 | HAS_VOWEL | HAS_ACCENT, // Ἧ 1032 0x0399 | HAS_VOWEL, // ἰ 1033 0x0399 | HAS_VOWEL, // ἱ 1034 0x0399 | HAS_VOWEL | HAS_ACCENT, // ἲ 1035 0x0399 | HAS_VOWEL | HAS_ACCENT, // ἳ 1036 0x0399 | HAS_VOWEL | HAS_ACCENT, // ἴ 1037 0x0399 | HAS_VOWEL | HAS_ACCENT, // ἵ 1038 0x0399 | HAS_VOWEL | HAS_ACCENT, // ἶ 1039 0x0399 | HAS_VOWEL | HAS_ACCENT, // ἷ 1040 0x0399 | HAS_VOWEL, // Ἰ 1041 0x0399 | HAS_VOWEL, // Ἱ 1042 0x0399 | HAS_VOWEL | HAS_ACCENT, // Ἲ 1043 0x0399 | HAS_VOWEL | HAS_ACCENT, // Ἳ 1044 0x0399 | HAS_VOWEL | HAS_ACCENT, // Ἴ 1045 0x0399 | HAS_VOWEL | HAS_ACCENT, // Ἵ 1046 0x0399 | HAS_VOWEL | HAS_ACCENT, // Ἶ 1047 0x0399 | HAS_VOWEL | HAS_ACCENT, // Ἷ 1048 0x039F | HAS_VOWEL, // ὀ 1049 0x039F | HAS_VOWEL, // ὁ 1050 0x039F | HAS_VOWEL | HAS_ACCENT, // ὂ 1051 0x039F | HAS_VOWEL | HAS_ACCENT, // ὃ 1052 0x039F | HAS_VOWEL | HAS_ACCENT, // ὄ 1053 0x039F | HAS_VOWEL | HAS_ACCENT, // ὅ 1054 0, 1055 0, 1056 0x039F | HAS_VOWEL, // Ὀ 1057 0x039F | HAS_VOWEL, // Ὁ 1058 0x039F | HAS_VOWEL | HAS_ACCENT, // Ὂ 1059 0x039F | HAS_VOWEL | HAS_ACCENT, // Ὃ 1060 0x039F | HAS_VOWEL | HAS_ACCENT, // Ὄ 1061 0x039F | HAS_VOWEL | HAS_ACCENT, // Ὅ 1062 0, 1063 0, 1064 0x03A5 | HAS_VOWEL, // ὐ 1065 0x03A5 | HAS_VOWEL, // ὑ 1066 0x03A5 | HAS_VOWEL | HAS_ACCENT, // ὒ 1067 0x03A5 | HAS_VOWEL | HAS_ACCENT, // ὓ 1068 0x03A5 | HAS_VOWEL | HAS_ACCENT, // ὔ 1069 0x03A5 | HAS_VOWEL | HAS_ACCENT, // ὕ 1070 0x03A5 | HAS_VOWEL | HAS_ACCENT, // ὖ 1071 0x03A5 | HAS_VOWEL | HAS_ACCENT, // ὗ 1072 0, 1073 0x03A5 | HAS_VOWEL, // Ὑ 1074 0, 1075 0x03A5 | HAS_VOWEL | HAS_ACCENT, // Ὓ 1076 0, 1077 0x03A5 | HAS_VOWEL | HAS_ACCENT, // Ὕ 1078 0, 1079 0x03A5 | HAS_VOWEL | HAS_ACCENT, // Ὗ 1080 0x03A9 | HAS_VOWEL, // ὠ 1081 0x03A9 | HAS_VOWEL, // ὡ 1082 0x03A9 | HAS_VOWEL | HAS_ACCENT, // ὢ 1083 0x03A9 | HAS_VOWEL | HAS_ACCENT, // ὣ 1084 0x03A9 | HAS_VOWEL | HAS_ACCENT, // ὤ 1085 0x03A9 | HAS_VOWEL | HAS_ACCENT, // ὥ 1086 0x03A9 | HAS_VOWEL | HAS_ACCENT, // ὦ 1087 0x03A9 | HAS_VOWEL | HAS_ACCENT, // ὧ 1088 0x03A9 | HAS_VOWEL, // Ὠ 1089 0x03A9 | HAS_VOWEL, // Ὡ 1090 0x03A9 | HAS_VOWEL | HAS_ACCENT, // Ὢ 1091 0x03A9 | HAS_VOWEL | HAS_ACCENT, // Ὣ 1092 0x03A9 | HAS_VOWEL | HAS_ACCENT, // Ὤ 1093 0x03A9 | HAS_VOWEL | HAS_ACCENT, // Ὥ 1094 0x03A9 | HAS_VOWEL | HAS_ACCENT, // Ὦ 1095 0x03A9 | HAS_VOWEL | HAS_ACCENT, // Ὧ 1096 0x0391 | HAS_VOWEL | HAS_ACCENT, // ὰ 1097 0x0391 | HAS_VOWEL | HAS_ACCENT, // ά 1098 0x0395 | HAS_VOWEL | HAS_ACCENT, // ὲ 1099 0x0395 | HAS_VOWEL | HAS_ACCENT, // έ 1100 0x0397 | HAS_VOWEL | HAS_ACCENT, // ὴ 1101 0x0397 | HAS_VOWEL | HAS_ACCENT, // ή 1102 0x0399 | HAS_VOWEL | HAS_ACCENT, // ὶ 1103 0x0399 | HAS_VOWEL | HAS_ACCENT, // ί 1104 0x039F | HAS_VOWEL | HAS_ACCENT, // ὸ 1105 0x039F | HAS_VOWEL | HAS_ACCENT, // ό 1106 0x03A5 | HAS_VOWEL | HAS_ACCENT, // ὺ 1107 0x03A5 | HAS_VOWEL | HAS_ACCENT, // ύ 1108 0x03A9 | HAS_VOWEL | HAS_ACCENT, // ὼ 1109 0x03A9 | HAS_VOWEL | HAS_ACCENT, // ώ 1110 0, 1111 0, 1112 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾀ 1113 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾁ 1114 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾂ 1115 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾃ 1116 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾄ 1117 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾅ 1118 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾆ 1119 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾇ 1120 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾈ 1121 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾉ 1122 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾊ 1123 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾋ 1124 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾌ 1125 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾍ 1126 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾎ 1127 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾏ 1128 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾐ 1129 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾑ 1130 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾒ 1131 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾓ 1132 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾔ 1133 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾕ 1134 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾖ 1135 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾗ 1136 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾘ 1137 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾙ 1138 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾚ 1139 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾛ 1140 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾜ 1141 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾝ 1142 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾞ 1143 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾟ 1144 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾠ 1145 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾡ 1146 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾢ 1147 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾣ 1148 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾤ 1149 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾥ 1150 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾦ 1151 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾧ 1152 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾨ 1153 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾩ 1154 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾪ 1155 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾫ 1156 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾬ 1157 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾭ 1158 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾮ 1159 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾯ 1160 0x0391 | HAS_VOWEL, // ᾰ 1161 0x0391 | HAS_VOWEL, // ᾱ 1162 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾲ 1163 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾳ 1164 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾴ 1165 0, 1166 0x0391 | HAS_VOWEL | HAS_ACCENT, // ᾶ 1167 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾷ 1168 0x0391 | HAS_VOWEL, // Ᾰ 1169 0x0391 | HAS_VOWEL, // Ᾱ 1170 0x0391 | HAS_VOWEL | HAS_ACCENT, // Ὰ 1171 0x0391 | HAS_VOWEL | HAS_ACCENT, // Ά 1172 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾼ 1173 0, 1174 0x0399 | HAS_VOWEL, // ι 1175 0, 1176 0, 1177 0, 1178 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ῂ 1179 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ῃ 1180 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ῄ 1181 0, 1182 0x0397 | HAS_VOWEL | HAS_ACCENT, // ῆ 1183 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ῇ 1184 0x0395 | HAS_VOWEL | HAS_ACCENT, // Ὲ 1185 0x0395 | HAS_VOWEL | HAS_ACCENT, // Έ 1186 0x0397 | HAS_VOWEL | HAS_ACCENT, // Ὴ 1187 0x0397 | HAS_VOWEL | HAS_ACCENT, // Ή 1188 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ῌ 1189 0, 1190 0, 1191 0, 1192 0x0399 | HAS_VOWEL, // ῐ 1193 0x0399 | HAS_VOWEL, // ῑ 1194 0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA, // ῒ 1195 0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA, // ΐ 1196 0, 1197 0, 1198 0x0399 | HAS_VOWEL | HAS_ACCENT, // ῖ 1199 0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA, // ῗ 1200 0x0399 | HAS_VOWEL, // Ῐ 1201 0x0399 | HAS_VOWEL, // Ῑ 1202 0x0399 | HAS_VOWEL | HAS_ACCENT, // Ὶ 1203 0x0399 | HAS_VOWEL | HAS_ACCENT, // Ί 1204 0, 1205 0, 1206 0, 1207 0, 1208 0x03A5 | HAS_VOWEL, // ῠ 1209 0x03A5 | HAS_VOWEL, // ῡ 1210 0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA, // ῢ 1211 0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA, // ΰ 1212 0x03A1, // ῤ 1213 0x03A1, // ῥ 1214 0x03A5 | HAS_VOWEL | HAS_ACCENT, // ῦ 1215 0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA, // ῧ 1216 0x03A5 | HAS_VOWEL, // Ῠ 1217 0x03A5 | HAS_VOWEL, // Ῡ 1218 0x03A5 | HAS_VOWEL | HAS_ACCENT, // Ὺ 1219 0x03A5 | HAS_VOWEL | HAS_ACCENT, // Ύ 1220 0x03A1, // Ῥ 1221 0, 1222 0, 1223 0, 1224 0, 1225 0, 1226 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ῲ 1227 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ῳ 1228 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ῴ 1229 0, 1230 0x03A9 | HAS_VOWEL | HAS_ACCENT, // ῶ 1231 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ῷ 1232 0x039F | HAS_VOWEL | HAS_ACCENT, // Ὸ 1233 0x039F | HAS_VOWEL | HAS_ACCENT, // Ό 1234 0x03A9 | HAS_VOWEL | HAS_ACCENT, // Ὼ 1235 0x03A9 | HAS_VOWEL | HAS_ACCENT, // Ώ 1236 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ῼ 1237 0, 1238 0, 1239 0, 1240 }; 1241 1242 // U+2126 Ohm sign 1243 private static final char data2126 = 0x03A9 | HAS_VOWEL; // Ω 1244 getLetterData(int c)1245 private static final int getLetterData(int c) { 1246 if (c < 0x370 || 0x2126 < c || (0x3ff < c && c < 0x1f00)) { 1247 return 0; 1248 } else if (c <= 0x3ff) { 1249 return data0370[c - 0x370]; 1250 } else if (c <= 0x1fff) { 1251 return data1F00[c - 0x1f00]; 1252 } else if (c == 0x2126) { 1253 return data2126; 1254 } else { 1255 return 0; 1256 } 1257 } 1258 1259 /** 1260 * Returns a non-zero value for each of the Greek combining diacritics 1261 * listed in The Unicode Standard, version 8, chapter 7.2 Greek, 1262 * plus some perispomeni look-alikes. 1263 */ getDiacriticData(int c)1264 private static final int getDiacriticData(int c) { 1265 switch (c) { 1266 case '\u0300': // varia 1267 case '\u0301': // tonos = oxia 1268 case '\u0342': // perispomeni 1269 case '\u0302': // circumflex can look like perispomeni 1270 case '\u0303': // tilde can look like perispomeni 1271 case '\u0311': // inverted breve can look like perispomeni 1272 return HAS_ACCENT; 1273 case '\u0308': // dialytika = diaeresis 1274 return HAS_COMBINING_DIALYTIKA; 1275 case '\u0344': // dialytika tonos 1276 return HAS_COMBINING_DIALYTIKA | HAS_ACCENT; 1277 case '\u0345': // ypogegrammeni = iota subscript 1278 return HAS_YPOGEGRAMMENI; 1279 case '\u0304': // macron 1280 case '\u0306': // breve 1281 case '\u0313': // comma above 1282 case '\u0314': // reversed comma above 1283 case '\u0343': // koronis 1284 return HAS_OTHER_GREEK_DIACRITIC; 1285 default: 1286 return 0; 1287 } 1288 } 1289 isFollowedByCasedLetter(CharSequence s, int i)1290 private static boolean isFollowedByCasedLetter(CharSequence s, int i) { 1291 while (i < s.length()) { 1292 int c = Character.codePointAt(s, i); 1293 int type = UCaseProps.INSTANCE.getTypeOrIgnorable(c); 1294 if ((type & UCaseProps.IGNORABLE) != 0) { 1295 // Case-ignorable, continue with the loop. 1296 i += Character.charCount(c); 1297 } else if (type != UCaseProps.NONE) { 1298 return true; // Followed by cased letter. 1299 } else { 1300 return false; // Uncased and not case-ignorable. 1301 } 1302 } 1303 return false; // Not followed by cased letter. 1304 } 1305 1306 /** 1307 * Greek string uppercasing with a state machine. 1308 * Probably simpler than a stateless function that has to figure out complex context-before 1309 * for each character. 1310 * TODO: Try to re-consolidate one way or another with the non-Greek function. 1311 * 1312 * <p>Keep this consistent with the C++ versions in ustrcase.cpp (UTF-16) and ucasemap.cpp (UTF-8). 1313 * @throws IOException 1314 */ toUpper(int options, CharSequence src, A dest, Edits edits)1315 private static <A extends Appendable> A toUpper(int options, 1316 CharSequence src, A dest, Edits edits) throws IOException { 1317 int state = 0; 1318 for (int i = 0; i < src.length();) { 1319 int c = Character.codePointAt(src, i); 1320 int nextIndex = i + Character.charCount(c); 1321 int nextState = 0; 1322 int type = UCaseProps.INSTANCE.getTypeOrIgnorable(c); 1323 if ((type & UCaseProps.IGNORABLE) != 0) { 1324 // c is case-ignorable 1325 nextState |= (state & AFTER_CASED); 1326 } else if (type != UCaseProps.NONE) { 1327 // c is cased 1328 nextState |= AFTER_CASED; 1329 } 1330 int data = getLetterData(c); 1331 if (data > 0) { 1332 int upper = data & UPPER_MASK; 1333 // Add a dialytika to this iota or ypsilon vowel 1334 // if we removed a tonos from the previous vowel, 1335 // and that previous vowel did not also have (or gain) a dialytika. 1336 // Adding one only to the final vowel in a longer sequence 1337 // (which does not occur in normal writing) would require lookahead. 1338 // Set the same flag as for preserving an existing dialytika. 1339 if ((data & HAS_VOWEL) != 0 && (state & AFTER_VOWEL_WITH_ACCENT) != 0 && 1340 (upper == 'Ι' || upper == 'Υ')) { 1341 data |= HAS_DIALYTIKA; 1342 } 1343 int numYpogegrammeni = 0; // Map each one to a trailing, spacing, capital iota. 1344 if ((data & HAS_YPOGEGRAMMENI) != 0) { 1345 numYpogegrammeni = 1; 1346 } 1347 // Skip combining diacritics after this Greek letter. 1348 while (nextIndex < src.length()) { 1349 int diacriticData = getDiacriticData(src.charAt(nextIndex)); 1350 if (diacriticData != 0) { 1351 data |= diacriticData; 1352 if ((diacriticData & HAS_YPOGEGRAMMENI) != 0) { 1353 ++numYpogegrammeni; 1354 } 1355 ++nextIndex; 1356 } else { 1357 break; // not a Greek diacritic 1358 } 1359 } 1360 if ((data & HAS_VOWEL_AND_ACCENT_AND_DIALYTIKA) == HAS_VOWEL_AND_ACCENT) { 1361 nextState |= AFTER_VOWEL_WITH_ACCENT; 1362 } 1363 // Map according to Greek rules. 1364 boolean addTonos = false; 1365 if (upper == 'Η' && 1366 (data & HAS_ACCENT) != 0 && 1367 numYpogegrammeni == 0 && 1368 (state & AFTER_CASED) == 0 && 1369 !isFollowedByCasedLetter(src, nextIndex)) { 1370 // Keep disjunctive "or" with (only) a tonos. 1371 // We use the same "word boundary" conditions as for the Final_Sigma test. 1372 if (i == nextIndex) { 1373 upper = 'Ή'; // Preserve the precomposed form. 1374 } else { 1375 addTonos = true; 1376 } 1377 } else if ((data & HAS_DIALYTIKA) != 0) { 1378 // Preserve a vowel with dialytika in precomposed form if it exists. 1379 if (upper == 'Ι') { 1380 upper = 'Ϊ'; 1381 data &= ~HAS_EITHER_DIALYTIKA; 1382 } else if (upper == 'Υ') { 1383 upper = 'Ϋ'; 1384 data &= ~HAS_EITHER_DIALYTIKA; 1385 } 1386 } 1387 1388 boolean change; 1389 if (edits == null && (options & OMIT_UNCHANGED_TEXT) == 0) { 1390 change = true; // common, simple usage 1391 } else { 1392 // Find out first whether we are changing the text. 1393 change = src.charAt(i) != upper || numYpogegrammeni > 0; 1394 int i2 = i + 1; 1395 if ((data & HAS_EITHER_DIALYTIKA) != 0) { 1396 change |= i2 >= nextIndex || src.charAt(i2) != 0x308; 1397 ++i2; 1398 } 1399 if (addTonos) { 1400 change |= i2 >= nextIndex || src.charAt(i2) != 0x301; 1401 ++i2; 1402 } 1403 int oldLength = nextIndex - i; 1404 int newLength = (i2 - i) + numYpogegrammeni; 1405 change |= oldLength != newLength; 1406 if (change) { 1407 if (edits != null) { 1408 edits.addReplace(oldLength, newLength); 1409 } 1410 } else { 1411 if (edits != null) { 1412 edits.addUnchanged(oldLength); 1413 } 1414 // Write unchanged text? 1415 change = (options & OMIT_UNCHANGED_TEXT) == 0; 1416 } 1417 } 1418 1419 if (change) { 1420 dest.append((char)upper); 1421 if ((data & HAS_EITHER_DIALYTIKA) != 0) { 1422 dest.append('\u0308'); // restore or add a dialytika 1423 } 1424 if (addTonos) { 1425 dest.append('\u0301'); 1426 } 1427 while (numYpogegrammeni > 0) { 1428 dest.append('Ι'); 1429 --numYpogegrammeni; 1430 } 1431 } 1432 } else { 1433 c = UCaseProps.INSTANCE.toFullUpper(c, null, dest, UCaseProps.LOC_GREEK); 1434 appendResult(c, dest, nextIndex - i, options, edits); 1435 } 1436 i = nextIndex; 1437 state = nextState; 1438 } 1439 return dest; 1440 } 1441 } 1442 } 1443