1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 package com.ibm.icu.impl; 4 5 import java.io.IOException; 6 import java.text.CharacterIterator; 7 import java.util.Locale; 8 9 import com.ibm.icu.lang.UCharacter; 10 import com.ibm.icu.lang.UCharacterCategory; 11 import com.ibm.icu.text.BreakIterator; 12 import com.ibm.icu.text.Edits; 13 import com.ibm.icu.util.ICUUncheckedIOException; 14 import com.ibm.icu.util.ULocale; 15 16 public final class CaseMapImpl { 17 /** 18 * Implementation of UCaseProps.ContextIterator, iterates over a String. 19 * See ustrcase.c/utf16_caseContextIterator(). 20 */ 21 public static final class StringContextIterator implements UCaseProps.ContextIterator { 22 /** 23 * Constructor. 24 * @param src String to iterate over. 25 */ StringContextIterator(CharSequence src)26 public StringContextIterator(CharSequence src) { 27 this.s=src; 28 limit=src.length(); 29 cpStart=cpLimit=index=0; 30 dir=0; 31 } 32 33 /** 34 * Constructor. 35 * @param src String to iterate over. 36 * @param cpStart Start index of the current code point. 37 * @param cpLimit Limit index of the current code point. 38 */ StringContextIterator(CharSequence src, int cpStart, int cpLimit)39 public StringContextIterator(CharSequence src, int cpStart, int cpLimit) { 40 s = src; 41 index = 0; 42 limit = src.length(); 43 this.cpStart = cpStart; 44 this.cpLimit = cpLimit; 45 dir = 0; 46 } 47 48 /** 49 * Set the iteration limit for nextCaseMapCP() to an index within the string. 50 * If the limit parameter is negative or past the string, then the 51 * string length is restored as the iteration limit. 52 * 53 * <p>This limit does not affect the next() function which always 54 * iterates to the very end of the string. 55 * 56 * @param lim The iteration limit. 57 */ setLimit(int lim)58 public void setLimit(int lim) { 59 if(0<=lim && lim<=s.length()) { 60 limit=lim; 61 } else { 62 limit=s.length(); 63 } 64 } 65 66 /** 67 * Move to the iteration limit without fetching code points up to there. 68 */ moveToLimit()69 public void moveToLimit() { 70 cpStart=cpLimit=limit; 71 } 72 moveTo(int i)73 public void moveTo(int i) { 74 cpStart=cpLimit=i; 75 } 76 77 /** 78 * Iterate forward through the string to fetch the next code point 79 * to be case-mapped, and set the context indexes for it. 80 * 81 * <p>When the iteration limit is reached (and -1 is returned), 82 * getCPStart() will be at the iteration limit. 83 * 84 * <p>Iteration with next() does not affect the position for nextCaseMapCP(). 85 * 86 * @return The next code point to be case-mapped, or <0 when the iteration is done. 87 */ nextCaseMapCP()88 public int nextCaseMapCP() { 89 cpStart=cpLimit; 90 if(cpLimit<limit) { 91 int c=Character.codePointAt(s, cpLimit); 92 cpLimit+=Character.charCount(c); 93 return c; 94 } else { 95 return -1; 96 } 97 } 98 setCPStartAndLimit(int s, int l)99 public void setCPStartAndLimit(int s, int l) { 100 cpStart = s; 101 cpLimit = l; 102 dir = 0; 103 } 104 /** 105 * Returns the start of the code point that was last returned 106 * by nextCaseMapCP(). 107 */ getCPStart()108 public int getCPStart() { 109 return cpStart; 110 } 111 112 /** 113 * Returns the limit of the code point that was last returned 114 * by nextCaseMapCP(). 115 */ getCPLimit()116 public int getCPLimit() { 117 return cpLimit; 118 } 119 getCPLength()120 public int getCPLength() { 121 return cpLimit-cpStart; 122 } 123 124 // implement UCaseProps.ContextIterator 125 // The following code is not used anywhere in this private class 126 @Override reset(int direction)127 public void reset(int direction) { 128 if(direction>0) { 129 /* reset for forward iteration */ 130 dir=1; 131 index=cpLimit; 132 } else if(direction<0) { 133 /* reset for backward iteration */ 134 dir=-1; 135 index=cpStart; 136 } else { 137 // not a valid direction 138 dir=0; 139 index=0; 140 } 141 } 142 143 @Override next()144 public int next() { 145 int c; 146 147 if(dir>0 && index<s.length()) { 148 c=Character.codePointAt(s, index); 149 index+=Character.charCount(c); 150 return c; 151 } else if(dir<0 && index>0) { 152 c=Character.codePointBefore(s, index); 153 index-=Character.charCount(c); 154 return c; 155 } 156 return -1; 157 } 158 159 // variables 160 protected CharSequence s; 161 protected int index, limit, cpStart, cpLimit; 162 protected int dir; // 0=initial state >0=forward <0=backward 163 } 164 165 public static final int TITLECASE_WHOLE_STRING = 0x20; 166 public static final int TITLECASE_SENTENCES = 0x40; 167 168 /** 169 * Bit mask for the titlecasing iterator options bit field. 170 * Currently only 3 out of 8 values are used: 171 * 0 (words), TITLECASE_WHOLE_STRING, TITLECASE_SENTENCES. 172 * See stringoptions.h. 173 * @internal 174 */ 175 private static final int TITLECASE_ITERATOR_MASK = 0xe0; 176 177 public static final int TITLECASE_ADJUST_TO_CASED = 0x400; 178 179 /** 180 * Bit mask for the titlecasing index adjustment options bit set. 181 * Currently two bits are defined: 182 * TITLECASE_NO_BREAK_ADJUSTMENT, TITLECASE_ADJUST_TO_CASED. 183 * See stringoptions.h. 184 * @internal 185 */ 186 private static final int TITLECASE_ADJUSTMENT_MASK = 0x600; 187 addTitleAdjustmentOption(int options, int newOption)188 public static int addTitleAdjustmentOption(int options, int newOption) { 189 int adjOptions = options & TITLECASE_ADJUSTMENT_MASK; 190 if (adjOptions !=0 && adjOptions != newOption) { 191 throw new IllegalArgumentException("multiple titlecasing index adjustment options"); 192 } 193 return options | newOption; 194 } 195 196 private static final char ACUTE = '\u0301'; 197 198 private static final int U_GC_M_MASK = 199 (1 << UCharacterCategory.NON_SPACING_MARK) | 200 (1 << UCharacterCategory.COMBINING_SPACING_MARK) | 201 (1 << UCharacterCategory.ENCLOSING_MARK); 202 203 private static final int LNS = 204 (1 << UCharacterCategory.UPPERCASE_LETTER) | 205 (1 << UCharacterCategory.LOWERCASE_LETTER) | 206 (1 << UCharacterCategory.TITLECASE_LETTER) | 207 // Not MODIFIER_LETTER: We count only cased modifier letters. 208 (1 << UCharacterCategory.OTHER_LETTER) | 209 210 (1 << UCharacterCategory.DECIMAL_DIGIT_NUMBER) | 211 (1 << UCharacterCategory.LETTER_NUMBER) | 212 (1 << UCharacterCategory.OTHER_NUMBER) | 213 214 (1 << UCharacterCategory.MATH_SYMBOL) | 215 (1 << UCharacterCategory.CURRENCY_SYMBOL) | 216 (1 << UCharacterCategory.MODIFIER_SYMBOL) | 217 (1 << UCharacterCategory.OTHER_SYMBOL) | 218 219 (1 << UCharacterCategory.PRIVATE_USE); 220 isLNS(int c)221 private static boolean isLNS(int c) { 222 // Letter, number, symbol, 223 // or a private use code point because those are typically used as letters or numbers. 224 // Consider modifier letters only if they are cased. 225 int gc = UCharacterProperty.INSTANCE.getType(c); 226 return ((1 << gc) & LNS) != 0 || 227 (gc == UCharacterCategory.MODIFIER_LETTER && 228 UCaseProps.INSTANCE.getType(c) != UCaseProps.NONE); 229 } 230 addTitleIteratorOption(int options, int newOption)231 public static int addTitleIteratorOption(int options, int newOption) { 232 int iterOptions = options & TITLECASE_ITERATOR_MASK; 233 if (iterOptions !=0 && iterOptions != newOption) { 234 throw new IllegalArgumentException("multiple titlecasing iterator options"); 235 } 236 return options | newOption; 237 } 238 getTitleBreakIterator( Locale locale, int options, BreakIterator iter)239 public static BreakIterator getTitleBreakIterator( 240 Locale locale, int options, BreakIterator iter) { 241 options &= TITLECASE_ITERATOR_MASK; 242 if (options != 0 && iter != null) { 243 throw new IllegalArgumentException( 244 "titlecasing iterator option together with an explicit iterator"); 245 } 246 if (iter == null) { 247 switch (options) { 248 case 0: 249 iter = BreakIterator.getWordInstance(locale); 250 break; 251 case TITLECASE_WHOLE_STRING: 252 iter = new WholeStringBreakIterator(); 253 break; 254 case TITLECASE_SENTENCES: 255 iter = BreakIterator.getSentenceInstance(locale); 256 break; 257 default: 258 throw new IllegalArgumentException("unknown titlecasing iterator option"); 259 } 260 } 261 return iter; 262 } 263 getTitleBreakIterator( ULocale locale, int options, BreakIterator iter)264 public static BreakIterator getTitleBreakIterator( 265 ULocale locale, int options, BreakIterator iter) { 266 options &= TITLECASE_ITERATOR_MASK; 267 if (options != 0 && iter != null) { 268 throw new IllegalArgumentException( 269 "titlecasing iterator option together with an explicit iterator"); 270 } 271 if (iter == null) { 272 switch (options) { 273 case 0: 274 iter = BreakIterator.getWordInstance(locale); 275 break; 276 case TITLECASE_WHOLE_STRING: 277 iter = new WholeStringBreakIterator(); 278 break; 279 case TITLECASE_SENTENCES: 280 iter = BreakIterator.getSentenceInstance(locale); 281 break; 282 default: 283 throw new IllegalArgumentException("unknown titlecasing iterator option"); 284 } 285 } 286 return iter; 287 } 288 289 /** 290 * Omit unchanged text when case-mapping with Edits. 291 */ 292 public static final int OMIT_UNCHANGED_TEXT = 0x4000; 293 294 private static final class WholeStringBreakIterator extends BreakIterator { 295 private int length; 296 notImplemented()297 private static void notImplemented() { 298 throw new UnsupportedOperationException("should not occur"); 299 } 300 301 @Override first()302 public int first() { 303 return 0; 304 } 305 306 @Override last()307 public int last() { 308 notImplemented(); 309 return 0; 310 } 311 312 @Override next(int n)313 public int next(int n) { 314 notImplemented(); 315 return 0; 316 } 317 318 @Override next()319 public int next() { 320 return length; 321 } 322 323 @Override previous()324 public int previous() { 325 notImplemented(); 326 return 0; 327 } 328 329 @Override following(int offset)330 public int following(int offset) { 331 notImplemented(); 332 return 0; 333 } 334 335 @Override current()336 public int current() { 337 notImplemented(); 338 return 0; 339 } 340 341 @Override getText()342 public CharacterIterator getText() { 343 notImplemented(); 344 return null; 345 } 346 347 @Override setText(CharacterIterator newText)348 public void setText(CharacterIterator newText) { 349 length = newText.getEndIndex(); 350 } 351 352 @Override setText(CharSequence newText)353 public void setText(CharSequence newText) { 354 length = newText.length(); 355 } 356 357 @Override setText(String newText)358 public void setText(String newText) { 359 length = newText.length(); 360 } 361 } 362 appendCodePoint(Appendable a, int c)363 private static int appendCodePoint(Appendable a, int c) throws IOException { 364 if (c <= Character.MAX_VALUE) { 365 a.append((char)c); 366 return 1; 367 } else { 368 a.append((char)(0xd7c0 + (c >> 10))); 369 a.append((char)(Character.MIN_LOW_SURROGATE + (c & 0x3ff))); 370 return 2; 371 } 372 } 373 374 /** 375 * Appends a full case mapping result, see {@link UCaseProps#MAX_STRING_LENGTH}. 376 * @throws IOException 377 */ appendResult(int result, Appendable dest, int cpLength, int options, Edits edits)378 private static void appendResult(int result, Appendable dest, 379 int cpLength, int options, Edits edits) throws IOException { 380 // Decode the result. 381 if (result < 0) { 382 // (not) original code point 383 if (edits != null) { 384 edits.addUnchanged(cpLength); 385 } 386 if ((options & OMIT_UNCHANGED_TEXT) != 0) { 387 return; 388 } 389 appendCodePoint(dest, ~result); 390 } else if (result <= UCaseProps.MAX_STRING_LENGTH) { 391 // The mapping has already been appended to result. 392 if (edits != null) { 393 edits.addReplace(cpLength, result); 394 } 395 } else { 396 // Append the single-code point mapping. 397 int length = appendCodePoint(dest, result); 398 if (edits != null) { 399 edits.addReplace(cpLength, length); 400 } 401 } 402 } 403 appendUnchanged(CharSequence src, int start, int length, Appendable dest, int options, Edits edits)404 private static final void appendUnchanged(CharSequence src, int start, int length, 405 Appendable dest, int options, Edits edits) throws IOException { 406 if (length > 0) { 407 if (edits != null) { 408 edits.addUnchanged(length); 409 } 410 if ((options & OMIT_UNCHANGED_TEXT) != 0) { 411 return; 412 } 413 dest.append(src, start, start + length); 414 } 415 } 416 applyEdits(CharSequence src, StringBuilder replacementChars, Edits edits)417 private static String applyEdits(CharSequence src, StringBuilder replacementChars, Edits edits) { 418 if (!edits.hasChanges()) { 419 return src.toString(); 420 } 421 StringBuilder result = new StringBuilder(src.length() + edits.lengthDelta()); 422 for (Edits.Iterator ei = edits.getCoarseIterator(); ei.next();) { 423 if (ei.hasChange()) { 424 int i = ei.replacementIndex(); 425 result.append(replacementChars, i, i + ei.newLength()); 426 } else { 427 int i = ei.sourceIndex(); 428 result.append(src, i, i + ei.oldLength()); 429 } 430 } 431 return result.toString(); 432 } 433 434 private static final Trie2_16 CASE_TRIE = UCaseProps.getTrie(); 435 436 /** 437 * caseLocale >= 0: Lowercases [srcStart..srcLimit[ but takes context [0..srcLength[ into account. 438 * caseLocale < 0: Case-folds [srcStart..srcLimit[. 439 */ internalToLower(int caseLocale, int options, CharSequence src, int srcStart, int srcLimit, StringContextIterator iter, Appendable dest, Edits edits)440 private static void internalToLower(int caseLocale, int options, 441 CharSequence src, int srcStart, int srcLimit, StringContextIterator iter, 442 Appendable dest, Edits edits) throws IOException { 443 byte[] latinToLower; 444 if (caseLocale == UCaseProps.LOC_ROOT || 445 (caseLocale >= 0 ? 446 !(caseLocale == UCaseProps.LOC_TURKISH || caseLocale == UCaseProps.LOC_LITHUANIAN) : 447 (options & UCaseProps.FOLD_CASE_OPTIONS_MASK) == UCharacter.FOLD_CASE_DEFAULT)) { 448 latinToLower = UCaseProps.LatinCase.TO_LOWER_NORMAL; 449 } else { 450 latinToLower = UCaseProps.LatinCase.TO_LOWER_TR_LT; 451 } 452 int prev = srcStart; 453 int srcIndex = srcStart; 454 outerLoop: 455 for (;;) { 456 // fast path for simple cases 457 char lead; 458 for (;;) { 459 if (srcIndex >= srcLimit) { 460 break outerLoop; 461 } 462 lead = src.charAt(srcIndex); 463 int delta; 464 if (lead < UCaseProps.LatinCase.LONG_S) { 465 byte d = latinToLower[lead]; 466 if (d == UCaseProps.LatinCase.EXC) { break; } 467 ++srcIndex; 468 if (d == 0) { continue; } 469 delta = d; 470 } else if (lead >= 0xd800) { 471 break; // surrogate or higher 472 } else { 473 int props = CASE_TRIE.getFromU16SingleLead(lead); 474 if (UCaseProps.propsHasException(props)) { break; } 475 ++srcIndex; 476 if (!UCaseProps.isUpperOrTitleFromProps(props) || 477 (delta = UCaseProps.getDelta(props)) == 0) { 478 continue; 479 } 480 } 481 lead += delta; 482 appendUnchanged(src, prev, srcIndex - 1 - prev, dest, options, edits); 483 dest.append(lead); 484 if (edits != null) { 485 edits.addReplace(1, 1); 486 } 487 prev = srcIndex; 488 } 489 // slow path 490 int cpStart = srcIndex++; 491 char trail; 492 int c; 493 if (Character.isHighSurrogate(lead) && srcIndex < srcLimit && 494 Character.isLowSurrogate(trail = src.charAt(srcIndex))) { 495 c = Character.toCodePoint(lead, trail); 496 ++srcIndex; 497 } else { 498 c = lead; 499 } 500 // We need to append unchanged text before calling the UCaseProps.toFullXyz() methods 501 // because they will sometimes append their mapping to dest, 502 // and that must be after copying the previous text. 503 appendUnchanged(src, prev, cpStart - prev, dest, options, edits); 504 prev = cpStart; 505 if (caseLocale >= 0) { 506 if (iter == null) { 507 iter = new StringContextIterator(src, cpStart, srcIndex); 508 } else { 509 iter.setCPStartAndLimit(cpStart, srcIndex); 510 } 511 c = UCaseProps.INSTANCE.toFullLower(c, iter, dest, caseLocale); 512 } else { 513 c = UCaseProps.INSTANCE.toFullFolding(c, dest, options); 514 } 515 if (c >= 0) { 516 appendResult(c, dest, srcIndex - cpStart, options, edits); 517 prev = srcIndex; 518 } 519 } 520 appendUnchanged(src, prev, srcIndex - prev, dest, options, edits); 521 } 522 internalToUpper(int caseLocale, int options, CharSequence src, Appendable dest, Edits edits)523 private static void internalToUpper(int caseLocale, int options, 524 CharSequence src, Appendable dest, Edits edits) throws IOException { 525 StringContextIterator iter = null; 526 byte[] latinToUpper; 527 if (caseLocale == UCaseProps.LOC_TURKISH) { 528 latinToUpper = UCaseProps.LatinCase.TO_UPPER_TR; 529 } else { 530 latinToUpper = UCaseProps.LatinCase.TO_UPPER_NORMAL; 531 } 532 int prev = 0; 533 int srcIndex = 0; 534 int srcLength = src.length(); 535 outerLoop: 536 for (;;) { 537 // fast path for simple cases 538 char lead; 539 for (;;) { 540 if (srcIndex >= srcLength) { 541 break outerLoop; 542 } 543 lead = src.charAt(srcIndex); 544 int delta; 545 if (lead < UCaseProps.LatinCase.LONG_S) { 546 byte d = latinToUpper[lead]; 547 if (d == UCaseProps.LatinCase.EXC) { break; } 548 ++srcIndex; 549 if (d == 0) { continue; } 550 delta = d; 551 } else if (lead >= 0xd800) { 552 break; // surrogate or higher 553 } else { 554 int props = CASE_TRIE.getFromU16SingleLead(lead); 555 if (UCaseProps.propsHasException(props)) { break; } 556 ++srcIndex; 557 if (UCaseProps.getTypeFromProps(props) != UCaseProps.LOWER || 558 (delta = UCaseProps.getDelta(props)) == 0) { 559 continue; 560 } 561 } 562 lead += delta; 563 appendUnchanged(src, prev, srcIndex - 1 - prev, dest, options, edits); 564 dest.append(lead); 565 if (edits != null) { 566 edits.addReplace(1, 1); 567 } 568 prev = srcIndex; 569 } 570 // slow path 571 int cpStart = srcIndex++; 572 char trail; 573 int c; 574 if (Character.isHighSurrogate(lead) && srcIndex < srcLength && 575 Character.isLowSurrogate(trail = src.charAt(srcIndex))) { 576 c = Character.toCodePoint(lead, trail); 577 ++srcIndex; 578 } else { 579 c = lead; 580 } 581 if (iter == null) { 582 iter = new StringContextIterator(src, cpStart, srcIndex); 583 } else { 584 iter.setCPStartAndLimit(cpStart, srcIndex); 585 } 586 // We need to append unchanged text before calling UCaseProps.toFullUpper() 587 // because it will sometimes append its mapping to dest, 588 // and that must be after copying the previous text. 589 appendUnchanged(src, prev, cpStart - prev, dest, options, edits); 590 prev = cpStart; 591 c = UCaseProps.INSTANCE.toFullUpper(c, iter, dest, caseLocale); 592 if (c >= 0) { 593 appendResult(c, dest, srcIndex - cpStart, options, edits); 594 prev = srcIndex; 595 } 596 } 597 appendUnchanged(src, prev, srcIndex - prev, dest, options, edits); 598 } 599 toLower(int caseLocale, int options, CharSequence src)600 public static String toLower(int caseLocale, int options, CharSequence src) { 601 if (src.length() <= 100 && (options & OMIT_UNCHANGED_TEXT) == 0) { 602 if (src.length() == 0) { 603 return src.toString(); 604 } 605 // Collect and apply only changes. 606 // Good if no or few changes. Bad (slow) if many changes. 607 Edits edits = new Edits(); 608 StringBuilder replacementChars = toLower( 609 caseLocale, options | OMIT_UNCHANGED_TEXT, src, new StringBuilder(), edits); 610 return applyEdits(src, replacementChars, edits); 611 } else { 612 return toLower(caseLocale, options, src, 613 new StringBuilder(src.length()), null).toString(); 614 } 615 } 616 toLower(int caseLocale, int options, CharSequence src, A dest, Edits edits)617 public static <A extends Appendable> A toLower(int caseLocale, int options, 618 CharSequence src, A dest, Edits edits) { 619 try { 620 if (edits != null) { 621 edits.reset(); 622 } 623 internalToLower(caseLocale, options, src, 0, src.length(), null, dest, edits); 624 return dest; 625 } catch (IOException e) { 626 throw new ICUUncheckedIOException(e); 627 } 628 } 629 toUpper(int caseLocale, int options, CharSequence src)630 public static String toUpper(int caseLocale, int options, CharSequence src) { 631 if (src.length() <= 100 && (options & OMIT_UNCHANGED_TEXT) == 0) { 632 if (src.length() == 0) { 633 return src.toString(); 634 } 635 // Collect and apply only changes. 636 // Good if no or few changes. Bad (slow) if many changes. 637 Edits edits = new Edits(); 638 StringBuilder replacementChars = toUpper( 639 caseLocale, options | OMIT_UNCHANGED_TEXT, src, new StringBuilder(), edits); 640 return applyEdits(src, replacementChars, edits); 641 } else { 642 return toUpper(caseLocale, options, src, 643 new StringBuilder(src.length()), null).toString(); 644 } 645 } 646 toUpper(int caseLocale, int options, CharSequence src, A dest, Edits edits)647 public static <A extends Appendable> A toUpper(int caseLocale, int options, 648 CharSequence src, A dest, Edits edits) { 649 try { 650 if (edits != null) { 651 edits.reset(); 652 } 653 if (caseLocale == UCaseProps.LOC_GREEK) { 654 return GreekUpper.toUpper(options, src, dest, edits); 655 } 656 internalToUpper(caseLocale, options, src, dest, edits); 657 return dest; 658 } catch (IOException e) { 659 throw new ICUUncheckedIOException(e); 660 } 661 } 662 toTitle(int caseLocale, int options, BreakIterator iter, CharSequence src)663 public static String toTitle(int caseLocale, int options, BreakIterator iter, CharSequence src) { 664 if (src.length() <= 100 && (options & OMIT_UNCHANGED_TEXT) == 0) { 665 if (src.length() == 0) { 666 return src.toString(); 667 } 668 // Collect and apply only changes. 669 // Good if no or few changes. Bad (slow) if many changes. 670 Edits edits = new Edits(); 671 StringBuilder replacementChars = toTitle( 672 caseLocale, options | OMIT_UNCHANGED_TEXT, iter, src, 673 new StringBuilder(), edits); 674 return applyEdits(src, replacementChars, edits); 675 } else { 676 return toTitle(caseLocale, options, iter, src, 677 new StringBuilder(src.length()), null).toString(); 678 } 679 } 680 toTitle( int caseLocale, int options, BreakIterator titleIter, CharSequence src, A dest, Edits edits)681 public static <A extends Appendable> A toTitle( 682 int caseLocale, int options, BreakIterator titleIter, 683 CharSequence src, A dest, Edits edits) { 684 try { 685 if (edits != null) { 686 edits.reset(); 687 } 688 689 /* set up local variables */ 690 StringContextIterator iter = new StringContextIterator(src); 691 int srcLength = src.length(); 692 int prev=0; 693 boolean isFirstIndex=true; 694 695 /* titlecasing loop */ 696 while(prev<srcLength) { 697 /* find next index where to titlecase */ 698 int index; 699 if(isFirstIndex) { 700 isFirstIndex=false; 701 index=titleIter.first(); 702 } else { 703 index=titleIter.next(); 704 } 705 if(index==BreakIterator.DONE || index>srcLength) { 706 index=srcLength; 707 } 708 709 /* 710 * Segment [prev..index[ into 3 parts: 711 * a) skipped characters (copy as-is) [prev..titleStart[ 712 * b) first letter (titlecase) [titleStart..titleLimit[ 713 * c) subsequent characters (lowercase) [titleLimit..index[ 714 */ 715 if(prev<index) { 716 // Find and copy skipped characters [prev..titleStart[ 717 int titleStart=prev; 718 iter.setLimit(index); 719 int c=iter.nextCaseMapCP(); 720 if ((options&UCharacter.TITLECASE_NO_BREAK_ADJUSTMENT)==0) { 721 // Adjust the titlecasing index to the next cased character, 722 // or to the next letter/number/symbol/private use. 723 // Stop with titleStart<titleLimit<=index 724 // if there is a character to be titlecased, 725 // or else stop with titleStart==titleLimit==index. 726 boolean toCased = (options&CaseMapImpl.TITLECASE_ADJUST_TO_CASED) != 0; 727 while ((toCased ? 728 UCaseProps.NONE==UCaseProps.INSTANCE.getType(c) : 729 !CaseMapImpl.isLNS(c)) && 730 (c=iter.nextCaseMapCP())>=0) {} 731 // If c<0 then we have only uncased characters in [prev..index[ 732 // and stopped with titleStart==titleLimit==index. 733 titleStart=iter.getCPStart(); 734 if (prev < titleStart) { 735 appendUnchanged(src, prev, titleStart-prev, dest, options, edits); 736 } 737 } 738 739 if(titleStart<index) { 740 // titlecase c which is from [titleStart..titleLimit[ 741 c = UCaseProps.INSTANCE.toFullTitle(c, iter, dest, caseLocale); 742 appendResult(c, dest, iter.getCPLength(), options, edits); 743 744 // Special case Dutch IJ titlecasing 745 int titleLimit; 746 if (titleStart+1 < index && caseLocale == UCaseProps.LOC_DUTCH) { 747 if (c < 0) { 748 c = ~c; 749 } 750 if (c == 'I' || c == 'Í') { 751 titleLimit = maybeTitleDutchIJ(src, c, titleStart + 1, index, dest, options, edits); 752 iter.moveTo(titleLimit); 753 } 754 else { 755 titleLimit = iter.getCPLimit(); 756 } 757 } else { 758 titleLimit = iter.getCPLimit(); 759 } 760 761 // lowercase [titleLimit..index[ 762 if(titleLimit<index) { 763 if((options&UCharacter.TITLECASE_NO_LOWERCASE)==0) { 764 // Normal operation: Lowercase the rest of the word. 765 internalToLower(caseLocale, options, 766 src, titleLimit, index, iter, dest, edits); 767 } else { 768 // Optionally just copy the rest of the word unchanged. 769 appendUnchanged(src, titleLimit, index-titleLimit, dest, options, edits); 770 } 771 iter.moveToLimit(); 772 } 773 } 774 } 775 776 prev=index; 777 } 778 return dest; 779 } catch (IOException e) { 780 throw new ICUUncheckedIOException(e); 781 } 782 } 783 784 /** 785 * Input: c is a letter I with or without acute accent. 786 * start is the index in src after c, and is less than segmentLimit. 787 * If a plain i/I is followed by a plain j/J, 788 * or an i/I with acute (precomposed or decomposed) is followed by a j/J with acute, 789 * then we output accordingly. 790 * 791 * @return the src index after the titlecased sequence, or the start index if no Dutch IJ 792 * @throws IOException 793 */ maybeTitleDutchIJ( CharSequence src, int c, int start, int segmentLimit, A dest, int options, Edits edits)794 private static <A extends Appendable> int maybeTitleDutchIJ( 795 CharSequence src, int c, int start, int segmentLimit, 796 A dest, int options, Edits edits) throws IOException { 797 assert start < segmentLimit; 798 799 int index = start; 800 boolean withAcute = false; 801 802 // If the conditions are met, then the following variables tell us what to output. 803 int unchanged1 = 0; // code units before the j, or the whole sequence (0..3) 804 boolean doTitleJ = false; // true if the j needs to be titlecased 805 int unchanged2 = 0; // after the j (0 or 1) 806 807 // next character after the first letter 808 char c2 = src.charAt(index++); 809 810 // Is the first letter an i/I with accent? 811 if (c == 'I') { 812 if (c2 == ACUTE) { 813 withAcute = true; 814 unchanged1 = 1; 815 if (index == segmentLimit) { return start; } 816 c2 = src.charAt(index++); 817 } 818 } else { // Í 819 withAcute = true; 820 } 821 // Is the next character a j/J? 822 if (c2 == 'j') { 823 doTitleJ = true; 824 } else if (c2 == 'J') { 825 ++unchanged1; 826 } else { 827 return start; 828 } 829 // A plain i/I must be followed by a plain j/J. 830 // An i/I with acute must be followed by a j/J with acute. 831 if (withAcute) { 832 if (index == segmentLimit || src.charAt(index++) != ACUTE) { return start; } 833 if (doTitleJ) { 834 unchanged2 = 1; 835 } else { 836 ++unchanged1; 837 } 838 } 839 // There must not be another combining mark. 840 if (index < segmentLimit) { 841 int cp = Character.codePointAt(src, index); 842 int bit = 1 << UCharacter.getType(cp); 843 if ((bit & U_GC_M_MASK) != 0) { 844 return start; 845 } 846 } 847 // Output the rest of the Dutch IJ. 848 appendUnchanged(src, start, unchanged1, dest, options, edits); 849 start += unchanged1; 850 if (doTitleJ) { 851 dest.append('J'); 852 if (edits != null) { 853 edits.addReplace(1, 1); 854 } 855 ++start; 856 } 857 appendUnchanged(src, start, unchanged2, dest, options, edits); 858 assert start + unchanged2 == index; 859 return index; 860 } 861 862 public static String fold(int options, CharSequence src) { 863 if (src.length() <= 100 && (options & OMIT_UNCHANGED_TEXT) == 0) { 864 if (src.length() == 0) { 865 return src.toString(); 866 } 867 // Collect and apply only changes. 868 // Good if no or few changes. Bad (slow) if many changes. 869 Edits edits = new Edits(); 870 StringBuilder replacementChars = fold( 871 options | OMIT_UNCHANGED_TEXT, src, new StringBuilder(), edits); 872 return applyEdits(src, replacementChars, edits); 873 } else { 874 return fold(options, src, new StringBuilder(src.length()), null).toString(); 875 } 876 } 877 878 public static <A extends Appendable> A fold(int options, 879 CharSequence src, A dest, Edits edits) { 880 try { 881 if (edits != null) { 882 edits.reset(); 883 } 884 internalToLower(-1, options, src, 0, src.length(), null, dest, edits); 885 return dest; 886 } catch (IOException e) { 887 throw new ICUUncheckedIOException(e); 888 } 889 } 890 891 private static final class GreekUpper { 892 // Data bits. 893 private static final int UPPER_MASK = 0x3ff; 894 private static final int HAS_VOWEL = 0x1000; 895 private static final int HAS_YPOGEGRAMMENI = 0x2000; 896 private static final int HAS_ACCENT = 0x4000; 897 private static final int HAS_DIALYTIKA = 0x8000; 898 // Further bits during data building and processing, not stored in the data map. 899 private static final int HAS_COMBINING_DIALYTIKA = 0x10000; 900 private static final int HAS_OTHER_GREEK_DIACRITIC = 0x20000; 901 902 private static final int HAS_VOWEL_AND_ACCENT = HAS_VOWEL | HAS_ACCENT; 903 private static final int HAS_VOWEL_AND_ACCENT_AND_DIALYTIKA = 904 HAS_VOWEL_AND_ACCENT | HAS_DIALYTIKA; 905 private static final int HAS_EITHER_DIALYTIKA = HAS_DIALYTIKA | HAS_COMBINING_DIALYTIKA; 906 907 // State bits. 908 private static final int AFTER_CASED = 1; 909 private static final int AFTER_VOWEL_WITH_COMBINING_ACCENT = 2; 910 private static final int AFTER_VOWEL_WITH_PRECOMPOSED_ACCENT = 4; 911 912 // Data generated by prototype code, see 913 // https://icu.unicode.org/design/case/greek-upper 914 // TODO: Move this data into ucase.icu. 915 private static final char[] data0370 = { 916 // U+0370..03FF 917 0x0370, // Ͱ 918 0x0370, // ͱ 919 0x0372, // Ͳ 920 0x0372, // ͳ 921 0, 922 0, 923 0x0376, // Ͷ 924 0x0376, // ͷ 925 0, 926 0, 927 0x037A, // ͺ 928 0x03FD, // ͻ 929 0x03FE, // ͼ 930 0x03FF, // ͽ 931 0, 932 0x037F, // Ϳ 933 0, 934 0, 935 0, 936 0, 937 0, 938 0, 939 0x0391 | HAS_VOWEL | HAS_ACCENT, // Ά 940 0, 941 0x0395 | HAS_VOWEL | HAS_ACCENT, // Έ 942 0x0397 | HAS_VOWEL | HAS_ACCENT, // Ή 943 0x0399 | HAS_VOWEL | HAS_ACCENT, // Ί 944 0, 945 0x039F | HAS_VOWEL | HAS_ACCENT, // Ό 946 0, 947 0x03A5 | HAS_VOWEL | HAS_ACCENT, // Ύ 948 0x03A9 | HAS_VOWEL | HAS_ACCENT, // Ώ 949 0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA, // ΐ 950 0x0391 | HAS_VOWEL, // Α 951 0x0392, // Β 952 0x0393, // Γ 953 0x0394, // Δ 954 0x0395 | HAS_VOWEL, // Ε 955 0x0396, // Ζ 956 0x0397 | HAS_VOWEL, // Η 957 0x0398, // Θ 958 0x0399 | HAS_VOWEL, // Ι 959 0x039A, // Κ 960 0x039B, // Λ 961 0x039C, // Μ 962 0x039D, // Ν 963 0x039E, // Ξ 964 0x039F | HAS_VOWEL, // Ο 965 0x03A0, // Π 966 0x03A1, // Ρ 967 0, 968 0x03A3, // Σ 969 0x03A4, // Τ 970 0x03A5 | HAS_VOWEL, // Υ 971 0x03A6, // Φ 972 0x03A7, // Χ 973 0x03A8, // Ψ 974 0x03A9 | HAS_VOWEL, // Ω 975 0x0399 | HAS_VOWEL | HAS_DIALYTIKA, // Ϊ 976 0x03A5 | HAS_VOWEL | HAS_DIALYTIKA, // Ϋ 977 0x0391 | HAS_VOWEL | HAS_ACCENT, // ά 978 0x0395 | HAS_VOWEL | HAS_ACCENT, // έ 979 0x0397 | HAS_VOWEL | HAS_ACCENT, // ή 980 0x0399 | HAS_VOWEL | HAS_ACCENT, // ί 981 0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA, // ΰ 982 0x0391 | HAS_VOWEL, // α 983 0x0392, // β 984 0x0393, // γ 985 0x0394, // δ 986 0x0395 | HAS_VOWEL, // ε 987 0x0396, // ζ 988 0x0397 | HAS_VOWEL, // η 989 0x0398, // θ 990 0x0399 | HAS_VOWEL, // ι 991 0x039A, // κ 992 0x039B, // λ 993 0x039C, // μ 994 0x039D, // ν 995 0x039E, // ξ 996 0x039F | HAS_VOWEL, // ο 997 0x03A0, // π 998 0x03A1, // ρ 999 0x03A3, // ς 1000 0x03A3, // σ 1001 0x03A4, // τ 1002 0x03A5 | HAS_VOWEL, // υ 1003 0x03A6, // φ 1004 0x03A7, // χ 1005 0x03A8, // ψ 1006 0x03A9 | HAS_VOWEL, // ω 1007 0x0399 | HAS_VOWEL | HAS_DIALYTIKA, // ϊ 1008 0x03A5 | HAS_VOWEL | HAS_DIALYTIKA, // ϋ 1009 0x039F | HAS_VOWEL | HAS_ACCENT, // ό 1010 0x03A5 | HAS_VOWEL | HAS_ACCENT, // ύ 1011 0x03A9 | HAS_VOWEL | HAS_ACCENT, // ώ 1012 0x03CF, // Ϗ 1013 0x0392, // ϐ 1014 0x0398, // ϑ 1015 0x03D2, // ϒ 1016 0x03D2 | HAS_ACCENT, // ϓ 1017 0x03D2 | HAS_DIALYTIKA, // ϔ 1018 0x03A6, // ϕ 1019 0x03A0, // ϖ 1020 0x03CF, // ϗ 1021 0x03D8, // Ϙ 1022 0x03D8, // ϙ 1023 0x03DA, // Ϛ 1024 0x03DA, // ϛ 1025 0x03DC, // Ϝ 1026 0x03DC, // ϝ 1027 0x03DE, // Ϟ 1028 0x03DE, // ϟ 1029 0x03E0, // Ϡ 1030 0x03E0, // ϡ 1031 0, 1032 0, 1033 0, 1034 0, 1035 0, 1036 0, 1037 0, 1038 0, 1039 0, 1040 0, 1041 0, 1042 0, 1043 0, 1044 0, 1045 0x039A, // ϰ 1046 0x03A1, // ϱ 1047 0x03F9, // ϲ 1048 0x037F, // ϳ 1049 0x03F4, // ϴ 1050 0x0395 | HAS_VOWEL, // ϵ 1051 0, 1052 0x03F7, // Ϸ 1053 0x03F7, // ϸ 1054 0x03F9, // Ϲ 1055 0x03FA, // Ϻ 1056 0x03FA, // ϻ 1057 0x03FC, // ϼ 1058 0x03FD, // Ͻ 1059 0x03FE, // Ͼ 1060 0x03FF, // Ͽ 1061 }; 1062 1063 private static final char[] data1F00 = { 1064 // U+1F00..1FFF 1065 0x0391 | HAS_VOWEL, // ἀ 1066 0x0391 | HAS_VOWEL, // ἁ 1067 0x0391 | HAS_VOWEL | HAS_ACCENT, // ἂ 1068 0x0391 | HAS_VOWEL | HAS_ACCENT, // ἃ 1069 0x0391 | HAS_VOWEL | HAS_ACCENT, // ἄ 1070 0x0391 | HAS_VOWEL | HAS_ACCENT, // ἅ 1071 0x0391 | HAS_VOWEL | HAS_ACCENT, // ἆ 1072 0x0391 | HAS_VOWEL | HAS_ACCENT, // ἇ 1073 0x0391 | HAS_VOWEL, // Ἀ 1074 0x0391 | HAS_VOWEL, // Ἁ 1075 0x0391 | HAS_VOWEL | HAS_ACCENT, // Ἂ 1076 0x0391 | HAS_VOWEL | HAS_ACCENT, // Ἃ 1077 0x0391 | HAS_VOWEL | HAS_ACCENT, // Ἄ 1078 0x0391 | HAS_VOWEL | HAS_ACCENT, // Ἅ 1079 0x0391 | HAS_VOWEL | HAS_ACCENT, // Ἆ 1080 0x0391 | HAS_VOWEL | HAS_ACCENT, // Ἇ 1081 0x0395 | HAS_VOWEL, // ἐ 1082 0x0395 | HAS_VOWEL, // ἑ 1083 0x0395 | HAS_VOWEL | HAS_ACCENT, // ἒ 1084 0x0395 | HAS_VOWEL | HAS_ACCENT, // ἓ 1085 0x0395 | HAS_VOWEL | HAS_ACCENT, // ἔ 1086 0x0395 | HAS_VOWEL | HAS_ACCENT, // ἕ 1087 0, 1088 0, 1089 0x0395 | HAS_VOWEL, // Ἐ 1090 0x0395 | HAS_VOWEL, // Ἑ 1091 0x0395 | HAS_VOWEL | HAS_ACCENT, // Ἒ 1092 0x0395 | HAS_VOWEL | HAS_ACCENT, // Ἓ 1093 0x0395 | HAS_VOWEL | HAS_ACCENT, // Ἔ 1094 0x0395 | HAS_VOWEL | HAS_ACCENT, // Ἕ 1095 0, 1096 0, 1097 0x0397 | HAS_VOWEL, // ἠ 1098 0x0397 | HAS_VOWEL, // ἡ 1099 0x0397 | HAS_VOWEL | HAS_ACCENT, // ἢ 1100 0x0397 | HAS_VOWEL | HAS_ACCENT, // ἣ 1101 0x0397 | HAS_VOWEL | HAS_ACCENT, // ἤ 1102 0x0397 | HAS_VOWEL | HAS_ACCENT, // ἥ 1103 0x0397 | HAS_VOWEL | HAS_ACCENT, // ἦ 1104 0x0397 | HAS_VOWEL | HAS_ACCENT, // ἧ 1105 0x0397 | HAS_VOWEL, // Ἠ 1106 0x0397 | HAS_VOWEL, // Ἡ 1107 0x0397 | HAS_VOWEL | HAS_ACCENT, // Ἢ 1108 0x0397 | HAS_VOWEL | HAS_ACCENT, // Ἣ 1109 0x0397 | HAS_VOWEL | HAS_ACCENT, // Ἤ 1110 0x0397 | HAS_VOWEL | HAS_ACCENT, // Ἥ 1111 0x0397 | HAS_VOWEL | HAS_ACCENT, // Ἦ 1112 0x0397 | HAS_VOWEL | HAS_ACCENT, // Ἧ 1113 0x0399 | HAS_VOWEL, // ἰ 1114 0x0399 | HAS_VOWEL, // ἱ 1115 0x0399 | HAS_VOWEL | HAS_ACCENT, // ἲ 1116 0x0399 | HAS_VOWEL | HAS_ACCENT, // ἳ 1117 0x0399 | HAS_VOWEL | HAS_ACCENT, // ἴ 1118 0x0399 | HAS_VOWEL | HAS_ACCENT, // ἵ 1119 0x0399 | HAS_VOWEL | HAS_ACCENT, // ἶ 1120 0x0399 | HAS_VOWEL | HAS_ACCENT, // ἷ 1121 0x0399 | HAS_VOWEL, // Ἰ 1122 0x0399 | HAS_VOWEL, // Ἱ 1123 0x0399 | HAS_VOWEL | HAS_ACCENT, // Ἲ 1124 0x0399 | HAS_VOWEL | HAS_ACCENT, // Ἳ 1125 0x0399 | HAS_VOWEL | HAS_ACCENT, // Ἴ 1126 0x0399 | HAS_VOWEL | HAS_ACCENT, // Ἵ 1127 0x0399 | HAS_VOWEL | HAS_ACCENT, // Ἶ 1128 0x0399 | HAS_VOWEL | HAS_ACCENT, // Ἷ 1129 0x039F | HAS_VOWEL, // ὀ 1130 0x039F | HAS_VOWEL, // ὁ 1131 0x039F | HAS_VOWEL | HAS_ACCENT, // ὂ 1132 0x039F | HAS_VOWEL | HAS_ACCENT, // ὃ 1133 0x039F | HAS_VOWEL | HAS_ACCENT, // ὄ 1134 0x039F | HAS_VOWEL | HAS_ACCENT, // ὅ 1135 0, 1136 0, 1137 0x039F | HAS_VOWEL, // Ὀ 1138 0x039F | HAS_VOWEL, // Ὁ 1139 0x039F | HAS_VOWEL | HAS_ACCENT, // Ὂ 1140 0x039F | HAS_VOWEL | HAS_ACCENT, // Ὃ 1141 0x039F | HAS_VOWEL | HAS_ACCENT, // Ὄ 1142 0x039F | HAS_VOWEL | HAS_ACCENT, // Ὅ 1143 0, 1144 0, 1145 0x03A5 | HAS_VOWEL, // ὐ 1146 0x03A5 | HAS_VOWEL, // ὑ 1147 0x03A5 | HAS_VOWEL | HAS_ACCENT, // ὒ 1148 0x03A5 | HAS_VOWEL | HAS_ACCENT, // ὓ 1149 0x03A5 | HAS_VOWEL | HAS_ACCENT, // ὔ 1150 0x03A5 | HAS_VOWEL | HAS_ACCENT, // ὕ 1151 0x03A5 | HAS_VOWEL | HAS_ACCENT, // ὖ 1152 0x03A5 | HAS_VOWEL | HAS_ACCENT, // ὗ 1153 0, 1154 0x03A5 | HAS_VOWEL, // Ὑ 1155 0, 1156 0x03A5 | HAS_VOWEL | HAS_ACCENT, // Ὓ 1157 0, 1158 0x03A5 | HAS_VOWEL | HAS_ACCENT, // Ὕ 1159 0, 1160 0x03A5 | HAS_VOWEL | HAS_ACCENT, // Ὗ 1161 0x03A9 | HAS_VOWEL, // ὠ 1162 0x03A9 | HAS_VOWEL, // ὡ 1163 0x03A9 | HAS_VOWEL | HAS_ACCENT, // ὢ 1164 0x03A9 | HAS_VOWEL | HAS_ACCENT, // ὣ 1165 0x03A9 | HAS_VOWEL | HAS_ACCENT, // ὤ 1166 0x03A9 | HAS_VOWEL | HAS_ACCENT, // ὥ 1167 0x03A9 | HAS_VOWEL | HAS_ACCENT, // ὦ 1168 0x03A9 | HAS_VOWEL | HAS_ACCENT, // ὧ 1169 0x03A9 | HAS_VOWEL, // Ὠ 1170 0x03A9 | HAS_VOWEL, // Ὡ 1171 0x03A9 | HAS_VOWEL | HAS_ACCENT, // Ὢ 1172 0x03A9 | HAS_VOWEL | HAS_ACCENT, // Ὣ 1173 0x03A9 | HAS_VOWEL | HAS_ACCENT, // Ὤ 1174 0x03A9 | HAS_VOWEL | HAS_ACCENT, // Ὥ 1175 0x03A9 | HAS_VOWEL | HAS_ACCENT, // Ὦ 1176 0x03A9 | HAS_VOWEL | HAS_ACCENT, // Ὧ 1177 0x0391 | HAS_VOWEL | HAS_ACCENT, // ὰ 1178 0x0391 | HAS_VOWEL | HAS_ACCENT, // ά 1179 0x0395 | HAS_VOWEL | HAS_ACCENT, // ὲ 1180 0x0395 | HAS_VOWEL | HAS_ACCENT, // έ 1181 0x0397 | HAS_VOWEL | HAS_ACCENT, // ὴ 1182 0x0397 | HAS_VOWEL | HAS_ACCENT, // ή 1183 0x0399 | HAS_VOWEL | HAS_ACCENT, // ὶ 1184 0x0399 | HAS_VOWEL | HAS_ACCENT, // ί 1185 0x039F | HAS_VOWEL | HAS_ACCENT, // ὸ 1186 0x039F | HAS_VOWEL | HAS_ACCENT, // ό 1187 0x03A5 | HAS_VOWEL | HAS_ACCENT, // ὺ 1188 0x03A5 | HAS_VOWEL | HAS_ACCENT, // ύ 1189 0x03A9 | HAS_VOWEL | HAS_ACCENT, // ὼ 1190 0x03A9 | HAS_VOWEL | HAS_ACCENT, // ώ 1191 0, 1192 0, 1193 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾀ 1194 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾁ 1195 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾂ 1196 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾃ 1197 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾄ 1198 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾅ 1199 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾆ 1200 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾇ 1201 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾈ 1202 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾉ 1203 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾊ 1204 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾋ 1205 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾌ 1206 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾍ 1207 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾎ 1208 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾏ 1209 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾐ 1210 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾑ 1211 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾒ 1212 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾓ 1213 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾔ 1214 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾕ 1215 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾖ 1216 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾗ 1217 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾘ 1218 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾙ 1219 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾚ 1220 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾛ 1221 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾜ 1222 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾝ 1223 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾞ 1224 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾟ 1225 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾠ 1226 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾡ 1227 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾢ 1228 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾣ 1229 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾤ 1230 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾥ 1231 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾦ 1232 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾧ 1233 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾨ 1234 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾩ 1235 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾪ 1236 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾫ 1237 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾬ 1238 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾭ 1239 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾮ 1240 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾯ 1241 0x0391 | HAS_VOWEL, // ᾰ 1242 0x0391 | HAS_VOWEL, // ᾱ 1243 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾲ 1244 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾳ 1245 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾴ 1246 0, 1247 0x0391 | HAS_VOWEL | HAS_ACCENT, // ᾶ 1248 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾷ 1249 0x0391 | HAS_VOWEL, // Ᾰ 1250 0x0391 | HAS_VOWEL, // Ᾱ 1251 0x0391 | HAS_VOWEL | HAS_ACCENT, // Ὰ 1252 0x0391 | HAS_VOWEL | HAS_ACCENT, // Ά 1253 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾼ 1254 0, 1255 0x0399 | HAS_VOWEL, // ι 1256 0, 1257 0, 1258 0, 1259 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ῂ 1260 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ῃ 1261 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ῄ 1262 0, 1263 0x0397 | HAS_VOWEL | HAS_ACCENT, // ῆ 1264 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ῇ 1265 0x0395 | HAS_VOWEL | HAS_ACCENT, // Ὲ 1266 0x0395 | HAS_VOWEL | HAS_ACCENT, // Έ 1267 0x0397 | HAS_VOWEL | HAS_ACCENT, // Ὴ 1268 0x0397 | HAS_VOWEL | HAS_ACCENT, // Ή 1269 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ῌ 1270 0, 1271 0, 1272 0, 1273 0x0399 | HAS_VOWEL, // ῐ 1274 0x0399 | HAS_VOWEL, // ῑ 1275 0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA, // ῒ 1276 0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA, // ΐ 1277 0, 1278 0, 1279 0x0399 | HAS_VOWEL | HAS_ACCENT, // ῖ 1280 0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA, // ῗ 1281 0x0399 | HAS_VOWEL, // Ῐ 1282 0x0399 | HAS_VOWEL, // Ῑ 1283 0x0399 | HAS_VOWEL | HAS_ACCENT, // Ὶ 1284 0x0399 | HAS_VOWEL | HAS_ACCENT, // Ί 1285 0, 1286 0, 1287 0, 1288 0, 1289 0x03A5 | HAS_VOWEL, // ῠ 1290 0x03A5 | HAS_VOWEL, // ῡ 1291 0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA, // ῢ 1292 0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA, // ΰ 1293 0x03A1, // ῤ 1294 0x03A1, // ῥ 1295 0x03A5 | HAS_VOWEL | HAS_ACCENT, // ῦ 1296 0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA, // ῧ 1297 0x03A5 | HAS_VOWEL, // Ῠ 1298 0x03A5 | HAS_VOWEL, // Ῡ 1299 0x03A5 | HAS_VOWEL | HAS_ACCENT, // Ὺ 1300 0x03A5 | HAS_VOWEL | HAS_ACCENT, // Ύ 1301 0x03A1, // Ῥ 1302 0, 1303 0, 1304 0, 1305 0, 1306 0, 1307 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ῲ 1308 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ῳ 1309 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ῴ 1310 0, 1311 0x03A9 | HAS_VOWEL | HAS_ACCENT, // ῶ 1312 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ῷ 1313 0x039F | HAS_VOWEL | HAS_ACCENT, // Ὸ 1314 0x039F | HAS_VOWEL | HAS_ACCENT, // Ό 1315 0x03A9 | HAS_VOWEL | HAS_ACCENT, // Ὼ 1316 0x03A9 | HAS_VOWEL | HAS_ACCENT, // Ώ 1317 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ῼ 1318 0, 1319 0, 1320 0, 1321 }; 1322 1323 // U+2126 Ohm sign 1324 private static final char data2126 = 0x03A9 | HAS_VOWEL; // Ω 1325 1326 private static final int getLetterData(int c) { 1327 if (c < 0x370 || 0x2126 < c || (0x3ff < c && c < 0x1f00)) { 1328 return 0; 1329 } else if (c <= 0x3ff) { 1330 return data0370[c - 0x370]; 1331 } else if (c <= 0x1fff) { 1332 return data1F00[c - 0x1f00]; 1333 } else if (c == 0x2126) { 1334 return data2126; 1335 } else { 1336 return 0; 1337 } 1338 } 1339 1340 /** 1341 * Returns a non-zero value for each of the Greek combining diacritics 1342 * listed in The Unicode Standard, version 8, chapter 7.2 Greek, 1343 * plus some perispomeni look-alikes. 1344 */ 1345 private static final int getDiacriticData(int c) { 1346 switch (c) { 1347 case '\u0300': // varia 1348 case '\u0301': // tonos = oxia 1349 case '\u0342': // perispomeni 1350 case '\u0302': // circumflex can look like perispomeni 1351 case '\u0303': // tilde can look like perispomeni 1352 case '\u0311': // inverted breve can look like perispomeni 1353 return HAS_ACCENT; 1354 case '\u0308': // dialytika = diaeresis 1355 return HAS_COMBINING_DIALYTIKA; 1356 case '\u0344': // dialytika tonos 1357 return HAS_COMBINING_DIALYTIKA | HAS_ACCENT; 1358 case '\u0345': // ypogegrammeni = iota subscript 1359 return HAS_YPOGEGRAMMENI; 1360 case '\u0304': // macron 1361 case '\u0306': // breve 1362 case '\u0313': // comma above 1363 case '\u0314': // reversed comma above 1364 case '\u0343': // koronis 1365 return HAS_OTHER_GREEK_DIACRITIC; 1366 default: 1367 return 0; 1368 } 1369 } 1370 1371 private static boolean isFollowedByCasedLetter(CharSequence s, int i) { 1372 while (i < s.length()) { 1373 int c = Character.codePointAt(s, i); 1374 int type = UCaseProps.INSTANCE.getTypeOrIgnorable(c); 1375 if ((type & UCaseProps.IGNORABLE) != 0) { 1376 // Case-ignorable, continue with the loop. 1377 i += Character.charCount(c); 1378 } else if (type != UCaseProps.NONE) { 1379 return true; // Followed by cased letter. 1380 } else { 1381 return false; // Uncased and not case-ignorable. 1382 } 1383 } 1384 return false; // Not followed by cased letter. 1385 } 1386 1387 /** 1388 * Greek string uppercasing with a state machine. 1389 * Probably simpler than a stateless function that has to figure out complex context-before 1390 * for each character. 1391 * TODO: Try to re-consolidate one way or another with the non-Greek function. 1392 * 1393 * <p>Keep this consistent with the C++ versions in ustrcase.cpp (UTF-16) and ucasemap.cpp (UTF-8). 1394 * @throws IOException 1395 */ 1396 private static <A extends Appendable> A toUpper(int options, 1397 CharSequence src, A dest, Edits edits) throws IOException { 1398 int state = 0; 1399 for (int i = 0; i < src.length();) { 1400 int c = Character.codePointAt(src, i); 1401 int nextIndex = i + Character.charCount(c); 1402 int nextState = 0; 1403 int type = UCaseProps.INSTANCE.getTypeOrIgnorable(c); 1404 if ((type & UCaseProps.IGNORABLE) != 0) { 1405 // c is case-ignorable 1406 nextState |= (state & AFTER_CASED); 1407 } else if (type != UCaseProps.NONE) { 1408 // c is cased 1409 nextState |= AFTER_CASED; 1410 } 1411 int data = getLetterData(c); 1412 if (data > 0) { 1413 int upper = data & UPPER_MASK; 1414 // Add a dialytika to this iota or ypsilon vowel 1415 // if we removed a tonos from the previous vowel, 1416 // and that previous vowel did not also have (or gain) a dialytika. 1417 // Adding one only to the final vowel in a longer sequence 1418 // (which does not occur in normal writing) would require lookahead. 1419 // Set the same flag as for preserving an existing dialytika. 1420 if ((data & HAS_VOWEL) != 0 1421 && (state & (AFTER_VOWEL_WITH_PRECOMPOSED_ACCENT | AFTER_VOWEL_WITH_COMBINING_ACCENT)) != 0 1422 && (upper == 'Ι' || upper == 'Υ')) { 1423 data |= (state & AFTER_VOWEL_WITH_PRECOMPOSED_ACCENT) != 0 ? HAS_DIALYTIKA 1424 : HAS_COMBINING_DIALYTIKA; 1425 } 1426 int numYpogegrammeni = 0; // Map each one to a trailing, spacing, capital iota. 1427 if ((data & HAS_YPOGEGRAMMENI) != 0) { 1428 numYpogegrammeni = 1; 1429 } 1430 final boolean hasPrecomposedAccent = (data & HAS_ACCENT) != 0; 1431 // Skip combining diacritics after this Greek letter. 1432 while (nextIndex < src.length()) { 1433 int diacriticData = getDiacriticData(src.charAt(nextIndex)); 1434 if (diacriticData != 0) { 1435 data |= diacriticData; 1436 if ((diacriticData & HAS_YPOGEGRAMMENI) != 0) { 1437 ++numYpogegrammeni; 1438 } 1439 ++nextIndex; 1440 } else { 1441 break; // not a Greek diacritic 1442 } 1443 } 1444 if ((data & HAS_VOWEL_AND_ACCENT_AND_DIALYTIKA) == HAS_VOWEL_AND_ACCENT) { 1445 nextState |= hasPrecomposedAccent ? AFTER_VOWEL_WITH_PRECOMPOSED_ACCENT 1446 : AFTER_VOWEL_WITH_COMBINING_ACCENT; 1447 } 1448 // Map according to Greek rules. 1449 boolean addTonos = false; 1450 if (upper == 'Η' && 1451 (data & HAS_ACCENT) != 0 && 1452 numYpogegrammeni == 0 && 1453 (state & AFTER_CASED) == 0 && 1454 !isFollowedByCasedLetter(src, nextIndex)) { 1455 // Keep disjunctive "or" with (only) a tonos. 1456 // We use the same "word boundary" conditions as for the Final_Sigma test. 1457 if (hasPrecomposedAccent) { 1458 upper = 'Ή'; // Preserve the precomposed form. 1459 } else { 1460 addTonos = true; 1461 } 1462 } else if ((data & HAS_DIALYTIKA) != 0) { 1463 // Preserve a vowel with dialytika in precomposed form if it exists. 1464 if (upper == 'Ι') { 1465 upper = 'Ϊ'; 1466 data &= ~HAS_EITHER_DIALYTIKA; 1467 } else if (upper == 'Υ') { 1468 upper = 'Ϋ'; 1469 data &= ~HAS_EITHER_DIALYTIKA; 1470 } 1471 } 1472 1473 boolean change; 1474 if (edits == null && (options & OMIT_UNCHANGED_TEXT) == 0) { 1475 change = true; // common, simple usage 1476 } else { 1477 // Find out first whether we are changing the text. 1478 change = src.charAt(i) != upper || numYpogegrammeni > 0; 1479 int i2 = i + 1; 1480 if ((data & HAS_EITHER_DIALYTIKA) != 0) { 1481 change |= i2 >= nextIndex || src.charAt(i2) != 0x308; 1482 ++i2; 1483 } 1484 if (addTonos) { 1485 change |= i2 >= nextIndex || src.charAt(i2) != 0x301; 1486 ++i2; 1487 } 1488 int oldLength = nextIndex - i; 1489 int newLength = (i2 - i) + numYpogegrammeni; 1490 change |= oldLength != newLength; 1491 if (change) { 1492 if (edits != null) { 1493 edits.addReplace(oldLength, newLength); 1494 } 1495 } else { 1496 if (edits != null) { 1497 edits.addUnchanged(oldLength); 1498 } 1499 // Write unchanged text? 1500 change = (options & OMIT_UNCHANGED_TEXT) == 0; 1501 } 1502 } 1503 1504 if (change) { 1505 dest.append((char)upper); 1506 if ((data & HAS_EITHER_DIALYTIKA) != 0) { 1507 dest.append('\u0308'); // restore or add a dialytika 1508 } 1509 if (addTonos) { 1510 dest.append('\u0301'); 1511 } 1512 while (numYpogegrammeni > 0) { 1513 dest.append('Ι'); 1514 --numYpogegrammeni; 1515 } 1516 } 1517 } else { 1518 c = UCaseProps.INSTANCE.toFullUpper(c, null, dest, UCaseProps.LOC_GREEK); 1519 appendResult(c, dest, nextIndex - i, options, edits); 1520 } 1521 i = nextIndex; 1522 state = nextState; 1523 } 1524 return dest; 1525 } 1526 } 1527 } 1528