1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 package com.ibm.icu.impl; 4 5 import java.io.IOException; 6 import java.text.CharacterIterator; 7 import java.util.Locale; 8 9 import com.ibm.icu.lang.UCharacter; 10 import com.ibm.icu.lang.UCharacterCategory; 11 import com.ibm.icu.text.BreakIterator; 12 import com.ibm.icu.text.Edits; 13 import com.ibm.icu.util.ICUUncheckedIOException; 14 import com.ibm.icu.util.ULocale; 15 16 public final class CaseMapImpl { 17 /** 18 * Implementation of UCaseProps.ContextIterator, iterates over a String. 19 * See ustrcase.c/utf16_caseContextIterator(). 20 */ 21 public static final class StringContextIterator implements UCaseProps.ContextIterator { 22 /** 23 * Constructor. 24 * @param src String to iterate over. 25 */ StringContextIterator(CharSequence src)26 public StringContextIterator(CharSequence src) { 27 this.s=src; 28 limit=src.length(); 29 cpStart=cpLimit=index=0; 30 dir=0; 31 } 32 33 /** 34 * Constructor. 35 * @param src String to iterate over. 36 * @param cpStart Start index of the current code point. 37 * @param cpLimit Limit index of the current code point. 38 */ StringContextIterator(CharSequence src, int cpStart, int cpLimit)39 public StringContextIterator(CharSequence src, int cpStart, int cpLimit) { 40 s = src; 41 index = 0; 42 limit = src.length(); 43 this.cpStart = cpStart; 44 this.cpLimit = cpLimit; 45 dir = 0; 46 } 47 48 /** 49 * Set the iteration limit for nextCaseMapCP() to an index within the string. 50 * If the limit parameter is negative or past the string, then the 51 * string length is restored as the iteration limit. 52 * 53 * <p>This limit does not affect the next() function which always 54 * iterates to the very end of the string. 55 * 56 * @param lim The iteration limit. 57 */ setLimit(int lim)58 public void setLimit(int lim) { 59 if(0<=lim && lim<=s.length()) { 60 limit=lim; 61 } else { 62 limit=s.length(); 63 } 64 } 65 66 /** 67 * Move to the iteration limit without fetching code points up to there. 68 */ moveToLimit()69 public void moveToLimit() { 70 cpStart=cpLimit=limit; 71 } 72 moveTo(int i)73 public void moveTo(int i) { 74 cpStart=cpLimit=i; 75 } 76 77 /** 78 * Iterate forward through the string to fetch the next code point 79 * to be case-mapped, and set the context indexes for it. 80 * 81 * <p>When the iteration limit is reached (and -1 is returned), 82 * getCPStart() will be at the iteration limit. 83 * 84 * <p>Iteration with next() does not affect the position for nextCaseMapCP(). 85 * 86 * @return The next code point to be case-mapped, or <0 when the iteration is done. 87 */ nextCaseMapCP()88 public int nextCaseMapCP() { 89 cpStart=cpLimit; 90 if(cpLimit<limit) { 91 int c=Character.codePointAt(s, cpLimit); 92 cpLimit+=Character.charCount(c); 93 return c; 94 } else { 95 return -1; 96 } 97 } 98 setCPStartAndLimit(int s, int l)99 public void setCPStartAndLimit(int s, int l) { 100 cpStart = s; 101 cpLimit = l; 102 dir = 0; 103 } 104 /** 105 * Returns the start of the code point that was last returned 106 * by nextCaseMapCP(). 107 */ getCPStart()108 public int getCPStart() { 109 return cpStart; 110 } 111 112 /** 113 * Returns the limit of the code point that was last returned 114 * by nextCaseMapCP(). 115 */ getCPLimit()116 public int getCPLimit() { 117 return cpLimit; 118 } 119 getCPLength()120 public int getCPLength() { 121 return cpLimit-cpStart; 122 } 123 124 // implement UCaseProps.ContextIterator 125 // The following code is not used anywhere in this private class 126 @Override reset(int direction)127 public void reset(int direction) { 128 if(direction>0) { 129 /* reset for forward iteration */ 130 dir=1; 131 index=cpLimit; 132 } else if(direction<0) { 133 /* reset for backward iteration */ 134 dir=-1; 135 index=cpStart; 136 } else { 137 // not a valid direction 138 dir=0; 139 index=0; 140 } 141 } 142 143 @Override next()144 public int next() { 145 int c; 146 147 if(dir>0 && index<s.length()) { 148 c=Character.codePointAt(s, index); 149 index+=Character.charCount(c); 150 return c; 151 } else if(dir<0 && index>0) { 152 c=Character.codePointBefore(s, index); 153 index-=Character.charCount(c); 154 return c; 155 } 156 return -1; 157 } 158 159 // variables 160 protected CharSequence s; 161 protected int index, limit, cpStart, cpLimit; 162 protected int dir; // 0=initial state >0=forward <0=backward 163 } 164 165 public static final int TITLECASE_WHOLE_STRING = 0x20; 166 public static final int TITLECASE_SENTENCES = 0x40; 167 168 /** 169 * Bit mask for the titlecasing iterator options bit field. 170 * Currently only 3 out of 8 values are used: 171 * 0 (words), TITLECASE_WHOLE_STRING, TITLECASE_SENTENCES. 172 * See stringoptions.h. 173 * @internal 174 */ 175 private static final int TITLECASE_ITERATOR_MASK = 0xe0; 176 177 public static final int TITLECASE_ADJUST_TO_CASED = 0x400; 178 179 /** 180 * Bit mask for the titlecasing index adjustment options bit set. 181 * Currently two bits are defined: 182 * TITLECASE_NO_BREAK_ADJUSTMENT, TITLECASE_ADJUST_TO_CASED. 183 * See stringoptions.h. 184 * @internal 185 */ 186 private static final int TITLECASE_ADJUSTMENT_MASK = 0x600; 187 addTitleAdjustmentOption(int options, int newOption)188 public static int addTitleAdjustmentOption(int options, int newOption) { 189 int adjOptions = options & TITLECASE_ADJUSTMENT_MASK; 190 if (adjOptions !=0 && adjOptions != newOption) { 191 throw new IllegalArgumentException("multiple titlecasing index adjustment options"); 192 } 193 return options | newOption; 194 } 195 196 private static final char ACUTE = '\u0301'; 197 198 private static final int U_GC_M_MASK = 199 (1 << UCharacterCategory.NON_SPACING_MARK) | 200 (1 << UCharacterCategory.COMBINING_SPACING_MARK) | 201 (1 << UCharacterCategory.ENCLOSING_MARK); 202 203 private static final int LNS = 204 (1 << UCharacterCategory.UPPERCASE_LETTER) | 205 (1 << UCharacterCategory.LOWERCASE_LETTER) | 206 (1 << UCharacterCategory.TITLECASE_LETTER) | 207 // Not MODIFIER_LETTER: We count only cased modifier letters. 208 (1 << UCharacterCategory.OTHER_LETTER) | 209 210 (1 << UCharacterCategory.DECIMAL_DIGIT_NUMBER) | 211 (1 << UCharacterCategory.LETTER_NUMBER) | 212 (1 << UCharacterCategory.OTHER_NUMBER) | 213 214 (1 << UCharacterCategory.MATH_SYMBOL) | 215 (1 << UCharacterCategory.CURRENCY_SYMBOL) | 216 (1 << UCharacterCategory.MODIFIER_SYMBOL) | 217 (1 << UCharacterCategory.OTHER_SYMBOL) | 218 219 (1 << UCharacterCategory.PRIVATE_USE); 220 isLNS(int c)221 private static boolean isLNS(int c) { 222 // Letter, number, symbol, 223 // or a private use code point because those are typically used as letters or numbers. 224 // Consider modifier letters only if they are cased. 225 int gc = UCharacterProperty.INSTANCE.getType(c); 226 return ((1 << gc) & LNS) != 0 || 227 (gc == UCharacterCategory.MODIFIER_LETTER && 228 UCaseProps.INSTANCE.getType(c) != UCaseProps.NONE); 229 } 230 addTitleIteratorOption(int options, int newOption)231 public static int addTitleIteratorOption(int options, int newOption) { 232 int iterOptions = options & TITLECASE_ITERATOR_MASK; 233 if (iterOptions !=0 && iterOptions != newOption) { 234 throw new IllegalArgumentException("multiple titlecasing iterator options"); 235 } 236 return options | newOption; 237 } 238 getTitleBreakIterator( Locale locale, int options, BreakIterator iter)239 public static BreakIterator getTitleBreakIterator( 240 Locale locale, int options, BreakIterator iter) { 241 options &= TITLECASE_ITERATOR_MASK; 242 if (options != 0 && iter != null) { 243 throw new IllegalArgumentException( 244 "titlecasing iterator option together with an explicit iterator"); 245 } 246 if (iter == null) { 247 switch (options) { 248 case 0: 249 iter = BreakIterator.getWordInstance(locale); 250 break; 251 case TITLECASE_WHOLE_STRING: 252 iter = new WholeStringBreakIterator(); 253 break; 254 case TITLECASE_SENTENCES: 255 iter = BreakIterator.getSentenceInstance(locale); 256 break; 257 default: 258 throw new IllegalArgumentException("unknown titlecasing iterator option"); 259 } 260 } 261 return iter; 262 } 263 getTitleBreakIterator( ULocale locale, int options, BreakIterator iter)264 public static BreakIterator getTitleBreakIterator( 265 ULocale locale, int options, BreakIterator iter) { 266 options &= TITLECASE_ITERATOR_MASK; 267 if (options != 0 && iter != null) { 268 throw new IllegalArgumentException( 269 "titlecasing iterator option together with an explicit iterator"); 270 } 271 if (iter == null) { 272 switch (options) { 273 case 0: 274 iter = BreakIterator.getWordInstance(locale); 275 break; 276 case TITLECASE_WHOLE_STRING: 277 iter = new WholeStringBreakIterator(); 278 break; 279 case TITLECASE_SENTENCES: 280 iter = BreakIterator.getSentenceInstance(locale); 281 break; 282 default: 283 throw new IllegalArgumentException("unknown titlecasing iterator option"); 284 } 285 } 286 return iter; 287 } 288 289 /** 290 * Omit unchanged text when case-mapping with Edits. 291 */ 292 public static final int OMIT_UNCHANGED_TEXT = 0x4000; 293 294 private static final class WholeStringBreakIterator extends BreakIterator { 295 private int length; 296 notImplemented()297 private static void notImplemented() { 298 throw new UnsupportedOperationException("should not occur"); 299 } 300 301 @Override first()302 public int first() { 303 return 0; 304 } 305 306 @Override last()307 public int last() { 308 notImplemented(); 309 return 0; 310 } 311 312 @Override next(int n)313 public int next(int n) { 314 notImplemented(); 315 return 0; 316 } 317 318 @Override next()319 public int next() { 320 return length; 321 } 322 323 @Override previous()324 public int previous() { 325 notImplemented(); 326 return 0; 327 } 328 329 @Override following(int offset)330 public int following(int offset) { 331 notImplemented(); 332 return 0; 333 } 334 335 @Override current()336 public int current() { 337 notImplemented(); 338 return 0; 339 } 340 341 @Override getText()342 public CharacterIterator getText() { 343 notImplemented(); 344 return null; 345 } 346 347 @Override setText(CharacterIterator newText)348 public void setText(CharacterIterator newText) { 349 length = newText.getEndIndex(); 350 } 351 352 @Override setText(CharSequence newText)353 public void setText(CharSequence newText) { 354 length = newText.length(); 355 } 356 357 @Override setText(String newText)358 public void setText(String newText) { 359 length = newText.length(); 360 } 361 } 362 appendCodePoint(Appendable a, int c)363 private static int appendCodePoint(Appendable a, int c) throws IOException { 364 if (c <= Character.MAX_VALUE) { 365 a.append((char)c); 366 return 1; 367 } else { 368 a.append((char)(0xd7c0 + (c >> 10))); 369 a.append((char)(Character.MIN_LOW_SURROGATE + (c & 0x3ff))); 370 return 2; 371 } 372 } 373 374 /** 375 * Appends a full case mapping result, see {@link UCaseProps#MAX_STRING_LENGTH}. 376 * @throws IOException 377 */ appendResult(int result, Appendable dest, int cpLength, int options, Edits edits)378 private static void appendResult(int result, Appendable dest, 379 int cpLength, int options, Edits edits) throws IOException { 380 // Decode the result. 381 if (result < 0) { 382 // (not) original code point 383 if (edits != null) { 384 edits.addUnchanged(cpLength); 385 } 386 if ((options & OMIT_UNCHANGED_TEXT) != 0) { 387 return; 388 } 389 appendCodePoint(dest, ~result); 390 } else if (result <= UCaseProps.MAX_STRING_LENGTH) { 391 // The mapping has already been appended to result. 392 if (edits != null) { 393 edits.addReplace(cpLength, result); 394 } 395 } else { 396 // Append the single-code point mapping. 397 int length = appendCodePoint(dest, result); 398 if (edits != null) { 399 edits.addReplace(cpLength, length); 400 } 401 } 402 } 403 appendUnchanged(CharSequence src, int start, int length, Appendable dest, int options, Edits edits)404 private static final void appendUnchanged(CharSequence src, int start, int length, 405 Appendable dest, int options, Edits edits) throws IOException { 406 if (length > 0) { 407 if (edits != null) { 408 edits.addUnchanged(length); 409 } 410 if ((options & OMIT_UNCHANGED_TEXT) != 0) { 411 return; 412 } 413 dest.append(src, start, start + length); 414 } 415 } 416 applyEdits(CharSequence src, StringBuilder replacementChars, Edits edits)417 private static String applyEdits(CharSequence src, StringBuilder replacementChars, Edits edits) { 418 if (!edits.hasChanges()) { 419 return src.toString(); 420 } 421 StringBuilder result = new StringBuilder(src.length() + edits.lengthDelta()); 422 for (Edits.Iterator ei = edits.getCoarseIterator(); ei.next();) { 423 if (ei.hasChange()) { 424 int i = ei.replacementIndex(); 425 result.append(replacementChars, i, i + ei.newLength()); 426 } else { 427 int i = ei.sourceIndex(); 428 result.append(src, i, i + ei.oldLength()); 429 } 430 } 431 return result.toString(); 432 } 433 434 private static final Trie2_16 CASE_TRIE = UCaseProps.getTrie(); 435 436 /** 437 * caseLocale >= 0: Lowercases [srcStart..srcLimit[ but takes context [0..srcLength[ into account. 438 * caseLocale < 0: Case-folds [srcStart..srcLimit[. 439 */ internalToLower(int caseLocale, int options, CharSequence src, int srcStart, int srcLimit, StringContextIterator iter, Appendable dest, Edits edits)440 private static void internalToLower(int caseLocale, int options, 441 CharSequence src, int srcStart, int srcLimit, StringContextIterator iter, 442 Appendable dest, Edits edits) throws IOException { 443 byte[] latinToLower; 444 if (caseLocale == UCaseProps.LOC_ROOT || 445 (caseLocale >= 0 ? 446 !(caseLocale == UCaseProps.LOC_TURKISH || caseLocale == UCaseProps.LOC_LITHUANIAN) : 447 (options & UCaseProps.FOLD_CASE_OPTIONS_MASK) == UCharacter.FOLD_CASE_DEFAULT)) { 448 latinToLower = UCaseProps.LatinCase.TO_LOWER_NORMAL; 449 } else { 450 latinToLower = UCaseProps.LatinCase.TO_LOWER_TR_LT; 451 } 452 int prev = srcStart; 453 int srcIndex = srcStart; 454 outerLoop: 455 for (;;) { 456 // fast path for simple cases 457 char lead; 458 for (;;) { 459 if (srcIndex >= srcLimit) { 460 break outerLoop; 461 } 462 lead = src.charAt(srcIndex); 463 int delta; 464 if (lead < UCaseProps.LatinCase.LONG_S) { 465 byte d = latinToLower[lead]; 466 if (d == UCaseProps.LatinCase.EXC) { break; } 467 ++srcIndex; 468 if (d == 0) { continue; } 469 delta = d; 470 } else if (lead >= 0xd800) { 471 break; // surrogate or higher 472 } else { 473 int props = CASE_TRIE.getFromU16SingleLead(lead); 474 if (UCaseProps.propsHasException(props)) { break; } 475 ++srcIndex; 476 if (!UCaseProps.isUpperOrTitleFromProps(props) || 477 (delta = UCaseProps.getDelta(props)) == 0) { 478 continue; 479 } 480 } 481 lead += delta; 482 appendUnchanged(src, prev, srcIndex - 1 - prev, dest, options, edits); 483 dest.append(lead); 484 if (edits != null) { 485 edits.addReplace(1, 1); 486 } 487 prev = srcIndex; 488 } 489 // slow path 490 int cpStart = srcIndex++; 491 char trail; 492 int c; 493 if (Character.isHighSurrogate(lead) && srcIndex < srcLimit && 494 Character.isLowSurrogate(trail = src.charAt(srcIndex))) { 495 c = Character.toCodePoint(lead, trail); 496 ++srcIndex; 497 } else { 498 c = lead; 499 } 500 // We need to append unchanged text before calling the UCaseProps.toFullXyz() methods 501 // because they will sometimes append their mapping to dest, 502 // and that must be after copying the previous text. 503 appendUnchanged(src, prev, cpStart - prev, dest, options, edits); 504 prev = cpStart; 505 if (caseLocale >= 0) { 506 if (iter == null) { 507 iter = new StringContextIterator(src, cpStart, srcIndex); 508 } else { 509 iter.setCPStartAndLimit(cpStart, srcIndex); 510 } 511 c = UCaseProps.INSTANCE.toFullLower(c, iter, dest, caseLocale); 512 } else { 513 c = UCaseProps.INSTANCE.toFullFolding(c, dest, options); 514 } 515 if (c >= 0) { 516 appendResult(c, dest, srcIndex - cpStart, options, edits); 517 prev = srcIndex; 518 } 519 } 520 appendUnchanged(src, prev, srcIndex - prev, dest, options, edits); 521 } 522 internalToUpper(int caseLocale, int options, CharSequence src, Appendable dest, Edits edits)523 private static void internalToUpper(int caseLocale, int options, 524 CharSequence src, Appendable dest, Edits edits) throws IOException { 525 StringContextIterator iter = null; 526 byte[] latinToUpper; 527 if (caseLocale == UCaseProps.LOC_TURKISH) { 528 latinToUpper = UCaseProps.LatinCase.TO_UPPER_TR; 529 } else { 530 latinToUpper = UCaseProps.LatinCase.TO_UPPER_NORMAL; 531 } 532 int prev = 0; 533 int srcIndex = 0; 534 int srcLength = src.length(); 535 outerLoop: 536 for (;;) { 537 // fast path for simple cases 538 char lead; 539 for (;;) { 540 if (srcIndex >= srcLength) { 541 break outerLoop; 542 } 543 lead = src.charAt(srcIndex); 544 int delta; 545 if (lead < UCaseProps.LatinCase.LONG_S) { 546 byte d = latinToUpper[lead]; 547 if (d == UCaseProps.LatinCase.EXC) { break; } 548 ++srcIndex; 549 if (d == 0) { continue; } 550 delta = d; 551 } else if (lead >= 0xd800) { 552 break; // surrogate or higher 553 } else { 554 int props = CASE_TRIE.getFromU16SingleLead(lead); 555 if (UCaseProps.propsHasException(props)) { break; } 556 ++srcIndex; 557 if (UCaseProps.getTypeFromProps(props) != UCaseProps.LOWER || 558 (delta = UCaseProps.getDelta(props)) == 0) { 559 continue; 560 } 561 } 562 lead += delta; 563 appendUnchanged(src, prev, srcIndex - 1 - prev, dest, options, edits); 564 dest.append(lead); 565 if (edits != null) { 566 edits.addReplace(1, 1); 567 } 568 prev = srcIndex; 569 } 570 // slow path 571 int cpStart = srcIndex++; 572 char trail; 573 int c; 574 if (Character.isHighSurrogate(lead) && srcIndex < srcLength && 575 Character.isLowSurrogate(trail = src.charAt(srcIndex))) { 576 c = Character.toCodePoint(lead, trail); 577 ++srcIndex; 578 } else { 579 c = lead; 580 } 581 if (iter == null) { 582 iter = new StringContextIterator(src, cpStart, srcIndex); 583 } else { 584 iter.setCPStartAndLimit(cpStart, srcIndex); 585 } 586 // We need to append unchanged text before calling UCaseProps.toFullUpper() 587 // because it will sometimes append its mapping to dest, 588 // and that must be after copying the previous text. 589 appendUnchanged(src, prev, cpStart - prev, dest, options, edits); 590 prev = cpStart; 591 c = UCaseProps.INSTANCE.toFullUpper(c, iter, dest, caseLocale); 592 if (c >= 0) { 593 appendResult(c, dest, srcIndex - cpStart, options, edits); 594 prev = srcIndex; 595 } 596 } 597 appendUnchanged(src, prev, srcIndex - prev, dest, options, edits); 598 } 599 toLower(int caseLocale, int options, CharSequence src)600 public static String toLower(int caseLocale, int options, CharSequence src) { 601 if (src.length() <= 100 && (options & OMIT_UNCHANGED_TEXT) == 0) { 602 if (src.length() == 0) { 603 return src.toString(); 604 } 605 // Collect and apply only changes. 606 // Good if no or few changes. Bad (slow) if many changes. 607 Edits edits = new Edits(); 608 StringBuilder replacementChars = toLower( 609 caseLocale, options | OMIT_UNCHANGED_TEXT, src, new StringBuilder(), edits); 610 return applyEdits(src, replacementChars, edits); 611 } else { 612 return toLower(caseLocale, options, src, 613 new StringBuilder(src.length()), null).toString(); 614 } 615 } 616 toLower(int caseLocale, int options, CharSequence src, A dest, Edits edits)617 public static <A extends Appendable> A toLower(int caseLocale, int options, 618 CharSequence src, A dest, Edits edits) { 619 try { 620 if (edits != null) { 621 edits.reset(); 622 } 623 internalToLower(caseLocale, options, src, 0, src.length(), null, dest, edits); 624 return dest; 625 } catch (IOException e) { 626 throw new ICUUncheckedIOException(e); 627 } 628 } 629 toUpper(int caseLocale, int options, CharSequence src)630 public static String toUpper(int caseLocale, int options, CharSequence src) { 631 if (src.length() <= 100 && (options & OMIT_UNCHANGED_TEXT) == 0) { 632 if (src.length() == 0) { 633 return src.toString(); 634 } 635 // Collect and apply only changes. 636 // Good if no or few changes. Bad (slow) if many changes. 637 Edits edits = new Edits(); 638 StringBuilder replacementChars = toUpper( 639 caseLocale, options | OMIT_UNCHANGED_TEXT, src, new StringBuilder(), edits); 640 return applyEdits(src, replacementChars, edits); 641 } else { 642 return toUpper(caseLocale, options, src, 643 new StringBuilder(src.length()), null).toString(); 644 } 645 } 646 toUpper(int caseLocale, int options, CharSequence src, A dest, Edits edits)647 public static <A extends Appendable> A toUpper(int caseLocale, int options, 648 CharSequence src, A dest, Edits edits) { 649 try { 650 if (edits != null) { 651 edits.reset(); 652 } 653 if (caseLocale == UCaseProps.LOC_GREEK) { 654 return GreekUpper.toUpper(options, src, dest, edits); 655 } 656 internalToUpper(caseLocale, options, src, dest, edits); 657 return dest; 658 } catch (IOException e) { 659 throw new ICUUncheckedIOException(e); 660 } 661 } 662 toTitle(int caseLocale, int options, BreakIterator iter, CharSequence src)663 public static String toTitle(int caseLocale, int options, BreakIterator iter, CharSequence src) { 664 if (src.length() <= 100 && (options & OMIT_UNCHANGED_TEXT) == 0) { 665 if (src.length() == 0) { 666 return src.toString(); 667 } 668 // Collect and apply only changes. 669 // Good if no or few changes. Bad (slow) if many changes. 670 Edits edits = new Edits(); 671 StringBuilder replacementChars = toTitle( 672 caseLocale, options | OMIT_UNCHANGED_TEXT, iter, src, 673 new StringBuilder(), edits); 674 return applyEdits(src, replacementChars, edits); 675 } else { 676 return toTitle(caseLocale, options, iter, src, 677 new StringBuilder(src.length()), null).toString(); 678 } 679 } 680 toTitle( int caseLocale, int options, BreakIterator titleIter, CharSequence src, A dest, Edits edits)681 public static <A extends Appendable> A toTitle( 682 int caseLocale, int options, BreakIterator titleIter, 683 CharSequence src, A dest, Edits edits) { 684 try { 685 if (edits != null) { 686 edits.reset(); 687 } 688 689 /* set up local variables */ 690 StringContextIterator iter = new StringContextIterator(src); 691 int srcLength = src.length(); 692 int prev=0; 693 boolean isFirstIndex=true; 694 695 /* titlecasing loop */ 696 while(prev<srcLength) { 697 /* find next index where to titlecase */ 698 int index; 699 if(isFirstIndex) { 700 isFirstIndex=false; 701 index=titleIter.first(); 702 } else { 703 index=titleIter.next(); 704 } 705 if(index==BreakIterator.DONE || index>srcLength) { 706 index=srcLength; 707 } 708 709 /* 710 * Segment [prev..index[ into 3 parts: 711 * a) skipped characters (copy as-is) [prev..titleStart[ 712 * b) first letter (titlecase) [titleStart..titleLimit[ 713 * c) subsequent characters (lowercase) [titleLimit..index[ 714 */ 715 if(prev<index) { 716 // Find and copy skipped characters [prev..titleStart[ 717 int titleStart=prev; 718 iter.setLimit(index); 719 int c=iter.nextCaseMapCP(); 720 if ((options&UCharacter.TITLECASE_NO_BREAK_ADJUSTMENT)==0) { 721 // Adjust the titlecasing index to the next cased character, 722 // or to the next letter/number/symbol/private use. 723 // Stop with titleStart<titleLimit<=index 724 // if there is a character to be titlecased, 725 // or else stop with titleStart==titleLimit==index. 726 boolean toCased = (options&CaseMapImpl.TITLECASE_ADJUST_TO_CASED) != 0; 727 while ((toCased ? 728 UCaseProps.NONE==UCaseProps.INSTANCE.getType(c) : 729 !CaseMapImpl.isLNS(c)) && 730 (c=iter.nextCaseMapCP())>=0) {} 731 // If c<0 then we have only uncased characters in [prev..index[ 732 // and stopped with titleStart==titleLimit==index. 733 titleStart=iter.getCPStart(); 734 if (prev < titleStart) { 735 appendUnchanged(src, prev, titleStart-prev, dest, options, edits); 736 } 737 } 738 739 if(titleStart<index) { 740 // titlecase c which is from [titleStart..titleLimit[ 741 c = UCaseProps.INSTANCE.toFullTitle(c, iter, dest, caseLocale); 742 appendResult(c, dest, iter.getCPLength(), options, edits); 743 744 // Special case Dutch IJ titlecasing 745 int titleLimit; 746 if (titleStart+1 < index && caseLocale == UCaseProps.LOC_DUTCH) { 747 if (c < 0) { 748 c = ~c; 749 } 750 if (c == 'I' || c == 'Í') { 751 titleLimit = maybeTitleDutchIJ(src, c, titleStart + 1, index, dest, options, edits); 752 iter.moveTo(titleLimit); 753 } 754 else { 755 titleLimit = iter.getCPLimit(); 756 } 757 } else { 758 titleLimit = iter.getCPLimit(); 759 } 760 761 // lowercase [titleLimit..index[ 762 if(titleLimit<index) { 763 if((options&UCharacter.TITLECASE_NO_LOWERCASE)==0) { 764 // Normal operation: Lowercase the rest of the word. 765 internalToLower(caseLocale, options, 766 src, titleLimit, index, iter, dest, edits); 767 } else { 768 // Optionally just copy the rest of the word unchanged. 769 appendUnchanged(src, titleLimit, index-titleLimit, dest, options, edits); 770 } 771 iter.moveToLimit(); 772 } 773 } 774 } 775 776 prev=index; 777 } 778 return dest; 779 } catch (IOException e) { 780 throw new ICUUncheckedIOException(e); 781 } 782 } 783 784 /** 785 * Input: c is a letter I with or without acute accent. 786 * start is the index in src after c, and is less than segmentLimit. 787 * If a plain i/I is followed by a plain j/J, 788 * or an i/I with acute (precomposed or decomposed) is followed by a j/J with acute, 789 * then we output accordingly. 790 * 791 * @return the src index after the titlecased sequence, or the start index if no Dutch IJ 792 * @throws IOException 793 */ maybeTitleDutchIJ( CharSequence src, int c, int start, int segmentLimit, A dest, int options, Edits edits)794 private static <A extends Appendable> int maybeTitleDutchIJ( 795 CharSequence src, int c, int start, int segmentLimit, 796 A dest, int options, Edits edits) throws IOException { 797 assert start < segmentLimit; 798 799 int index = start; 800 boolean withAcute = false; 801 802 // If the conditions are met, then the following variables tell us what to output. 803 int unchanged1 = 0; // code units before the j, or the whole sequence (0..3) 804 boolean doTitleJ = false; // true if the j needs to be titlecased 805 int unchanged2 = 0; // after the j (0 or 1) 806 807 // next character after the first letter 808 char c2 = src.charAt(index++); 809 810 // Is the first letter an i/I with accent? 811 if (c == 'I') { 812 if (c2 == ACUTE) { 813 withAcute = true; 814 unchanged1 = 1; 815 if (index == segmentLimit) { return start; } 816 c2 = src.charAt(index++); 817 } 818 } else { // Í 819 withAcute = true; 820 } 821 // Is the next character a j/J? 822 if (c2 == 'j') { 823 doTitleJ = true; 824 } else if (c2 == 'J') { 825 ++unchanged1; 826 } else { 827 return start; 828 } 829 // A plain i/I must be followed by a plain j/J. 830 // An i/I with acute must be followed by a j/J with acute. 831 if (withAcute) { 832 if (index == segmentLimit || src.charAt(index++) != ACUTE) { return start; } 833 if (doTitleJ) { 834 unchanged2 = 1; 835 } else { 836 ++unchanged1; 837 } 838 } 839 // There must not be another combining mark. 840 if (index < segmentLimit) { 841 int cp = Character.codePointAt(src, index); 842 int bit = 1 << UCharacter.getType(cp); 843 if ((bit & U_GC_M_MASK) != 0) { 844 return start; 845 } 846 } 847 // Output the rest of the Dutch IJ. 848 appendUnchanged(src, start, unchanged1, dest, options, edits); 849 start += unchanged1; 850 if (doTitleJ) { 851 dest.append('J'); 852 if (edits != null) { 853 edits.addReplace(1, 1); 854 } 855 ++start; 856 } 857 appendUnchanged(src, start, unchanged2, dest, options, edits); 858 assert start + unchanged2 == index; 859 return index; 860 } 861 862 public static String fold(int options, CharSequence src) { 863 if (src.length() <= 100 && (options & OMIT_UNCHANGED_TEXT) == 0) { 864 if (src.length() == 0) { 865 return src.toString(); 866 } 867 // Collect and apply only changes. 868 // Good if no or few changes. Bad (slow) if many changes. 869 Edits edits = new Edits(); 870 StringBuilder replacementChars = fold( 871 options | OMIT_UNCHANGED_TEXT, src, new StringBuilder(), edits); 872 return applyEdits(src, replacementChars, edits); 873 } else { 874 return fold(options, src, new StringBuilder(src.length()), null).toString(); 875 } 876 } 877 878 public static <A extends Appendable> A fold(int options, 879 CharSequence src, A dest, Edits edits) { 880 try { 881 if (edits != null) { 882 edits.reset(); 883 } 884 internalToLower(-1, options, src, 0, src.length(), null, dest, edits); 885 return dest; 886 } catch (IOException e) { 887 throw new ICUUncheckedIOException(e); 888 } 889 } 890 891 private static final class GreekUpper { 892 // Data bits. 893 private static final int UPPER_MASK = 0x3ff; 894 private static final int HAS_VOWEL = 0x1000; 895 private static final int HAS_YPOGEGRAMMENI = 0x2000; 896 private static final int HAS_ACCENT = 0x4000; 897 private static final int HAS_DIALYTIKA = 0x8000; 898 // Further bits during data building and processing, not stored in the data map. 899 private static final int HAS_COMBINING_DIALYTIKA = 0x10000; 900 private static final int HAS_OTHER_GREEK_DIACRITIC = 0x20000; 901 902 private static final int HAS_VOWEL_AND_ACCENT = HAS_VOWEL | HAS_ACCENT; 903 private static final int HAS_VOWEL_AND_ACCENT_AND_DIALYTIKA = 904 HAS_VOWEL_AND_ACCENT | HAS_DIALYTIKA; 905 private static final int HAS_EITHER_DIALYTIKA = HAS_DIALYTIKA | HAS_COMBINING_DIALYTIKA; 906 907 // State bits. 908 private static final int AFTER_CASED = 1; 909 private static final int AFTER_VOWEL_WITH_ACCENT = 2; 910 911 // Data generated by prototype code, see 912 // https://icu.unicode.org/design/case/greek-upper 913 // TODO: Move this data into ucase.icu. 914 private static final char[] data0370 = { 915 // U+0370..03FF 916 0x0370, // Ͱ 917 0x0370, // ͱ 918 0x0372, // Ͳ 919 0x0372, // ͳ 920 0, 921 0, 922 0x0376, // Ͷ 923 0x0376, // ͷ 924 0, 925 0, 926 0x037A, // ͺ 927 0x03FD, // ͻ 928 0x03FE, // ͼ 929 0x03FF, // ͽ 930 0, 931 0x037F, // Ϳ 932 0, 933 0, 934 0, 935 0, 936 0, 937 0, 938 0x0391 | HAS_VOWEL | HAS_ACCENT, // Ά 939 0, 940 0x0395 | HAS_VOWEL | HAS_ACCENT, // Έ 941 0x0397 | HAS_VOWEL | HAS_ACCENT, // Ή 942 0x0399 | HAS_VOWEL | HAS_ACCENT, // Ί 943 0, 944 0x039F | HAS_VOWEL | HAS_ACCENT, // Ό 945 0, 946 0x03A5 | HAS_VOWEL | HAS_ACCENT, // Ύ 947 0x03A9 | HAS_VOWEL | HAS_ACCENT, // Ώ 948 0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA, // ΐ 949 0x0391 | HAS_VOWEL, // Α 950 0x0392, // Β 951 0x0393, // Γ 952 0x0394, // Δ 953 0x0395 | HAS_VOWEL, // Ε 954 0x0396, // Ζ 955 0x0397 | HAS_VOWEL, // Η 956 0x0398, // Θ 957 0x0399 | HAS_VOWEL, // Ι 958 0x039A, // Κ 959 0x039B, // Λ 960 0x039C, // Μ 961 0x039D, // Ν 962 0x039E, // Ξ 963 0x039F | HAS_VOWEL, // Ο 964 0x03A0, // Π 965 0x03A1, // Ρ 966 0, 967 0x03A3, // Σ 968 0x03A4, // Τ 969 0x03A5 | HAS_VOWEL, // Υ 970 0x03A6, // Φ 971 0x03A7, // Χ 972 0x03A8, // Ψ 973 0x03A9 | HAS_VOWEL, // Ω 974 0x0399 | HAS_VOWEL | HAS_DIALYTIKA, // Ϊ 975 0x03A5 | HAS_VOWEL | HAS_DIALYTIKA, // Ϋ 976 0x0391 | HAS_VOWEL | HAS_ACCENT, // ά 977 0x0395 | HAS_VOWEL | HAS_ACCENT, // έ 978 0x0397 | HAS_VOWEL | HAS_ACCENT, // ή 979 0x0399 | HAS_VOWEL | HAS_ACCENT, // ί 980 0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA, // ΰ 981 0x0391 | HAS_VOWEL, // α 982 0x0392, // β 983 0x0393, // γ 984 0x0394, // δ 985 0x0395 | HAS_VOWEL, // ε 986 0x0396, // ζ 987 0x0397 | HAS_VOWEL, // η 988 0x0398, // θ 989 0x0399 | HAS_VOWEL, // ι 990 0x039A, // κ 991 0x039B, // λ 992 0x039C, // μ 993 0x039D, // ν 994 0x039E, // ξ 995 0x039F | HAS_VOWEL, // ο 996 0x03A0, // π 997 0x03A1, // ρ 998 0x03A3, // ς 999 0x03A3, // σ 1000 0x03A4, // τ 1001 0x03A5 | HAS_VOWEL, // υ 1002 0x03A6, // φ 1003 0x03A7, // χ 1004 0x03A8, // ψ 1005 0x03A9 | HAS_VOWEL, // ω 1006 0x0399 | HAS_VOWEL | HAS_DIALYTIKA, // ϊ 1007 0x03A5 | HAS_VOWEL | HAS_DIALYTIKA, // ϋ 1008 0x039F | HAS_VOWEL | HAS_ACCENT, // ό 1009 0x03A5 | HAS_VOWEL | HAS_ACCENT, // ύ 1010 0x03A9 | HAS_VOWEL | HAS_ACCENT, // ώ 1011 0x03CF, // Ϗ 1012 0x0392, // ϐ 1013 0x0398, // ϑ 1014 0x03D2, // ϒ 1015 0x03D2 | HAS_ACCENT, // ϓ 1016 0x03D2 | HAS_DIALYTIKA, // ϔ 1017 0x03A6, // ϕ 1018 0x03A0, // ϖ 1019 0x03CF, // ϗ 1020 0x03D8, // Ϙ 1021 0x03D8, // ϙ 1022 0x03DA, // Ϛ 1023 0x03DA, // ϛ 1024 0x03DC, // Ϝ 1025 0x03DC, // ϝ 1026 0x03DE, // Ϟ 1027 0x03DE, // ϟ 1028 0x03E0, // Ϡ 1029 0x03E0, // ϡ 1030 0, 1031 0, 1032 0, 1033 0, 1034 0, 1035 0, 1036 0, 1037 0, 1038 0, 1039 0, 1040 0, 1041 0, 1042 0, 1043 0, 1044 0x039A, // ϰ 1045 0x03A1, // ϱ 1046 0x03F9, // ϲ 1047 0x037F, // ϳ 1048 0x03F4, // ϴ 1049 0x0395 | HAS_VOWEL, // ϵ 1050 0, 1051 0x03F7, // Ϸ 1052 0x03F7, // ϸ 1053 0x03F9, // Ϲ 1054 0x03FA, // Ϻ 1055 0x03FA, // ϻ 1056 0x03FC, // ϼ 1057 0x03FD, // Ͻ 1058 0x03FE, // Ͼ 1059 0x03FF, // Ͽ 1060 }; 1061 1062 private static final char[] data1F00 = { 1063 // U+1F00..1FFF 1064 0x0391 | HAS_VOWEL, // ἀ 1065 0x0391 | HAS_VOWEL, // ἁ 1066 0x0391 | HAS_VOWEL | HAS_ACCENT, // ἂ 1067 0x0391 | HAS_VOWEL | HAS_ACCENT, // ἃ 1068 0x0391 | HAS_VOWEL | HAS_ACCENT, // ἄ 1069 0x0391 | HAS_VOWEL | HAS_ACCENT, // ἅ 1070 0x0391 | HAS_VOWEL | HAS_ACCENT, // ἆ 1071 0x0391 | HAS_VOWEL | HAS_ACCENT, // ἇ 1072 0x0391 | HAS_VOWEL, // Ἀ 1073 0x0391 | HAS_VOWEL, // Ἁ 1074 0x0391 | HAS_VOWEL | HAS_ACCENT, // Ἂ 1075 0x0391 | HAS_VOWEL | HAS_ACCENT, // Ἃ 1076 0x0391 | HAS_VOWEL | HAS_ACCENT, // Ἄ 1077 0x0391 | HAS_VOWEL | HAS_ACCENT, // Ἅ 1078 0x0391 | HAS_VOWEL | HAS_ACCENT, // Ἆ 1079 0x0391 | HAS_VOWEL | HAS_ACCENT, // Ἇ 1080 0x0395 | HAS_VOWEL, // ἐ 1081 0x0395 | HAS_VOWEL, // ἑ 1082 0x0395 | HAS_VOWEL | HAS_ACCENT, // ἒ 1083 0x0395 | HAS_VOWEL | HAS_ACCENT, // ἓ 1084 0x0395 | HAS_VOWEL | HAS_ACCENT, // ἔ 1085 0x0395 | HAS_VOWEL | HAS_ACCENT, // ἕ 1086 0, 1087 0, 1088 0x0395 | HAS_VOWEL, // Ἐ 1089 0x0395 | HAS_VOWEL, // Ἑ 1090 0x0395 | HAS_VOWEL | HAS_ACCENT, // Ἒ 1091 0x0395 | HAS_VOWEL | HAS_ACCENT, // Ἓ 1092 0x0395 | HAS_VOWEL | HAS_ACCENT, // Ἔ 1093 0x0395 | HAS_VOWEL | HAS_ACCENT, // Ἕ 1094 0, 1095 0, 1096 0x0397 | HAS_VOWEL, // ἠ 1097 0x0397 | HAS_VOWEL, // ἡ 1098 0x0397 | HAS_VOWEL | HAS_ACCENT, // ἢ 1099 0x0397 | HAS_VOWEL | HAS_ACCENT, // ἣ 1100 0x0397 | HAS_VOWEL | HAS_ACCENT, // ἤ 1101 0x0397 | HAS_VOWEL | HAS_ACCENT, // ἥ 1102 0x0397 | HAS_VOWEL | HAS_ACCENT, // ἦ 1103 0x0397 | HAS_VOWEL | HAS_ACCENT, // ἧ 1104 0x0397 | HAS_VOWEL, // Ἠ 1105 0x0397 | HAS_VOWEL, // Ἡ 1106 0x0397 | HAS_VOWEL | HAS_ACCENT, // Ἢ 1107 0x0397 | HAS_VOWEL | HAS_ACCENT, // Ἣ 1108 0x0397 | HAS_VOWEL | HAS_ACCENT, // Ἤ 1109 0x0397 | HAS_VOWEL | HAS_ACCENT, // Ἥ 1110 0x0397 | HAS_VOWEL | HAS_ACCENT, // Ἦ 1111 0x0397 | HAS_VOWEL | HAS_ACCENT, // Ἧ 1112 0x0399 | HAS_VOWEL, // ἰ 1113 0x0399 | HAS_VOWEL, // ἱ 1114 0x0399 | HAS_VOWEL | HAS_ACCENT, // ἲ 1115 0x0399 | HAS_VOWEL | HAS_ACCENT, // ἳ 1116 0x0399 | HAS_VOWEL | HAS_ACCENT, // ἴ 1117 0x0399 | HAS_VOWEL | HAS_ACCENT, // ἵ 1118 0x0399 | HAS_VOWEL | HAS_ACCENT, // ἶ 1119 0x0399 | HAS_VOWEL | HAS_ACCENT, // ἷ 1120 0x0399 | HAS_VOWEL, // Ἰ 1121 0x0399 | HAS_VOWEL, // Ἱ 1122 0x0399 | HAS_VOWEL | HAS_ACCENT, // Ἲ 1123 0x0399 | HAS_VOWEL | HAS_ACCENT, // Ἳ 1124 0x0399 | HAS_VOWEL | HAS_ACCENT, // Ἴ 1125 0x0399 | HAS_VOWEL | HAS_ACCENT, // Ἵ 1126 0x0399 | HAS_VOWEL | HAS_ACCENT, // Ἶ 1127 0x0399 | HAS_VOWEL | HAS_ACCENT, // Ἷ 1128 0x039F | HAS_VOWEL, // ὀ 1129 0x039F | HAS_VOWEL, // ὁ 1130 0x039F | HAS_VOWEL | HAS_ACCENT, // ὂ 1131 0x039F | HAS_VOWEL | HAS_ACCENT, // ὃ 1132 0x039F | HAS_VOWEL | HAS_ACCENT, // ὄ 1133 0x039F | HAS_VOWEL | HAS_ACCENT, // ὅ 1134 0, 1135 0, 1136 0x039F | HAS_VOWEL, // Ὀ 1137 0x039F | HAS_VOWEL, // Ὁ 1138 0x039F | HAS_VOWEL | HAS_ACCENT, // Ὂ 1139 0x039F | HAS_VOWEL | HAS_ACCENT, // Ὃ 1140 0x039F | HAS_VOWEL | HAS_ACCENT, // Ὄ 1141 0x039F | HAS_VOWEL | HAS_ACCENT, // Ὅ 1142 0, 1143 0, 1144 0x03A5 | HAS_VOWEL, // ὐ 1145 0x03A5 | HAS_VOWEL, // ὑ 1146 0x03A5 | HAS_VOWEL | HAS_ACCENT, // ὒ 1147 0x03A5 | HAS_VOWEL | HAS_ACCENT, // ὓ 1148 0x03A5 | HAS_VOWEL | HAS_ACCENT, // ὔ 1149 0x03A5 | HAS_VOWEL | HAS_ACCENT, // ὕ 1150 0x03A5 | HAS_VOWEL | HAS_ACCENT, // ὖ 1151 0x03A5 | HAS_VOWEL | HAS_ACCENT, // ὗ 1152 0, 1153 0x03A5 | HAS_VOWEL, // Ὑ 1154 0, 1155 0x03A5 | HAS_VOWEL | HAS_ACCENT, // Ὓ 1156 0, 1157 0x03A5 | HAS_VOWEL | HAS_ACCENT, // Ὕ 1158 0, 1159 0x03A5 | HAS_VOWEL | HAS_ACCENT, // Ὗ 1160 0x03A9 | HAS_VOWEL, // ὠ 1161 0x03A9 | HAS_VOWEL, // ὡ 1162 0x03A9 | HAS_VOWEL | HAS_ACCENT, // ὢ 1163 0x03A9 | HAS_VOWEL | HAS_ACCENT, // ὣ 1164 0x03A9 | HAS_VOWEL | HAS_ACCENT, // ὤ 1165 0x03A9 | HAS_VOWEL | HAS_ACCENT, // ὥ 1166 0x03A9 | HAS_VOWEL | HAS_ACCENT, // ὦ 1167 0x03A9 | HAS_VOWEL | HAS_ACCENT, // ὧ 1168 0x03A9 | HAS_VOWEL, // Ὠ 1169 0x03A9 | HAS_VOWEL, // Ὡ 1170 0x03A9 | HAS_VOWEL | HAS_ACCENT, // Ὢ 1171 0x03A9 | HAS_VOWEL | HAS_ACCENT, // Ὣ 1172 0x03A9 | HAS_VOWEL | HAS_ACCENT, // Ὤ 1173 0x03A9 | HAS_VOWEL | HAS_ACCENT, // Ὥ 1174 0x03A9 | HAS_VOWEL | HAS_ACCENT, // Ὦ 1175 0x03A9 | HAS_VOWEL | HAS_ACCENT, // Ὧ 1176 0x0391 | HAS_VOWEL | HAS_ACCENT, // ὰ 1177 0x0391 | HAS_VOWEL | HAS_ACCENT, // ά 1178 0x0395 | HAS_VOWEL | HAS_ACCENT, // ὲ 1179 0x0395 | HAS_VOWEL | HAS_ACCENT, // έ 1180 0x0397 | HAS_VOWEL | HAS_ACCENT, // ὴ 1181 0x0397 | HAS_VOWEL | HAS_ACCENT, // ή 1182 0x0399 | HAS_VOWEL | HAS_ACCENT, // ὶ 1183 0x0399 | HAS_VOWEL | HAS_ACCENT, // ί 1184 0x039F | HAS_VOWEL | HAS_ACCENT, // ὸ 1185 0x039F | HAS_VOWEL | HAS_ACCENT, // ό 1186 0x03A5 | HAS_VOWEL | HAS_ACCENT, // ὺ 1187 0x03A5 | HAS_VOWEL | HAS_ACCENT, // ύ 1188 0x03A9 | HAS_VOWEL | HAS_ACCENT, // ὼ 1189 0x03A9 | HAS_VOWEL | HAS_ACCENT, // ώ 1190 0, 1191 0, 1192 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾀ 1193 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾁ 1194 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾂ 1195 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾃ 1196 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾄ 1197 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾅ 1198 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾆ 1199 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾇ 1200 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾈ 1201 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾉ 1202 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾊ 1203 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾋ 1204 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾌ 1205 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾍ 1206 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾎ 1207 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾏ 1208 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾐ 1209 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾑ 1210 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾒ 1211 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾓ 1212 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾔ 1213 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾕ 1214 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾖ 1215 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾗ 1216 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾘ 1217 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾙ 1218 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾚ 1219 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾛ 1220 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾜ 1221 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾝ 1222 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾞ 1223 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾟ 1224 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾠ 1225 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾡ 1226 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾢ 1227 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾣ 1228 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾤ 1229 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾥ 1230 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾦ 1231 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾧ 1232 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾨ 1233 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾩ 1234 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾪ 1235 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾫ 1236 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾬ 1237 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾭ 1238 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾮ 1239 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾯ 1240 0x0391 | HAS_VOWEL, // ᾰ 1241 0x0391 | HAS_VOWEL, // ᾱ 1242 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾲ 1243 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾳ 1244 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾴ 1245 0, 1246 0x0391 | HAS_VOWEL | HAS_ACCENT, // ᾶ 1247 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾷ 1248 0x0391 | HAS_VOWEL, // Ᾰ 1249 0x0391 | HAS_VOWEL, // Ᾱ 1250 0x0391 | HAS_VOWEL | HAS_ACCENT, // Ὰ 1251 0x0391 | HAS_VOWEL | HAS_ACCENT, // Ά 1252 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾼ 1253 0, 1254 0x0399 | HAS_VOWEL, // ι 1255 0, 1256 0, 1257 0, 1258 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ῂ 1259 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ῃ 1260 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ῄ 1261 0, 1262 0x0397 | HAS_VOWEL | HAS_ACCENT, // ῆ 1263 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ῇ 1264 0x0395 | HAS_VOWEL | HAS_ACCENT, // Ὲ 1265 0x0395 | HAS_VOWEL | HAS_ACCENT, // Έ 1266 0x0397 | HAS_VOWEL | HAS_ACCENT, // Ὴ 1267 0x0397 | HAS_VOWEL | HAS_ACCENT, // Ή 1268 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ῌ 1269 0, 1270 0, 1271 0, 1272 0x0399 | HAS_VOWEL, // ῐ 1273 0x0399 | HAS_VOWEL, // ῑ 1274 0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA, // ῒ 1275 0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA, // ΐ 1276 0, 1277 0, 1278 0x0399 | HAS_VOWEL | HAS_ACCENT, // ῖ 1279 0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA, // ῗ 1280 0x0399 | HAS_VOWEL, // Ῐ 1281 0x0399 | HAS_VOWEL, // Ῑ 1282 0x0399 | HAS_VOWEL | HAS_ACCENT, // Ὶ 1283 0x0399 | HAS_VOWEL | HAS_ACCENT, // Ί 1284 0, 1285 0, 1286 0, 1287 0, 1288 0x03A5 | HAS_VOWEL, // ῠ 1289 0x03A5 | HAS_VOWEL, // ῡ 1290 0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA, // ῢ 1291 0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA, // ΰ 1292 0x03A1, // ῤ 1293 0x03A1, // ῥ 1294 0x03A5 | HAS_VOWEL | HAS_ACCENT, // ῦ 1295 0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA, // ῧ 1296 0x03A5 | HAS_VOWEL, // Ῠ 1297 0x03A5 | HAS_VOWEL, // Ῡ 1298 0x03A5 | HAS_VOWEL | HAS_ACCENT, // Ὺ 1299 0x03A5 | HAS_VOWEL | HAS_ACCENT, // Ύ 1300 0x03A1, // Ῥ 1301 0, 1302 0, 1303 0, 1304 0, 1305 0, 1306 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ῲ 1307 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ῳ 1308 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ῴ 1309 0, 1310 0x03A9 | HAS_VOWEL | HAS_ACCENT, // ῶ 1311 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ῷ 1312 0x039F | HAS_VOWEL | HAS_ACCENT, // Ὸ 1313 0x039F | HAS_VOWEL | HAS_ACCENT, // Ό 1314 0x03A9 | HAS_VOWEL | HAS_ACCENT, // Ὼ 1315 0x03A9 | HAS_VOWEL | HAS_ACCENT, // Ώ 1316 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ῼ 1317 0, 1318 0, 1319 0, 1320 }; 1321 1322 // U+2126 Ohm sign 1323 private static final char data2126 = 0x03A9 | HAS_VOWEL; // Ω 1324 1325 private static final int getLetterData(int c) { 1326 if (c < 0x370 || 0x2126 < c || (0x3ff < c && c < 0x1f00)) { 1327 return 0; 1328 } else if (c <= 0x3ff) { 1329 return data0370[c - 0x370]; 1330 } else if (c <= 0x1fff) { 1331 return data1F00[c - 0x1f00]; 1332 } else if (c == 0x2126) { 1333 return data2126; 1334 } else { 1335 return 0; 1336 } 1337 } 1338 1339 /** 1340 * Returns a non-zero value for each of the Greek combining diacritics 1341 * listed in The Unicode Standard, version 8, chapter 7.2 Greek, 1342 * plus some perispomeni look-alikes. 1343 */ 1344 private static final int getDiacriticData(int c) { 1345 switch (c) { 1346 case '\u0300': // varia 1347 case '\u0301': // tonos = oxia 1348 case '\u0342': // perispomeni 1349 case '\u0302': // circumflex can look like perispomeni 1350 case '\u0303': // tilde can look like perispomeni 1351 case '\u0311': // inverted breve can look like perispomeni 1352 return HAS_ACCENT; 1353 case '\u0308': // dialytika = diaeresis 1354 return HAS_COMBINING_DIALYTIKA; 1355 case '\u0344': // dialytika tonos 1356 return HAS_COMBINING_DIALYTIKA | HAS_ACCENT; 1357 case '\u0345': // ypogegrammeni = iota subscript 1358 return HAS_YPOGEGRAMMENI; 1359 case '\u0304': // macron 1360 case '\u0306': // breve 1361 case '\u0313': // comma above 1362 case '\u0314': // reversed comma above 1363 case '\u0343': // koronis 1364 return HAS_OTHER_GREEK_DIACRITIC; 1365 default: 1366 return 0; 1367 } 1368 } 1369 1370 private static boolean isFollowedByCasedLetter(CharSequence s, int i) { 1371 while (i < s.length()) { 1372 int c = Character.codePointAt(s, i); 1373 int type = UCaseProps.INSTANCE.getTypeOrIgnorable(c); 1374 if ((type & UCaseProps.IGNORABLE) != 0) { 1375 // Case-ignorable, continue with the loop. 1376 i += Character.charCount(c); 1377 } else if (type != UCaseProps.NONE) { 1378 return true; // Followed by cased letter. 1379 } else { 1380 return false; // Uncased and not case-ignorable. 1381 } 1382 } 1383 return false; // Not followed by cased letter. 1384 } 1385 1386 /** 1387 * Greek string uppercasing with a state machine. 1388 * Probably simpler than a stateless function that has to figure out complex context-before 1389 * for each character. 1390 * TODO: Try to re-consolidate one way or another with the non-Greek function. 1391 * 1392 * <p>Keep this consistent with the C++ versions in ustrcase.cpp (UTF-16) and ucasemap.cpp (UTF-8). 1393 * @throws IOException 1394 */ 1395 private static <A extends Appendable> A toUpper(int options, 1396 CharSequence src, A dest, Edits edits) throws IOException { 1397 int state = 0; 1398 for (int i = 0; i < src.length();) { 1399 int c = Character.codePointAt(src, i); 1400 int nextIndex = i + Character.charCount(c); 1401 int nextState = 0; 1402 int type = UCaseProps.INSTANCE.getTypeOrIgnorable(c); 1403 if ((type & UCaseProps.IGNORABLE) != 0) { 1404 // c is case-ignorable 1405 nextState |= (state & AFTER_CASED); 1406 } else if (type != UCaseProps.NONE) { 1407 // c is cased 1408 nextState |= AFTER_CASED; 1409 } 1410 int data = getLetterData(c); 1411 if (data > 0) { 1412 int upper = data & UPPER_MASK; 1413 // Add a dialytika to this iota or ypsilon vowel 1414 // if we removed a tonos from the previous vowel, 1415 // and that previous vowel did not also have (or gain) a dialytika. 1416 // Adding one only to the final vowel in a longer sequence 1417 // (which does not occur in normal writing) would require lookahead. 1418 // Set the same flag as for preserving an existing dialytika. 1419 if ((data & HAS_VOWEL) != 0 && (state & AFTER_VOWEL_WITH_ACCENT) != 0 && 1420 (upper == 'Ι' || upper == 'Υ')) { 1421 data |= HAS_DIALYTIKA; 1422 } 1423 int numYpogegrammeni = 0; // Map each one to a trailing, spacing, capital iota. 1424 if ((data & HAS_YPOGEGRAMMENI) != 0) { 1425 numYpogegrammeni = 1; 1426 } 1427 // Skip combining diacritics after this Greek letter. 1428 while (nextIndex < src.length()) { 1429 int diacriticData = getDiacriticData(src.charAt(nextIndex)); 1430 if (diacriticData != 0) { 1431 data |= diacriticData; 1432 if ((diacriticData & HAS_YPOGEGRAMMENI) != 0) { 1433 ++numYpogegrammeni; 1434 } 1435 ++nextIndex; 1436 } else { 1437 break; // not a Greek diacritic 1438 } 1439 } 1440 if ((data & HAS_VOWEL_AND_ACCENT_AND_DIALYTIKA) == HAS_VOWEL_AND_ACCENT) { 1441 nextState |= AFTER_VOWEL_WITH_ACCENT; 1442 } 1443 // Map according to Greek rules. 1444 boolean addTonos = false; 1445 if (upper == 'Η' && 1446 (data & HAS_ACCENT) != 0 && 1447 numYpogegrammeni == 0 && 1448 (state & AFTER_CASED) == 0 && 1449 !isFollowedByCasedLetter(src, nextIndex)) { 1450 // Keep disjunctive "or" with (only) a tonos. 1451 // We use the same "word boundary" conditions as for the Final_Sigma test. 1452 if (i == nextIndex) { 1453 upper = 'Ή'; // Preserve the precomposed form. 1454 } else { 1455 addTonos = true; 1456 } 1457 } else if ((data & HAS_DIALYTIKA) != 0) { 1458 // Preserve a vowel with dialytika in precomposed form if it exists. 1459 if (upper == 'Ι') { 1460 upper = 'Ϊ'; 1461 data &= ~HAS_EITHER_DIALYTIKA; 1462 } else if (upper == 'Υ') { 1463 upper = 'Ϋ'; 1464 data &= ~HAS_EITHER_DIALYTIKA; 1465 } 1466 } 1467 1468 boolean change; 1469 if (edits == null && (options & OMIT_UNCHANGED_TEXT) == 0) { 1470 change = true; // common, simple usage 1471 } else { 1472 // Find out first whether we are changing the text. 1473 change = src.charAt(i) != upper || numYpogegrammeni > 0; 1474 int i2 = i + 1; 1475 if ((data & HAS_EITHER_DIALYTIKA) != 0) { 1476 change |= i2 >= nextIndex || src.charAt(i2) != 0x308; 1477 ++i2; 1478 } 1479 if (addTonos) { 1480 change |= i2 >= nextIndex || src.charAt(i2) != 0x301; 1481 ++i2; 1482 } 1483 int oldLength = nextIndex - i; 1484 int newLength = (i2 - i) + numYpogegrammeni; 1485 change |= oldLength != newLength; 1486 if (change) { 1487 if (edits != null) { 1488 edits.addReplace(oldLength, newLength); 1489 } 1490 } else { 1491 if (edits != null) { 1492 edits.addUnchanged(oldLength); 1493 } 1494 // Write unchanged text? 1495 change = (options & OMIT_UNCHANGED_TEXT) == 0; 1496 } 1497 } 1498 1499 if (change) { 1500 dest.append((char)upper); 1501 if ((data & HAS_EITHER_DIALYTIKA) != 0) { 1502 dest.append('\u0308'); // restore or add a dialytika 1503 } 1504 if (addTonos) { 1505 dest.append('\u0301'); 1506 } 1507 while (numYpogegrammeni > 0) { 1508 dest.append('Ι'); 1509 --numYpogegrammeni; 1510 } 1511 } 1512 } else { 1513 c = UCaseProps.INSTANCE.toFullUpper(c, null, dest, UCaseProps.LOC_GREEK); 1514 appendResult(c, dest, nextIndex - i, options, edits); 1515 } 1516 i = nextIndex; 1517 state = nextState; 1518 } 1519 return dest; 1520 } 1521 } 1522 } 1523