• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 package com.ibm.icu.impl;
4 
5 import java.io.IOException;
6 import java.text.CharacterIterator;
7 import java.util.Locale;
8 
9 import com.ibm.icu.lang.UCharacter;
10 import com.ibm.icu.lang.UCharacterCategory;
11 import com.ibm.icu.text.BreakIterator;
12 import com.ibm.icu.text.Edits;
13 import com.ibm.icu.util.ICUUncheckedIOException;
14 import com.ibm.icu.util.ULocale;
15 
16 public final class CaseMapImpl {
17     /**
18      * Implementation of UCaseProps.ContextIterator, iterates over a String.
19      * See ustrcase.c/utf16_caseContextIterator().
20      */
21     public static final class StringContextIterator implements UCaseProps.ContextIterator {
22         /**
23          * Constructor.
24          * @param src String to iterate over.
25          */
StringContextIterator(CharSequence src)26         public StringContextIterator(CharSequence src) {
27             this.s=src;
28             limit=src.length();
29             cpStart=cpLimit=index=0;
30             dir=0;
31         }
32 
33         /**
34          * Constructor.
35          * @param src String to iterate over.
36          * @param cpStart Start index of the current code point.
37          * @param cpLimit Limit index of the current code point.
38          */
StringContextIterator(CharSequence src, int cpStart, int cpLimit)39         public StringContextIterator(CharSequence src, int cpStart, int cpLimit) {
40             s = src;
41             index = 0;
42             limit = src.length();
43             this.cpStart = cpStart;
44             this.cpLimit = cpLimit;
45             dir = 0;
46         }
47 
48         /**
49          * Set the iteration limit for nextCaseMapCP() to an index within the string.
50          * If the limit parameter is negative or past the string, then the
51          * string length is restored as the iteration limit.
52          *
53          * <p>This limit does not affect the next() function which always
54          * iterates to the very end of the string.
55          *
56          * @param lim The iteration limit.
57          */
setLimit(int lim)58         public void setLimit(int lim) {
59             if(0<=lim && lim<=s.length()) {
60                 limit=lim;
61             } else {
62                 limit=s.length();
63             }
64         }
65 
66         /**
67          * Move to the iteration limit without fetching code points up to there.
68          */
moveToLimit()69         public void moveToLimit() {
70             cpStart=cpLimit=limit;
71         }
72 
moveTo(int i)73         public void moveTo(int i) {
74             cpStart=cpLimit=i;
75         }
76 
77         /**
78          * Iterate forward through the string to fetch the next code point
79          * to be case-mapped, and set the context indexes for it.
80          *
81          * <p>When the iteration limit is reached (and -1 is returned),
82          * getCPStart() will be at the iteration limit.
83          *
84          * <p>Iteration with next() does not affect the position for nextCaseMapCP().
85          *
86          * @return The next code point to be case-mapped, or <0 when the iteration is done.
87          */
nextCaseMapCP()88         public int nextCaseMapCP() {
89             cpStart=cpLimit;
90             if(cpLimit<limit) {
91                 int c=Character.codePointAt(s, cpLimit);
92                 cpLimit+=Character.charCount(c);
93                 return c;
94             } else {
95                 return -1;
96             }
97         }
98 
setCPStartAndLimit(int s, int l)99         public void setCPStartAndLimit(int s, int l) {
100             cpStart = s;
101             cpLimit = l;
102             dir = 0;
103         }
104         /**
105          * Returns the start of the code point that was last returned
106          * by nextCaseMapCP().
107          */
getCPStart()108         public int getCPStart() {
109             return cpStart;
110         }
111 
112         /**
113          * Returns the limit of the code point that was last returned
114          * by nextCaseMapCP().
115          */
getCPLimit()116         public int getCPLimit() {
117             return cpLimit;
118         }
119 
getCPLength()120         public int getCPLength() {
121             return cpLimit-cpStart;
122         }
123 
124         // implement UCaseProps.ContextIterator
125         // The following code is not used anywhere in this private class
126         @Override
reset(int direction)127         public void reset(int direction) {
128             if(direction>0) {
129                 /* reset for forward iteration */
130                 dir=1;
131                 index=cpLimit;
132             } else if(direction<0) {
133                 /* reset for backward iteration */
134                 dir=-1;
135                 index=cpStart;
136             } else {
137                 // not a valid direction
138                 dir=0;
139                 index=0;
140             }
141         }
142 
143         @Override
next()144         public int next() {
145             int c;
146 
147             if(dir>0 && index<s.length()) {
148                 c=Character.codePointAt(s, index);
149                 index+=Character.charCount(c);
150                 return c;
151             } else if(dir<0 && index>0) {
152                 c=Character.codePointBefore(s, index);
153                 index-=Character.charCount(c);
154                 return c;
155             }
156             return -1;
157         }
158 
159         // variables
160         protected CharSequence s;
161         protected int index, limit, cpStart, cpLimit;
162         protected int dir; // 0=initial state  >0=forward  <0=backward
163     }
164 
165     public static final int TITLECASE_WHOLE_STRING = 0x20;
166     public static final int TITLECASE_SENTENCES = 0x40;
167 
168     /**
169      * Bit mask for the titlecasing iterator options bit field.
170      * Currently only 3 out of 8 values are used:
171      * 0 (words), TITLECASE_WHOLE_STRING, TITLECASE_SENTENCES.
172      * See stringoptions.h.
173      * @internal
174      */
175     private static final int TITLECASE_ITERATOR_MASK = 0xe0;
176 
177     public static final int TITLECASE_ADJUST_TO_CASED = 0x400;
178 
179     /**
180      * Bit mask for the titlecasing index adjustment options bit set.
181      * Currently two bits are defined:
182      * TITLECASE_NO_BREAK_ADJUSTMENT, TITLECASE_ADJUST_TO_CASED.
183      * See stringoptions.h.
184      * @internal
185      */
186     private static final int TITLECASE_ADJUSTMENT_MASK = 0x600;
187 
addTitleAdjustmentOption(int options, int newOption)188     public static int addTitleAdjustmentOption(int options, int newOption) {
189         int adjOptions = options & TITLECASE_ADJUSTMENT_MASK;
190         if (adjOptions !=0 && adjOptions != newOption) {
191             throw new IllegalArgumentException("multiple titlecasing index adjustment options");
192         }
193         return options | newOption;
194     }
195 
196     private static final char ACUTE = '\u0301';
197 
198     private static final int U_GC_M_MASK =
199             (1 << UCharacterCategory.NON_SPACING_MARK) |
200             (1 << UCharacterCategory.COMBINING_SPACING_MARK) |
201             (1 << UCharacterCategory.ENCLOSING_MARK);
202 
203     private static final int LNS =
204             (1 << UCharacterCategory.UPPERCASE_LETTER) |
205             (1 << UCharacterCategory.LOWERCASE_LETTER) |
206             (1 << UCharacterCategory.TITLECASE_LETTER) |
207             // Not MODIFIER_LETTER: We count only cased modifier letters.
208             (1 << UCharacterCategory.OTHER_LETTER) |
209 
210             (1 << UCharacterCategory.DECIMAL_DIGIT_NUMBER) |
211             (1 << UCharacterCategory.LETTER_NUMBER) |
212             (1 << UCharacterCategory.OTHER_NUMBER) |
213 
214             (1 << UCharacterCategory.MATH_SYMBOL) |
215             (1 << UCharacterCategory.CURRENCY_SYMBOL) |
216             (1 << UCharacterCategory.MODIFIER_SYMBOL) |
217             (1 << UCharacterCategory.OTHER_SYMBOL) |
218 
219             (1 << UCharacterCategory.PRIVATE_USE);
220 
isLNS(int c)221     private static boolean isLNS(int c) {
222         // Letter, number, symbol,
223         // or a private use code point because those are typically used as letters or numbers.
224         // Consider modifier letters only if they are cased.
225         int gc = UCharacterProperty.INSTANCE.getType(c);
226         return ((1 << gc) & LNS) != 0 ||
227                 (gc == UCharacterCategory.MODIFIER_LETTER &&
228                     UCaseProps.INSTANCE.getType(c) != UCaseProps.NONE);
229     }
230 
addTitleIteratorOption(int options, int newOption)231     public static int addTitleIteratorOption(int options, int newOption) {
232         int iterOptions = options & TITLECASE_ITERATOR_MASK;
233         if (iterOptions !=0 && iterOptions != newOption) {
234             throw new IllegalArgumentException("multiple titlecasing iterator options");
235         }
236         return options | newOption;
237     }
238 
getTitleBreakIterator( Locale locale, int options, BreakIterator iter)239     public static BreakIterator getTitleBreakIterator(
240             Locale locale, int options, BreakIterator iter) {
241         options &= TITLECASE_ITERATOR_MASK;
242         if (options != 0 && iter != null) {
243             throw new IllegalArgumentException(
244                     "titlecasing iterator option together with an explicit iterator");
245         }
246         if (iter == null) {
247             switch (options) {
248             case 0:
249                 iter = BreakIterator.getWordInstance(locale);
250                 break;
251             case TITLECASE_WHOLE_STRING:
252                 iter = new WholeStringBreakIterator();
253                 break;
254             case TITLECASE_SENTENCES:
255                 iter = BreakIterator.getSentenceInstance(locale);
256                 break;
257             default:
258                 throw new IllegalArgumentException("unknown titlecasing iterator option");
259             }
260         }
261         return iter;
262     }
263 
getTitleBreakIterator( ULocale locale, int options, BreakIterator iter)264     public static BreakIterator getTitleBreakIterator(
265             ULocale locale, int options, BreakIterator iter) {
266         options &= TITLECASE_ITERATOR_MASK;
267         if (options != 0 && iter != null) {
268             throw new IllegalArgumentException(
269                     "titlecasing iterator option together with an explicit iterator");
270         }
271         if (iter == null) {
272             switch (options) {
273             case 0:
274                 iter = BreakIterator.getWordInstance(locale);
275                 break;
276             case TITLECASE_WHOLE_STRING:
277                 iter = new WholeStringBreakIterator();
278                 break;
279             case TITLECASE_SENTENCES:
280                 iter = BreakIterator.getSentenceInstance(locale);
281                 break;
282             default:
283                 throw new IllegalArgumentException("unknown titlecasing iterator option");
284             }
285         }
286         return iter;
287     }
288 
289     /**
290      * Omit unchanged text when case-mapping with Edits.
291      */
292     public static final int OMIT_UNCHANGED_TEXT = 0x4000;
293 
294     private static final class WholeStringBreakIterator extends BreakIterator {
295         private int length;
296 
notImplemented()297         private static void notImplemented() {
298             throw new UnsupportedOperationException("should not occur");
299         }
300 
301         @Override
first()302         public int first() {
303             return 0;
304         }
305 
306         @Override
last()307         public int last() {
308             notImplemented();
309             return 0;
310         }
311 
312         @Override
next(int n)313         public int next(int n) {
314             notImplemented();
315             return 0;
316         }
317 
318         @Override
next()319         public int next() {
320             return length;
321         }
322 
323         @Override
previous()324         public int previous() {
325             notImplemented();
326             return 0;
327         }
328 
329         @Override
following(int offset)330         public int following(int offset) {
331             notImplemented();
332             return 0;
333         }
334 
335         @Override
current()336         public int current() {
337             notImplemented();
338             return 0;
339         }
340 
341         @Override
getText()342         public CharacterIterator getText() {
343             notImplemented();
344             return null;
345         }
346 
347         @Override
setText(CharacterIterator newText)348         public void setText(CharacterIterator newText) {
349             length = newText.getEndIndex();
350         }
351 
352         @Override
setText(CharSequence newText)353         public void setText(CharSequence newText) {
354             length = newText.length();
355         }
356 
357         @Override
setText(String newText)358         public void setText(String newText) {
359             length = newText.length();
360         }
361     }
362 
appendCodePoint(Appendable a, int c)363     private static int appendCodePoint(Appendable a, int c) throws IOException {
364         if (c <= Character.MAX_VALUE) {
365             a.append((char)c);
366             return 1;
367         } else {
368             a.append((char)(0xd7c0 + (c >> 10)));
369             a.append((char)(Character.MIN_LOW_SURROGATE + (c & 0x3ff)));
370             return 2;
371         }
372     }
373 
374     /**
375      * Appends a full case mapping result, see {@link UCaseProps#MAX_STRING_LENGTH}.
376      * @throws IOException
377      */
appendResult(int result, Appendable dest, int cpLength, int options, Edits edits)378     private static void appendResult(int result, Appendable dest,
379             int cpLength, int options, Edits edits) throws IOException {
380         // Decode the result.
381         if (result < 0) {
382             // (not) original code point
383             if (edits != null) {
384                 edits.addUnchanged(cpLength);
385             }
386             if ((options & OMIT_UNCHANGED_TEXT) != 0) {
387                 return;
388             }
389             appendCodePoint(dest, ~result);
390         } else if (result <= UCaseProps.MAX_STRING_LENGTH) {
391             // The mapping has already been appended to result.
392             if (edits != null) {
393                 edits.addReplace(cpLength, result);
394             }
395         } else {
396             // Append the single-code point mapping.
397             int length = appendCodePoint(dest, result);
398             if (edits != null) {
399                 edits.addReplace(cpLength, length);
400             }
401         }
402     }
403 
appendUnchanged(CharSequence src, int start, int length, Appendable dest, int options, Edits edits)404     private static final void appendUnchanged(CharSequence src, int start, int length,
405             Appendable dest, int options, Edits edits) throws IOException {
406         if (length > 0) {
407             if (edits != null) {
408                 edits.addUnchanged(length);
409             }
410             if ((options & OMIT_UNCHANGED_TEXT) != 0) {
411                 return;
412             }
413             dest.append(src, start, start + length);
414         }
415     }
416 
applyEdits(CharSequence src, StringBuilder replacementChars, Edits edits)417     private static String applyEdits(CharSequence src, StringBuilder replacementChars, Edits edits) {
418         if (!edits.hasChanges()) {
419             return src.toString();
420         }
421         StringBuilder result = new StringBuilder(src.length() + edits.lengthDelta());
422         for (Edits.Iterator ei = edits.getCoarseIterator(); ei.next();) {
423             if (ei.hasChange()) {
424                 int i = ei.replacementIndex();
425                 result.append(replacementChars, i, i + ei.newLength());
426             } else {
427                 int i = ei.sourceIndex();
428                 result.append(src, i, i + ei.oldLength());
429             }
430         }
431         return result.toString();
432     }
433 
434     private static final Trie2_16 CASE_TRIE = UCaseProps.getTrie();
435 
436     /**
437      * caseLocale >= 0: Lowercases [srcStart..srcLimit[ but takes context [0..srcLength[ into account.
438      * caseLocale < 0: Case-folds [srcStart..srcLimit[.
439      */
internalToLower(int caseLocale, int options, CharSequence src, int srcStart, int srcLimit, StringContextIterator iter, Appendable dest, Edits edits)440     private static void internalToLower(int caseLocale, int options,
441             CharSequence src, int srcStart, int srcLimit, StringContextIterator iter,
442             Appendable dest, Edits edits) throws IOException {
443         byte[] latinToLower;
444         if (caseLocale == UCaseProps.LOC_ROOT ||
445                 (caseLocale >= 0 ?
446                     !(caseLocale == UCaseProps.LOC_TURKISH || caseLocale == UCaseProps.LOC_LITHUANIAN) :
447                     (options & UCaseProps.FOLD_CASE_OPTIONS_MASK) == UCharacter.FOLD_CASE_DEFAULT)) {
448             latinToLower = UCaseProps.LatinCase.TO_LOWER_NORMAL;
449         } else {
450             latinToLower = UCaseProps.LatinCase.TO_LOWER_TR_LT;
451         }
452         int prev = srcStart;
453         int srcIndex = srcStart;
454         outerLoop:
455         for (;;) {
456             // fast path for simple cases
457             char lead;
458             for (;;) {
459                 if (srcIndex >= srcLimit) {
460                     break outerLoop;
461                 }
462                 lead = src.charAt(srcIndex);
463                 int delta;
464                 if (lead < UCaseProps.LatinCase.LONG_S) {
465                     byte d = latinToLower[lead];
466                     if (d == UCaseProps.LatinCase.EXC) { break; }
467                     ++srcIndex;
468                     if (d == 0) { continue; }
469                     delta = d;
470                 } else if (lead >= 0xd800) {
471                     break;  // surrogate or higher
472                 } else {
473                     int props = CASE_TRIE.getFromU16SingleLead(lead);
474                     if (UCaseProps.propsHasException(props)) { break; }
475                     ++srcIndex;
476                     if (!UCaseProps.isUpperOrTitleFromProps(props) ||
477                             (delta = UCaseProps.getDelta(props)) == 0) {
478                         continue;
479                     }
480                 }
481                 lead += delta;
482                 appendUnchanged(src, prev, srcIndex - 1 - prev, dest, options, edits);
483                 dest.append(lead);
484                 if (edits != null) {
485                     edits.addReplace(1, 1);
486                 }
487                 prev = srcIndex;
488             }
489             // slow path
490             int cpStart = srcIndex++;
491             char trail;
492             int c;
493             if (Character.isHighSurrogate(lead) && srcIndex < srcLimit &&
494                     Character.isLowSurrogate(trail = src.charAt(srcIndex))) {
495                 c = Character.toCodePoint(lead, trail);
496                 ++srcIndex;
497             } else {
498                 c = lead;
499             }
500             // We need to append unchanged text before calling the UCaseProps.toFullXyz() methods
501             // because they will sometimes append their mapping to dest,
502             // and that must be after copying the previous text.
503             appendUnchanged(src, prev, cpStart - prev, dest, options, edits);
504             prev = cpStart;
505             if (caseLocale >= 0) {
506                 if (iter == null) {
507                     iter = new StringContextIterator(src, cpStart, srcIndex);
508                 } else {
509                     iter.setCPStartAndLimit(cpStart, srcIndex);
510                 }
511                 c = UCaseProps.INSTANCE.toFullLower(c, iter, dest, caseLocale);
512             } else {
513                 c = UCaseProps.INSTANCE.toFullFolding(c, dest, options);
514             }
515             if (c >= 0) {
516                 appendResult(c, dest, srcIndex - cpStart, options, edits);
517                 prev = srcIndex;
518             }
519         }
520         appendUnchanged(src, prev, srcIndex - prev, dest, options, edits);
521     }
522 
internalToUpper(int caseLocale, int options, CharSequence src, Appendable dest, Edits edits)523     private static void internalToUpper(int caseLocale, int options,
524             CharSequence src, Appendable dest, Edits edits) throws IOException {
525         StringContextIterator iter = null;
526         byte[] latinToUpper;
527         if (caseLocale == UCaseProps.LOC_TURKISH) {
528             latinToUpper = UCaseProps.LatinCase.TO_UPPER_TR;
529         } else {
530             latinToUpper = UCaseProps.LatinCase.TO_UPPER_NORMAL;
531         }
532         int prev = 0;
533         int srcIndex = 0;
534         int srcLength = src.length();
535         outerLoop:
536         for (;;) {
537             // fast path for simple cases
538             char lead;
539             for (;;) {
540                 if (srcIndex >= srcLength) {
541                     break outerLoop;
542                 }
543                 lead = src.charAt(srcIndex);
544                 int delta;
545                 if (lead < UCaseProps.LatinCase.LONG_S) {
546                     byte d = latinToUpper[lead];
547                     if (d == UCaseProps.LatinCase.EXC) { break; }
548                     ++srcIndex;
549                     if (d == 0) { continue; }
550                     delta = d;
551                 } else if (lead >= 0xd800) {
552                     break;  // surrogate or higher
553                 } else {
554                     int props = CASE_TRIE.getFromU16SingleLead(lead);
555                     if (UCaseProps.propsHasException(props)) { break; }
556                     ++srcIndex;
557                     if (UCaseProps.getTypeFromProps(props) != UCaseProps.LOWER ||
558                             (delta = UCaseProps.getDelta(props)) == 0) {
559                         continue;
560                     }
561                 }
562                 lead += delta;
563                 appendUnchanged(src, prev, srcIndex - 1 - prev, dest, options, edits);
564                 dest.append(lead);
565                 if (edits != null) {
566                     edits.addReplace(1, 1);
567                 }
568                 prev = srcIndex;
569             }
570             // slow path
571             int cpStart = srcIndex++;
572             char trail;
573             int c;
574             if (Character.isHighSurrogate(lead) && srcIndex < srcLength &&
575                     Character.isLowSurrogate(trail = src.charAt(srcIndex))) {
576                 c = Character.toCodePoint(lead, trail);
577                 ++srcIndex;
578             } else {
579                 c = lead;
580             }
581             if (iter == null) {
582                 iter = new StringContextIterator(src, cpStart, srcIndex);
583             } else {
584                 iter.setCPStartAndLimit(cpStart, srcIndex);
585             }
586             // We need to append unchanged text before calling UCaseProps.toFullUpper()
587             // because it will sometimes append its mapping to dest,
588             // and that must be after copying the previous text.
589             appendUnchanged(src, prev, cpStart - prev, dest, options, edits);
590             prev = cpStart;
591             c = UCaseProps.INSTANCE.toFullUpper(c, iter, dest, caseLocale);
592             if (c >= 0) {
593                 appendResult(c, dest, srcIndex - cpStart, options, edits);
594                 prev = srcIndex;
595             }
596         }
597         appendUnchanged(src, prev, srcIndex - prev, dest, options, edits);
598     }
599 
toLower(int caseLocale, int options, CharSequence src)600     public static String toLower(int caseLocale, int options, CharSequence src) {
601         if (src.length() <= 100 && (options & OMIT_UNCHANGED_TEXT) == 0) {
602             if (src.length() == 0) {
603                 return src.toString();
604             }
605             // Collect and apply only changes.
606             // Good if no or few changes. Bad (slow) if many changes.
607             Edits edits = new Edits();
608             StringBuilder replacementChars = toLower(
609                     caseLocale, options | OMIT_UNCHANGED_TEXT, src, new StringBuilder(), edits);
610             return applyEdits(src, replacementChars, edits);
611         } else {
612             return toLower(caseLocale, options, src,
613                     new StringBuilder(src.length()), null).toString();
614         }
615     }
616 
toLower(int caseLocale, int options, CharSequence src, A dest, Edits edits)617     public static <A extends Appendable> A toLower(int caseLocale, int options,
618             CharSequence src, A dest, Edits edits) {
619         try {
620             if (edits != null) {
621                 edits.reset();
622             }
623             internalToLower(caseLocale, options, src, 0, src.length(), null, dest, edits);
624             return dest;
625         } catch (IOException e) {
626             throw new ICUUncheckedIOException(e);
627         }
628     }
629 
toUpper(int caseLocale, int options, CharSequence src)630     public static String toUpper(int caseLocale, int options, CharSequence src) {
631         if (src.length() <= 100 && (options & OMIT_UNCHANGED_TEXT) == 0) {
632             if (src.length() == 0) {
633                 return src.toString();
634             }
635             // Collect and apply only changes.
636             // Good if no or few changes. Bad (slow) if many changes.
637             Edits edits = new Edits();
638             StringBuilder replacementChars = toUpper(
639                     caseLocale, options | OMIT_UNCHANGED_TEXT, src, new StringBuilder(), edits);
640             return applyEdits(src, replacementChars, edits);
641         } else {
642             return toUpper(caseLocale, options, src,
643                     new StringBuilder(src.length()), null).toString();
644         }
645     }
646 
toUpper(int caseLocale, int options, CharSequence src, A dest, Edits edits)647     public static <A extends Appendable> A toUpper(int caseLocale, int options,
648             CharSequence src, A dest, Edits edits) {
649         try {
650             if (edits != null) {
651                 edits.reset();
652             }
653             if (caseLocale == UCaseProps.LOC_GREEK) {
654                 return GreekUpper.toUpper(options, src, dest, edits);
655             }
656             internalToUpper(caseLocale, options, src, dest, edits);
657             return dest;
658         } catch (IOException e) {
659             throw new ICUUncheckedIOException(e);
660         }
661     }
662 
toTitle(int caseLocale, int options, BreakIterator iter, CharSequence src)663     public static String toTitle(int caseLocale, int options, BreakIterator iter, CharSequence src) {
664         if (src.length() <= 100 && (options & OMIT_UNCHANGED_TEXT) == 0) {
665             if (src.length() == 0) {
666                 return src.toString();
667             }
668             // Collect and apply only changes.
669             // Good if no or few changes. Bad (slow) if many changes.
670             Edits edits = new Edits();
671             StringBuilder replacementChars = toTitle(
672                     caseLocale, options | OMIT_UNCHANGED_TEXT, iter, src,
673                     new StringBuilder(), edits);
674             return applyEdits(src, replacementChars, edits);
675         } else {
676             return toTitle(caseLocale, options, iter, src,
677                     new StringBuilder(src.length()), null).toString();
678         }
679     }
680 
toTitle( int caseLocale, int options, BreakIterator titleIter, CharSequence src, A dest, Edits edits)681     public static <A extends Appendable> A toTitle(
682             int caseLocale, int options, BreakIterator titleIter,
683             CharSequence src, A dest, Edits edits) {
684         try {
685             if (edits != null) {
686                 edits.reset();
687             }
688 
689             /* set up local variables */
690             StringContextIterator iter = new StringContextIterator(src);
691             int srcLength = src.length();
692             int prev=0;
693             boolean isFirstIndex=true;
694 
695             /* titlecasing loop */
696             while(prev<srcLength) {
697                 /* find next index where to titlecase */
698                 int index;
699                 if(isFirstIndex) {
700                     isFirstIndex=false;
701                     index=titleIter.first();
702                 } else {
703                     index=titleIter.next();
704                 }
705                 if(index==BreakIterator.DONE || index>srcLength) {
706                     index=srcLength;
707                 }
708 
709                 /*
710                  * Segment [prev..index[ into 3 parts:
711                  * a) skipped characters (copy as-is) [prev..titleStart[
712                  * b) first letter (titlecase)              [titleStart..titleLimit[
713                  * c) subsequent characters (lowercase)                 [titleLimit..index[
714                  */
715                 if(prev<index) {
716                     // Find and copy skipped characters [prev..titleStart[
717                     int titleStart=prev;
718                     iter.setLimit(index);
719                     int c=iter.nextCaseMapCP();
720                     if ((options&UCharacter.TITLECASE_NO_BREAK_ADJUSTMENT)==0) {
721                         // Adjust the titlecasing index to the next cased character,
722                         // or to the next letter/number/symbol/private use.
723                         // Stop with titleStart<titleLimit<=index
724                         // if there is a character to be titlecased,
725                         // or else stop with titleStart==titleLimit==index.
726                         boolean toCased = (options&CaseMapImpl.TITLECASE_ADJUST_TO_CASED) != 0;
727                         while ((toCased ?
728                                     UCaseProps.NONE==UCaseProps.INSTANCE.getType(c) :
729                                         !CaseMapImpl.isLNS(c)) &&
730                                 (c=iter.nextCaseMapCP())>=0) {}
731                         // If c<0 then we have only uncased characters in [prev..index[
732                         // and stopped with titleStart==titleLimit==index.
733                         titleStart=iter.getCPStart();
734                         if (prev < titleStart) {
735                             appendUnchanged(src, prev, titleStart-prev, dest, options, edits);
736                         }
737                     }
738 
739                     if(titleStart<index) {
740                         // titlecase c which is from [titleStart..titleLimit[
741                         c = UCaseProps.INSTANCE.toFullTitle(c, iter, dest, caseLocale);
742                         appendResult(c, dest, iter.getCPLength(), options, edits);
743 
744                         // Special case Dutch IJ titlecasing
745                         int titleLimit;
746                         if (titleStart+1 < index && caseLocale == UCaseProps.LOC_DUTCH) {
747                             if (c < 0) {
748                                 c = ~c;
749                             }
750                             if (c == 'I' || c == 'Í') {
751                                 titleLimit = maybeTitleDutchIJ(src, c, titleStart + 1, index, dest, options, edits);
752                                 iter.moveTo(titleLimit);
753                             }
754                             else {
755                                 titleLimit = iter.getCPLimit();
756                             }
757                         } else {
758                             titleLimit = iter.getCPLimit();
759                         }
760 
761                         // lowercase [titleLimit..index[
762                         if(titleLimit<index) {
763                             if((options&UCharacter.TITLECASE_NO_LOWERCASE)==0) {
764                                 // Normal operation: Lowercase the rest of the word.
765                                 internalToLower(caseLocale, options,
766                                         src, titleLimit, index, iter, dest, edits);
767                             } else {
768                                 // Optionally just copy the rest of the word unchanged.
769                                 appendUnchanged(src, titleLimit, index-titleLimit, dest, options, edits);
770                             }
771                             iter.moveToLimit();
772                         }
773                     }
774                 }
775 
776                 prev=index;
777             }
778             return dest;
779         } catch (IOException e) {
780             throw new ICUUncheckedIOException(e);
781         }
782     }
783 
784     /**
785      * Input: c is a letter I with or without acute accent.
786      * start is the index in src after c, and is less than segmentLimit.
787      * If a plain i/I is followed by a plain j/J,
788      * or an i/I with acute (precomposed or decomposed) is followed by a j/J with acute,
789      * then we output accordingly.
790      *
791      * @return the src index after the titlecased sequence, or the start index if no Dutch IJ
792      * @throws IOException
793      */
maybeTitleDutchIJ( CharSequence src, int c, int start, int segmentLimit, A dest, int options, Edits edits)794     private static <A extends Appendable> int maybeTitleDutchIJ(
795             CharSequence src, int c, int start, int segmentLimit,
796             A dest, int options, Edits edits) throws IOException {
797         assert start < segmentLimit;
798 
799         int index = start;
800         boolean withAcute = false;
801 
802         // If the conditions are met, then the following variables tell us what to output.
803         int unchanged1 = 0;  // code units before the j, or the whole sequence (0..3)
804         boolean doTitleJ = false;  // true if the j needs to be titlecased
805         int unchanged2 = 0;  // after the j (0 or 1)
806 
807         // next character after the first letter
808         char c2 = src.charAt(index++);
809 
810         // Is the first letter an i/I with accent?
811         if (c == 'I') {
812             if (c2 == ACUTE) {
813                 withAcute = true;
814                 unchanged1 = 1;
815                 if (index == segmentLimit) { return start; }
816                 c2 = src.charAt(index++);
817             }
818         } else {  // Í
819             withAcute = true;
820         }
821         // Is the next character a j/J?
822         if (c2 == 'j') {
823             doTitleJ = true;
824         } else if (c2 == 'J') {
825             ++unchanged1;
826         } else {
827             return start;
828         }
829         // A plain i/I must be followed by a plain j/J.
830         // An i/I with acute must be followed by a j/J with acute.
831         if (withAcute) {
832             if (index == segmentLimit || src.charAt(index++) != ACUTE) { return start; }
833             if (doTitleJ) {
834                 unchanged2 = 1;
835             } else {
836                 ++unchanged1;
837             }
838         }
839         // There must not be another combining mark.
840         if (index < segmentLimit) {
841             int cp = Character.codePointAt(src, index);
842             int bit = 1 << UCharacter.getType(cp);
843             if ((bit & U_GC_M_MASK) != 0) {
844                 return start;
845             }
846         }
847         // Output the rest of the Dutch IJ.
848         appendUnchanged(src, start, unchanged1, dest, options, edits);
849         start += unchanged1;
850         if (doTitleJ) {
851             dest.append('J');
852             if (edits != null) {
853                 edits.addReplace(1, 1);
854             }
855             ++start;
856         }
857         appendUnchanged(src, start, unchanged2, dest, options, edits);
858         assert start + unchanged2 == index;
859         return index;
860     }
861 
862     public static String fold(int options, CharSequence src) {
863         if (src.length() <= 100 && (options & OMIT_UNCHANGED_TEXT) == 0) {
864             if (src.length() == 0) {
865                 return src.toString();
866             }
867             // Collect and apply only changes.
868             // Good if no or few changes. Bad (slow) if many changes.
869             Edits edits = new Edits();
870             StringBuilder replacementChars = fold(
871                     options | OMIT_UNCHANGED_TEXT, src, new StringBuilder(), edits);
872             return applyEdits(src, replacementChars, edits);
873         } else {
874             return fold(options, src, new StringBuilder(src.length()), null).toString();
875         }
876     }
877 
878     public static <A extends Appendable> A fold(int options,
879             CharSequence src, A dest, Edits edits) {
880         try {
881             if (edits != null) {
882                 edits.reset();
883             }
884             internalToLower(-1, options, src, 0, src.length(), null, dest, edits);
885             return dest;
886         } catch (IOException e) {
887             throw new ICUUncheckedIOException(e);
888         }
889     }
890 
891     private static final class GreekUpper {
892         // Data bits.
893         private static final int UPPER_MASK = 0x3ff;
894         private static final int HAS_VOWEL = 0x1000;
895         private static final int HAS_YPOGEGRAMMENI = 0x2000;
896         private static final int HAS_ACCENT = 0x4000;
897         private static final int HAS_DIALYTIKA = 0x8000;
898         // Further bits during data building and processing, not stored in the data map.
899         private static final int HAS_COMBINING_DIALYTIKA = 0x10000;
900         private static final int HAS_OTHER_GREEK_DIACRITIC = 0x20000;
901 
902         private static final int HAS_VOWEL_AND_ACCENT = HAS_VOWEL | HAS_ACCENT;
903         private static final int HAS_VOWEL_AND_ACCENT_AND_DIALYTIKA =
904                 HAS_VOWEL_AND_ACCENT | HAS_DIALYTIKA;
905         private static final int HAS_EITHER_DIALYTIKA = HAS_DIALYTIKA | HAS_COMBINING_DIALYTIKA;
906 
907         // State bits.
908         private static final int AFTER_CASED = 1;
909         private static final int AFTER_VOWEL_WITH_ACCENT = 2;
910 
911         // Data generated by prototype code, see
912         // https://icu.unicode.org/design/case/greek-upper
913         // TODO: Move this data into ucase.icu.
914         private static final char[] data0370 = {
915             // U+0370..03FF
916             0x0370,  // Ͱ
917             0x0370,  // ͱ
918             0x0372,  // Ͳ
919             0x0372,  // ͳ
920             0,
921             0,
922             0x0376,  // Ͷ
923             0x0376,  // ͷ
924             0,
925             0,
926             0x037A,  // ͺ
927             0x03FD,  // ͻ
928             0x03FE,  // ͼ
929             0x03FF,  // ͽ
930             0,
931             0x037F,  // Ϳ
932             0,
933             0,
934             0,
935             0,
936             0,
937             0,
938             0x0391 | HAS_VOWEL | HAS_ACCENT,  // Ά
939             0,
940             0x0395 | HAS_VOWEL | HAS_ACCENT,  // Έ
941             0x0397 | HAS_VOWEL | HAS_ACCENT,  // Ή
942             0x0399 | HAS_VOWEL | HAS_ACCENT,  // Ί
943             0,
944             0x039F | HAS_VOWEL | HAS_ACCENT,  // Ό
945             0,
946             0x03A5 | HAS_VOWEL | HAS_ACCENT,  // Ύ
947             0x03A9 | HAS_VOWEL | HAS_ACCENT,  // Ώ
948             0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,  // ΐ
949             0x0391 | HAS_VOWEL,  // Α
950             0x0392,  // Β
951             0x0393,  // Γ
952             0x0394,  // Δ
953             0x0395 | HAS_VOWEL,  // Ε
954             0x0396,  // Ζ
955             0x0397 | HAS_VOWEL,  // Η
956             0x0398,  // Θ
957             0x0399 | HAS_VOWEL,  // Ι
958             0x039A,  // Κ
959             0x039B,  // Λ
960             0x039C,  // Μ
961             0x039D,  // Ν
962             0x039E,  // Ξ
963             0x039F | HAS_VOWEL,  // Ο
964             0x03A0,  // Π
965             0x03A1,  // Ρ
966             0,
967             0x03A3,  // Σ
968             0x03A4,  // Τ
969             0x03A5 | HAS_VOWEL,  // Υ
970             0x03A6,  // Φ
971             0x03A7,  // Χ
972             0x03A8,  // Ψ
973             0x03A9 | HAS_VOWEL,  // Ω
974             0x0399 | HAS_VOWEL | HAS_DIALYTIKA,  // Ϊ
975             0x03A5 | HAS_VOWEL | HAS_DIALYTIKA,  // Ϋ
976             0x0391 | HAS_VOWEL | HAS_ACCENT,  // ά
977             0x0395 | HAS_VOWEL | HAS_ACCENT,  // έ
978             0x0397 | HAS_VOWEL | HAS_ACCENT,  // ή
979             0x0399 | HAS_VOWEL | HAS_ACCENT,  // ί
980             0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,  // ΰ
981             0x0391 | HAS_VOWEL,  // α
982             0x0392,  // β
983             0x0393,  // γ
984             0x0394,  // δ
985             0x0395 | HAS_VOWEL,  // ε
986             0x0396,  // ζ
987             0x0397 | HAS_VOWEL,  // η
988             0x0398,  // θ
989             0x0399 | HAS_VOWEL,  // ι
990             0x039A,  // κ
991             0x039B,  // λ
992             0x039C,  // μ
993             0x039D,  // ν
994             0x039E,  // ξ
995             0x039F | HAS_VOWEL,  // ο
996             0x03A0,  // π
997             0x03A1,  // ρ
998             0x03A3,  // ς
999             0x03A3,  // σ
1000             0x03A4,  // τ
1001             0x03A5 | HAS_VOWEL,  // υ
1002             0x03A6,  // φ
1003             0x03A7,  // χ
1004             0x03A8,  // ψ
1005             0x03A9 | HAS_VOWEL,  // ω
1006             0x0399 | HAS_VOWEL | HAS_DIALYTIKA,  // ϊ
1007             0x03A5 | HAS_VOWEL | HAS_DIALYTIKA,  // ϋ
1008             0x039F | HAS_VOWEL | HAS_ACCENT,  // ό
1009             0x03A5 | HAS_VOWEL | HAS_ACCENT,  // ύ
1010             0x03A9 | HAS_VOWEL | HAS_ACCENT,  // ώ
1011             0x03CF,  // Ϗ
1012             0x0392,  // ϐ
1013             0x0398,  // ϑ
1014             0x03D2,  // ϒ
1015             0x03D2 | HAS_ACCENT,  // ϓ
1016             0x03D2 | HAS_DIALYTIKA,  // ϔ
1017             0x03A6,  // ϕ
1018             0x03A0,  // ϖ
1019             0x03CF,  // ϗ
1020             0x03D8,  // Ϙ
1021             0x03D8,  // ϙ
1022             0x03DA,  // Ϛ
1023             0x03DA,  // ϛ
1024             0x03DC,  // Ϝ
1025             0x03DC,  // ϝ
1026             0x03DE,  // Ϟ
1027             0x03DE,  // ϟ
1028             0x03E0,  // Ϡ
1029             0x03E0,  // ϡ
1030             0,
1031             0,
1032             0,
1033             0,
1034             0,
1035             0,
1036             0,
1037             0,
1038             0,
1039             0,
1040             0,
1041             0,
1042             0,
1043             0,
1044             0x039A,  // ϰ
1045             0x03A1,  // ϱ
1046             0x03F9,  // ϲ
1047             0x037F,  // ϳ
1048             0x03F4,  // ϴ
1049             0x0395 | HAS_VOWEL,  // ϵ
1050             0,
1051             0x03F7,  // Ϸ
1052             0x03F7,  // ϸ
1053             0x03F9,  // Ϲ
1054             0x03FA,  // Ϻ
1055             0x03FA,  // ϻ
1056             0x03FC,  // ϼ
1057             0x03FD,  // Ͻ
1058             0x03FE,  // Ͼ
1059             0x03FF,  // Ͽ
1060         };
1061 
1062         private static final char[] data1F00 = {
1063             // U+1F00..1FFF
1064             0x0391 | HAS_VOWEL,  // ἀ
1065             0x0391 | HAS_VOWEL,  // ἁ
1066             0x0391 | HAS_VOWEL | HAS_ACCENT,  // ἂ
1067             0x0391 | HAS_VOWEL | HAS_ACCENT,  // ἃ
1068             0x0391 | HAS_VOWEL | HAS_ACCENT,  // ἄ
1069             0x0391 | HAS_VOWEL | HAS_ACCENT,  // ἅ
1070             0x0391 | HAS_VOWEL | HAS_ACCENT,  // ἆ
1071             0x0391 | HAS_VOWEL | HAS_ACCENT,  // ἇ
1072             0x0391 | HAS_VOWEL,  // Ἀ
1073             0x0391 | HAS_VOWEL,  // Ἁ
1074             0x0391 | HAS_VOWEL | HAS_ACCENT,  // Ἂ
1075             0x0391 | HAS_VOWEL | HAS_ACCENT,  // Ἃ
1076             0x0391 | HAS_VOWEL | HAS_ACCENT,  // Ἄ
1077             0x0391 | HAS_VOWEL | HAS_ACCENT,  // Ἅ
1078             0x0391 | HAS_VOWEL | HAS_ACCENT,  // Ἆ
1079             0x0391 | HAS_VOWEL | HAS_ACCENT,  // Ἇ
1080             0x0395 | HAS_VOWEL,  // ἐ
1081             0x0395 | HAS_VOWEL,  // ἑ
1082             0x0395 | HAS_VOWEL | HAS_ACCENT,  // ἒ
1083             0x0395 | HAS_VOWEL | HAS_ACCENT,  // ἓ
1084             0x0395 | HAS_VOWEL | HAS_ACCENT,  // ἔ
1085             0x0395 | HAS_VOWEL | HAS_ACCENT,  // ἕ
1086             0,
1087             0,
1088             0x0395 | HAS_VOWEL,  // Ἐ
1089             0x0395 | HAS_VOWEL,  // Ἑ
1090             0x0395 | HAS_VOWEL | HAS_ACCENT,  // Ἒ
1091             0x0395 | HAS_VOWEL | HAS_ACCENT,  // Ἓ
1092             0x0395 | HAS_VOWEL | HAS_ACCENT,  // Ἔ
1093             0x0395 | HAS_VOWEL | HAS_ACCENT,  // Ἕ
1094             0,
1095             0,
1096             0x0397 | HAS_VOWEL,  // ἠ
1097             0x0397 | HAS_VOWEL,  // ἡ
1098             0x0397 | HAS_VOWEL | HAS_ACCENT,  // ἢ
1099             0x0397 | HAS_VOWEL | HAS_ACCENT,  // ἣ
1100             0x0397 | HAS_VOWEL | HAS_ACCENT,  // ἤ
1101             0x0397 | HAS_VOWEL | HAS_ACCENT,  // ἥ
1102             0x0397 | HAS_VOWEL | HAS_ACCENT,  // ἦ
1103             0x0397 | HAS_VOWEL | HAS_ACCENT,  // ἧ
1104             0x0397 | HAS_VOWEL,  // Ἠ
1105             0x0397 | HAS_VOWEL,  // Ἡ
1106             0x0397 | HAS_VOWEL | HAS_ACCENT,  // Ἢ
1107             0x0397 | HAS_VOWEL | HAS_ACCENT,  // Ἣ
1108             0x0397 | HAS_VOWEL | HAS_ACCENT,  // Ἤ
1109             0x0397 | HAS_VOWEL | HAS_ACCENT,  // Ἥ
1110             0x0397 | HAS_VOWEL | HAS_ACCENT,  // Ἦ
1111             0x0397 | HAS_VOWEL | HAS_ACCENT,  // Ἧ
1112             0x0399 | HAS_VOWEL,  // ἰ
1113             0x0399 | HAS_VOWEL,  // ἱ
1114             0x0399 | HAS_VOWEL | HAS_ACCENT,  // ἲ
1115             0x0399 | HAS_VOWEL | HAS_ACCENT,  // ἳ
1116             0x0399 | HAS_VOWEL | HAS_ACCENT,  // ἴ
1117             0x0399 | HAS_VOWEL | HAS_ACCENT,  // ἵ
1118             0x0399 | HAS_VOWEL | HAS_ACCENT,  // ἶ
1119             0x0399 | HAS_VOWEL | HAS_ACCENT,  // ἷ
1120             0x0399 | HAS_VOWEL,  // Ἰ
1121             0x0399 | HAS_VOWEL,  // Ἱ
1122             0x0399 | HAS_VOWEL | HAS_ACCENT,  // Ἲ
1123             0x0399 | HAS_VOWEL | HAS_ACCENT,  // Ἳ
1124             0x0399 | HAS_VOWEL | HAS_ACCENT,  // Ἴ
1125             0x0399 | HAS_VOWEL | HAS_ACCENT,  // Ἵ
1126             0x0399 | HAS_VOWEL | HAS_ACCENT,  // Ἶ
1127             0x0399 | HAS_VOWEL | HAS_ACCENT,  // Ἷ
1128             0x039F | HAS_VOWEL,  // ὀ
1129             0x039F | HAS_VOWEL,  // ὁ
1130             0x039F | HAS_VOWEL | HAS_ACCENT,  // ὂ
1131             0x039F | HAS_VOWEL | HAS_ACCENT,  // ὃ
1132             0x039F | HAS_VOWEL | HAS_ACCENT,  // ὄ
1133             0x039F | HAS_VOWEL | HAS_ACCENT,  // ὅ
1134             0,
1135             0,
1136             0x039F | HAS_VOWEL,  // Ὀ
1137             0x039F | HAS_VOWEL,  // Ὁ
1138             0x039F | HAS_VOWEL | HAS_ACCENT,  // Ὂ
1139             0x039F | HAS_VOWEL | HAS_ACCENT,  // Ὃ
1140             0x039F | HAS_VOWEL | HAS_ACCENT,  // Ὄ
1141             0x039F | HAS_VOWEL | HAS_ACCENT,  // Ὅ
1142             0,
1143             0,
1144             0x03A5 | HAS_VOWEL,  // ὐ
1145             0x03A5 | HAS_VOWEL,  // ὑ
1146             0x03A5 | HAS_VOWEL | HAS_ACCENT,  // ὒ
1147             0x03A5 | HAS_VOWEL | HAS_ACCENT,  // ὓ
1148             0x03A5 | HAS_VOWEL | HAS_ACCENT,  // ὔ
1149             0x03A5 | HAS_VOWEL | HAS_ACCENT,  // ὕ
1150             0x03A5 | HAS_VOWEL | HAS_ACCENT,  // ὖ
1151             0x03A5 | HAS_VOWEL | HAS_ACCENT,  // ὗ
1152             0,
1153             0x03A5 | HAS_VOWEL,  // Ὑ
1154             0,
1155             0x03A5 | HAS_VOWEL | HAS_ACCENT,  // Ὓ
1156             0,
1157             0x03A5 | HAS_VOWEL | HAS_ACCENT,  // Ὕ
1158             0,
1159             0x03A5 | HAS_VOWEL | HAS_ACCENT,  // Ὗ
1160             0x03A9 | HAS_VOWEL,  // ὠ
1161             0x03A9 | HAS_VOWEL,  // ὡ
1162             0x03A9 | HAS_VOWEL | HAS_ACCENT,  // ὢ
1163             0x03A9 | HAS_VOWEL | HAS_ACCENT,  // ὣ
1164             0x03A9 | HAS_VOWEL | HAS_ACCENT,  // ὤ
1165             0x03A9 | HAS_VOWEL | HAS_ACCENT,  // ὥ
1166             0x03A9 | HAS_VOWEL | HAS_ACCENT,  // ὦ
1167             0x03A9 | HAS_VOWEL | HAS_ACCENT,  // ὧ
1168             0x03A9 | HAS_VOWEL,  // Ὠ
1169             0x03A9 | HAS_VOWEL,  // Ὡ
1170             0x03A9 | HAS_VOWEL | HAS_ACCENT,  // Ὢ
1171             0x03A9 | HAS_VOWEL | HAS_ACCENT,  // Ὣ
1172             0x03A9 | HAS_VOWEL | HAS_ACCENT,  // Ὤ
1173             0x03A9 | HAS_VOWEL | HAS_ACCENT,  // Ὥ
1174             0x03A9 | HAS_VOWEL | HAS_ACCENT,  // Ὦ
1175             0x03A9 | HAS_VOWEL | HAS_ACCENT,  // Ὧ
1176             0x0391 | HAS_VOWEL | HAS_ACCENT,  // ὰ
1177             0x0391 | HAS_VOWEL | HAS_ACCENT,  // ά
1178             0x0395 | HAS_VOWEL | HAS_ACCENT,  // ὲ
1179             0x0395 | HAS_VOWEL | HAS_ACCENT,  // έ
1180             0x0397 | HAS_VOWEL | HAS_ACCENT,  // ὴ
1181             0x0397 | HAS_VOWEL | HAS_ACCENT,  // ή
1182             0x0399 | HAS_VOWEL | HAS_ACCENT,  // ὶ
1183             0x0399 | HAS_VOWEL | HAS_ACCENT,  // ί
1184             0x039F | HAS_VOWEL | HAS_ACCENT,  // ὸ
1185             0x039F | HAS_VOWEL | HAS_ACCENT,  // ό
1186             0x03A5 | HAS_VOWEL | HAS_ACCENT,  // ὺ
1187             0x03A5 | HAS_VOWEL | HAS_ACCENT,  // ύ
1188             0x03A9 | HAS_VOWEL | HAS_ACCENT,  // ὼ
1189             0x03A9 | HAS_VOWEL | HAS_ACCENT,  // ώ
1190             0,
1191             0,
1192             0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI,  // ᾀ
1193             0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI,  // ᾁ
1194             0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾂ
1195             0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾃ
1196             0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾄ
1197             0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾅ
1198             0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾆ
1199             0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾇ
1200             0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI,  // ᾈ
1201             0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI,  // ᾉ
1202             0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾊ
1203             0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾋ
1204             0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾌ
1205             0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾍ
1206             0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾎ
1207             0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾏ
1208             0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI,  // ᾐ
1209             0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI,  // ᾑ
1210             0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾒ
1211             0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾓ
1212             0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾔ
1213             0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾕ
1214             0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾖ
1215             0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾗ
1216             0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI,  // ᾘ
1217             0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI,  // ᾙ
1218             0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾚ
1219             0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾛ
1220             0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾜ
1221             0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾝ
1222             0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾞ
1223             0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾟ
1224             0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI,  // ᾠ
1225             0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI,  // ᾡ
1226             0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾢ
1227             0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾣ
1228             0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾤ
1229             0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾥ
1230             0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾦ
1231             0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾧ
1232             0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI,  // ᾨ
1233             0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI,  // ᾩ
1234             0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾪ
1235             0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾫ
1236             0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾬ
1237             0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾭ
1238             0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾮ
1239             0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾯ
1240             0x0391 | HAS_VOWEL,  // ᾰ
1241             0x0391 | HAS_VOWEL,  // ᾱ
1242             0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾲ
1243             0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI,  // ᾳ
1244             0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾴ
1245             0,
1246             0x0391 | HAS_VOWEL | HAS_ACCENT,  // ᾶ
1247             0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾷ
1248             0x0391 | HAS_VOWEL,  // Ᾰ
1249             0x0391 | HAS_VOWEL,  // Ᾱ
1250             0x0391 | HAS_VOWEL | HAS_ACCENT,  // Ὰ
1251             0x0391 | HAS_VOWEL | HAS_ACCENT,  // Ά
1252             0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI,  // ᾼ
1253             0,
1254             0x0399 | HAS_VOWEL,  // ι
1255             0,
1256             0,
1257             0,
1258             0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ῂ
1259             0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI,  // ῃ
1260             0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ῄ
1261             0,
1262             0x0397 | HAS_VOWEL | HAS_ACCENT,  // ῆ
1263             0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ῇ
1264             0x0395 | HAS_VOWEL | HAS_ACCENT,  // Ὲ
1265             0x0395 | HAS_VOWEL | HAS_ACCENT,  // Έ
1266             0x0397 | HAS_VOWEL | HAS_ACCENT,  // Ὴ
1267             0x0397 | HAS_VOWEL | HAS_ACCENT,  // Ή
1268             0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI,  // ῌ
1269             0,
1270             0,
1271             0,
1272             0x0399 | HAS_VOWEL,  // ῐ
1273             0x0399 | HAS_VOWEL,  // ῑ
1274             0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,  // ῒ
1275             0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,  // ΐ
1276             0,
1277             0,
1278             0x0399 | HAS_VOWEL | HAS_ACCENT,  // ῖ
1279             0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,  // ῗ
1280             0x0399 | HAS_VOWEL,  // Ῐ
1281             0x0399 | HAS_VOWEL,  // Ῑ
1282             0x0399 | HAS_VOWEL | HAS_ACCENT,  // Ὶ
1283             0x0399 | HAS_VOWEL | HAS_ACCENT,  // Ί
1284             0,
1285             0,
1286             0,
1287             0,
1288             0x03A5 | HAS_VOWEL,  // ῠ
1289             0x03A5 | HAS_VOWEL,  // ῡ
1290             0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,  // ῢ
1291             0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,  // ΰ
1292             0x03A1,  // ῤ
1293             0x03A1,  // ῥ
1294             0x03A5 | HAS_VOWEL | HAS_ACCENT,  // ῦ
1295             0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,  // ῧ
1296             0x03A5 | HAS_VOWEL,  // Ῠ
1297             0x03A5 | HAS_VOWEL,  // Ῡ
1298             0x03A5 | HAS_VOWEL | HAS_ACCENT,  // Ὺ
1299             0x03A5 | HAS_VOWEL | HAS_ACCENT,  // Ύ
1300             0x03A1,  // Ῥ
1301             0,
1302             0,
1303             0,
1304             0,
1305             0,
1306             0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ῲ
1307             0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI,  // ῳ
1308             0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ῴ
1309             0,
1310             0x03A9 | HAS_VOWEL | HAS_ACCENT,  // ῶ
1311             0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ῷ
1312             0x039F | HAS_VOWEL | HAS_ACCENT,  // Ὸ
1313             0x039F | HAS_VOWEL | HAS_ACCENT,  // Ό
1314             0x03A9 | HAS_VOWEL | HAS_ACCENT,  // Ὼ
1315             0x03A9 | HAS_VOWEL | HAS_ACCENT,  // Ώ
1316             0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI,  // ῼ
1317             0,
1318             0,
1319             0,
1320         };
1321 
1322         // U+2126 Ohm sign
1323         private static final char data2126 = 0x03A9 | HAS_VOWEL;  // Ω
1324 
1325         private static final int getLetterData(int c) {
1326             if (c < 0x370 || 0x2126 < c || (0x3ff < c && c < 0x1f00)) {
1327                 return 0;
1328             } else if (c <= 0x3ff) {
1329                 return data0370[c - 0x370];
1330             } else if (c <= 0x1fff) {
1331                 return data1F00[c - 0x1f00];
1332             } else if (c == 0x2126) {
1333                 return data2126;
1334             } else {
1335                 return 0;
1336             }
1337         }
1338 
1339         /**
1340          * Returns a non-zero value for each of the Greek combining diacritics
1341          * listed in The Unicode Standard, version 8, chapter 7.2 Greek,
1342          * plus some perispomeni look-alikes.
1343          */
1344         private static final int getDiacriticData(int c) {
1345             switch (c) {
1346             case '\u0300':  // varia
1347             case '\u0301':  // tonos = oxia
1348             case '\u0342':  // perispomeni
1349             case '\u0302':  // circumflex can look like perispomeni
1350             case '\u0303':  // tilde can look like perispomeni
1351             case '\u0311':  // inverted breve can look like perispomeni
1352                 return HAS_ACCENT;
1353             case '\u0308':  // dialytika = diaeresis
1354                 return HAS_COMBINING_DIALYTIKA;
1355             case '\u0344':  // dialytika tonos
1356                 return HAS_COMBINING_DIALYTIKA | HAS_ACCENT;
1357             case '\u0345':  // ypogegrammeni = iota subscript
1358                 return HAS_YPOGEGRAMMENI;
1359             case '\u0304':  // macron
1360             case '\u0306':  // breve
1361             case '\u0313':  // comma above
1362             case '\u0314':  // reversed comma above
1363             case '\u0343':  // koronis
1364                 return HAS_OTHER_GREEK_DIACRITIC;
1365             default:
1366                 return 0;
1367             }
1368         }
1369 
1370         private static boolean isFollowedByCasedLetter(CharSequence s, int i) {
1371             while (i < s.length()) {
1372                 int c = Character.codePointAt(s, i);
1373                 int type = UCaseProps.INSTANCE.getTypeOrIgnorable(c);
1374                 if ((type & UCaseProps.IGNORABLE) != 0) {
1375                     // Case-ignorable, continue with the loop.
1376                     i += Character.charCount(c);
1377                 } else if (type != UCaseProps.NONE) {
1378                     return true;  // Followed by cased letter.
1379                 } else {
1380                     return false;  // Uncased and not case-ignorable.
1381                 }
1382             }
1383             return false;  // Not followed by cased letter.
1384         }
1385 
1386         /**
1387          * Greek string uppercasing with a state machine.
1388          * Probably simpler than a stateless function that has to figure out complex context-before
1389          * for each character.
1390          * TODO: Try to re-consolidate one way or another with the non-Greek function.
1391          *
1392          * <p>Keep this consistent with the C++ versions in ustrcase.cpp (UTF-16) and ucasemap.cpp (UTF-8).
1393          * @throws IOException
1394          */
1395         private static <A extends Appendable> A toUpper(int options,
1396                 CharSequence src, A dest, Edits edits) throws IOException {
1397             int state = 0;
1398             for (int i = 0; i < src.length();) {
1399                 int c = Character.codePointAt(src, i);
1400                 int nextIndex = i + Character.charCount(c);
1401                 int nextState = 0;
1402                 int type = UCaseProps.INSTANCE.getTypeOrIgnorable(c);
1403                 if ((type & UCaseProps.IGNORABLE) != 0) {
1404                     // c is case-ignorable
1405                     nextState |= (state & AFTER_CASED);
1406                 } else if (type != UCaseProps.NONE) {
1407                     // c is cased
1408                     nextState |= AFTER_CASED;
1409                 }
1410                 int data = getLetterData(c);
1411                 if (data > 0) {
1412                     int upper = data & UPPER_MASK;
1413                     // Add a dialytika to this iota or ypsilon vowel
1414                     // if we removed a tonos from the previous vowel,
1415                     // and that previous vowel did not also have (or gain) a dialytika.
1416                     // Adding one only to the final vowel in a longer sequence
1417                     // (which does not occur in normal writing) would require lookahead.
1418                     // Set the same flag as for preserving an existing dialytika.
1419                     if ((data & HAS_VOWEL) != 0 && (state & AFTER_VOWEL_WITH_ACCENT) != 0 &&
1420                             (upper == 'Ι' || upper == 'Υ')) {
1421                         data |= HAS_DIALYTIKA;
1422                     }
1423                     int numYpogegrammeni = 0;  // Map each one to a trailing, spacing, capital iota.
1424                     if ((data & HAS_YPOGEGRAMMENI) != 0) {
1425                         numYpogegrammeni = 1;
1426                     }
1427                     // Skip combining diacritics after this Greek letter.
1428                     while (nextIndex < src.length()) {
1429                         int diacriticData = getDiacriticData(src.charAt(nextIndex));
1430                         if (diacriticData != 0) {
1431                             data |= diacriticData;
1432                             if ((diacriticData & HAS_YPOGEGRAMMENI) != 0) {
1433                                 ++numYpogegrammeni;
1434                             }
1435                             ++nextIndex;
1436                         } else {
1437                             break;  // not a Greek diacritic
1438                         }
1439                     }
1440                     if ((data & HAS_VOWEL_AND_ACCENT_AND_DIALYTIKA) == HAS_VOWEL_AND_ACCENT) {
1441                         nextState |= AFTER_VOWEL_WITH_ACCENT;
1442                     }
1443                     // Map according to Greek rules.
1444                     boolean addTonos = false;
1445                     if (upper == 'Η' &&
1446                             (data & HAS_ACCENT) != 0 &&
1447                             numYpogegrammeni == 0 &&
1448                             (state & AFTER_CASED) == 0 &&
1449                             !isFollowedByCasedLetter(src, nextIndex)) {
1450                         // Keep disjunctive "or" with (only) a tonos.
1451                         // We use the same "word boundary" conditions as for the Final_Sigma test.
1452                         if (i == nextIndex) {
1453                             upper = 'Ή';  // Preserve the precomposed form.
1454                         } else {
1455                             addTonos = true;
1456                         }
1457                     } else if ((data & HAS_DIALYTIKA) != 0) {
1458                         // Preserve a vowel with dialytika in precomposed form if it exists.
1459                         if (upper == 'Ι') {
1460                             upper = 'Ϊ';
1461                             data &= ~HAS_EITHER_DIALYTIKA;
1462                         } else if (upper == 'Υ') {
1463                             upper = 'Ϋ';
1464                             data &= ~HAS_EITHER_DIALYTIKA;
1465                         }
1466                     }
1467 
1468                     boolean change;
1469                     if (edits == null && (options & OMIT_UNCHANGED_TEXT) == 0) {
1470                         change = true;  // common, simple usage
1471                     } else {
1472                         // Find out first whether we are changing the text.
1473                         change = src.charAt(i) != upper || numYpogegrammeni > 0;
1474                         int i2 = i + 1;
1475                         if ((data & HAS_EITHER_DIALYTIKA) != 0) {
1476                             change |= i2 >= nextIndex || src.charAt(i2) != 0x308;
1477                             ++i2;
1478                         }
1479                         if (addTonos) {
1480                             change |= i2 >= nextIndex || src.charAt(i2) != 0x301;
1481                             ++i2;
1482                         }
1483                         int oldLength = nextIndex - i;
1484                         int newLength = (i2 - i) + numYpogegrammeni;
1485                         change |= oldLength != newLength;
1486                         if (change) {
1487                             if (edits != null) {
1488                                 edits.addReplace(oldLength, newLength);
1489                             }
1490                         } else {
1491                             if (edits != null) {
1492                                 edits.addUnchanged(oldLength);
1493                             }
1494                             // Write unchanged text?
1495                             change = (options & OMIT_UNCHANGED_TEXT) == 0;
1496                         }
1497                     }
1498 
1499                     if (change) {
1500                         dest.append((char)upper);
1501                         if ((data & HAS_EITHER_DIALYTIKA) != 0) {
1502                             dest.append('\u0308');  // restore or add a dialytika
1503                         }
1504                         if (addTonos) {
1505                             dest.append('\u0301');
1506                         }
1507                         while (numYpogegrammeni > 0) {
1508                             dest.append('Ι');
1509                             --numYpogegrammeni;
1510                         }
1511                     }
1512                 } else {
1513                     c = UCaseProps.INSTANCE.toFullUpper(c, null, dest, UCaseProps.LOC_GREEK);
1514                     appendResult(c, dest, nextIndex - i, options, edits);
1515                 }
1516                 i = nextIndex;
1517                 state = nextState;
1518             }
1519             return dest;
1520         }
1521     }
1522 }
1523