• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 package com.ibm.icu.impl;
4 
5 import java.io.IOException;
6 import java.text.CharacterIterator;
7 import java.util.Locale;
8 
9 import com.ibm.icu.lang.UCharacter;
10 import com.ibm.icu.lang.UCharacterCategory;
11 import com.ibm.icu.text.BreakIterator;
12 import com.ibm.icu.text.Edits;
13 import com.ibm.icu.util.ICUUncheckedIOException;
14 import com.ibm.icu.util.ULocale;
15 
16 public final class CaseMapImpl {
17     /**
18      * Implementation of UCaseProps.ContextIterator, iterates over a String.
19      * See ustrcase.c/utf16_caseContextIterator().
20      */
21     public static final class StringContextIterator implements UCaseProps.ContextIterator {
22         /**
23          * Constructor.
24          * @param src String to iterate over.
25          */
StringContextIterator(CharSequence src)26         public StringContextIterator(CharSequence src) {
27             this.s=src;
28             limit=src.length();
29             cpStart=cpLimit=index=0;
30             dir=0;
31         }
32 
33         /**
34          * Constructor.
35          * @param src String to iterate over.
36          * @param cpStart Start index of the current code point.
37          * @param cpLimit Limit index of the current code point.
38          */
StringContextIterator(CharSequence src, int cpStart, int cpLimit)39         public StringContextIterator(CharSequence src, int cpStart, int cpLimit) {
40             s = src;
41             index = 0;
42             limit = src.length();
43             this.cpStart = cpStart;
44             this.cpLimit = cpLimit;
45             dir = 0;
46         }
47 
48         /**
49          * Set the iteration limit for nextCaseMapCP() to an index within the string.
50          * If the limit parameter is negative or past the string, then the
51          * string length is restored as the iteration limit.
52          *
53          * <p>This limit does not affect the next() function which always
54          * iterates to the very end of the string.
55          *
56          * @param lim The iteration limit.
57          */
setLimit(int lim)58         public void setLimit(int lim) {
59             if(0<=lim && lim<=s.length()) {
60                 limit=lim;
61             } else {
62                 limit=s.length();
63             }
64         }
65 
66         /**
67          * Move to the iteration limit without fetching code points up to there.
68          */
moveToLimit()69         public void moveToLimit() {
70             cpStart=cpLimit=limit;
71         }
72 
73         /**
74          * Iterate forward through the string to fetch the next code point
75          * to be case-mapped, and set the context indexes for it.
76          *
77          * <p>When the iteration limit is reached (and -1 is returned),
78          * getCPStart() will be at the iteration limit.
79          *
80          * <p>Iteration with next() does not affect the position for nextCaseMapCP().
81          *
82          * @return The next code point to be case-mapped, or <0 when the iteration is done.
83          */
nextCaseMapCP()84         public int nextCaseMapCP() {
85             cpStart=cpLimit;
86             if(cpLimit<limit) {
87                 int c=Character.codePointAt(s, cpLimit);
88                 cpLimit+=Character.charCount(c);
89                 return c;
90             } else {
91                 return -1;
92             }
93         }
94 
setCPStartAndLimit(int s, int l)95         public void setCPStartAndLimit(int s, int l) {
96             cpStart = s;
97             cpLimit = l;
98             dir = 0;
99         }
100         /**
101          * Returns the start of the code point that was last returned
102          * by nextCaseMapCP().
103          */
getCPStart()104         public int getCPStart() {
105             return cpStart;
106         }
107 
108         /**
109          * Returns the limit of the code point that was last returned
110          * by nextCaseMapCP().
111          */
getCPLimit()112         public int getCPLimit() {
113             return cpLimit;
114         }
115 
getCPLength()116         public int getCPLength() {
117             return cpLimit-cpStart;
118         }
119 
120         // implement UCaseProps.ContextIterator
121         // The following code is not used anywhere in this private class
122         @Override
reset(int direction)123         public void reset(int direction) {
124             if(direction>0) {
125                 /* reset for forward iteration */
126                 dir=1;
127                 index=cpLimit;
128             } else if(direction<0) {
129                 /* reset for backward iteration */
130                 dir=-1;
131                 index=cpStart;
132             } else {
133                 // not a valid direction
134                 dir=0;
135                 index=0;
136             }
137         }
138 
139         @Override
next()140         public int next() {
141             int c;
142 
143             if(dir>0 && index<s.length()) {
144                 c=Character.codePointAt(s, index);
145                 index+=Character.charCount(c);
146                 return c;
147             } else if(dir<0 && index>0) {
148                 c=Character.codePointBefore(s, index);
149                 index-=Character.charCount(c);
150                 return c;
151             }
152             return -1;
153         }
154 
155         // variables
156         protected CharSequence s;
157         protected int index, limit, cpStart, cpLimit;
158         protected int dir; // 0=initial state  >0=forward  <0=backward
159     }
160 
161     public static final int TITLECASE_WHOLE_STRING = 0x20;
162     public static final int TITLECASE_SENTENCES = 0x40;
163 
164     /**
165      * Bit mask for the titlecasing iterator options bit field.
166      * Currently only 3 out of 8 values are used:
167      * 0 (words), TITLECASE_WHOLE_STRING, TITLECASE_SENTENCES.
168      * See stringoptions.h.
169      * @internal
170      */
171     private static final int TITLECASE_ITERATOR_MASK = 0xe0;
172 
173     public static final int TITLECASE_ADJUST_TO_CASED = 0x400;
174 
175     /**
176      * Bit mask for the titlecasing index adjustment options bit set.
177      * Currently two bits are defined:
178      * TITLECASE_NO_BREAK_ADJUSTMENT, TITLECASE_ADJUST_TO_CASED.
179      * See stringoptions.h.
180      * @internal
181      */
182     private static final int TITLECASE_ADJUSTMENT_MASK = 0x600;
183 
addTitleAdjustmentOption(int options, int newOption)184     public static int addTitleAdjustmentOption(int options, int newOption) {
185         int adjOptions = options & TITLECASE_ADJUSTMENT_MASK;
186         if (adjOptions !=0 && adjOptions != newOption) {
187             throw new IllegalArgumentException("multiple titlecasing index adjustment options");
188         }
189         return options | newOption;
190     }
191 
192     private static final int LNS =
193             (1 << UCharacterCategory.UPPERCASE_LETTER) |
194             (1 << UCharacterCategory.LOWERCASE_LETTER) |
195             (1 << UCharacterCategory.TITLECASE_LETTER) |
196             // Not MODIFIER_LETTER: We count only cased modifier letters.
197             (1 << UCharacterCategory.OTHER_LETTER) |
198 
199             (1 << UCharacterCategory.DECIMAL_DIGIT_NUMBER) |
200             (1 << UCharacterCategory.LETTER_NUMBER) |
201             (1 << UCharacterCategory.OTHER_NUMBER) |
202 
203             (1 << UCharacterCategory.MATH_SYMBOL) |
204             (1 << UCharacterCategory.CURRENCY_SYMBOL) |
205             (1 << UCharacterCategory.MODIFIER_SYMBOL) |
206             (1 << UCharacterCategory.OTHER_SYMBOL) |
207 
208             (1 << UCharacterCategory.PRIVATE_USE);
209 
isLNS(int c)210     private static boolean isLNS(int c) {
211         // Letter, number, symbol,
212         // or a private use code point because those are typically used as letters or numbers.
213         // Consider modifier letters only if they are cased.
214         int gc = UCharacterProperty.INSTANCE.getType(c);
215         return ((1 << gc) & LNS) != 0 ||
216                 (gc == UCharacterCategory.MODIFIER_LETTER &&
217                     UCaseProps.INSTANCE.getType(c) != UCaseProps.NONE);
218     }
219 
addTitleIteratorOption(int options, int newOption)220     public static int addTitleIteratorOption(int options, int newOption) {
221         int iterOptions = options & TITLECASE_ITERATOR_MASK;
222         if (iterOptions !=0 && iterOptions != newOption) {
223             throw new IllegalArgumentException("multiple titlecasing iterator options");
224         }
225         return options | newOption;
226     }
227 
getTitleBreakIterator( Locale locale, int options, BreakIterator iter)228     public static BreakIterator getTitleBreakIterator(
229             Locale locale, int options, BreakIterator iter) {
230         options &= TITLECASE_ITERATOR_MASK;
231         if (options != 0 && iter != null) {
232             throw new IllegalArgumentException(
233                     "titlecasing iterator option together with an explicit iterator");
234         }
235         if (iter == null) {
236             switch (options) {
237             case 0:
238                 iter = BreakIterator.getWordInstance(locale);
239                 break;
240             case TITLECASE_WHOLE_STRING:
241                 iter = new WholeStringBreakIterator();
242                 break;
243             case TITLECASE_SENTENCES:
244                 iter = BreakIterator.getSentenceInstance(locale);
245                 break;
246             default:
247                 throw new IllegalArgumentException("unknown titlecasing iterator option");
248             }
249         }
250         return iter;
251     }
252 
getTitleBreakIterator( ULocale locale, int options, BreakIterator iter)253     public static BreakIterator getTitleBreakIterator(
254             ULocale locale, int options, BreakIterator iter) {
255         options &= TITLECASE_ITERATOR_MASK;
256         if (options != 0 && iter != null) {
257             throw new IllegalArgumentException(
258                     "titlecasing iterator option together with an explicit iterator");
259         }
260         if (iter == null) {
261             switch (options) {
262             case 0:
263                 iter = BreakIterator.getWordInstance(locale);
264                 break;
265             case TITLECASE_WHOLE_STRING:
266                 iter = new WholeStringBreakIterator();
267                 break;
268             case TITLECASE_SENTENCES:
269                 iter = BreakIterator.getSentenceInstance(locale);
270                 break;
271             default:
272                 throw new IllegalArgumentException("unknown titlecasing iterator option");
273             }
274         }
275         return iter;
276     }
277 
278     /**
279      * Omit unchanged text when case-mapping with Edits.
280      */
281     public static final int OMIT_UNCHANGED_TEXT = 0x4000;
282 
283     private static final class WholeStringBreakIterator extends BreakIterator {
284         private int length;
285 
notImplemented()286         private static void notImplemented() {
287             throw new UnsupportedOperationException("should not occur");
288         }
289 
290         @Override
first()291         public int first() {
292             return 0;
293         }
294 
295         @Override
last()296         public int last() {
297             notImplemented();
298             return 0;
299         }
300 
301         @Override
next(int n)302         public int next(int n) {
303             notImplemented();
304             return 0;
305         }
306 
307         @Override
next()308         public int next() {
309             return length;
310         }
311 
312         @Override
previous()313         public int previous() {
314             notImplemented();
315             return 0;
316         }
317 
318         @Override
following(int offset)319         public int following(int offset) {
320             notImplemented();
321             return 0;
322         }
323 
324         @Override
current()325         public int current() {
326             notImplemented();
327             return 0;
328         }
329 
330         @Override
getText()331         public CharacterIterator getText() {
332             notImplemented();
333             return null;
334         }
335 
336         @Override
setText(CharacterIterator newText)337         public void setText(CharacterIterator newText) {
338             length = newText.getEndIndex();
339         }
340 
341         @Override
setText(CharSequence newText)342         public void setText(CharSequence newText) {
343             length = newText.length();
344         }
345 
346         @Override
setText(String newText)347         public void setText(String newText) {
348             length = newText.length();
349         }
350     }
351 
appendCodePoint(Appendable a, int c)352     private static int appendCodePoint(Appendable a, int c) throws IOException {
353         if (c <= Character.MAX_VALUE) {
354             a.append((char)c);
355             return 1;
356         } else {
357             a.append((char)(0xd7c0 + (c >> 10)));
358             a.append((char)(Character.MIN_LOW_SURROGATE + (c & 0x3ff)));
359             return 2;
360         }
361     }
362 
363     /**
364      * Appends a full case mapping result, see {@link UCaseProps#MAX_STRING_LENGTH}.
365      * @throws IOException
366      */
appendResult(int result, Appendable dest, int cpLength, int options, Edits edits)367     private static void appendResult(int result, Appendable dest,
368             int cpLength, int options, Edits edits) throws IOException {
369         // Decode the result.
370         if (result < 0) {
371             // (not) original code point
372             if (edits != null) {
373                 edits.addUnchanged(cpLength);
374             }
375             if ((options & OMIT_UNCHANGED_TEXT) != 0) {
376                 return;
377             }
378             appendCodePoint(dest, ~result);
379         } else if (result <= UCaseProps.MAX_STRING_LENGTH) {
380             // The mapping has already been appended to result.
381             if (edits != null) {
382                 edits.addReplace(cpLength, result);
383             }
384         } else {
385             // Append the single-code point mapping.
386             int length = appendCodePoint(dest, result);
387             if (edits != null) {
388                 edits.addReplace(cpLength, length);
389             }
390         }
391     }
392 
appendUnchanged(CharSequence src, int start, int length, Appendable dest, int options, Edits edits)393     private static final void appendUnchanged(CharSequence src, int start, int length,
394             Appendable dest, int options, Edits edits) throws IOException {
395         if (length > 0) {
396             if (edits != null) {
397                 edits.addUnchanged(length);
398             }
399             if ((options & OMIT_UNCHANGED_TEXT) != 0) {
400                 return;
401             }
402             dest.append(src, start, start + length);
403         }
404     }
405 
applyEdits(CharSequence src, StringBuilder replacementChars, Edits edits)406     private static String applyEdits(CharSequence src, StringBuilder replacementChars, Edits edits) {
407         if (!edits.hasChanges()) {
408             return src.toString();
409         }
410         StringBuilder result = new StringBuilder(src.length() + edits.lengthDelta());
411         for (Edits.Iterator ei = edits.getCoarseIterator(); ei.next();) {
412             if (ei.hasChange()) {
413                 int i = ei.replacementIndex();
414                 result.append(replacementChars, i, i + ei.newLength());
415             } else {
416                 int i = ei.sourceIndex();
417                 result.append(src, i, i + ei.oldLength());
418             }
419         }
420         return result.toString();
421     }
422 
423     private static final Trie2_16 CASE_TRIE = UCaseProps.getTrie();
424 
425     /**
426      * caseLocale >= 0: Lowercases [srcStart..srcLimit[ but takes context [0..srcLength[ into account.
427      * caseLocale < 0: Case-folds [srcStart..srcLimit[.
428      */
internalToLower(int caseLocale, int options, CharSequence src, int srcStart, int srcLimit, StringContextIterator iter, Appendable dest, Edits edits)429     private static void internalToLower(int caseLocale, int options,
430             CharSequence src, int srcStart, int srcLimit, StringContextIterator iter,
431             Appendable dest, Edits edits) throws IOException {
432         byte[] latinToLower;
433         if (caseLocale == UCaseProps.LOC_ROOT ||
434                 (caseLocale >= 0 ?
435                     !(caseLocale == UCaseProps.LOC_TURKISH || caseLocale == UCaseProps.LOC_LITHUANIAN) :
436                     (options & UCaseProps.FOLD_CASE_OPTIONS_MASK) == UCharacter.FOLD_CASE_DEFAULT)) {
437             latinToLower = UCaseProps.LatinCase.TO_LOWER_NORMAL;
438         } else {
439             latinToLower = UCaseProps.LatinCase.TO_LOWER_TR_LT;
440         }
441         int prev = srcStart;
442         int srcIndex = srcStart;
443         outerLoop:
444         for (;;) {
445             // fast path for simple cases
446             char lead;
447             for (;;) {
448                 if (srcIndex >= srcLimit) {
449                     break outerLoop;
450                 }
451                 lead = src.charAt(srcIndex);
452                 int delta;
453                 if (lead < UCaseProps.LatinCase.LONG_S) {
454                     byte d = latinToLower[lead];
455                     if (d == UCaseProps.LatinCase.EXC) { break; }
456                     ++srcIndex;
457                     if (d == 0) { continue; }
458                     delta = d;
459                 } else if (lead >= 0xd800) {
460                     break;  // surrogate or higher
461                 } else {
462                     int props = CASE_TRIE.getFromU16SingleLead(lead);
463                     if (UCaseProps.propsHasException(props)) { break; }
464                     ++srcIndex;
465                     if (!UCaseProps.isUpperOrTitleFromProps(props) ||
466                             (delta = UCaseProps.getDelta(props)) == 0) {
467                         continue;
468                     }
469                 }
470                 lead += delta;
471                 appendUnchanged(src, prev, srcIndex - 1 - prev, dest, options, edits);
472                 dest.append(lead);
473                 if (edits != null) {
474                     edits.addReplace(1, 1);
475                 }
476                 prev = srcIndex;
477             }
478             // slow path
479             int cpStart = srcIndex++;
480             char trail;
481             int c;
482             if (Character.isHighSurrogate(lead) && srcIndex < srcLimit &&
483                     Character.isLowSurrogate(trail = src.charAt(srcIndex))) {
484                 c = Character.toCodePoint(lead, trail);
485                 ++srcIndex;
486             } else {
487                 c = lead;
488             }
489             // We need to append unchanged text before calling the UCaseProps.toFullXyz() methods
490             // because they will sometimes append their mapping to dest,
491             // and that must be after copying the previous text.
492             appendUnchanged(src, prev, cpStart - prev, dest, options, edits);
493             prev = cpStart;
494             if (caseLocale >= 0) {
495                 if (iter == null) {
496                     iter = new StringContextIterator(src, cpStart, srcIndex);
497                 } else {
498                     iter.setCPStartAndLimit(cpStart, srcIndex);
499                 }
500                 c = UCaseProps.INSTANCE.toFullLower(c, iter, dest, caseLocale);
501             } else {
502                 c = UCaseProps.INSTANCE.toFullFolding(c, dest, options);
503             }
504             if (c >= 0) {
505                 appendResult(c, dest, srcIndex - cpStart, options, edits);
506                 prev = srcIndex;
507             }
508         }
509         appendUnchanged(src, prev, srcIndex - prev, dest, options, edits);
510     }
511 
internalToUpper(int caseLocale, int options, CharSequence src, Appendable dest, Edits edits)512     private static void internalToUpper(int caseLocale, int options,
513             CharSequence src, Appendable dest, Edits edits) throws IOException {
514         StringContextIterator iter = null;
515         byte[] latinToUpper;
516         if (caseLocale == UCaseProps.LOC_TURKISH) {
517             latinToUpper = UCaseProps.LatinCase.TO_UPPER_TR;
518         } else {
519             latinToUpper = UCaseProps.LatinCase.TO_UPPER_NORMAL;
520         }
521         int prev = 0;
522         int srcIndex = 0;
523         int srcLength = src.length();
524         outerLoop:
525         for (;;) {
526             // fast path for simple cases
527             char lead;
528             for (;;) {
529                 if (srcIndex >= srcLength) {
530                     break outerLoop;
531                 }
532                 lead = src.charAt(srcIndex);
533                 int delta;
534                 if (lead < UCaseProps.LatinCase.LONG_S) {
535                     byte d = latinToUpper[lead];
536                     if (d == UCaseProps.LatinCase.EXC) { break; }
537                     ++srcIndex;
538                     if (d == 0) { continue; }
539                     delta = d;
540                 } else if (lead >= 0xd800) {
541                     break;  // surrogate or higher
542                 } else {
543                     int props = CASE_TRIE.getFromU16SingleLead(lead);
544                     if (UCaseProps.propsHasException(props)) { break; }
545                     ++srcIndex;
546                     if (UCaseProps.getTypeFromProps(props) != UCaseProps.LOWER ||
547                             (delta = UCaseProps.getDelta(props)) == 0) {
548                         continue;
549                     }
550                 }
551                 lead += delta;
552                 appendUnchanged(src, prev, srcIndex - 1 - prev, dest, options, edits);
553                 dest.append(lead);
554                 if (edits != null) {
555                     edits.addReplace(1, 1);
556                 }
557                 prev = srcIndex;
558             }
559             // slow path
560             int cpStart = srcIndex++;
561             char trail;
562             int c;
563             if (Character.isHighSurrogate(lead) && srcIndex < srcLength &&
564                     Character.isLowSurrogate(trail = src.charAt(srcIndex))) {
565                 c = Character.toCodePoint(lead, trail);
566                 ++srcIndex;
567             } else {
568                 c = lead;
569             }
570             if (iter == null) {
571                 iter = new StringContextIterator(src, cpStart, srcIndex);
572             } else {
573                 iter.setCPStartAndLimit(cpStart, srcIndex);
574             }
575             // We need to append unchanged text before calling UCaseProps.toFullUpper()
576             // because it will sometimes append its mapping to dest,
577             // and that must be after copying the previous text.
578             appendUnchanged(src, prev, cpStart - prev, dest, options, edits);
579             prev = cpStart;
580             c = UCaseProps.INSTANCE.toFullUpper(c, iter, dest, caseLocale);
581             if (c >= 0) {
582                 appendResult(c, dest, srcIndex - cpStart, options, edits);
583                 prev = srcIndex;
584             }
585         }
586         appendUnchanged(src, prev, srcIndex - prev, dest, options, edits);
587     }
588 
toLower(int caseLocale, int options, CharSequence src)589     public static String toLower(int caseLocale, int options, CharSequence src) {
590         if (src.length() <= 100 && (options & OMIT_UNCHANGED_TEXT) == 0) {
591             if (src.length() == 0) {
592                 return src.toString();
593             }
594             // Collect and apply only changes.
595             // Good if no or few changes. Bad (slow) if many changes.
596             Edits edits = new Edits();
597             StringBuilder replacementChars = toLower(
598                     caseLocale, options | OMIT_UNCHANGED_TEXT, src, new StringBuilder(), edits);
599             return applyEdits(src, replacementChars, edits);
600         } else {
601             return toLower(caseLocale, options, src,
602                     new StringBuilder(src.length()), null).toString();
603         }
604     }
605 
toLower(int caseLocale, int options, CharSequence src, A dest, Edits edits)606     public static <A extends Appendable> A toLower(int caseLocale, int options,
607             CharSequence src, A dest, Edits edits) {
608         try {
609             if (edits != null) {
610                 edits.reset();
611             }
612             internalToLower(caseLocale, options, src, 0, src.length(), null, dest, edits);
613             return dest;
614         } catch (IOException e) {
615             throw new ICUUncheckedIOException(e);
616         }
617     }
618 
toUpper(int caseLocale, int options, CharSequence src)619     public static String toUpper(int caseLocale, int options, CharSequence src) {
620         if (src.length() <= 100 && (options & OMIT_UNCHANGED_TEXT) == 0) {
621             if (src.length() == 0) {
622                 return src.toString();
623             }
624             // Collect and apply only changes.
625             // Good if no or few changes. Bad (slow) if many changes.
626             Edits edits = new Edits();
627             StringBuilder replacementChars = toUpper(
628                     caseLocale, options | OMIT_UNCHANGED_TEXT, src, new StringBuilder(), edits);
629             return applyEdits(src, replacementChars, edits);
630         } else {
631             return toUpper(caseLocale, options, src,
632                     new StringBuilder(src.length()), null).toString();
633         }
634     }
635 
toUpper(int caseLocale, int options, CharSequence src, A dest, Edits edits)636     public static <A extends Appendable> A toUpper(int caseLocale, int options,
637             CharSequence src, A dest, Edits edits) {
638         try {
639             if (edits != null) {
640                 edits.reset();
641             }
642             if (caseLocale == UCaseProps.LOC_GREEK) {
643                 return GreekUpper.toUpper(options, src, dest, edits);
644             }
645             internalToUpper(caseLocale, options, src, dest, edits);
646             return dest;
647         } catch (IOException e) {
648             throw new ICUUncheckedIOException(e);
649         }
650     }
651 
toTitle(int caseLocale, int options, BreakIterator iter, CharSequence src)652     public static String toTitle(int caseLocale, int options, BreakIterator iter, CharSequence src) {
653         if (src.length() <= 100 && (options & OMIT_UNCHANGED_TEXT) == 0) {
654             if (src.length() == 0) {
655                 return src.toString();
656             }
657             // Collect and apply only changes.
658             // Good if no or few changes. Bad (slow) if many changes.
659             Edits edits = new Edits();
660             StringBuilder replacementChars = toTitle(
661                     caseLocale, options | OMIT_UNCHANGED_TEXT, iter, src,
662                     new StringBuilder(), edits);
663             return applyEdits(src, replacementChars, edits);
664         } else {
665             return toTitle(caseLocale, options, iter, src,
666                     new StringBuilder(src.length()), null).toString();
667         }
668     }
669 
toTitle( int caseLocale, int options, BreakIterator titleIter, CharSequence src, A dest, Edits edits)670     public static <A extends Appendable> A toTitle(
671             int caseLocale, int options, BreakIterator titleIter,
672             CharSequence src, A dest, Edits edits) {
673         try {
674             if (edits != null) {
675                 edits.reset();
676             }
677 
678             /* set up local variables */
679             StringContextIterator iter = new StringContextIterator(src);
680             int srcLength = src.length();
681             int prev=0;
682             boolean isFirstIndex=true;
683 
684             /* titlecasing loop */
685             while(prev<srcLength) {
686                 /* find next index where to titlecase */
687                 int index;
688                 if(isFirstIndex) {
689                     isFirstIndex=false;
690                     index=titleIter.first();
691                 } else {
692                     index=titleIter.next();
693                 }
694                 if(index==BreakIterator.DONE || index>srcLength) {
695                     index=srcLength;
696                 }
697 
698                 /*
699                  * Segment [prev..index[ into 3 parts:
700                  * a) skipped characters (copy as-is) [prev..titleStart[
701                  * b) first letter (titlecase)              [titleStart..titleLimit[
702                  * c) subsequent characters (lowercase)                 [titleLimit..index[
703                  */
704                 if(prev<index) {
705                     // Find and copy skipped characters [prev..titleStart[
706                     int titleStart=prev;
707                     iter.setLimit(index);
708                     int c=iter.nextCaseMapCP();
709                     if ((options&UCharacter.TITLECASE_NO_BREAK_ADJUSTMENT)==0) {
710                         // Adjust the titlecasing index to the next cased character,
711                         // or to the next letter/number/symbol/private use.
712                         // Stop with titleStart<titleLimit<=index
713                         // if there is a character to be titlecased,
714                         // or else stop with titleStart==titleLimit==index.
715                         boolean toCased = (options&CaseMapImpl.TITLECASE_ADJUST_TO_CASED) != 0;
716                         while ((toCased ?
717                                     UCaseProps.NONE==UCaseProps.INSTANCE.getType(c) :
718                                         !CaseMapImpl.isLNS(c)) &&
719                                 (c=iter.nextCaseMapCP())>=0) {}
720                         // If c<0 then we have only uncased characters in [prev..index[
721                         // and stopped with titleStart==titleLimit==index.
722                         titleStart=iter.getCPStart();
723                         if (prev < titleStart) {
724                             appendUnchanged(src, prev, titleStart-prev, dest, options, edits);
725                         }
726                     }
727 
728                     if(titleStart<index) {
729                         int titleLimit=iter.getCPLimit();
730                         // titlecase c which is from [titleStart..titleLimit[
731                         c = UCaseProps.INSTANCE.toFullTitle(c, iter, dest, caseLocale);
732                         appendResult(c, dest, iter.getCPLength(), options, edits);
733 
734                         // Special case Dutch IJ titlecasing
735                         if (titleStart+1 < index && caseLocale == UCaseProps.LOC_DUTCH) {
736                             char c1 = src.charAt(titleStart);
737                             if ((c1 == 'i' || c1 == 'I')) {
738                                 char c2 = src.charAt(titleStart+1);
739                                 if (c2 == 'j') {
740                                     dest.append('J');
741                                     if (edits != null) {
742                                         edits.addReplace(1, 1);
743                                     }
744                                     c = iter.nextCaseMapCP();
745                                     titleLimit++;
746                                     assert c == c2;
747                                     assert titleLimit == iter.getCPLimit();
748                                 } else if (c2 == 'J') {
749                                     // Keep the capital J from getting lowercased.
750                                     appendUnchanged(src, titleStart + 1, 1, dest, options, edits);
751                                     c = iter.nextCaseMapCP();
752                                     titleLimit++;
753                                     assert c == c2;
754                                     assert titleLimit == iter.getCPLimit();
755                                 }
756                             }
757                         }
758 
759                         // lowercase [titleLimit..index[
760                         if(titleLimit<index) {
761                             if((options&UCharacter.TITLECASE_NO_LOWERCASE)==0) {
762                                 // Normal operation: Lowercase the rest of the word.
763                                 internalToLower(caseLocale, options,
764                                         src, titleLimit, index, iter, dest, edits);
765                             } else {
766                                 // Optionally just copy the rest of the word unchanged.
767                                 appendUnchanged(src, titleLimit, index-titleLimit, dest, options, edits);
768                             }
769                             iter.moveToLimit();
770                         }
771                     }
772                 }
773 
774                 prev=index;
775             }
776             return dest;
777         } catch (IOException e) {
778             throw new ICUUncheckedIOException(e);
779         }
780     }
781 
fold(int options, CharSequence src)782     public static String fold(int options, CharSequence src) {
783         if (src.length() <= 100 && (options & OMIT_UNCHANGED_TEXT) == 0) {
784             if (src.length() == 0) {
785                 return src.toString();
786             }
787             // Collect and apply only changes.
788             // Good if no or few changes. Bad (slow) if many changes.
789             Edits edits = new Edits();
790             StringBuilder replacementChars = fold(
791                     options | OMIT_UNCHANGED_TEXT, src, new StringBuilder(), edits);
792             return applyEdits(src, replacementChars, edits);
793         } else {
794             return fold(options, src, new StringBuilder(src.length()), null).toString();
795         }
796     }
797 
fold(int options, CharSequence src, A dest, Edits edits)798     public static <A extends Appendable> A fold(int options,
799             CharSequence src, A dest, Edits edits) {
800         try {
801             if (edits != null) {
802                 edits.reset();
803             }
804             internalToLower(-1, options, src, 0, src.length(), null, dest, edits);
805             return dest;
806         } catch (IOException e) {
807             throw new ICUUncheckedIOException(e);
808         }
809     }
810 
811     private static final class GreekUpper {
812         // Data bits.
813         private static final int UPPER_MASK = 0x3ff;
814         private static final int HAS_VOWEL = 0x1000;
815         private static final int HAS_YPOGEGRAMMENI = 0x2000;
816         private static final int HAS_ACCENT = 0x4000;
817         private static final int HAS_DIALYTIKA = 0x8000;
818         // Further bits during data building and processing, not stored in the data map.
819         private static final int HAS_COMBINING_DIALYTIKA = 0x10000;
820         private static final int HAS_OTHER_GREEK_DIACRITIC = 0x20000;
821 
822         private static final int HAS_VOWEL_AND_ACCENT = HAS_VOWEL | HAS_ACCENT;
823         private static final int HAS_VOWEL_AND_ACCENT_AND_DIALYTIKA =
824                 HAS_VOWEL_AND_ACCENT | HAS_DIALYTIKA;
825         private static final int HAS_EITHER_DIALYTIKA = HAS_DIALYTIKA | HAS_COMBINING_DIALYTIKA;
826 
827         // State bits.
828         private static final int AFTER_CASED = 1;
829         private static final int AFTER_VOWEL_WITH_ACCENT = 2;
830 
831         // Data generated by prototype code, see
832         // http://site.icu-project.org/design/case/greek-upper
833         // TODO: Move this data into ucase.icu.
834         private static final char[] data0370 = {
835             // U+0370..03FF
836             0x0370,  // Ͱ
837             0x0370,  // ͱ
838             0x0372,  // Ͳ
839             0x0372,  // ͳ
840             0,
841             0,
842             0x0376,  // Ͷ
843             0x0376,  // ͷ
844             0,
845             0,
846             0x037A,  // ͺ
847             0x03FD,  // ͻ
848             0x03FE,  // ͼ
849             0x03FF,  // ͽ
850             0,
851             0x037F,  // Ϳ
852             0,
853             0,
854             0,
855             0,
856             0,
857             0,
858             0x0391 | HAS_VOWEL | HAS_ACCENT,  // Ά
859             0,
860             0x0395 | HAS_VOWEL | HAS_ACCENT,  // Έ
861             0x0397 | HAS_VOWEL | HAS_ACCENT,  // Ή
862             0x0399 | HAS_VOWEL | HAS_ACCENT,  // Ί
863             0,
864             0x039F | HAS_VOWEL | HAS_ACCENT,  // Ό
865             0,
866             0x03A5 | HAS_VOWEL | HAS_ACCENT,  // Ύ
867             0x03A9 | HAS_VOWEL | HAS_ACCENT,  // Ώ
868             0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,  // ΐ
869             0x0391 | HAS_VOWEL,  // Α
870             0x0392,  // Β
871             0x0393,  // Γ
872             0x0394,  // Δ
873             0x0395 | HAS_VOWEL,  // Ε
874             0x0396,  // Ζ
875             0x0397 | HAS_VOWEL,  // Η
876             0x0398,  // Θ
877             0x0399 | HAS_VOWEL,  // Ι
878             0x039A,  // Κ
879             0x039B,  // Λ
880             0x039C,  // Μ
881             0x039D,  // Ν
882             0x039E,  // Ξ
883             0x039F | HAS_VOWEL,  // Ο
884             0x03A0,  // Π
885             0x03A1,  // Ρ
886             0,
887             0x03A3,  // Σ
888             0x03A4,  // Τ
889             0x03A5 | HAS_VOWEL,  // Υ
890             0x03A6,  // Φ
891             0x03A7,  // Χ
892             0x03A8,  // Ψ
893             0x03A9 | HAS_VOWEL,  // Ω
894             0x0399 | HAS_VOWEL | HAS_DIALYTIKA,  // Ϊ
895             0x03A5 | HAS_VOWEL | HAS_DIALYTIKA,  // Ϋ
896             0x0391 | HAS_VOWEL | HAS_ACCENT,  // ά
897             0x0395 | HAS_VOWEL | HAS_ACCENT,  // έ
898             0x0397 | HAS_VOWEL | HAS_ACCENT,  // ή
899             0x0399 | HAS_VOWEL | HAS_ACCENT,  // ί
900             0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,  // ΰ
901             0x0391 | HAS_VOWEL,  // α
902             0x0392,  // β
903             0x0393,  // γ
904             0x0394,  // δ
905             0x0395 | HAS_VOWEL,  // ε
906             0x0396,  // ζ
907             0x0397 | HAS_VOWEL,  // η
908             0x0398,  // θ
909             0x0399 | HAS_VOWEL,  // ι
910             0x039A,  // κ
911             0x039B,  // λ
912             0x039C,  // μ
913             0x039D,  // ν
914             0x039E,  // ξ
915             0x039F | HAS_VOWEL,  // ο
916             0x03A0,  // π
917             0x03A1,  // ρ
918             0x03A3,  // ς
919             0x03A3,  // σ
920             0x03A4,  // τ
921             0x03A5 | HAS_VOWEL,  // υ
922             0x03A6,  // φ
923             0x03A7,  // χ
924             0x03A8,  // ψ
925             0x03A9 | HAS_VOWEL,  // ω
926             0x0399 | HAS_VOWEL | HAS_DIALYTIKA,  // ϊ
927             0x03A5 | HAS_VOWEL | HAS_DIALYTIKA,  // ϋ
928             0x039F | HAS_VOWEL | HAS_ACCENT,  // ό
929             0x03A5 | HAS_VOWEL | HAS_ACCENT,  // ύ
930             0x03A9 | HAS_VOWEL | HAS_ACCENT,  // ώ
931             0x03CF,  // Ϗ
932             0x0392,  // ϐ
933             0x0398,  // ϑ
934             0x03D2,  // ϒ
935             0x03D2 | HAS_ACCENT,  // ϓ
936             0x03D2 | HAS_DIALYTIKA,  // ϔ
937             0x03A6,  // ϕ
938             0x03A0,  // ϖ
939             0x03CF,  // ϗ
940             0x03D8,  // Ϙ
941             0x03D8,  // ϙ
942             0x03DA,  // Ϛ
943             0x03DA,  // ϛ
944             0x03DC,  // Ϝ
945             0x03DC,  // ϝ
946             0x03DE,  // Ϟ
947             0x03DE,  // ϟ
948             0x03E0,  // Ϡ
949             0x03E0,  // ϡ
950             0,
951             0,
952             0,
953             0,
954             0,
955             0,
956             0,
957             0,
958             0,
959             0,
960             0,
961             0,
962             0,
963             0,
964             0x039A,  // ϰ
965             0x03A1,  // ϱ
966             0x03F9,  // ϲ
967             0x037F,  // ϳ
968             0x03F4,  // ϴ
969             0x0395 | HAS_VOWEL,  // ϵ
970             0,
971             0x03F7,  // Ϸ
972             0x03F7,  // ϸ
973             0x03F9,  // Ϲ
974             0x03FA,  // Ϻ
975             0x03FA,  // ϻ
976             0x03FC,  // ϼ
977             0x03FD,  // Ͻ
978             0x03FE,  // Ͼ
979             0x03FF,  // Ͽ
980         };
981 
982         private static final char[] data1F00 = {
983             // U+1F00..1FFF
984             0x0391 | HAS_VOWEL,  // ἀ
985             0x0391 | HAS_VOWEL,  // ἁ
986             0x0391 | HAS_VOWEL | HAS_ACCENT,  // ἂ
987             0x0391 | HAS_VOWEL | HAS_ACCENT,  // ἃ
988             0x0391 | HAS_VOWEL | HAS_ACCENT,  // ἄ
989             0x0391 | HAS_VOWEL | HAS_ACCENT,  // ἅ
990             0x0391 | HAS_VOWEL | HAS_ACCENT,  // ἆ
991             0x0391 | HAS_VOWEL | HAS_ACCENT,  // ἇ
992             0x0391 | HAS_VOWEL,  // Ἀ
993             0x0391 | HAS_VOWEL,  // Ἁ
994             0x0391 | HAS_VOWEL | HAS_ACCENT,  // Ἂ
995             0x0391 | HAS_VOWEL | HAS_ACCENT,  // Ἃ
996             0x0391 | HAS_VOWEL | HAS_ACCENT,  // Ἄ
997             0x0391 | HAS_VOWEL | HAS_ACCENT,  // Ἅ
998             0x0391 | HAS_VOWEL | HAS_ACCENT,  // Ἆ
999             0x0391 | HAS_VOWEL | HAS_ACCENT,  // Ἇ
1000             0x0395 | HAS_VOWEL,  // ἐ
1001             0x0395 | HAS_VOWEL,  // ἑ
1002             0x0395 | HAS_VOWEL | HAS_ACCENT,  // ἒ
1003             0x0395 | HAS_VOWEL | HAS_ACCENT,  // ἓ
1004             0x0395 | HAS_VOWEL | HAS_ACCENT,  // ἔ
1005             0x0395 | HAS_VOWEL | HAS_ACCENT,  // ἕ
1006             0,
1007             0,
1008             0x0395 | HAS_VOWEL,  // Ἐ
1009             0x0395 | HAS_VOWEL,  // Ἑ
1010             0x0395 | HAS_VOWEL | HAS_ACCENT,  // Ἒ
1011             0x0395 | HAS_VOWEL | HAS_ACCENT,  // Ἓ
1012             0x0395 | HAS_VOWEL | HAS_ACCENT,  // Ἔ
1013             0x0395 | HAS_VOWEL | HAS_ACCENT,  // Ἕ
1014             0,
1015             0,
1016             0x0397 | HAS_VOWEL,  // ἠ
1017             0x0397 | HAS_VOWEL,  // ἡ
1018             0x0397 | HAS_VOWEL | HAS_ACCENT,  // ἢ
1019             0x0397 | HAS_VOWEL | HAS_ACCENT,  // ἣ
1020             0x0397 | HAS_VOWEL | HAS_ACCENT,  // ἤ
1021             0x0397 | HAS_VOWEL | HAS_ACCENT,  // ἥ
1022             0x0397 | HAS_VOWEL | HAS_ACCENT,  // ἦ
1023             0x0397 | HAS_VOWEL | HAS_ACCENT,  // ἧ
1024             0x0397 | HAS_VOWEL,  // Ἠ
1025             0x0397 | HAS_VOWEL,  // Ἡ
1026             0x0397 | HAS_VOWEL | HAS_ACCENT,  // Ἢ
1027             0x0397 | HAS_VOWEL | HAS_ACCENT,  // Ἣ
1028             0x0397 | HAS_VOWEL | HAS_ACCENT,  // Ἤ
1029             0x0397 | HAS_VOWEL | HAS_ACCENT,  // Ἥ
1030             0x0397 | HAS_VOWEL | HAS_ACCENT,  // Ἦ
1031             0x0397 | HAS_VOWEL | HAS_ACCENT,  // Ἧ
1032             0x0399 | HAS_VOWEL,  // ἰ
1033             0x0399 | HAS_VOWEL,  // ἱ
1034             0x0399 | HAS_VOWEL | HAS_ACCENT,  // ἲ
1035             0x0399 | HAS_VOWEL | HAS_ACCENT,  // ἳ
1036             0x0399 | HAS_VOWEL | HAS_ACCENT,  // ἴ
1037             0x0399 | HAS_VOWEL | HAS_ACCENT,  // ἵ
1038             0x0399 | HAS_VOWEL | HAS_ACCENT,  // ἶ
1039             0x0399 | HAS_VOWEL | HAS_ACCENT,  // ἷ
1040             0x0399 | HAS_VOWEL,  // Ἰ
1041             0x0399 | HAS_VOWEL,  // Ἱ
1042             0x0399 | HAS_VOWEL | HAS_ACCENT,  // Ἲ
1043             0x0399 | HAS_VOWEL | HAS_ACCENT,  // Ἳ
1044             0x0399 | HAS_VOWEL | HAS_ACCENT,  // Ἴ
1045             0x0399 | HAS_VOWEL | HAS_ACCENT,  // Ἵ
1046             0x0399 | HAS_VOWEL | HAS_ACCENT,  // Ἶ
1047             0x0399 | HAS_VOWEL | HAS_ACCENT,  // Ἷ
1048             0x039F | HAS_VOWEL,  // ὀ
1049             0x039F | HAS_VOWEL,  // ὁ
1050             0x039F | HAS_VOWEL | HAS_ACCENT,  // ὂ
1051             0x039F | HAS_VOWEL | HAS_ACCENT,  // ὃ
1052             0x039F | HAS_VOWEL | HAS_ACCENT,  // ὄ
1053             0x039F | HAS_VOWEL | HAS_ACCENT,  // ὅ
1054             0,
1055             0,
1056             0x039F | HAS_VOWEL,  // Ὀ
1057             0x039F | HAS_VOWEL,  // Ὁ
1058             0x039F | HAS_VOWEL | HAS_ACCENT,  // Ὂ
1059             0x039F | HAS_VOWEL | HAS_ACCENT,  // Ὃ
1060             0x039F | HAS_VOWEL | HAS_ACCENT,  // Ὄ
1061             0x039F | HAS_VOWEL | HAS_ACCENT,  // Ὅ
1062             0,
1063             0,
1064             0x03A5 | HAS_VOWEL,  // ὐ
1065             0x03A5 | HAS_VOWEL,  // ὑ
1066             0x03A5 | HAS_VOWEL | HAS_ACCENT,  // ὒ
1067             0x03A5 | HAS_VOWEL | HAS_ACCENT,  // ὓ
1068             0x03A5 | HAS_VOWEL | HAS_ACCENT,  // ὔ
1069             0x03A5 | HAS_VOWEL | HAS_ACCENT,  // ὕ
1070             0x03A5 | HAS_VOWEL | HAS_ACCENT,  // ὖ
1071             0x03A5 | HAS_VOWEL | HAS_ACCENT,  // ὗ
1072             0,
1073             0x03A5 | HAS_VOWEL,  // Ὑ
1074             0,
1075             0x03A5 | HAS_VOWEL | HAS_ACCENT,  // Ὓ
1076             0,
1077             0x03A5 | HAS_VOWEL | HAS_ACCENT,  // Ὕ
1078             0,
1079             0x03A5 | HAS_VOWEL | HAS_ACCENT,  // Ὗ
1080             0x03A9 | HAS_VOWEL,  // ὠ
1081             0x03A9 | HAS_VOWEL,  // ὡ
1082             0x03A9 | HAS_VOWEL | HAS_ACCENT,  // ὢ
1083             0x03A9 | HAS_VOWEL | HAS_ACCENT,  // ὣ
1084             0x03A9 | HAS_VOWEL | HAS_ACCENT,  // ὤ
1085             0x03A9 | HAS_VOWEL | HAS_ACCENT,  // ὥ
1086             0x03A9 | HAS_VOWEL | HAS_ACCENT,  // ὦ
1087             0x03A9 | HAS_VOWEL | HAS_ACCENT,  // ὧ
1088             0x03A9 | HAS_VOWEL,  // Ὠ
1089             0x03A9 | HAS_VOWEL,  // Ὡ
1090             0x03A9 | HAS_VOWEL | HAS_ACCENT,  // Ὢ
1091             0x03A9 | HAS_VOWEL | HAS_ACCENT,  // Ὣ
1092             0x03A9 | HAS_VOWEL | HAS_ACCENT,  // Ὤ
1093             0x03A9 | HAS_VOWEL | HAS_ACCENT,  // Ὥ
1094             0x03A9 | HAS_VOWEL | HAS_ACCENT,  // Ὦ
1095             0x03A9 | HAS_VOWEL | HAS_ACCENT,  // Ὧ
1096             0x0391 | HAS_VOWEL | HAS_ACCENT,  // ὰ
1097             0x0391 | HAS_VOWEL | HAS_ACCENT,  // ά
1098             0x0395 | HAS_VOWEL | HAS_ACCENT,  // ὲ
1099             0x0395 | HAS_VOWEL | HAS_ACCENT,  // έ
1100             0x0397 | HAS_VOWEL | HAS_ACCENT,  // ὴ
1101             0x0397 | HAS_VOWEL | HAS_ACCENT,  // ή
1102             0x0399 | HAS_VOWEL | HAS_ACCENT,  // ὶ
1103             0x0399 | HAS_VOWEL | HAS_ACCENT,  // ί
1104             0x039F | HAS_VOWEL | HAS_ACCENT,  // ὸ
1105             0x039F | HAS_VOWEL | HAS_ACCENT,  // ό
1106             0x03A5 | HAS_VOWEL | HAS_ACCENT,  // ὺ
1107             0x03A5 | HAS_VOWEL | HAS_ACCENT,  // ύ
1108             0x03A9 | HAS_VOWEL | HAS_ACCENT,  // ὼ
1109             0x03A9 | HAS_VOWEL | HAS_ACCENT,  // ώ
1110             0,
1111             0,
1112             0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI,  // ᾀ
1113             0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI,  // ᾁ
1114             0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾂ
1115             0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾃ
1116             0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾄ
1117             0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾅ
1118             0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾆ
1119             0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾇ
1120             0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI,  // ᾈ
1121             0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI,  // ᾉ
1122             0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾊ
1123             0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾋ
1124             0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾌ
1125             0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾍ
1126             0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾎ
1127             0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾏ
1128             0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI,  // ᾐ
1129             0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI,  // ᾑ
1130             0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾒ
1131             0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾓ
1132             0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾔ
1133             0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾕ
1134             0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾖ
1135             0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾗ
1136             0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI,  // ᾘ
1137             0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI,  // ᾙ
1138             0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾚ
1139             0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾛ
1140             0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾜ
1141             0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾝ
1142             0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾞ
1143             0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾟ
1144             0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI,  // ᾠ
1145             0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI,  // ᾡ
1146             0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾢ
1147             0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾣ
1148             0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾤ
1149             0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾥ
1150             0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾦ
1151             0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾧ
1152             0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI,  // ᾨ
1153             0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI,  // ᾩ
1154             0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾪ
1155             0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾫ
1156             0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾬ
1157             0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾭ
1158             0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾮ
1159             0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾯ
1160             0x0391 | HAS_VOWEL,  // ᾰ
1161             0x0391 | HAS_VOWEL,  // ᾱ
1162             0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾲ
1163             0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI,  // ᾳ
1164             0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾴ
1165             0,
1166             0x0391 | HAS_VOWEL | HAS_ACCENT,  // ᾶ
1167             0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾷ
1168             0x0391 | HAS_VOWEL,  // Ᾰ
1169             0x0391 | HAS_VOWEL,  // Ᾱ
1170             0x0391 | HAS_VOWEL | HAS_ACCENT,  // Ὰ
1171             0x0391 | HAS_VOWEL | HAS_ACCENT,  // Ά
1172             0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI,  // ᾼ
1173             0,
1174             0x0399 | HAS_VOWEL,  // ι
1175             0,
1176             0,
1177             0,
1178             0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ῂ
1179             0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI,  // ῃ
1180             0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ῄ
1181             0,
1182             0x0397 | HAS_VOWEL | HAS_ACCENT,  // ῆ
1183             0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ῇ
1184             0x0395 | HAS_VOWEL | HAS_ACCENT,  // Ὲ
1185             0x0395 | HAS_VOWEL | HAS_ACCENT,  // Έ
1186             0x0397 | HAS_VOWEL | HAS_ACCENT,  // Ὴ
1187             0x0397 | HAS_VOWEL | HAS_ACCENT,  // Ή
1188             0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI,  // ῌ
1189             0,
1190             0,
1191             0,
1192             0x0399 | HAS_VOWEL,  // ῐ
1193             0x0399 | HAS_VOWEL,  // ῑ
1194             0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,  // ῒ
1195             0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,  // ΐ
1196             0,
1197             0,
1198             0x0399 | HAS_VOWEL | HAS_ACCENT,  // ῖ
1199             0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,  // ῗ
1200             0x0399 | HAS_VOWEL,  // Ῐ
1201             0x0399 | HAS_VOWEL,  // Ῑ
1202             0x0399 | HAS_VOWEL | HAS_ACCENT,  // Ὶ
1203             0x0399 | HAS_VOWEL | HAS_ACCENT,  // Ί
1204             0,
1205             0,
1206             0,
1207             0,
1208             0x03A5 | HAS_VOWEL,  // ῠ
1209             0x03A5 | HAS_VOWEL,  // ῡ
1210             0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,  // ῢ
1211             0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,  // ΰ
1212             0x03A1,  // ῤ
1213             0x03A1,  // ῥ
1214             0x03A5 | HAS_VOWEL | HAS_ACCENT,  // ῦ
1215             0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,  // ῧ
1216             0x03A5 | HAS_VOWEL,  // Ῠ
1217             0x03A5 | HAS_VOWEL,  // Ῡ
1218             0x03A5 | HAS_VOWEL | HAS_ACCENT,  // Ὺ
1219             0x03A5 | HAS_VOWEL | HAS_ACCENT,  // Ύ
1220             0x03A1,  // Ῥ
1221             0,
1222             0,
1223             0,
1224             0,
1225             0,
1226             0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ῲ
1227             0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI,  // ῳ
1228             0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ῴ
1229             0,
1230             0x03A9 | HAS_VOWEL | HAS_ACCENT,  // ῶ
1231             0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ῷ
1232             0x039F | HAS_VOWEL | HAS_ACCENT,  // Ὸ
1233             0x039F | HAS_VOWEL | HAS_ACCENT,  // Ό
1234             0x03A9 | HAS_VOWEL | HAS_ACCENT,  // Ὼ
1235             0x03A9 | HAS_VOWEL | HAS_ACCENT,  // Ώ
1236             0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI,  // ῼ
1237             0,
1238             0,
1239             0,
1240         };
1241 
1242         // U+2126 Ohm sign
1243         private static final char data2126 = 0x03A9 | HAS_VOWEL;  // Ω
1244 
getLetterData(int c)1245         private static final int getLetterData(int c) {
1246             if (c < 0x370 || 0x2126 < c || (0x3ff < c && c < 0x1f00)) {
1247                 return 0;
1248             } else if (c <= 0x3ff) {
1249                 return data0370[c - 0x370];
1250             } else if (c <= 0x1fff) {
1251                 return data1F00[c - 0x1f00];
1252             } else if (c == 0x2126) {
1253                 return data2126;
1254             } else {
1255                 return 0;
1256             }
1257         }
1258 
1259         /**
1260          * Returns a non-zero value for each of the Greek combining diacritics
1261          * listed in The Unicode Standard, version 8, chapter 7.2 Greek,
1262          * plus some perispomeni look-alikes.
1263          */
getDiacriticData(int c)1264         private static final int getDiacriticData(int c) {
1265             switch (c) {
1266             case '\u0300':  // varia
1267             case '\u0301':  // tonos = oxia
1268             case '\u0342':  // perispomeni
1269             case '\u0302':  // circumflex can look like perispomeni
1270             case '\u0303':  // tilde can look like perispomeni
1271             case '\u0311':  // inverted breve can look like perispomeni
1272                 return HAS_ACCENT;
1273             case '\u0308':  // dialytika = diaeresis
1274                 return HAS_COMBINING_DIALYTIKA;
1275             case '\u0344':  // dialytika tonos
1276                 return HAS_COMBINING_DIALYTIKA | HAS_ACCENT;
1277             case '\u0345':  // ypogegrammeni = iota subscript
1278                 return HAS_YPOGEGRAMMENI;
1279             case '\u0304':  // macron
1280             case '\u0306':  // breve
1281             case '\u0313':  // comma above
1282             case '\u0314':  // reversed comma above
1283             case '\u0343':  // koronis
1284                 return HAS_OTHER_GREEK_DIACRITIC;
1285             default:
1286                 return 0;
1287             }
1288         }
1289 
isFollowedByCasedLetter(CharSequence s, int i)1290         private static boolean isFollowedByCasedLetter(CharSequence s, int i) {
1291             while (i < s.length()) {
1292                 int c = Character.codePointAt(s, i);
1293                 int type = UCaseProps.INSTANCE.getTypeOrIgnorable(c);
1294                 if ((type & UCaseProps.IGNORABLE) != 0) {
1295                     // Case-ignorable, continue with the loop.
1296                     i += Character.charCount(c);
1297                 } else if (type != UCaseProps.NONE) {
1298                     return true;  // Followed by cased letter.
1299                 } else {
1300                     return false;  // Uncased and not case-ignorable.
1301                 }
1302             }
1303             return false;  // Not followed by cased letter.
1304         }
1305 
1306         /**
1307          * Greek string uppercasing with a state machine.
1308          * Probably simpler than a stateless function that has to figure out complex context-before
1309          * for each character.
1310          * TODO: Try to re-consolidate one way or another with the non-Greek function.
1311          *
1312          * <p>Keep this consistent with the C++ versions in ustrcase.cpp (UTF-16) and ucasemap.cpp (UTF-8).
1313          * @throws IOException
1314          */
toUpper(int options, CharSequence src, A dest, Edits edits)1315         private static <A extends Appendable> A toUpper(int options,
1316                 CharSequence src, A dest, Edits edits) throws IOException {
1317             int state = 0;
1318             for (int i = 0; i < src.length();) {
1319                 int c = Character.codePointAt(src, i);
1320                 int nextIndex = i + Character.charCount(c);
1321                 int nextState = 0;
1322                 int type = UCaseProps.INSTANCE.getTypeOrIgnorable(c);
1323                 if ((type & UCaseProps.IGNORABLE) != 0) {
1324                     // c is case-ignorable
1325                     nextState |= (state & AFTER_CASED);
1326                 } else if (type != UCaseProps.NONE) {
1327                     // c is cased
1328                     nextState |= AFTER_CASED;
1329                 }
1330                 int data = getLetterData(c);
1331                 if (data > 0) {
1332                     int upper = data & UPPER_MASK;
1333                     // Add a dialytika to this iota or ypsilon vowel
1334                     // if we removed a tonos from the previous vowel,
1335                     // and that previous vowel did not also have (or gain) a dialytika.
1336                     // Adding one only to the final vowel in a longer sequence
1337                     // (which does not occur in normal writing) would require lookahead.
1338                     // Set the same flag as for preserving an existing dialytika.
1339                     if ((data & HAS_VOWEL) != 0 && (state & AFTER_VOWEL_WITH_ACCENT) != 0 &&
1340                             (upper == 'Ι' || upper == 'Υ')) {
1341                         data |= HAS_DIALYTIKA;
1342                     }
1343                     int numYpogegrammeni = 0;  // Map each one to a trailing, spacing, capital iota.
1344                     if ((data & HAS_YPOGEGRAMMENI) != 0) {
1345                         numYpogegrammeni = 1;
1346                     }
1347                     // Skip combining diacritics after this Greek letter.
1348                     while (nextIndex < src.length()) {
1349                         int diacriticData = getDiacriticData(src.charAt(nextIndex));
1350                         if (diacriticData != 0) {
1351                             data |= diacriticData;
1352                             if ((diacriticData & HAS_YPOGEGRAMMENI) != 0) {
1353                                 ++numYpogegrammeni;
1354                             }
1355                             ++nextIndex;
1356                         } else {
1357                             break;  // not a Greek diacritic
1358                         }
1359                     }
1360                     if ((data & HAS_VOWEL_AND_ACCENT_AND_DIALYTIKA) == HAS_VOWEL_AND_ACCENT) {
1361                         nextState |= AFTER_VOWEL_WITH_ACCENT;
1362                     }
1363                     // Map according to Greek rules.
1364                     boolean addTonos = false;
1365                     if (upper == 'Η' &&
1366                             (data & HAS_ACCENT) != 0 &&
1367                             numYpogegrammeni == 0 &&
1368                             (state & AFTER_CASED) == 0 &&
1369                             !isFollowedByCasedLetter(src, nextIndex)) {
1370                         // Keep disjunctive "or" with (only) a tonos.
1371                         // We use the same "word boundary" conditions as for the Final_Sigma test.
1372                         if (i == nextIndex) {
1373                             upper = 'Ή';  // Preserve the precomposed form.
1374                         } else {
1375                             addTonos = true;
1376                         }
1377                     } else if ((data & HAS_DIALYTIKA) != 0) {
1378                         // Preserve a vowel with dialytika in precomposed form if it exists.
1379                         if (upper == 'Ι') {
1380                             upper = 'Ϊ';
1381                             data &= ~HAS_EITHER_DIALYTIKA;
1382                         } else if (upper == 'Υ') {
1383                             upper = 'Ϋ';
1384                             data &= ~HAS_EITHER_DIALYTIKA;
1385                         }
1386                     }
1387 
1388                     boolean change;
1389                     if (edits == null && (options & OMIT_UNCHANGED_TEXT) == 0) {
1390                         change = true;  // common, simple usage
1391                     } else {
1392                         // Find out first whether we are changing the text.
1393                         change = src.charAt(i) != upper || numYpogegrammeni > 0;
1394                         int i2 = i + 1;
1395                         if ((data & HAS_EITHER_DIALYTIKA) != 0) {
1396                             change |= i2 >= nextIndex || src.charAt(i2) != 0x308;
1397                             ++i2;
1398                         }
1399                         if (addTonos) {
1400                             change |= i2 >= nextIndex || src.charAt(i2) != 0x301;
1401                             ++i2;
1402                         }
1403                         int oldLength = nextIndex - i;
1404                         int newLength = (i2 - i) + numYpogegrammeni;
1405                         change |= oldLength != newLength;
1406                         if (change) {
1407                             if (edits != null) {
1408                                 edits.addReplace(oldLength, newLength);
1409                             }
1410                         } else {
1411                             if (edits != null) {
1412                                 edits.addUnchanged(oldLength);
1413                             }
1414                             // Write unchanged text?
1415                             change = (options & OMIT_UNCHANGED_TEXT) == 0;
1416                         }
1417                     }
1418 
1419                     if (change) {
1420                         dest.append((char)upper);
1421                         if ((data & HAS_EITHER_DIALYTIKA) != 0) {
1422                             dest.append('\u0308');  // restore or add a dialytika
1423                         }
1424                         if (addTonos) {
1425                             dest.append('\u0301');
1426                         }
1427                         while (numYpogegrammeni > 0) {
1428                             dest.append('Ι');
1429                             --numYpogegrammeni;
1430                         }
1431                     }
1432                 } else {
1433                     c = UCaseProps.INSTANCE.toFullUpper(c, null, dest, UCaseProps.LOC_GREEK);
1434                     appendResult(c, dest, nextIndex - i, options, edits);
1435                 }
1436                 i = nextIndex;
1437                 state = nextState;
1438             }
1439             return dest;
1440         }
1441     }
1442 }
1443