• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html#License
3 /*
4  *******************************************************************************
5  * Copyright (C) 2000-2016, International Business Machines Corporation and
6  * others. All Rights Reserved.
7  *******************************************************************************
8  */
9 package com.ibm.icu.text;
10 import java.nio.CharBuffer;
11 import java.text.CharacterIterator;
12 
13 import com.ibm.icu.impl.Norm2AllModes;
14 import com.ibm.icu.impl.Normalizer2Impl;
15 import com.ibm.icu.impl.UCaseProps;
16 import com.ibm.icu.lang.UCharacter;
17 import com.ibm.icu.util.ICUCloneNotSupportedException;
18 
19 /**
20  * Old Unicode normalization API.
21  *
22  * <p>This API has been replaced by the {@link Normalizer2} class and is only available
23  * for backward compatibility. This class simply delegates to the Normalizer2 class.
24  * There are two exceptions: The new API does not provide a replacement for
25  * <code>QuickCheckResult</code> and <code>compare()</code>.
26  *
27  * <p><code>normalize</code> transforms Unicode text into an equivalent composed or
28  * decomposed form, allowing for easier sorting and searching of text.
29  * <code>normalize</code> supports the standard normalization forms described in
30  * <a href="http://www.unicode.org/unicode/reports/tr15/" target="unicode">
31  * Unicode Standard Annex #15 &mdash; Unicode Normalization Forms</a>.
32  *
33  * <p>Characters with accents or other adornments can be encoded in
34  * several different ways in Unicode.  For example, take the character A-acute.
35  * In Unicode, this can be encoded as a single character (the
36  * "composed" form):
37  *
38  * <pre>
39  *      00C1    LATIN CAPITAL LETTER A WITH ACUTE
40  * </pre>
41  *
42  * or as two separate characters (the "decomposed" form):
43  *
44  * <pre>
45  *      0041    LATIN CAPITAL LETTER A
46  *      0301    COMBINING ACUTE ACCENT
47  * </pre>
48  *
49  * <p>To a user of your program, however, both of these sequences should be
50  * treated as the same "user-level" character "A with acute accent".  When you
51  * are searching or comparing text, you must ensure that these two sequences are
52  * treated equivalently.  In addition, you must handle characters with more than
53  * one accent.  Sometimes the order of a character's combining accents is
54  * significant, while in other cases accent sequences in different orders are
55  * really equivalent.
56  *
57  * <p>Similarly, the string "ffi" can be encoded as three separate letters:
58  *
59  * <pre>
60  *      0066    LATIN SMALL LETTER F
61  *      0066    LATIN SMALL LETTER F
62  *      0069    LATIN SMALL LETTER I
63  * </pre>
64  *
65  * or as the single character
66  *
67  * <pre>
68  *      FB03    LATIN SMALL LIGATURE FFI
69  * </pre>
70  *
71  * <p>The ffi ligature is not a distinct semantic character, and strictly speaking
72  * it shouldn't be in Unicode at all, but it was included for compatibility
73  * with existing character sets that already provided it.  The Unicode standard
74  * identifies such characters by giving them "compatibility" decompositions
75  * into the corresponding semantic characters.  When sorting and searching, you
76  * will often want to use these mappings.
77  *
78  * <p><code>normalize</code> helps solve these problems by transforming text into
79  * the canonical composed and decomposed forms as shown in the first example
80  * above. In addition, you can have it perform compatibility decompositions so
81  * that you can treat compatibility characters the same as their equivalents.
82  * Finally, <code>normalize</code> rearranges accents into the proper canonical
83  * order, so that you do not have to worry about accent rearrangement on your
84  * own.
85  *
86  * <p>Form FCD, "Fast C or D", is also designed for collation.
87  * It allows to work on strings that are not necessarily normalized
88  * with an algorithm (like in collation) that works under "canonical closure",
89  * i.e., it treats precomposed characters and their decomposed equivalents the
90  * same.
91  *
92  * <p>It is not a normalization form because it does not provide for uniqueness of
93  * representation. Multiple strings may be canonically equivalent (their NFDs
94  * are identical) and may all conform to FCD without being identical themselves.
95  *
96  * <p>The form is defined such that the "raw decomposition", the recursive
97  * canonical decomposition of each character, results in a string that is
98  * canonically ordered. This means that precomposed characters are allowed for
99  * as long as their decompositions do not need canonical reordering.
100  *
101  * <p>Its advantage for a process like collation is that all NFD and most NFC texts
102  * - and many unnormalized texts - already conform to FCD and do not need to be
103  * normalized (NFD) for such a process. The FCD quick check will return YES for
104  * most strings in practice.
105  *
106  * <p>normalize(FCD) may be implemented with NFD.
107  *
108  * <p>For more details on FCD see Unicode Technical Note #5 (Canonical Equivalence in Applications):
109  * http://www.unicode.org/notes/tn5/#FCD
110  *
111  * <p>ICU collation performs either NFD or FCD normalization automatically if
112  * normalization is turned on for the collator object. Beyond collation and
113  * string search, normalized strings may be useful for string equivalence
114  * comparisons, transliteration/transcription, unique representations, etc.
115  *
116  * <p>The W3C generally recommends to exchange texts in NFC.
117  * Note also that most legacy character encodings use only precomposed forms and
118  * often do not encode any combining marks by themselves. For conversion to such
119  * character encodings the Unicode text needs to be normalized to NFC.
120  * For more usage examples, see the Unicode Standard Annex.
121  *
122  * <p>Note: The Normalizer class also provides API for iterative normalization.
123  * While the setIndex() and getIndex() refer to indices in the
124  * underlying Unicode input text, the next() and previous() methods
125  * iterate through characters in the normalized output.
126  * This means that there is not necessarily a one-to-one correspondence
127  * between characters returned by next() and previous() and the indices
128  * passed to and returned from setIndex() and getIndex().
129  * It is for this reason that Normalizer does not implement the CharacterIterator interface.
130  *
131  * @stable ICU 2.8
132  */
133 public final class Normalizer implements Cloneable {
134     // The input text and our position in it
135     private UCharacterIterator  text;
136     private Normalizer2         norm2;
137     private Mode                mode;
138     private int                 options;
139 
140     // The normalization buffer is the result of normalization
141     // of the source in [currentIndex..nextIndex[ .
142     private int                 currentIndex;
143     private int                 nextIndex;
144 
145     // A buffer for holding intermediate results
146     private StringBuilder       buffer;
147     private int                 bufferPos;
148 
149     // Helper classes to defer loading of normalization data.
150     private static final class ModeImpl {
ModeImpl(Normalizer2 n2)151         private ModeImpl(Normalizer2 n2) {
152             normalizer2 = n2;
153         }
154         private final Normalizer2 normalizer2;
155     }
156     private static final class NFDModeImpl {
157         private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFDInstance());
158     }
159     private static final class NFKDModeImpl {
160         private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFKDInstance());
161     }
162     private static final class NFCModeImpl {
163         private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFCInstance());
164     }
165     private static final class NFKCModeImpl {
166         private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFKCInstance());
167     }
168     private static final class FCDModeImpl {
169         private static final ModeImpl INSTANCE = new ModeImpl(Norm2AllModes.getFCDNormalizer2());
170     }
171 
172     private static final class Unicode32 {
173         private static final UnicodeSet INSTANCE = new UnicodeSet("[:age=3.2:]").freeze();
174     }
175     private static final class NFD32ModeImpl {
176         private static final ModeImpl INSTANCE =
177             new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFDInstance(),
178                                                  Unicode32.INSTANCE));
179     }
180     private static final class NFKD32ModeImpl {
181         private static final ModeImpl INSTANCE =
182             new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFKDInstance(),
183                                                  Unicode32.INSTANCE));
184     }
185     private static final class NFC32ModeImpl {
186         private static final ModeImpl INSTANCE =
187             new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFCInstance(),
188                                                  Unicode32.INSTANCE));
189     }
190     private static final class NFKC32ModeImpl {
191         private static final ModeImpl INSTANCE =
192             new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFKCInstance(),
193                                                  Unicode32.INSTANCE));
194     }
195     private static final class FCD32ModeImpl {
196         private static final ModeImpl INSTANCE =
197             new ModeImpl(new FilteredNormalizer2(Norm2AllModes.getFCDNormalizer2(),
198                                                  Unicode32.INSTANCE));
199     }
200 
201     /**
202      * Options bit set value to select Unicode 3.2 normalization
203      * (except NormalizationCorrections).
204      * At most one Unicode version can be selected at a time.
205      *
206      * @deprecated ICU 56 Use {@link FilteredNormalizer2} instead.
207      */
208     @Deprecated
209     public static final int UNICODE_3_2=0x20;
210 
211     /**
212      * Constant indicating that the end of the iteration has been reached.
213      * This is guaranteed to have the same value as {@link UCharacterIterator#DONE}.
214      *
215      * @deprecated ICU 56
216      */
217     @Deprecated
218     public static final int DONE = UCharacterIterator.DONE;
219 
220     /**
221      * Constants for normalization modes.
222      * <p>
223      * The Mode class is not intended for public subclassing.
224      * Only the Mode constants provided by the Normalizer class should be used,
225      * and any fields or methods should not be called or overridden by users.
226      *
227      * @deprecated ICU 56 Use {@link Normalizer2} instead.
228      */
229     @Deprecated
230     public static abstract class Mode {
231         /**
232          * Sole constructor
233          * @internal
234          * @deprecated This API is ICU internal only.
235          */
236         @Deprecated
Mode()237         protected Mode() {
238         }
239 
240         /**
241          * @internal
242          * @deprecated This API is ICU internal only.
243          */
244         @Deprecated
getNormalizer2(int options)245         protected abstract Normalizer2 getNormalizer2(int options);
246     }
247 
248     private static final class NONEMode extends Mode {
249         @Override
getNormalizer2(int options)250         protected Normalizer2 getNormalizer2(int options) { return Norm2AllModes.NOOP_NORMALIZER2; }
251     }
252     private static final class NFDMode extends Mode {
253         @Override
getNormalizer2(int options)254         protected Normalizer2 getNormalizer2(int options) {
255             return (options&UNICODE_3_2) != 0 ?
256                     NFD32ModeImpl.INSTANCE.normalizer2 : NFDModeImpl.INSTANCE.normalizer2;
257         }
258     }
259     private static final class NFKDMode extends Mode {
260         @Override
getNormalizer2(int options)261         protected Normalizer2 getNormalizer2(int options) {
262             return (options&UNICODE_3_2) != 0 ?
263                     NFKD32ModeImpl.INSTANCE.normalizer2 : NFKDModeImpl.INSTANCE.normalizer2;
264         }
265     }
266     private static final class NFCMode extends Mode {
267         @Override
getNormalizer2(int options)268         protected Normalizer2 getNormalizer2(int options) {
269             return (options&UNICODE_3_2) != 0 ?
270                     NFC32ModeImpl.INSTANCE.normalizer2 : NFCModeImpl.INSTANCE.normalizer2;
271         }
272     }
273     private static final class NFKCMode extends Mode {
274         @Override
getNormalizer2(int options)275         protected Normalizer2 getNormalizer2(int options) {
276             return (options&UNICODE_3_2) != 0 ?
277                     NFKC32ModeImpl.INSTANCE.normalizer2 : NFKCModeImpl.INSTANCE.normalizer2;
278         }
279     }
280     private static final class FCDMode extends Mode {
281         @Override
getNormalizer2(int options)282         protected Normalizer2 getNormalizer2(int options) {
283             return (options&UNICODE_3_2) != 0 ?
284                     FCD32ModeImpl.INSTANCE.normalizer2 : FCDModeImpl.INSTANCE.normalizer2;
285         }
286     }
287 
288     /**
289      * No decomposition/composition.
290      *
291      * @deprecated ICU 56 Use {@link Normalizer2} instead.
292      */
293     @Deprecated
294     public static final Mode NONE = new NONEMode();
295 
296     /**
297      * Canonical decomposition.
298      *
299      * @deprecated ICU 56 Use {@link Normalizer2} instead.
300      */
301     @Deprecated
302     public static final Mode NFD = new NFDMode();
303 
304     /**
305      * Compatibility decomposition.
306      *
307      * @deprecated ICU 56 Use {@link Normalizer2} instead.
308      */
309     @Deprecated
310     public static final Mode NFKD = new NFKDMode();
311 
312     /**
313      * Canonical decomposition followed by canonical composition.
314      *
315      * @deprecated ICU 56 Use {@link Normalizer2} instead.
316      */
317     @Deprecated
318     public static final Mode NFC = new NFCMode();
319 
320     /**
321      * Default normalization.
322      *
323      * @deprecated ICU 56 Use {@link Normalizer2} instead.
324      */
325     @Deprecated
326     public static final Mode DEFAULT = NFC;
327 
328     /**
329      * Compatibility decomposition followed by canonical composition.
330      *
331      * @deprecated ICU 56 Use {@link Normalizer2} instead.
332      */
333     @Deprecated
334     public static final Mode NFKC =new NFKCMode();
335 
336     /**
337      * "Fast C or D" form.
338      *
339      * @deprecated ICU 56 Use {@link Normalizer2} instead.
340      */
341     @Deprecated
342     public static final Mode FCD = new FCDMode();
343 
344     /**
345      * Null operation for use with the {@link com.ibm.icu.text.Normalizer constructors}
346      * and the static {@link #normalize normalize} method.  This value tells
347      * the <tt>Normalizer</tt> to do nothing but return unprocessed characters
348      * from the underlying String or CharacterIterator.  If you have code which
349      * requires raw text at some times and normalized text at others, you can
350      * use <tt>NO_OP</tt> for the cases where you want raw text, rather
351      * than having a separate code path that bypasses <tt>Normalizer</tt>
352      * altogether.
353      * <p>
354      * @see #setMode
355      * @deprecated ICU 2.8. Use Nomalizer.NONE
356      * @see #NONE
357      */
358     @Deprecated
359     public static final Mode NO_OP = NONE;
360 
361     /**
362      * Canonical decomposition followed by canonical composition.  Used with the
363      * {@link com.ibm.icu.text.Normalizer constructors} and the static
364      * {@link #normalize normalize} method to determine the operation to be
365      * performed.
366      * <p>
367      * If all optional features (<i>e.g.</i> {@link #IGNORE_HANGUL}) are turned
368      * off, this operation produces output that is in
369      * <a href=http://www.unicode.org/unicode/reports/tr15/>Unicode Canonical
370      * Form</a>
371      * <b>C</b>.
372      * <p>
373      * @see #setMode
374      * @deprecated ICU 2.8. Use Normalier.NFC
375      * @see #NFC
376      */
377     @Deprecated
378     public static final Mode COMPOSE = NFC;
379 
380     /**
381      * Compatibility decomposition followed by canonical composition.
382      * Used with the {@link com.ibm.icu.text.Normalizer constructors} and the static
383      * {@link #normalize normalize} method to determine the operation to be
384      * performed.
385      * <p>
386      * If all optional features (<i>e.g.</i> {@link #IGNORE_HANGUL}) are turned
387      * off, this operation produces output that is in
388      * <a href=http://www.unicode.org/unicode/reports/tr15/>Unicode Canonical
389      * Form</a>
390      * <b>KC</b>.
391      * <p>
392      * @see #setMode
393      * @deprecated ICU 2.8. Use Normalizer.NFKC
394      * @see #NFKC
395      */
396     @Deprecated
397     public static final Mode COMPOSE_COMPAT = NFKC;
398 
399     /**
400      * Canonical decomposition.  This value is passed to the
401      * {@link com.ibm.icu.text.Normalizer constructors} and the static
402      * {@link #normalize normalize}
403      * method to determine the operation to be performed.
404      * <p>
405      * If all optional features (<i>e.g.</i> {@link #IGNORE_HANGUL}) are turned
406      * off, this operation produces output that is in
407      * <a href=http://www.unicode.org/unicode/reports/tr15/>Unicode Canonical
408      * Form</a>
409      * <b>D</b>.
410      * <p>
411      * @see #setMode
412      * @deprecated ICU 2.8. Use Normalizer.NFD
413      * @see #NFD
414      */
415     @Deprecated
416     public static final Mode DECOMP = NFD;
417 
418     /**
419      * Compatibility decomposition.  This value is passed to the
420      * {@link com.ibm.icu.text.Normalizer constructors} and the static
421      * {@link #normalize normalize}
422      * method to determine the operation to be performed.
423      * <p>
424      * If all optional features (<i>e.g.</i> {@link #IGNORE_HANGUL}) are turned
425      * off, this operation produces output that is in
426      * <a href=http://www.unicode.org/unicode/reports/tr15/>Unicode Canonical
427      * Form</a>
428      * <b>KD</b>.
429      * <p>
430      * @see #setMode
431      * @deprecated ICU 2.8. Use Normalizer.NFKD
432      * @see #NFKD
433      */
434     @Deprecated
435     public static final Mode DECOMP_COMPAT = NFKD;
436 
437     /**
438      * Option to disable Hangul/Jamo composition and decomposition.
439      * This option applies to Korean text,
440      * which can be represented either in the Jamo alphabet or in Hangul
441      * characters, which are really just two or three Jamo combined
442      * into one visual glyph.  Since Jamo takes up more storage space than
443      * Hangul, applications that process only Hangul text may wish to turn
444      * this option on when decomposing text.
445      * <p>
446      * The Unicode standard treates Hangul to Jamo conversion as a
447      * canonical decomposition, so this option must be turned <b>off</b> if you
448      * wish to transform strings into one of the standard
449      * <a href="http://www.unicode.org/unicode/reports/tr15/" target="unicode">
450      * Unicode Normalization Forms</a>.
451      * <p>
452      * @see #setOption
453      * @deprecated ICU 2.8. This option is no longer supported.
454      */
455     @Deprecated
456     public static final int IGNORE_HANGUL = 0x0001;
457 
458     /**
459      * Result values for quickCheck().
460      * For details see Unicode Technical Report 15.
461      * @stable ICU 2.8
462      */
463     public static final class QuickCheckResult{
464         //private int resultValue;
QuickCheckResult(int value)465         private QuickCheckResult(int value) {
466             //resultValue=value;
467         }
468     }
469     /**
470      * Indicates that string is not in the normalized format
471      * @stable ICU 2.8
472      */
473     public static final QuickCheckResult NO = new QuickCheckResult(0);
474 
475     /**
476      * Indicates that string is in the normalized format
477      * @stable ICU 2.8
478      */
479     public static final QuickCheckResult YES = new QuickCheckResult(1);
480 
481     /**
482      * Indicates it cannot be determined if string is in the normalized
483      * format without further thorough checks.
484      * @stable ICU 2.8
485      */
486     public static final QuickCheckResult MAYBE = new QuickCheckResult(2);
487 
488     /**
489      * Option bit for compare:
490      * Case sensitively compare the strings
491      * @stable ICU 2.8
492      */
493     public static final int FOLD_CASE_DEFAULT =  UCharacter.FOLD_CASE_DEFAULT;
494 
495     /**
496      * Option bit for compare:
497      * Both input strings are assumed to fulfill FCD conditions.
498      * @stable ICU 2.8
499      */
500     public static final int INPUT_IS_FCD    =      0x20000;
501 
502     /**
503      * Option bit for compare:
504      * Perform case-insensitive comparison.
505      * @stable ICU 2.8
506      */
507     public static final int COMPARE_IGNORE_CASE  =     0x10000;
508 
509     /**
510      * Option bit for compare:
511      * Compare strings in code point order instead of code unit order.
512      * @stable ICU 2.8
513      */
514     public static final int COMPARE_CODE_POINT_ORDER = 0x8000;
515 
516     /**
517      * Option value for case folding:
518      * Use the modified set of mappings provided in CaseFolding.txt to handle dotted I
519      * and dotless i appropriately for Turkic languages (tr, az).
520      * @see UCharacter#FOLD_CASE_EXCLUDE_SPECIAL_I
521      * @stable ICU 2.8
522      */
523     public static final int FOLD_CASE_EXCLUDE_SPECIAL_I = UCharacter.FOLD_CASE_EXCLUDE_SPECIAL_I;
524 
525     /**
526      * Lowest-order bit number of compare() options bits corresponding to
527      * normalization options bits.
528      *
529      * The options parameter for compare() uses most bits for
530      * itself and for various comparison and folding flags.
531      * The most significant bits, however, are shifted down and passed on
532      * to the normalization implementation.
533      * (That is, from compare(..., options, ...),
534      * options&gt;&gt;COMPARE_NORM_OPTIONS_SHIFT will be passed on to the
535      * internal normalization functions.)
536      *
537      * @see #compare
538      * @deprecated ICU 56 Use {@link Normalizer2} instead.
539      */
540     @Deprecated
541     public static final int COMPARE_NORM_OPTIONS_SHIFT  = 20;
542 
543     //-------------------------------------------------------------------------
544     // Iterator constructors
545     //-------------------------------------------------------------------------
546 
547     /**
548      * Creates a new <tt>Normalizer</tt> object for iterating over the
549      * normalized form of a given string.
550      * <p>
551      * The <tt>options</tt> parameter specifies which optional
552      * <tt>Normalizer</tt> features are to be enabled for this object.
553      * <p>
554      * @param str  The string to be normalized.  The normalization
555      *              will start at the beginning of the string.
556      *
557      * @param mode The normalization mode.
558      *
559      * @param opt Any optional features to be enabled.
560      *            Currently the only available option is {@link #UNICODE_3_2}.
561      *            If you want the default behavior corresponding to one of the
562      *            standard Unicode Normalization Forms, use 0 for this argument.
563      * @deprecated ICU 56 Use {@link Normalizer2} instead.
564      */
565     @Deprecated
Normalizer(String str, Mode mode, int opt)566     public Normalizer(String str, Mode mode, int opt) {
567         this.text = UCharacterIterator.getInstance(str);
568         this.mode = mode;
569         this.options=opt;
570         norm2 = mode.getNormalizer2(opt);
571         buffer = new StringBuilder();
572     }
573 
574     /**
575      * Creates a new <tt>Normalizer</tt> object for iterating over the
576      * normalized form of the given text.
577      * <p>
578      * @param iter  The input text to be normalized.  The normalization
579      *              will start at the beginning of the string.
580      *
581      * @param mode  The normalization mode.
582      *
583      * @param opt Any optional features to be enabled.
584      *            Currently the only available option is {@link #UNICODE_3_2}.
585      *            If you want the default behavior corresponding to one of the
586      *            standard Unicode Normalization Forms, use 0 for this argument.
587      * @deprecated ICU 56 Use {@link Normalizer2} instead.
588      */
589     @Deprecated
Normalizer(CharacterIterator iter, Mode mode, int opt)590     public Normalizer(CharacterIterator iter, Mode mode, int opt) {
591         this.text = UCharacterIterator.getInstance((CharacterIterator)iter.clone());
592         this.mode = mode;
593         this.options = opt;
594         norm2 = mode.getNormalizer2(opt);
595         buffer = new StringBuilder();
596     }
597 
598     /**
599      * Creates a new <tt>Normalizer</tt> object for iterating over the
600      * normalized form of the given text.
601      * <p>
602      * @param iter  The input text to be normalized.  The normalization
603      *              will start at the beginning of the string.
604      *
605      * @param mode  The normalization mode.
606      * @param options The normalization options, ORed together (0 for no options).
607      * @deprecated ICU 56 Use {@link Normalizer2} instead.
608      */
609     @Deprecated
Normalizer(UCharacterIterator iter, Mode mode, int options)610     public Normalizer(UCharacterIterator iter, Mode mode, int options) {
611         try {
612             this.text     = (UCharacterIterator)iter.clone();
613             this.mode     = mode;
614             this.options  = options;
615             norm2 = mode.getNormalizer2(options);
616             buffer = new StringBuilder();
617         } catch (CloneNotSupportedException e) {
618             throw new ICUCloneNotSupportedException(e);
619         }
620     }
621 
622     /**
623      * Clones this <tt>Normalizer</tt> object.  All properties of this
624      * object are duplicated in the new object, including the cloning of any
625      * {@link CharacterIterator} that was passed in to the constructor
626      * or to {@link #setText(CharacterIterator) setText}.
627      * However, the text storage underlying
628      * the <tt>CharacterIterator</tt> is not duplicated unless the
629      * iterator's <tt>clone</tt> method does so.
630      *
631      * @deprecated ICU 56 Use {@link Normalizer2} instead.
632      */
633     @Deprecated
634     @Override
clone()635     public Object clone() {
636         try {
637             Normalizer copy = (Normalizer) super.clone();
638             copy.text = (UCharacterIterator) text.clone();
639             copy.mode = mode;
640             copy.options = options;
641             copy.norm2 = norm2;
642             copy.buffer = new StringBuilder(buffer);
643             copy.bufferPos = bufferPos;
644             copy.currentIndex = currentIndex;
645             copy.nextIndex = nextIndex;
646             return copy;
647         }
648         catch (CloneNotSupportedException e) {
649             throw new ICUCloneNotSupportedException(e);
650         }
651     }
652 
653     //--------------------------------------------------------------------------
654     // Static Utility methods
655     //--------------------------------------------------------------------------
656 
getComposeNormalizer2(boolean compat, int options)657     private static final Normalizer2 getComposeNormalizer2(boolean compat, int options) {
658         return (compat ? NFKC : NFC).getNormalizer2(options);
659     }
getDecomposeNormalizer2(boolean compat, int options)660     private static final Normalizer2 getDecomposeNormalizer2(boolean compat, int options) {
661         return (compat ? NFKD : NFD).getNormalizer2(options);
662     }
663 
664     /**
665      * Compose a string.
666      * The string will be composed to according to the specified mode.
667      * @param str        The string to compose.
668      * @param compat     If true the string will be composed according to
669      *                    NFKC rules and if false will be composed according to
670      *                    NFC rules.
671      * @return String    The composed string
672      * @deprecated ICU 56 Use {@link Normalizer2} instead.
673      */
674     @Deprecated
compose(String str, boolean compat)675     public static String compose(String str, boolean compat) {
676         return compose(str,compat,0);
677     }
678 
679     /**
680      * Compose a string.
681      * The string will be composed to according to the specified mode.
682      * @param str        The string to compose.
683      * @param compat     If true the string will be composed according to
684      *                    NFKC rules and if false will be composed according to
685      *                    NFC rules.
686      * @param options    The only recognized option is UNICODE_3_2
687      * @return String    The composed string
688      * @deprecated ICU 56 Use {@link Normalizer2} instead.
689      */
690     @Deprecated
compose(String str, boolean compat, int options)691     public static String compose(String str, boolean compat, int options) {
692         return getComposeNormalizer2(compat, options).normalize(str);
693     }
694 
695     /**
696      * Compose a string.
697      * The string will be composed to according to the specified mode.
698      * @param source The char array to compose.
699      * @param target A char buffer to receive the normalized text.
700      * @param compat If true the char array will be composed according to
701      *                NFKC rules and if false will be composed according to
702      *                NFC rules.
703      * @param options The normalization options, ORed together (0 for no options).
704      * @return int   The total buffer size needed;if greater than length of
705      *                result, the output was truncated.
706      * @exception IndexOutOfBoundsException if target.length is less than the
707      *             required length
708      * @deprecated ICU 56 Use {@link Normalizer2} instead.
709      */
710     @Deprecated
compose(char[] source,char[] target, boolean compat, int options)711     public static int compose(char[] source,char[] target, boolean compat, int options) {
712         return compose(source, 0, source.length, target, 0, target.length, compat, options);
713     }
714 
715     /**
716      * Compose a string.
717      * The string will be composed to according to the specified mode.
718      * @param src       The char array to compose.
719      * @param srcStart  Start index of the source
720      * @param srcLimit  Limit index of the source
721      * @param dest      The char buffer to fill in
722      * @param destStart Start index of the destination buffer
723      * @param destLimit End index of the destination buffer
724      * @param compat If true the char array will be composed according to
725      *                NFKC rules and if false will be composed according to
726      *                NFC rules.
727      * @param options The normalization options, ORed together (0 for no options).
728      * @return int   The total buffer size needed;if greater than length of
729      *                result, the output was truncated.
730      * @exception IndexOutOfBoundsException if target.length is less than the
731      *             required length
732      * @deprecated ICU 56 Use {@link Normalizer2} instead.
733      */
734     @Deprecated
compose(char[] src,int srcStart, int srcLimit, char[] dest,int destStart, int destLimit, boolean compat, int options)735     public static int compose(char[] src,int srcStart, int srcLimit,
736                               char[] dest,int destStart, int destLimit,
737                               boolean compat, int options) {
738         CharBuffer srcBuffer = CharBuffer.wrap(src, srcStart, srcLimit - srcStart);
739         CharsAppendable app = new CharsAppendable(dest, destStart, destLimit);
740         getComposeNormalizer2(compat, options).normalize(srcBuffer, app);
741         return app.length();
742     }
743 
744     /**
745      * Decompose a string.
746      * The string will be decomposed to according to the specified mode.
747      * @param str       The string to decompose.
748      * @param compat    If true the string will be decomposed according to NFKD
749      *                   rules and if false will be decomposed according to NFD
750      *                   rules.
751      * @return String   The decomposed string
752      * @deprecated ICU 56 Use {@link Normalizer2} instead.
753      */
754     @Deprecated
decompose(String str, boolean compat)755     public static String decompose(String str, boolean compat) {
756         return decompose(str,compat,0);
757     }
758 
759     /**
760      * Decompose a string.
761      * The string will be decomposed to according to the specified mode.
762      * @param str     The string to decompose.
763      * @param compat  If true the string will be decomposed according to NFKD
764      *                 rules and if false will be decomposed according to NFD
765      *                 rules.
766      * @param options The normalization options, ORed together (0 for no options).
767      * @return String The decomposed string
768      * @deprecated ICU 56 Use {@link Normalizer2} instead.
769      */
770     @Deprecated
decompose(String str, boolean compat, int options)771     public static String decompose(String str, boolean compat, int options) {
772         return getDecomposeNormalizer2(compat, options).normalize(str);
773     }
774 
775     /**
776      * Decompose a string.
777      * The string will be decomposed to according to the specified mode.
778      * @param source The char array to decompose.
779      * @param target A char buffer to receive the normalized text.
780      * @param compat If true the char array will be decomposed according to NFKD
781      *                rules and if false will be decomposed according to
782      *                NFD rules.
783      * @return int   The total buffer size needed;if greater than length of
784      *                result,the output was truncated.
785      * @param options The normalization options, ORed together (0 for no options).
786      * @exception IndexOutOfBoundsException if the target capacity is less than
787      *             the required length
788      * @deprecated ICU 56 Use {@link Normalizer2} instead.
789      */
790     @Deprecated
decompose(char[] source,char[] target, boolean compat, int options)791     public static int decompose(char[] source,char[] target, boolean compat, int options) {
792         return decompose(source, 0, source.length, target, 0, target.length, compat, options);
793     }
794 
795     /**
796      * Decompose a string.
797      * The string will be decomposed to according to the specified mode.
798      * @param src       The char array to compose.
799      * @param srcStart  Start index of the source
800      * @param srcLimit  Limit index of the source
801      * @param dest      The char buffer to fill in
802      * @param destStart Start index of the destination buffer
803      * @param destLimit End index of the destination buffer
804      * @param compat If true the char array will be decomposed according to NFKD
805      *                rules and if false will be decomposed according to
806      *                NFD rules.
807      * @param options The normalization options, ORed together (0 for no options).
808      * @return int   The total buffer size needed;if greater than length of
809      *                result,the output was truncated.
810      * @exception IndexOutOfBoundsException if the target capacity is less than
811      *             the required length
812      * @deprecated ICU 56 Use {@link Normalizer2} instead.
813      */
814     @Deprecated
decompose(char[] src,int srcStart, int srcLimit, char[] dest,int destStart, int destLimit, boolean compat, int options)815     public static int decompose(char[] src,int srcStart, int srcLimit,
816                                 char[] dest,int destStart, int destLimit,
817                                 boolean compat, int options) {
818         CharBuffer srcBuffer = CharBuffer.wrap(src, srcStart, srcLimit - srcStart);
819         CharsAppendable app = new CharsAppendable(dest, destStart, destLimit);
820         getDecomposeNormalizer2(compat, options).normalize(srcBuffer, app);
821         return app.length();
822     }
823 
824     /**
825      * Normalizes a <tt>String</tt> using the given normalization operation.
826      * <p>
827      * The <tt>options</tt> parameter specifies which optional
828      * <tt>Normalizer</tt> features are to be enabled for this operation.
829      * Currently the only available option is {@link #UNICODE_3_2}.
830      * If you want the default behavior corresponding to one of the standard
831      * Unicode Normalization Forms, use 0 for this argument.
832      * <p>
833      * @param str       the input string to be normalized.
834      * @param mode      the normalization mode
835      * @param options   the optional features to be enabled.
836      * @return String   the normalized string
837      * @deprecated ICU 56 Use {@link Normalizer2} instead.
838      */
839     @Deprecated
normalize(String str, Mode mode, int options)840     public static String normalize(String str, Mode mode, int options) {
841         return mode.getNormalizer2(options).normalize(str);
842     }
843 
844     /**
845      * Normalize a string.
846      * The string will be normalized according to the specified normalization
847      * mode and options.
848      * @param src        The string to normalize.
849      * @param mode       The normalization mode; one of Normalizer.NONE,
850      *                    Normalizer.NFD, Normalizer.NFC, Normalizer.NFKC,
851      *                    Normalizer.NFKD, Normalizer.DEFAULT
852      * @return the normalized string
853      * @deprecated ICU 56 Use {@link Normalizer2} instead.
854      */
855     @Deprecated
normalize(String src,Mode mode)856     public static String normalize(String src,Mode mode) {
857         return normalize(src, mode, 0);
858     }
859     /**
860      * Normalize a string.
861      * The string will be normalized according to the specified normalization
862      * mode and options.
863      * @param source The char array to normalize.
864      * @param target A char buffer to receive the normalized text.
865      * @param mode   The normalization mode; one of Normalizer.NONE,
866      *                Normalizer.NFD, Normalizer.NFC, Normalizer.NFKC,
867      *                Normalizer.NFKD, Normalizer.DEFAULT
868      * @param options The normalization options, ORed together (0 for no options).
869      * @return int   The total buffer size needed;if greater than length of
870      *                result, the output was truncated.
871      * @exception    IndexOutOfBoundsException if the target capacity is less
872      *                than the required length
873      * @deprecated ICU 56 Use {@link Normalizer2} instead.
874      */
875     @Deprecated
normalize(char[] source,char[] target, Mode mode, int options)876     public static int normalize(char[] source,char[] target, Mode  mode, int options) {
877         return normalize(source,0,source.length,target,0,target.length,mode, options);
878     }
879 
880     /**
881      * Normalize a string.
882      * The string will be normalized according to the specified normalization
883      * mode and options.
884      * @param src       The char array to compose.
885      * @param srcStart  Start index of the source
886      * @param srcLimit  Limit index of the source
887      * @param dest      The char buffer to fill in
888      * @param destStart Start index of the destination buffer
889      * @param destLimit End index of the destination buffer
890      * @param mode      The normalization mode; one of Normalizer.NONE,
891      *                   Normalizer.NFD, Normalizer.NFC, Normalizer.NFKC,
892      *                   Normalizer.NFKD, Normalizer.DEFAULT
893      * @param options The normalization options, ORed together (0 for no options).
894      * @return int      The total buffer size needed;if greater than length of
895      *                   result, the output was truncated.
896      * @exception       IndexOutOfBoundsException if the target capacity is
897      *                   less than the required length
898      * @deprecated ICU 56 Use {@link Normalizer2} instead.
899      */
900     @Deprecated
normalize(char[] src,int srcStart, int srcLimit, char[] dest,int destStart, int destLimit, Mode mode, int options)901     public static int normalize(char[] src,int srcStart, int srcLimit,
902                                 char[] dest,int destStart, int destLimit,
903                                 Mode  mode, int options) {
904         CharBuffer srcBuffer = CharBuffer.wrap(src, srcStart, srcLimit - srcStart);
905         CharsAppendable app = new CharsAppendable(dest, destStart, destLimit);
906         mode.getNormalizer2(options).normalize(srcBuffer, app);
907         return app.length();
908     }
909 
910     /**
911      * Normalize a codepoint according to the given mode
912      * @param char32    The input string to be normalized.
913      * @param mode      The normalization mode
914      * @param options   Options for use with exclusion set and tailored Normalization
915      *                                   The only option that is currently recognized is UNICODE_3_2
916      * @return String   The normalized string
917      * @see #UNICODE_3_2
918      * @deprecated ICU 56 Use {@link Normalizer2} instead.
919      */
920     @Deprecated
normalize(int char32, Mode mode, int options)921     public static String normalize(int char32, Mode mode, int options) {
922         if(mode == NFD && options == 0) {
923             String decomposition = Normalizer2.getNFCInstance().getDecomposition(char32);
924             if(decomposition == null) {
925                 decomposition = UTF16.valueOf(char32);
926             }
927             return decomposition;
928         }
929         return normalize(UTF16.valueOf(char32), mode, options);
930     }
931 
932     /**
933      * Convenience method to normalize a codepoint according to the given mode
934      * @param char32    The input string to be normalized.
935      * @param mode      The normalization mode
936      * @return String   The normalized string
937      * @deprecated ICU 56 Use {@link Normalizer2} instead.
938      */
939     @Deprecated
normalize(int char32, Mode mode)940     public static String normalize(int char32, Mode mode) {
941         return normalize(char32, mode, 0);
942     }
943 
944     /**
945      * Convenience method.
946      *
947      * @param source   string for determining if it is in a normalized format
948      * @param mode     normalization format (Normalizer.NFC,Normalizer.NFD,
949      *                  Normalizer.NFKC,Normalizer.NFKD)
950      * @return         Return code to specify if the text is normalized or not
951      *                     (Normalizer.YES, Normalizer.NO or Normalizer.MAYBE)
952      * @deprecated ICU 56 Use {@link Normalizer2} instead.
953      */
954     @Deprecated
quickCheck(String source, Mode mode)955     public static QuickCheckResult quickCheck(String source, Mode mode) {
956         return quickCheck(source, mode, 0);
957     }
958 
959     /**
960      * Performing quick check on a string, to quickly determine if the string is
961      * in a particular normalization format.
962      * Three types of result can be returned Normalizer.YES, Normalizer.NO or
963      * Normalizer.MAYBE. Result Normalizer.YES indicates that the argument
964      * string is in the desired normalized format, Normalizer.NO determines that
965      * argument string is not in the desired normalized format. A
966      * Normalizer.MAYBE result indicates that a more thorough check is required,
967      * the user may have to put the string in its normalized form and compare
968      * the results.
969      *
970      * @param source   string for determining if it is in a normalized format
971      * @param mode     normalization format (Normalizer.NFC,Normalizer.NFD,
972      *                  Normalizer.NFKC,Normalizer.NFKD)
973      * @param options   Options for use with exclusion set and tailored Normalization
974      *                                   The only option that is currently recognized is UNICODE_3_2
975      * @return         Return code to specify if the text is normalized or not
976      *                     (Normalizer.YES, Normalizer.NO or Normalizer.MAYBE)
977      * @deprecated ICU 56 Use {@link Normalizer2} instead.
978      */
979     @Deprecated
quickCheck(String source, Mode mode, int options)980     public static QuickCheckResult quickCheck(String source, Mode mode, int options) {
981         return mode.getNormalizer2(options).quickCheck(source);
982     }
983 
984     /**
985      * Convenience method.
986      *
987      * @param source Array of characters for determining if it is in a
988      *                normalized format
989      * @param mode   normalization format (Normalizer.NFC,Normalizer.NFD,
990      *                Normalizer.NFKC,Normalizer.NFKD)
991      * @param options   Options for use with exclusion set and tailored Normalization
992      *                                   The only option that is currently recognized is UNICODE_3_2
993      * @return       Return code to specify if the text is normalized or not
994      *                (Normalizer.YES, Normalizer.NO or Normalizer.MAYBE)
995      * @deprecated ICU 56 Use {@link Normalizer2} instead.
996      */
997     @Deprecated
quickCheck(char[] source, Mode mode, int options)998     public static QuickCheckResult quickCheck(char[] source, Mode mode, int options) {
999         return quickCheck(source, 0, source.length, mode, options);
1000     }
1001 
1002     /**
1003      * Performing quick check on a string, to quickly determine if the string is
1004      * in a particular normalization format.
1005      * Three types of result can be returned Normalizer.YES, Normalizer.NO or
1006      * Normalizer.MAYBE. Result Normalizer.YES indicates that the argument
1007      * string is in the desired normalized format, Normalizer.NO determines that
1008      * argument string is not in the desired normalized format. A
1009      * Normalizer.MAYBE result indicates that a more thorough check is required,
1010      * the user may have to put the string in its normalized form and compare
1011      * the results.
1012      *
1013      * @param source    string for determining if it is in a normalized format
1014      * @param start     the start index of the source
1015      * @param limit     the limit index of the source it is equal to the length
1016      * @param mode      normalization format (Normalizer.NFC,Normalizer.NFD,
1017      *                   Normalizer.NFKC,Normalizer.NFKD)
1018      * @param options   Options for use with exclusion set and tailored Normalization
1019      *                                   The only option that is currently recognized is UNICODE_3_2
1020      * @return          Return code to specify if the text is normalized or not
1021      *                   (Normalizer.YES, Normalizer.NO or
1022      *                   Normalizer.MAYBE)
1023      * @deprecated ICU 56 Use {@link Normalizer2} instead.
1024      */
1025     @Deprecated
quickCheck(char[] source,int start, int limit, Mode mode,int options)1026     public static QuickCheckResult quickCheck(char[] source,int start,
1027                                               int limit, Mode mode,int options) {
1028         CharBuffer srcBuffer = CharBuffer.wrap(source, start, limit - start);
1029         return mode.getNormalizer2(options).quickCheck(srcBuffer);
1030     }
1031 
1032     /**
1033      * Test if a string is in a given normalization form.
1034      * This is semantically equivalent to source.equals(normalize(source, mode)).
1035      *
1036      * Unlike quickCheck(), this function returns a definitive result,
1037      * never a "maybe".
1038      * For NFD, NFKD, and FCD, both functions work exactly the same.
1039      * For NFC and NFKC where quickCheck may return "maybe", this function will
1040      * perform further tests to arrive at a true/false result.
1041      * @param src       The input array of characters to be checked to see if
1042      *                   it is normalized
1043      * @param start     The strart index in the source
1044      * @param limit     The limit index in the source
1045      * @param mode      the normalization mode
1046      * @param options   Options for use with exclusion set and tailored Normalization
1047      *                                   The only option that is currently recognized is UNICODE_3_2
1048      * @return Boolean value indicating whether the source string is in the
1049      *         "mode" normalization form
1050      * @deprecated ICU 56 Use {@link Normalizer2} instead.
1051      */
1052     @Deprecated
isNormalized(char[] src,int start, int limit, Mode mode, int options)1053     public static boolean isNormalized(char[] src,int start,
1054                                        int limit, Mode mode,
1055                                        int options) {
1056         CharBuffer srcBuffer = CharBuffer.wrap(src, start, limit - start);
1057         return mode.getNormalizer2(options).isNormalized(srcBuffer);
1058     }
1059 
1060     /**
1061      * Test if a string is in a given normalization form.
1062      * This is semantically equivalent to source.equals(normalize(source, mode)).
1063      *
1064      * Unlike quickCheck(), this function returns a definitive result,
1065      * never a "maybe".
1066      * For NFD, NFKD, and FCD, both functions work exactly the same.
1067      * For NFC and NFKC where quickCheck may return "maybe", this function will
1068      * perform further tests to arrive at a true/false result.
1069      * @param str       the input string to be checked to see if it is
1070      *                   normalized
1071      * @param mode      the normalization mode
1072      * @param options   Options for use with exclusion set and tailored Normalization
1073      *                  The only option that is currently recognized is UNICODE_3_2
1074      * @see #isNormalized
1075      * @deprecated ICU 56 Use {@link Normalizer2} instead.
1076      */
1077     @Deprecated
isNormalized(String str, Mode mode, int options)1078     public static boolean isNormalized(String str, Mode mode, int options) {
1079         return mode.getNormalizer2(options).isNormalized(str);
1080     }
1081 
1082     /**
1083      * Convenience Method
1084      * @param char32    the input code point to be checked to see if it is
1085      *                   normalized
1086      * @param mode      the normalization mode
1087      * @param options   Options for use with exclusion set and tailored Normalization
1088      *                  The only option that is currently recognized is UNICODE_3_2
1089      *
1090      * @see #isNormalized
1091      * @deprecated ICU 56 Use {@link Normalizer2} instead.
1092      */
1093     @Deprecated
isNormalized(int char32, Mode mode,int options)1094     public static boolean isNormalized(int char32, Mode mode,int options) {
1095         return isNormalized(UTF16.valueOf(char32), mode, options);
1096     }
1097 
1098     /**
1099      * Compare two strings for canonical equivalence.
1100      * Further options include case-insensitive comparison and
1101      * code point order (as opposed to code unit order).
1102      *
1103      * Canonical equivalence between two strings is defined as their normalized
1104      * forms (NFD or NFC) being identical.
1105      * This function compares strings incrementally instead of normalizing
1106      * (and optionally case-folding) both strings entirely,
1107      * improving performance significantly.
1108      *
1109      * Bulk normalization is only necessary if the strings do not fulfill the
1110      * FCD conditions. Only in this case, and only if the strings are relatively
1111      * long, is memory allocated temporarily.
1112      * For FCD strings and short non-FCD strings there is no memory allocation.
1113      *
1114      * Semantically, this is equivalent to
1115      *   strcmp[CodePointOrder](foldCase(NFD(s1)), foldCase(NFD(s2)))
1116      * where code point order and foldCase are all optional.
1117      *
1118      * @param s1        First source character array.
1119      * @param s1Start   start index of source
1120      * @param s1Limit   limit of the source
1121      *
1122      * @param s2        Second source character array.
1123      * @param s2Start   start index of the source
1124      * @param s2Limit   limit of the source
1125      *
1126      * @param options A bit set of options:
1127      *   - FOLD_CASE_DEFAULT or 0 is used for default options:
1128      *     Case-sensitive comparison in code unit order, and the input strings
1129      *     are quick-checked for FCD.
1130      *
1131      *   - INPUT_IS_FCD
1132      *     Set if the caller knows that both s1 and s2 fulfill the FCD
1133      *     conditions.If not set, the function will quickCheck for FCD
1134      *     and normalize if necessary.
1135      *
1136      *   - COMPARE_CODE_POINT_ORDER
1137      *     Set to choose code point order instead of code unit order
1138      *
1139      *   - COMPARE_IGNORE_CASE
1140      *     Set to compare strings case-insensitively using case folding,
1141      *     instead of case-sensitively.
1142      *     If set, then the following case folding options are used.
1143      *
1144      *
1145      * @return &lt;0 or 0 or &gt;0 as usual for string comparisons
1146      *
1147      * @see #normalize
1148      * @see #FCD
1149      * @stable ICU 2.8
1150      */
compare(char[] s1, int s1Start, int s1Limit, char[] s2, int s2Start, int s2Limit, int options)1151     public static int compare(char[] s1, int s1Start, int s1Limit,
1152                               char[] s2, int s2Start, int s2Limit,
1153                               int options) {
1154         if( s1==null || s1Start<0 || s1Limit<0 ||
1155             s2==null || s2Start<0 || s2Limit<0 ||
1156             s1Limit<s1Start || s2Limit<s2Start
1157         ) {
1158             throw new IllegalArgumentException();
1159         }
1160         return internalCompare(CharBuffer.wrap(s1, s1Start, s1Limit-s1Start),
1161                                CharBuffer.wrap(s2, s2Start, s2Limit-s2Start),
1162                                options);
1163     }
1164 
1165     /**
1166      * Compare two strings for canonical equivalence.
1167      * Further options include case-insensitive comparison and
1168      * code point order (as opposed to code unit order).
1169      *
1170      * Canonical equivalence between two strings is defined as their normalized
1171      * forms (NFD or NFC) being identical.
1172      * This function compares strings incrementally instead of normalizing
1173      * (and optionally case-folding) both strings entirely,
1174      * improving performance significantly.
1175      *
1176      * Bulk normalization is only necessary if the strings do not fulfill the
1177      * FCD conditions. Only in this case, and only if the strings are relatively
1178      * long, is memory allocated temporarily.
1179      * For FCD strings and short non-FCD strings there is no memory allocation.
1180      *
1181      * Semantically, this is equivalent to
1182      *   strcmp[CodePointOrder](foldCase(NFD(s1)), foldCase(NFD(s2)))
1183      * where code point order and foldCase are all optional.
1184      *
1185      * @param s1 First source string.
1186      * @param s2 Second source string.
1187      *
1188      * @param options A bit set of options:
1189      *   - FOLD_CASE_DEFAULT or 0 is used for default options:
1190      *     Case-sensitive comparison in code unit order, and the input strings
1191      *     are quick-checked for FCD.
1192      *
1193      *   - INPUT_IS_FCD
1194      *     Set if the caller knows that both s1 and s2 fulfill the FCD
1195      *     conditions. If not set, the function will quickCheck for FCD
1196      *     and normalize if necessary.
1197      *
1198      *   - COMPARE_CODE_POINT_ORDER
1199      *     Set to choose code point order instead of code unit order
1200      *
1201      *   - COMPARE_IGNORE_CASE
1202      *     Set to compare strings case-insensitively using case folding,
1203      *     instead of case-sensitively.
1204      *     If set, then the following case folding options are used.
1205      *
1206      * @return &lt;0 or 0 or &gt;0 as usual for string comparisons
1207      *
1208      * @see #normalize
1209      * @see #FCD
1210      * @stable ICU 2.8
1211      */
compare(String s1, String s2, int options)1212     public static int compare(String s1, String s2, int options) {
1213         return internalCompare(s1, s2, options);
1214     }
1215 
1216     /**
1217      * Compare two strings for canonical equivalence.
1218      * Further options include case-insensitive comparison and
1219      * code point order (as opposed to code unit order).
1220      * Convenience method.
1221      *
1222      * @param s1 First source string.
1223      * @param s2 Second source string.
1224      *
1225      * @param options A bit set of options:
1226      *   - FOLD_CASE_DEFAULT or 0 is used for default options:
1227      *     Case-sensitive comparison in code unit order, and the input strings
1228      *     are quick-checked for FCD.
1229      *
1230      *   - INPUT_IS_FCD
1231      *     Set if the caller knows that both s1 and s2 fulfill the FCD
1232      *     conditions. If not set, the function will quickCheck for FCD
1233      *     and normalize if necessary.
1234      *
1235      *   - COMPARE_CODE_POINT_ORDER
1236      *     Set to choose code point order instead of code unit order
1237      *
1238      *   - COMPARE_IGNORE_CASE
1239      *     Set to compare strings case-insensitively using case folding,
1240      *     instead of case-sensitively.
1241      *     If set, then the following case folding options are used.
1242      *
1243      * @return &lt;0 or 0 or &gt;0 as usual for string comparisons
1244      *
1245      * @see #normalize
1246      * @see #FCD
1247      * @stable ICU 2.8
1248      */
compare(char[] s1, char[] s2, int options)1249     public static int compare(char[] s1, char[] s2, int options) {
1250         return internalCompare(CharBuffer.wrap(s1), CharBuffer.wrap(s2), options);
1251     }
1252 
1253     /**
1254      * Convenience method that can have faster implementation
1255      * by not allocating buffers.
1256      * @param char32a    the first code point to be checked against the
1257      * @param char32b    the second code point
1258      * @param options    A bit set of options
1259      * @stable ICU 2.8
1260      */
compare(int char32a, int char32b, int options)1261     public static int compare(int char32a, int char32b, int options) {
1262         return internalCompare(UTF16.valueOf(char32a), UTF16.valueOf(char32b), options|INPUT_IS_FCD);
1263     }
1264 
1265     /**
1266      * Convenience method that can have faster implementation
1267      * by not allocating buffers.
1268      * @param char32a   the first code point to be checked against
1269      * @param str2      the second string
1270      * @param options   A bit set of options
1271      * @stable ICU 2.8
1272      */
compare(int char32a, String str2, int options)1273     public static int compare(int char32a, String str2, int options) {
1274         return internalCompare(UTF16.valueOf(char32a), str2, options);
1275     }
1276 
1277     /* Concatenation of normalized strings --------------------------------- */
1278     /**
1279      * Concatenate normalized strings, making sure that the result is normalized
1280      * as well.
1281      *
1282      * If both the left and the right strings are in
1283      * the normalization form according to "mode",
1284      * then the result will be
1285      *
1286      * <code>
1287      *     dest=normalize(left+right, mode)
1288      * </code>
1289      *
1290      * With the input strings already being normalized,
1291      * this function will use next() and previous()
1292      * to find the adjacent end pieces of the input strings.
1293      * Only the concatenation of these end pieces will be normalized and
1294      * then concatenated with the remaining parts of the input strings.
1295      *
1296      * It is allowed to have dest==left to avoid copying the entire left string.
1297      *
1298      * @param left Left source array, may be same as dest.
1299      * @param leftStart start in the left array.
1300      * @param leftLimit limit in the left array (==length)
1301      * @param right Right source array.
1302      * @param rightStart start in the right array.
1303      * @param rightLimit limit in the right array (==length)
1304      * @param dest The output buffer; can be null if destStart==destLimit==0
1305      *              for pure preflighting.
1306      * @param destStart start in the destination array
1307      * @param destLimit limit in the destination array (==length)
1308      * @param mode The normalization mode.
1309      * @param options The normalization options, ORed together (0 for no options).
1310      * @return Length of output (number of chars) when successful or
1311      *          IndexOutOfBoundsException
1312      * @exception IndexOutOfBoundsException whose message has the string
1313      *             representation of destination capacity required.
1314      * @see #normalize
1315      * @see #next
1316      * @see #previous
1317      * @exception IndexOutOfBoundsException if target capacity is less than the
1318      *             required length
1319      * @deprecated ICU 56 Use {@link Normalizer2} instead.
1320      */
1321     @Deprecated
concatenate(char[] left, int leftStart, int leftLimit, char[] right, int rightStart, int rightLimit, char[] dest, int destStart, int destLimit, Normalizer.Mode mode, int options)1322     public static int concatenate(char[] left,  int leftStart,  int leftLimit,
1323                                   char[] right, int rightStart, int rightLimit,
1324                                   char[] dest,  int destStart,  int destLimit,
1325                                   Normalizer.Mode mode, int options) {
1326         if(dest == null) {
1327             throw new IllegalArgumentException();
1328         }
1329 
1330         /* check for overlapping right and destination */
1331         if (right == dest && rightStart < destLimit && destStart < rightLimit) {
1332             throw new IllegalArgumentException("overlapping right and dst ranges");
1333         }
1334 
1335         /* allow left==dest */
1336         StringBuilder destBuilder=new StringBuilder(leftLimit-leftStart+rightLimit-rightStart+16);
1337         destBuilder.append(left, leftStart, leftLimit-leftStart);
1338         CharBuffer rightBuffer=CharBuffer.wrap(right, rightStart, rightLimit-rightStart);
1339         mode.getNormalizer2(options).append(destBuilder, rightBuffer);
1340         int destLength=destBuilder.length();
1341         if(destLength<=(destLimit-destStart)) {
1342             destBuilder.getChars(0, destLength, dest, destStart);
1343             return destLength;
1344         } else {
1345             throw new IndexOutOfBoundsException(Integer.toString(destLength));
1346         }
1347     }
1348 
1349     /**
1350      * Concatenate normalized strings, making sure that the result is normalized
1351      * as well.
1352      *
1353      * If both the left and the right strings are in
1354      * the normalization form according to "mode",
1355      * then the result will be
1356      *
1357      * <code>
1358      *     dest=normalize(left+right, mode)
1359      * </code>
1360      *
1361      * For details see concatenate
1362      *
1363      * @param left Left source string.
1364      * @param right Right source string.
1365      * @param mode The normalization mode.
1366      * @param options The normalization options, ORed together (0 for no options).
1367      * @return result
1368      *
1369      * @see #concatenate
1370      * @see #normalize
1371      * @see #next
1372      * @see #previous
1373      * @see #concatenate
1374      * @deprecated ICU 56 Use {@link Normalizer2} instead.
1375      */
1376     @Deprecated
concatenate(char[] left, char[] right,Mode mode, int options)1377     public static String concatenate(char[] left, char[] right,Mode mode, int options) {
1378         StringBuilder dest=new StringBuilder(left.length+right.length+16).append(left);
1379         return mode.getNormalizer2(options).append(dest, CharBuffer.wrap(right)).toString();
1380     }
1381 
1382     /**
1383      * Concatenate normalized strings, making sure that the result is normalized
1384      * as well.
1385      *
1386      * If both the left and the right strings are in
1387      * the normalization form according to "mode",
1388      * then the result will be
1389      *
1390      * <code>
1391      *     dest=normalize(left+right, mode)
1392      * </code>
1393      *
1394      * With the input strings already being normalized,
1395      * this function will use next() and previous()
1396      * to find the adjacent end pieces of the input strings.
1397      * Only the concatenation of these end pieces will be normalized and
1398      * then concatenated with the remaining parts of the input strings.
1399      *
1400      * @param left Left source string.
1401      * @param right Right source string.
1402      * @param mode The normalization mode.
1403      * @param options The normalization options, ORed together (0 for no options).
1404      * @return result
1405      *
1406      * @see #concatenate
1407      * @see #normalize
1408      * @see #next
1409      * @see #previous
1410      * @see #concatenate
1411      * @deprecated ICU 56 Use {@link Normalizer2} instead.
1412      */
1413     @Deprecated
concatenate(String left, String right, Mode mode, int options)1414     public static String concatenate(String left, String right, Mode mode, int options) {
1415         StringBuilder dest=new StringBuilder(left.length()+right.length()+16).append(left);
1416         return mode.getNormalizer2(options).append(dest, right).toString();
1417     }
1418 
1419     /**
1420      * Gets the FC_NFKC closure value.
1421      * @param c The code point whose closure value is to be retrieved
1422      * @param dest The char array to receive the closure value
1423      * @return the length of the closure value; 0 if there is none
1424      * @deprecated ICU 56
1425      */
1426     @Deprecated
getFC_NFKC_Closure(int c,char[] dest)1427     public static int getFC_NFKC_Closure(int c,char[] dest) {
1428         String closure=getFC_NFKC_Closure(c);
1429         int length=closure.length();
1430         if(length!=0 && dest!=null && length<=dest.length) {
1431             closure.getChars(0, length, dest, 0);
1432         }
1433         return length;
1434     }
1435     /**
1436      * Gets the FC_NFKC closure value.
1437      * @param c The code point whose closure value is to be retrieved
1438      * @return String representation of the closure value; "" if there is none
1439      * @deprecated ICU 56
1440      */
1441     @Deprecated
getFC_NFKC_Closure(int c)1442     public static String getFC_NFKC_Closure(int c) {
1443         // Compute the FC_NFKC_Closure on the fly:
1444         // We have the API for complete coverage of Unicode properties, although
1445         // this value by itself is not useful via API.
1446         // (What could be useful is a custom normalization table that combines
1447         // case folding and NFKC.)
1448         // For the derivation, see Unicode's DerivedNormalizationProps.txt.
1449         Normalizer2 nfkc=NFKCModeImpl.INSTANCE.normalizer2;
1450         UCaseProps csp=UCaseProps.INSTANCE;
1451         // first: b = NFKC(Fold(a))
1452         StringBuilder folded=new StringBuilder();
1453         int folded1Length=csp.toFullFolding(c, folded, 0);
1454         if(folded1Length<0) {
1455             Normalizer2Impl nfkcImpl=((Norm2AllModes.Normalizer2WithImpl)nfkc).impl;
1456             if(nfkcImpl.getCompQuickCheck(nfkcImpl.getNorm16(c))!=0) {
1457                 return "";  // c does not change at all under CaseFolding+NFKC
1458             }
1459             folded.appendCodePoint(c);
1460         } else {
1461             if(folded1Length>UCaseProps.MAX_STRING_LENGTH) {
1462                 folded.appendCodePoint(folded1Length);
1463             }
1464         }
1465         String kc1=nfkc.normalize(folded);
1466         // second: c = NFKC(Fold(b))
1467         String kc2=nfkc.normalize(UCharacter.foldCase(kc1, 0));
1468         // if (c != b) add the mapping from a to c
1469         if(kc1.equals(kc2)) {
1470             return "";
1471         } else {
1472             return kc2;
1473         }
1474     }
1475 
1476     //-------------------------------------------------------------------------
1477     // Iteration API
1478     //-------------------------------------------------------------------------
1479 
1480     /**
1481      * Return the current character in the normalized text.
1482      * @return The codepoint as an int
1483      * @deprecated ICU 56
1484      */
1485     @Deprecated
current()1486     public int current() {
1487         if(bufferPos<buffer.length() || nextNormalize()) {
1488             return buffer.codePointAt(bufferPos);
1489         } else {
1490             return DONE;
1491         }
1492     }
1493 
1494     /**
1495      * Return the next character in the normalized text and advance
1496      * the iteration position by one.  If the end
1497      * of the text has already been reached, {@link #DONE} is returned.
1498      * @return The codepoint as an int
1499      * @deprecated ICU 56
1500      */
1501     @Deprecated
next()1502     public int next() {
1503         if(bufferPos<buffer.length() ||  nextNormalize()) {
1504             int c=buffer.codePointAt(bufferPos);
1505             bufferPos+=Character.charCount(c);
1506             return c;
1507         } else {
1508             return DONE;
1509         }
1510     }
1511 
1512 
1513     /**
1514      * Return the previous character in the normalized text and decrement
1515      * the iteration position by one.  If the beginning
1516      * of the text has already been reached, {@link #DONE} is returned.
1517      * @return The codepoint as an int
1518      * @deprecated ICU 56
1519      */
1520     @Deprecated
previous()1521     public int previous() {
1522         if(bufferPos>0 || previousNormalize()) {
1523             int c=buffer.codePointBefore(bufferPos);
1524             bufferPos-=Character.charCount(c);
1525             return c;
1526         } else {
1527             return DONE;
1528         }
1529     }
1530 
1531     /**
1532      * Reset the index to the beginning of the text.
1533      * This is equivalent to setIndexOnly(startIndex)).
1534      * @deprecated ICU 56
1535      */
1536     @Deprecated
reset()1537     public void reset() {
1538         text.setToStart();
1539         currentIndex=nextIndex=0;
1540         clearBuffer();
1541     }
1542 
1543     /**
1544      * Set the iteration position in the input text that is being normalized,
1545      * without any immediate normalization.
1546      * After setIndexOnly(), getIndex() will return the same index that is
1547      * specified here.
1548      *
1549      * @param index the desired index in the input text.
1550      * @deprecated ICU 56
1551      */
1552     @Deprecated
setIndexOnly(int index)1553     public void setIndexOnly(int index) {
1554         text.setIndex(index);  // validates index
1555         currentIndex=nextIndex=index;
1556         clearBuffer();
1557     }
1558 
1559     /**
1560      * Set the iteration position in the input text that is being normalized
1561      * and return the first normalized character at that position.
1562      * <p>
1563      * <b>Note:</b> This method sets the position in the <em>input</em> text,
1564      * while {@link #next} and {@link #previous} iterate through characters
1565      * in the normalized <em>output</em>.  This means that there is not
1566      * necessarily a one-to-one correspondence between characters returned
1567      * by <tt>next</tt> and <tt>previous</tt> and the indices passed to and
1568      * returned from <tt>setIndex</tt> and {@link #getIndex}.
1569      * <p>
1570      * @param index the desired index in the input text.
1571      *
1572      * @return   the first normalized character that is the result of iterating
1573      *            forward starting at the given index.
1574      *
1575      * @throws IllegalArgumentException if the given index is less than
1576      *          {@link #getBeginIndex} or greater than {@link #getEndIndex}.
1577      * @deprecated ICU 3.2
1578      * @obsolete ICU 3.2
1579      */
1580     @Deprecated
1581      ///CLOVER:OFF
setIndex(int index)1582      public int setIndex(int index) {
1583          setIndexOnly(index);
1584          return current();
1585      }
1586      ///CLOVER:ON
1587     /**
1588      * Retrieve the index of the start of the input text. This is the begin
1589      * index of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the
1590      * <tt>String</tt> over which this <tt>Normalizer</tt> is iterating
1591      * @deprecated ICU 2.2. Use startIndex() instead.
1592      * @return The codepoint as an int
1593      * @see #startIndex
1594      */
1595     @Deprecated
getBeginIndex()1596     public int getBeginIndex() {
1597         return 0;
1598     }
1599 
1600     /**
1601      * Retrieve the index of the end of the input text.  This is the end index
1602      * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt>
1603      * over which this <tt>Normalizer</tt> is iterating
1604      * @deprecated ICU 2.2. Use endIndex() instead.
1605      * @return The codepoint as an int
1606      * @see #endIndex
1607      */
1608     @Deprecated
getEndIndex()1609     public int getEndIndex() {
1610         return endIndex();
1611     }
1612     /**
1613      * Return the first character in the normalized text.  This resets
1614      * the <tt>Normalizer's</tt> position to the beginning of the text.
1615      * @return The codepoint as an int
1616      * @deprecated ICU 56
1617      */
1618     @Deprecated
first()1619     public int first() {
1620         reset();
1621         return next();
1622     }
1623 
1624     /**
1625      * Return the last character in the normalized text.  This resets
1626      * the <tt>Normalizer's</tt> position to be just before the
1627      * the input text corresponding to that normalized character.
1628      * @return The codepoint as an int
1629      * @deprecated ICU 56
1630      */
1631     @Deprecated
last()1632     public int last() {
1633         text.setToLimit();
1634         currentIndex=nextIndex=text.getIndex();
1635         clearBuffer();
1636         return previous();
1637     }
1638 
1639     /**
1640      * Retrieve the current iteration position in the input text that is
1641      * being normalized.  This method is useful in applications such as
1642      * searching, where you need to be able to determine the position in
1643      * the input text that corresponds to a given normalized output character.
1644      * <p>
1645      * <b>Note:</b> This method sets the position in the <em>input</em>, while
1646      * {@link #next} and {@link #previous} iterate through characters in the
1647      * <em>output</em>.  This means that there is not necessarily a one-to-one
1648      * correspondence between characters returned by <tt>next</tt> and
1649      * <tt>previous</tt> and the indices passed to and returned from
1650      * <tt>setIndex</tt> and {@link #getIndex}.
1651      * @return The current iteration position
1652      * @deprecated ICU 56
1653      */
1654     @Deprecated
getIndex()1655     public int getIndex() {
1656         if(bufferPos<buffer.length()) {
1657             return currentIndex;
1658         } else {
1659             return nextIndex;
1660         }
1661     }
1662 
1663     /**
1664      * Retrieve the index of the start of the input text. This is the begin
1665      * index of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the
1666      * <tt>String</tt> over which this <tt>Normalizer</tt> is iterating
1667      * @return The current iteration position
1668      * @deprecated ICU 56
1669      */
1670     @Deprecated
startIndex()1671     public int startIndex() {
1672         return 0;
1673     }
1674 
1675     /**
1676      * Retrieve the index of the end of the input text.  This is the end index
1677      * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt>
1678      * over which this <tt>Normalizer</tt> is iterating
1679      * @return The current iteration position
1680      * @deprecated ICU 56
1681      */
1682     @Deprecated
endIndex()1683     public int endIndex() {
1684         return text.getLength();
1685     }
1686 
1687     //-------------------------------------------------------------------------
1688     // Iterator attributes
1689     //-------------------------------------------------------------------------
1690     /**
1691      * Set the normalization mode for this object.
1692      * <p>
1693      * <b>Note:</b>If the normalization mode is changed while iterating
1694      * over a string, calls to {@link #next} and {@link #previous} may
1695      * return previously buffers characters in the old normalization mode
1696      * until the iteration is able to re-sync at the next base character.
1697      * It is safest to call {@link #setText setText()}, {@link #first},
1698      * {@link #last}, etc. after calling <tt>setMode</tt>.
1699      * <p>
1700      * @param newMode the new mode for this <tt>Normalizer</tt>.
1701      * The supported modes are:
1702      * <ul>
1703      *  <li>{@link #NFC}    - Unicode canonical decompositiion
1704      *                        followed by canonical composition.
1705      *  <li>{@link #NFKC}   - Unicode compatibility decompositiion
1706      *                        follwed by canonical composition.
1707      *  <li>{@link #NFD}    - Unicode canonical decomposition
1708      *  <li>{@link #NFKD}   - Unicode compatibility decomposition.
1709      *  <li>{@link #NONE}   - Do nothing but return characters
1710      *                        from the underlying input text.
1711      * </ul>
1712      *
1713      * @see #getMode
1714      * @deprecated ICU 56
1715      */
1716     @Deprecated
setMode(Mode newMode)1717     public void setMode(Mode newMode) {
1718         mode = newMode;
1719         norm2 = mode.getNormalizer2(options);
1720     }
1721     /**
1722      * Return the basic operation performed by this <tt>Normalizer</tt>
1723      *
1724      * @see #setMode
1725      * @deprecated ICU 56
1726      */
1727     @Deprecated
getMode()1728     public Mode getMode() {
1729         return mode;
1730     }
1731     /**
1732      * Set options that affect this <tt>Normalizer</tt>'s operation.
1733      * Options do not change the basic composition or decomposition operation
1734      * that is being performed , but they control whether
1735      * certain optional portions of the operation are done.
1736      * Currently the only available option is:
1737      *
1738      * <ul>
1739      *   <li>{@link #UNICODE_3_2} - Use Normalization conforming to Unicode version 3.2.
1740      * </ul>
1741      *
1742      * @param   option  the option whose value is to be set.
1743      * @param   value   the new setting for the option.  Use <tt>true</tt> to
1744      *                  turn the option on and <tt>false</tt> to turn it off.
1745      *
1746      * @see #getOption
1747      * @deprecated ICU 56
1748      */
1749     @Deprecated
setOption(int option,boolean value)1750     public void setOption(int option,boolean value) {
1751         if (value) {
1752             options |= option;
1753         } else {
1754             options &= (~option);
1755         }
1756         norm2 = mode.getNormalizer2(options);
1757     }
1758 
1759     /**
1760      * Determine whether an option is turned on or off.
1761      * <p>
1762      * @see #setOption
1763      * @deprecated ICU 56
1764      */
1765     @Deprecated
getOption(int option)1766     public int getOption(int option) {
1767         if((options & option)!=0) {
1768             return 1 ;
1769         } else {
1770             return 0;
1771         }
1772     }
1773 
1774     /**
1775      * Gets the underlying text storage
1776      * @param fillIn the char buffer to fill the UTF-16 units.
1777      *         The length of the buffer should be equal to the length of the
1778      *         underlying text storage
1779      * @throws IndexOutOfBoundsException If the index passed for the array is invalid.
1780      * @see   #getLength
1781      * @deprecated ICU 56
1782      */
1783     @Deprecated
getText(char[] fillIn)1784     public int getText(char[] fillIn) {
1785         return text.getText(fillIn);
1786     }
1787 
1788     /**
1789      * Gets the length of underlying text storage
1790      * @return the length
1791      * @deprecated ICU 56
1792      */
1793     @Deprecated
getLength()1794     public int getLength() {
1795         return text.getLength();
1796     }
1797 
1798     /**
1799      * Returns the text under iteration as a string
1800      * @return a copy of the text under iteration.
1801      * @deprecated ICU 56
1802      */
1803     @Deprecated
getText()1804     public String getText() {
1805         return text.getText();
1806     }
1807 
1808     /**
1809      * Set the input text over which this <tt>Normalizer</tt> will iterate.
1810      * The iteration position is set to the beginning of the input text.
1811      * @param newText   The new string to be normalized.
1812      * @deprecated ICU 56
1813      */
1814     @Deprecated
setText(StringBuffer newText)1815     public void setText(StringBuffer newText) {
1816         UCharacterIterator newIter = UCharacterIterator.getInstance(newText);
1817         if (newIter == null) {
1818             throw new IllegalStateException("Could not create a new UCharacterIterator");
1819         }
1820         text = newIter;
1821         reset();
1822     }
1823 
1824     /**
1825      * Set the input text over which this <tt>Normalizer</tt> will iterate.
1826      * The iteration position is set to the beginning of the input text.
1827      * @param newText   The new string to be normalized.
1828      * @deprecated ICU 56
1829      */
1830     @Deprecated
setText(char[] newText)1831     public void setText(char[] newText) {
1832         UCharacterIterator newIter = UCharacterIterator.getInstance(newText);
1833         if (newIter == null) {
1834             throw new IllegalStateException("Could not create a new UCharacterIterator");
1835         }
1836         text = newIter;
1837         reset();
1838     }
1839 
1840     /**
1841      * Set the input text over which this <tt>Normalizer</tt> will iterate.
1842      * The iteration position is set to the beginning of the input text.
1843      * @param newText   The new string to be normalized.
1844      * @deprecated ICU 56
1845      */
1846     @Deprecated
setText(String newText)1847     public void setText(String newText) {
1848         UCharacterIterator newIter = UCharacterIterator.getInstance(newText);
1849         if (newIter == null) {
1850             throw new IllegalStateException("Could not create a new UCharacterIterator");
1851         }
1852         text = newIter;
1853         reset();
1854     }
1855 
1856     /**
1857      * Set the input text over which this <tt>Normalizer</tt> will iterate.
1858      * The iteration position is set to the beginning of the input text.
1859      * @param newText   The new string to be normalized.
1860      * @deprecated ICU 56
1861      */
1862     @Deprecated
setText(CharacterIterator newText)1863     public void setText(CharacterIterator newText) {
1864         UCharacterIterator newIter = UCharacterIterator.getInstance(newText);
1865         if (newIter == null) {
1866             throw new IllegalStateException("Could not create a new UCharacterIterator");
1867         }
1868         text = newIter;
1869         reset();
1870     }
1871 
1872     /**
1873      * Set the input text over which this <tt>Normalizer</tt> will iterate.
1874      * The iteration position is set to the beginning of the string.
1875      * @param newText   The new string to be normalized.
1876      * @deprecated ICU 56
1877      */
1878     @Deprecated
setText(UCharacterIterator newText)1879     public void setText(UCharacterIterator newText) {
1880         try{
1881             UCharacterIterator newIter = (UCharacterIterator)newText.clone();
1882             if (newIter == null) {
1883                 throw new IllegalStateException("Could not create a new UCharacterIterator");
1884             }
1885             text = newIter;
1886             reset();
1887         }catch(CloneNotSupportedException e) {
1888             throw new ICUCloneNotSupportedException("Could not clone the UCharacterIterator", e);
1889         }
1890     }
1891 
clearBuffer()1892     private void clearBuffer() {
1893         buffer.setLength(0);
1894         bufferPos=0;
1895     }
1896 
nextNormalize()1897     private boolean nextNormalize() {
1898         clearBuffer();
1899         currentIndex=nextIndex;
1900         text.setIndex(nextIndex);
1901         // Skip at least one character so we make progress.
1902         int c=text.nextCodePoint();
1903         if(c<0) {
1904             return false;
1905         }
1906         StringBuilder segment=new StringBuilder().appendCodePoint(c);
1907         while((c=text.nextCodePoint())>=0) {
1908             if(norm2.hasBoundaryBefore(c)) {
1909                 text.moveCodePointIndex(-1);
1910                 break;
1911             }
1912             segment.appendCodePoint(c);
1913         }
1914         nextIndex=text.getIndex();
1915         norm2.normalize(segment, buffer);
1916         return buffer.length()!=0;
1917     }
1918 
previousNormalize()1919     private boolean previousNormalize() {
1920         clearBuffer();
1921         nextIndex=currentIndex;
1922         text.setIndex(currentIndex);
1923         StringBuilder segment=new StringBuilder();
1924         int c;
1925         while((c=text.previousCodePoint())>=0) {
1926             if(c<=0xffff) {
1927                 segment.insert(0, (char)c);
1928             } else {
1929                 segment.insert(0, Character.toChars(c));
1930             }
1931             if(norm2.hasBoundaryBefore(c)) {
1932                 break;
1933             }
1934         }
1935         currentIndex=text.getIndex();
1936         norm2.normalize(segment, buffer);
1937         bufferPos=buffer.length();
1938         return buffer.length()!=0;
1939     }
1940 
1941     /* compare canonically equivalent ------------------------------------------- */
1942 
1943     // TODO: Broaden the public compare(String, String, options) API like this. Ticket #7407
internalCompare(CharSequence s1, CharSequence s2, int options)1944     private static int internalCompare(CharSequence s1, CharSequence s2, int options) {
1945         int normOptions=options>>>COMPARE_NORM_OPTIONS_SHIFT;
1946         options|= COMPARE_EQUIV;
1947 
1948         /*
1949          * UAX #21 Case Mappings, as fixed for Unicode version 4
1950          * (see Jitterbug 2021), defines a canonical caseless match as
1951          *
1952          * A string X is a canonical caseless match
1953          * for a string Y if and only if
1954          * NFD(toCasefold(NFD(X))) = NFD(toCasefold(NFD(Y)))
1955          *
1956          * For better performance, we check for FCD (or let the caller tell us that
1957          * both strings are in FCD) for the inner normalization.
1958          * BasicNormalizerTest::FindFoldFCDExceptions() makes sure that
1959          * case-folding preserves the FCD-ness of a string.
1960          * The outer normalization is then only performed by NormalizerImpl.cmpEquivFold()
1961          * when there is a difference.
1962          *
1963          * Exception: When using the Turkic case-folding option, we do perform
1964          * full NFD first. This is because in the Turkic case precomposed characters
1965          * with 0049 capital I or 0069 small i fold differently whether they
1966          * are first decomposed or not, so an FCD check - a check only for
1967          * canonical order - is not sufficient.
1968          */
1969         if((options&INPUT_IS_FCD)==0 || (options&FOLD_CASE_EXCLUDE_SPECIAL_I)!=0) {
1970             Normalizer2 n2;
1971             if((options&FOLD_CASE_EXCLUDE_SPECIAL_I)!=0) {
1972                 n2=NFD.getNormalizer2(normOptions);
1973             } else {
1974                 n2=FCD.getNormalizer2(normOptions);
1975             }
1976 
1977             // check if s1 and/or s2 fulfill the FCD conditions
1978             int spanQCYes1=n2.spanQuickCheckYes(s1);
1979             int spanQCYes2=n2.spanQuickCheckYes(s2);
1980 
1981             /*
1982              * ICU 2.4 had a further optimization:
1983              * If both strings were not in FCD, then they were both NFD'ed,
1984              * and the COMPARE_EQUIV option was turned off.
1985              * It is not entirely clear that this is valid with the current
1986              * definition of the canonical caseless match.
1987              * Therefore, ICU 2.6 removes that optimization.
1988              */
1989 
1990             if(spanQCYes1<s1.length()) {
1991                 StringBuilder fcd1=new StringBuilder(s1.length()+16).append(s1, 0, spanQCYes1);
1992                 s1=n2.normalizeSecondAndAppend(fcd1, s1.subSequence(spanQCYes1, s1.length()));
1993             }
1994             if(spanQCYes2<s2.length()) {
1995                 StringBuilder fcd2=new StringBuilder(s2.length()+16).append(s2, 0, spanQCYes2);
1996                 s2=n2.normalizeSecondAndAppend(fcd2, s2.subSequence(spanQCYes2, s2.length()));
1997             }
1998         }
1999 
2000         return cmpEquivFold(s1, s2, options);
2001     }
2002 
2003     /*
2004      * Compare two strings for canonical equivalence.
2005      * Further options include case-insensitive comparison and
2006      * code point order (as opposed to code unit order).
2007      *
2008      * In this function, canonical equivalence is optional as well.
2009      * If canonical equivalence is tested, then both strings must fulfill
2010      * the FCD check.
2011      *
2012      * Semantically, this is equivalent to
2013      *   strcmp[CodePointOrder](NFD(foldCase(s1)), NFD(foldCase(s2)))
2014      * where code point order, NFD and foldCase are all optional.
2015      *
2016      * String comparisons almost always yield results before processing both strings
2017      * completely.
2018      * They are generally more efficient working incrementally instead of
2019      * performing the sub-processing (strlen, normalization, case-folding)
2020      * on the entire strings first.
2021      *
2022      * It is also unnecessary to not normalize identical characters.
2023      *
2024      * This function works in principle as follows:
2025      *
2026      * loop {
2027      *   get one code unit c1 from s1 (-1 if end of source)
2028      *   get one code unit c2 from s2 (-1 if end of source)
2029      *
2030      *   if(either string finished) {
2031      *     return result;
2032      *   }
2033      *   if(c1==c2) {
2034      *     continue;
2035      *   }
2036      *
2037      *   // c1!=c2
2038      *   try to decompose/case-fold c1/c2, and continue if one does;
2039      *
2040      *   // still c1!=c2 and neither decomposes/case-folds, return result
2041      *   return c1-c2;
2042      * }
2043      *
2044      * When a character decomposes, then the pointer for that source changes to
2045      * the decomposition, pushing the previous pointer onto a stack.
2046      * When the end of the decomposition is reached, then the code unit reader
2047      * pops the previous source from the stack.
2048      * (Same for case-folding.)
2049      *
2050      * This is complicated further by operating on variable-width UTF-16.
2051      * The top part of the loop works on code units, while lookups for decomposition
2052      * and case-folding need code points.
2053      * Code points are assembled after the equality/end-of-source part.
2054      * The source pointer is only advanced beyond all code units when the code point
2055      * actually decomposes/case-folds.
2056      *
2057      * If we were on a trail surrogate unit when assembling a code point,
2058      * and the code point decomposes/case-folds, then the decomposition/folding
2059      * result must be compared with the part of the other string that corresponds to
2060      * this string's lead surrogate.
2061      * Since we only assemble a code point when hitting a trail unit when the
2062      * preceding lead units were identical, we back up the other string by one unit
2063      * in such a case.
2064      *
2065      * The optional code point order comparison at the end works with
2066      * the same fix-up as the other code point order comparison functions.
2067      * See ustring.c and the comment near the end of this function.
2068      *
2069      * Assumption: A decomposition or case-folding result string never contains
2070      * a single surrogate. This is a safe assumption in the Unicode Standard.
2071      * Therefore, we do not need to check for surrogate pairs across
2072      * decomposition/case-folding boundaries.
2073      *
2074      * Further assumptions (see verifications tstnorm.cpp):
2075      * The API function checks for FCD first, while the core function
2076      * first case-folds and then decomposes. This requires that case-folding does not
2077      * un-FCD any strings.
2078      *
2079      * The API function may also NFD the input and turn off decomposition.
2080      * This requires that case-folding does not un-NFD strings either.
2081      *
2082      * TODO If any of the above two assumptions is violated,
2083      * then this entire code must be re-thought.
2084      * If this happens, then a simple solution is to case-fold both strings up front
2085      * and to turn off UNORM_INPUT_IS_FCD.
2086      * We already do this when not both strings are in FCD because makeFCD
2087      * would be a partial NFD before the case folding, which does not work.
2088      * Note that all of this is only a problem when case-folding _and_
2089      * canonical equivalence come together.
2090      * (Comments in unorm_compare() are more up to date than this TODO.)
2091      */
2092 
2093     /* stack element for previous-level source/decomposition pointers */
2094     private static final class CmpEquivLevel {
2095         CharSequence cs;
2096         int s;
2097     };
createCmpEquivLevelStack()2098     private static final CmpEquivLevel[] createCmpEquivLevelStack() {
2099         return new CmpEquivLevel[] {
2100             new CmpEquivLevel(), new CmpEquivLevel()
2101         };
2102     }
2103 
2104     /**
2105      * Internal option for unorm_cmpEquivFold() for decomposing.
2106      * If not set, just do strcasecmp().
2107      */
2108     private static final int COMPARE_EQUIV=0x80000;
2109 
2110     /* internal function; package visibility for use by UTF16.StringComparator */
cmpEquivFold(CharSequence cs1, CharSequence cs2, int options)2111     /*package*/ static int cmpEquivFold(CharSequence cs1, CharSequence cs2, int options) {
2112         Normalizer2Impl nfcImpl;
2113         UCaseProps csp;
2114 
2115         /* current-level start/limit - s1/s2 as current */
2116         int s1, s2, limit1, limit2;
2117 
2118         /* decomposition and case folding variables */
2119         int length;
2120 
2121         /* stacks of previous-level start/current/limit */
2122         CmpEquivLevel[] stack1=null, stack2=null;
2123 
2124         /* buffers for algorithmic decompositions */
2125         String decomp1, decomp2;
2126 
2127         /* case folding buffers, only use current-level start/limit */
2128         StringBuilder fold1, fold2;
2129 
2130         /* track which is the current level per string */
2131         int level1, level2;
2132 
2133         /* current code units, and code points for lookups */
2134         int c1, c2, cp1, cp2;
2135 
2136         /* no argument error checking because this itself is not an API */
2137 
2138         /*
2139          * assume that at least one of the options _COMPARE_EQUIV and U_COMPARE_IGNORE_CASE is set
2140          * otherwise this function must behave exactly as uprv_strCompare()
2141          * not checking for that here makes testing this function easier
2142          */
2143 
2144         /* normalization/properties data loaded? */
2145         if((options&COMPARE_EQUIV)!=0) {
2146             nfcImpl=Norm2AllModes.getNFCInstance().impl;
2147         } else {
2148             nfcImpl=null;
2149         }
2150         if((options&COMPARE_IGNORE_CASE)!=0) {
2151             csp=UCaseProps.INSTANCE;
2152             fold1=new StringBuilder();
2153             fold2=new StringBuilder();
2154         } else {
2155             csp=null;
2156             fold1=fold2=null;
2157         }
2158 
2159         /* initialize */
2160         s1=0;
2161         limit1=cs1.length();
2162         s2=0;
2163         limit2=cs2.length();
2164 
2165         level1=level2=0;
2166         c1=c2=-1;
2167 
2168         /* comparison loop */
2169         for(;;) {
2170             /*
2171              * here a code unit value of -1 means "get another code unit"
2172              * below it will mean "this source is finished"
2173              */
2174 
2175             if(c1<0) {
2176                 /* get next code unit from string 1, post-increment */
2177                 for(;;) {
2178                     if(s1==limit1) {
2179                         if(level1==0) {
2180                             c1=-1;
2181                             break;
2182                         }
2183                     } else {
2184                         c1=cs1.charAt(s1++);
2185                         break;
2186                     }
2187 
2188                     /* reached end of level buffer, pop one level */
2189                     do {
2190                         --level1;
2191                         cs1=stack1[level1].cs;
2192                     } while(cs1==null);
2193                     s1=stack1[level1].s;
2194                     limit1=cs1.length();
2195                 }
2196             }
2197 
2198             if(c2<0) {
2199                 /* get next code unit from string 2, post-increment */
2200                 for(;;) {
2201                     if(s2==limit2) {
2202                         if(level2==0) {
2203                             c2=-1;
2204                             break;
2205                         }
2206                     } else {
2207                         c2=cs2.charAt(s2++);
2208                         break;
2209                     }
2210 
2211                     /* reached end of level buffer, pop one level */
2212                     do {
2213                         --level2;
2214                         cs2=stack2[level2].cs;
2215                     } while(cs2==null);
2216                     s2=stack2[level2].s;
2217                     limit2=cs2.length();
2218                 }
2219             }
2220 
2221             /*
2222              * compare c1 and c2
2223              * either variable c1, c2 is -1 only if the corresponding string is finished
2224              */
2225             if(c1==c2) {
2226                 if(c1<0) {
2227                     return 0;   /* c1==c2==-1 indicating end of strings */
2228                 }
2229                 c1=c2=-1;       /* make us fetch new code units */
2230                 continue;
2231             } else if(c1<0) {
2232                 return -1;      /* string 1 ends before string 2 */
2233             } else if(c2<0) {
2234                 return 1;       /* string 2 ends before string 1 */
2235             }
2236             /* c1!=c2 && c1>=0 && c2>=0 */
2237 
2238             /* get complete code points for c1, c2 for lookups if either is a surrogate */
2239             cp1=c1;
2240             if(UTF16.isSurrogate((char)c1)) {
2241                 char c;
2242 
2243                 if(Normalizer2Impl.UTF16Plus.isSurrogateLead(c1)) {
2244                     if(s1!=limit1 && Character.isLowSurrogate(c=cs1.charAt(s1))) {
2245                         /* advance ++s1; only below if cp1 decomposes/case-folds */
2246                         cp1=Character.toCodePoint((char)c1, c);
2247                     }
2248                 } else /* isTrail(c1) */ {
2249                     if(0<=(s1-2) && Character.isHighSurrogate(c=cs1.charAt(s1-2))) {
2250                         cp1=Character.toCodePoint(c, (char)c1);
2251                     }
2252                 }
2253             }
2254 
2255             cp2=c2;
2256             if(UTF16.isSurrogate((char)c2)) {
2257                 char c;
2258 
2259                 if(Normalizer2Impl.UTF16Plus.isSurrogateLead(c2)) {
2260                     if(s2!=limit2 && Character.isLowSurrogate(c=cs2.charAt(s2))) {
2261                         /* advance ++s2; only below if cp2 decomposes/case-folds */
2262                         cp2=Character.toCodePoint((char)c2, c);
2263                     }
2264                 } else /* isTrail(c2) */ {
2265                     if(0<=(s2-2) && Character.isHighSurrogate(c=cs2.charAt(s2-2))) {
2266                         cp2=Character.toCodePoint(c, (char)c2);
2267                     }
2268                 }
2269             }
2270 
2271             /*
2272              * go down one level for each string
2273              * continue with the main loop as soon as there is a real change
2274              */
2275 
2276             if( level1==0 && (options&COMPARE_IGNORE_CASE)!=0 &&
2277                 (length=csp.toFullFolding(cp1, fold1, options))>=0
2278             ) {
2279                 /* cp1 case-folds to the code point "length" or to p[length] */
2280                 if(UTF16.isSurrogate((char)c1)) {
2281                     if(Normalizer2Impl.UTF16Plus.isSurrogateLead(c1)) {
2282                         /* advance beyond source surrogate pair if it case-folds */
2283                         ++s1;
2284                     } else /* isTrail(c1) */ {
2285                         /*
2286                          * we got a supplementary code point when hitting its trail surrogate,
2287                          * therefore the lead surrogate must have been the same as in the other string;
2288                          * compare this decomposition with the lead surrogate in the other string
2289                          * remember that this simulates bulk text replacement:
2290                          * the decomposition would replace the entire code point
2291                          */
2292                         --s2;
2293                         c2=cs2.charAt(s2-1);
2294                     }
2295                 }
2296 
2297                 /* push current level pointers */
2298                 if(stack1==null) {
2299                     stack1=createCmpEquivLevelStack();
2300                 }
2301                 stack1[0].cs=cs1;
2302                 stack1[0].s=s1;
2303                 ++level1;
2304 
2305                 /* copy the folding result to fold1[] */
2306                 /* Java: the buffer was probably not empty, remove the old contents */
2307                 if(length<=UCaseProps.MAX_STRING_LENGTH) {
2308                     fold1.delete(0, fold1.length()-length);
2309                 } else {
2310                     fold1.setLength(0);
2311                     fold1.appendCodePoint(length);
2312                 }
2313 
2314                 /* set next level pointers to case folding */
2315                 cs1=fold1;
2316                 s1=0;
2317                 limit1=fold1.length();
2318 
2319                 /* get ready to read from decomposition, continue with loop */
2320                 c1=-1;
2321                 continue;
2322             }
2323 
2324             if( level2==0 && (options&COMPARE_IGNORE_CASE)!=0 &&
2325                 (length=csp.toFullFolding(cp2, fold2, options))>=0
2326             ) {
2327                 /* cp2 case-folds to the code point "length" or to p[length] */
2328                 if(UTF16.isSurrogate((char)c2)) {
2329                     if(Normalizer2Impl.UTF16Plus.isSurrogateLead(c2)) {
2330                         /* advance beyond source surrogate pair if it case-folds */
2331                         ++s2;
2332                     } else /* isTrail(c2) */ {
2333                         /*
2334                          * we got a supplementary code point when hitting its trail surrogate,
2335                          * therefore the lead surrogate must have been the same as in the other string;
2336                          * compare this decomposition with the lead surrogate in the other string
2337                          * remember that this simulates bulk text replacement:
2338                          * the decomposition would replace the entire code point
2339                          */
2340                         --s1;
2341                         c1=cs1.charAt(s1-1);
2342                     }
2343                 }
2344 
2345                 /* push current level pointers */
2346                 if(stack2==null) {
2347                     stack2=createCmpEquivLevelStack();
2348                 }
2349                 stack2[0].cs=cs2;
2350                 stack2[0].s=s2;
2351                 ++level2;
2352 
2353                 /* copy the folding result to fold2[] */
2354                 /* Java: the buffer was probably not empty, remove the old contents */
2355                 if(length<=UCaseProps.MAX_STRING_LENGTH) {
2356                     fold2.delete(0, fold2.length()-length);
2357                 } else {
2358                     fold2.setLength(0);
2359                     fold2.appendCodePoint(length);
2360                 }
2361 
2362                 /* set next level pointers to case folding */
2363                 cs2=fold2;
2364                 s2=0;
2365                 limit2=fold2.length();
2366 
2367                 /* get ready to read from decomposition, continue with loop */
2368                 c2=-1;
2369                 continue;
2370             }
2371 
2372             if( level1<2 && (options&COMPARE_EQUIV)!=0 &&
2373                 (decomp1=nfcImpl.getDecomposition(cp1))!=null
2374             ) {
2375                 /* cp1 decomposes into p[length] */
2376                 if(UTF16.isSurrogate((char)c1)) {
2377                     if(Normalizer2Impl.UTF16Plus.isSurrogateLead(c1)) {
2378                         /* advance beyond source surrogate pair if it decomposes */
2379                         ++s1;
2380                     } else /* isTrail(c1) */ {
2381                         /*
2382                          * we got a supplementary code point when hitting its trail surrogate,
2383                          * therefore the lead surrogate must have been the same as in the other string;
2384                          * compare this decomposition with the lead surrogate in the other string
2385                          * remember that this simulates bulk text replacement:
2386                          * the decomposition would replace the entire code point
2387                          */
2388                         --s2;
2389                         c2=cs2.charAt(s2-1);
2390                     }
2391                 }
2392 
2393                 /* push current level pointers */
2394                 if(stack1==null) {
2395                     stack1=createCmpEquivLevelStack();
2396                 }
2397                 stack1[level1].cs=cs1;
2398                 stack1[level1].s=s1;
2399                 ++level1;
2400 
2401                 /* set empty intermediate level if skipped */
2402                 if(level1<2) {
2403                     stack1[level1++].cs=null;
2404                 }
2405 
2406                 /* set next level pointers to decomposition */
2407                 cs1=decomp1;
2408                 s1=0;
2409                 limit1=decomp1.length();
2410 
2411                 /* get ready to read from decomposition, continue with loop */
2412                 c1=-1;
2413                 continue;
2414             }
2415 
2416             if( level2<2 && (options&COMPARE_EQUIV)!=0 &&
2417                 (decomp2=nfcImpl.getDecomposition(cp2))!=null
2418             ) {
2419                 /* cp2 decomposes into p[length] */
2420                 if(UTF16.isSurrogate((char)c2)) {
2421                     if(Normalizer2Impl.UTF16Plus.isSurrogateLead(c2)) {
2422                         /* advance beyond source surrogate pair if it decomposes */
2423                         ++s2;
2424                     } else /* isTrail(c2) */ {
2425                         /*
2426                          * we got a supplementary code point when hitting its trail surrogate,
2427                          * therefore the lead surrogate must have been the same as in the other string;
2428                          * compare this decomposition with the lead surrogate in the other string
2429                          * remember that this simulates bulk text replacement:
2430                          * the decomposition would replace the entire code point
2431                          */
2432                         --s1;
2433                         c1=cs1.charAt(s1-1);
2434                     }
2435                 }
2436 
2437                 /* push current level pointers */
2438                 if(stack2==null) {
2439                     stack2=createCmpEquivLevelStack();
2440                 }
2441                 stack2[level2].cs=cs2;
2442                 stack2[level2].s=s2;
2443                 ++level2;
2444 
2445                 /* set empty intermediate level if skipped */
2446                 if(level2<2) {
2447                     stack2[level2++].cs=null;
2448                 }
2449 
2450                 /* set next level pointers to decomposition */
2451                 cs2=decomp2;
2452                 s2=0;
2453                 limit2=decomp2.length();
2454 
2455                 /* get ready to read from decomposition, continue with loop */
2456                 c2=-1;
2457                 continue;
2458             }
2459 
2460             /*
2461              * no decomposition/case folding, max level for both sides:
2462              * return difference result
2463              *
2464              * code point order comparison must not just return cp1-cp2
2465              * because when single surrogates are present then the surrogate pairs
2466              * that formed cp1 and cp2 may be from different string indexes
2467              *
2468              * example: { d800 d800 dc01 } vs. { d800 dc00 }, compare at second code units
2469              * c1=d800 cp1=10001 c2=dc00 cp2=10000
2470              * cp1-cp2>0 but c1-c2<0 and in fact in UTF-32 it is { d800 10001 } < { 10000 }
2471              *
2472              * therefore, use same fix-up as in ustring.c/uprv_strCompare()
2473              * except: uprv_strCompare() fetches c=*s while this functions fetches c=*s++
2474              * so we have slightly different pointer/start/limit comparisons here
2475              */
2476 
2477             if(c1>=0xd800 && c2>=0xd800 && (options&COMPARE_CODE_POINT_ORDER)!=0) {
2478                 /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */
2479                 if(
2480                     (c1<=0xdbff && s1!=limit1 && Character.isLowSurrogate(cs1.charAt(s1))) ||
2481                     (Character.isLowSurrogate((char)c1) && 0!=(s1-1) && Character.isHighSurrogate(cs1.charAt(s1-2)))
2482                 ) {
2483                     /* part of a surrogate pair, leave >=d800 */
2484                 } else {
2485                     /* BMP code point - may be surrogate code point - make <d800 */
2486                     c1-=0x2800;
2487                 }
2488 
2489                 if(
2490                     (c2<=0xdbff && s2!=limit2 && Character.isLowSurrogate(cs2.charAt(s2))) ||
2491                     (Character.isLowSurrogate((char)c2) && 0!=(s2-1) && Character.isHighSurrogate(cs2.charAt(s2-2)))
2492                 ) {
2493                     /* part of a surrogate pair, leave >=d800 */
2494                 } else {
2495                     /* BMP code point - may be surrogate code point - make <d800 */
2496                     c2-=0x2800;
2497                 }
2498             }
2499 
2500             return c1-c2;
2501         }
2502     }
2503 
2504     /**
2505      * An Appendable that writes into a char array with a capacity that may be
2506      * less than array.length.
2507      * (By contrast, CharBuffer will write beyond destLimit all the way up to array.length.)
2508      * <p>
2509      * An overflow is only reported at the end, for the old Normalizer API functions that write
2510      * to char arrays.
2511      */
2512     private static final class CharsAppendable implements Appendable {
CharsAppendable(char[] dest, int destStart, int destLimit)2513         public CharsAppendable(char[] dest, int destStart, int destLimit) {
2514             chars=dest;
2515             start=offset=destStart;
2516             limit=destLimit;
2517         }
length()2518         public int length() {
2519             int len=offset-start;
2520             if(offset<=limit) {
2521                 return len;
2522             } else {
2523                 throw new IndexOutOfBoundsException(Integer.toString(len));
2524             }
2525         }
2526         @Override
append(char c)2527         public Appendable append(char c) {
2528             if(offset<limit) {
2529                 chars[offset]=c;
2530             }
2531             ++offset;
2532             return this;
2533         }
2534         @Override
append(CharSequence s)2535         public Appendable append(CharSequence s) {
2536             return append(s, 0, s.length());
2537         }
2538         @Override
append(CharSequence s, int sStart, int sLimit)2539         public Appendable append(CharSequence s, int sStart, int sLimit) {
2540             int len=sLimit-sStart;
2541             if(len<=(limit-offset)) {
2542                 while(sStart<sLimit) {  // TODO: Is there a better way to copy the characters?
2543                     chars[offset++]=s.charAt(sStart++);
2544                 }
2545             } else {
2546                 offset+=len;
2547             }
2548             return this;
2549         }
2550 
2551         private final char[] chars;
2552         private final int start, limit;
2553         private int offset;
2554     }
2555 }
2556