• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* GENERATED SOURCE. DO NOT MODIFY. */
2 // © 2016 and later: Unicode, Inc. and others.
3 // License & terms of use: http://www.unicode.org/copyright.html#License
4 /*
5  *******************************************************************************
6  * Copyright (C) 2000-2016, International Business Machines Corporation and
7  * others. All Rights Reserved.
8  *******************************************************************************
9  */
10 package android.icu.text;
11 import java.nio.CharBuffer;
12 import java.text.CharacterIterator;
13 
14 import android.icu.impl.Norm2AllModes;
15 import android.icu.impl.Normalizer2Impl;
16 import android.icu.impl.UCaseProps;
17 import android.icu.lang.UCharacter;
18 import android.icu.util.ICUCloneNotSupportedException;
19 
20 /**
21  * Old Unicode normalization API.
22  *
23  * <p>This API has been replaced by the {@link Normalizer2} class and is only available
24  * for backward compatibility. This class simply delegates to the Normalizer2 class.
25  * There are two exceptions: The new API does not provide a replacement for
26  * <code>QuickCheckResult</code> and <code>compare()</code>.
27  *
28  * <p><code>normalize</code> transforms Unicode text into an equivalent composed or
29  * decomposed form, allowing for easier sorting and searching of text.
30  * <code>normalize</code> supports the standard normalization forms described in
31  * <a href="http://www.unicode.org/unicode/reports/tr15/" target="unicode">
32  * Unicode Standard Annex #15 &mdash; Unicode Normalization Forms</a>.
33  *
34  * <p>Characters with accents or other adornments can be encoded in
35  * several different ways in Unicode.  For example, take the character A-acute.
36  * In Unicode, this can be encoded as a single character (the
37  * "composed" form):
38  *
39  * <pre>
40  *      00C1    LATIN CAPITAL LETTER A WITH ACUTE
41  * </pre>
42  *
43  * or as two separate characters (the "decomposed" form):
44  *
45  * <pre>
46  *      0041    LATIN CAPITAL LETTER A
47  *      0301    COMBINING ACUTE ACCENT
48  * </pre>
49  *
50  * <p>To a user of your program, however, both of these sequences should be
51  * treated as the same "user-level" character "A with acute accent".  When you
52  * are searching or comparing text, you must ensure that these two sequences are
53  * treated equivalently.  In addition, you must handle characters with more than
54  * one accent.  Sometimes the order of a character's combining accents is
55  * significant, while in other cases accent sequences in different orders are
56  * really equivalent.
57  *
58  * <p>Similarly, the string "ffi" can be encoded as three separate letters:
59  *
60  * <pre>
61  *      0066    LATIN SMALL LETTER F
62  *      0066    LATIN SMALL LETTER F
63  *      0069    LATIN SMALL LETTER I
64  * </pre>
65  *
66  * or as the single character
67  *
68  * <pre>
69  *      FB03    LATIN SMALL LIGATURE FFI
70  * </pre>
71  *
72  * <p>The ffi ligature is not a distinct semantic character, and strictly speaking
73  * it shouldn't be in Unicode at all, but it was included for compatibility
74  * with existing character sets that already provided it.  The Unicode standard
75  * identifies such characters by giving them "compatibility" decompositions
76  * into the corresponding semantic characters.  When sorting and searching, you
77  * will often want to use these mappings.
78  *
79  * <p><code>normalize</code> helps solve these problems by transforming text into
80  * the canonical composed and decomposed forms as shown in the first example
81  * above. In addition, you can have it perform compatibility decompositions so
82  * that you can treat compatibility characters the same as their equivalents.
83  * Finally, <code>normalize</code> rearranges accents into the proper canonical
84  * order, so that you do not have to worry about accent rearrangement on your
85  * own.
86  *
87  * <p>Form FCD, "Fast C or D", is also designed for collation.
88  * It allows to work on strings that are not necessarily normalized
89  * with an algorithm (like in collation) that works under "canonical closure",
90  * i.e., it treats precomposed characters and their decomposed equivalents the
91  * same.
92  *
93  * <p>It is not a normalization form because it does not provide for uniqueness of
94  * representation. Multiple strings may be canonically equivalent (their NFDs
95  * are identical) and may all conform to FCD without being identical themselves.
96  *
97  * <p>The form is defined such that the "raw decomposition", the recursive
98  * canonical decomposition of each character, results in a string that is
99  * canonically ordered. This means that precomposed characters are allowed for
100  * as long as their decompositions do not need canonical reordering.
101  *
102  * <p>Its advantage for a process like collation is that all NFD and most NFC texts
103  * - and many unnormalized texts - already conform to FCD and do not need to be
104  * normalized (NFD) for such a process. The FCD quick check will return YES for
105  * most strings in practice.
106  *
107  * <p>normalize(FCD) may be implemented with NFD.
108  *
109  * <p>For more details on FCD see Unicode Technical Note #5 (Canonical Equivalence in Applications):
110  * http://www.unicode.org/notes/tn5/#FCD
111  *
112  * <p>ICU collation performs either NFD or FCD normalization automatically if
113  * normalization is turned on for the collator object. Beyond collation and
114  * string search, normalized strings may be useful for string equivalence
115  * comparisons, transliteration/transcription, unique representations, etc.
116  *
117  * <p>The W3C generally recommends to exchange texts in NFC.
118  * Note also that most legacy character encodings use only precomposed forms and
119  * often do not encode any combining marks by themselves. For conversion to such
120  * character encodings the Unicode text needs to be normalized to NFC.
121  * For more usage examples, see the Unicode Standard Annex.
122  *
123  * <p>Note: The Normalizer class also provides API for iterative normalization.
124  * While the setIndex() and getIndex() refer to indices in the
125  * underlying Unicode input text, the next() and previous() methods
126  * iterate through characters in the normalized output.
127  * This means that there is not necessarily a one-to-one correspondence
128  * between characters returned by next() and previous() and the indices
129  * passed to and returned from setIndex() and getIndex().
130  * It is for this reason that Normalizer does not implement the CharacterIterator interface.
131  */
132 public final class Normalizer implements Cloneable {
133     // The input text and our position in it
134     private UCharacterIterator  text;
135     private Normalizer2         norm2;
136     private Mode                mode;
137     private int                 options;
138 
139     // The normalization buffer is the result of normalization
140     // of the source in [currentIndex..nextIndex[ .
141     private int                 currentIndex;
142     private int                 nextIndex;
143 
144     // A buffer for holding intermediate results
145     private StringBuilder       buffer;
146     private int                 bufferPos;
147 
148     // Helper classes to defer loading of normalization data.
149     private static final class ModeImpl {
ModeImpl(Normalizer2 n2)150         private ModeImpl(Normalizer2 n2) {
151             normalizer2 = n2;
152         }
153         private final Normalizer2 normalizer2;
154     }
155     private static final class NFDModeImpl {
156         private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFDInstance());
157     }
158     private static final class NFKDModeImpl {
159         private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFKDInstance());
160     }
161     private static final class NFCModeImpl {
162         private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFCInstance());
163     }
164     private static final class NFKCModeImpl {
165         private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFKCInstance());
166     }
167     private static final class FCDModeImpl {
168         private static final ModeImpl INSTANCE = new ModeImpl(Norm2AllModes.getFCDNormalizer2());
169     }
170 
171     private static final class Unicode32 {
172         private static final UnicodeSet INSTANCE = new UnicodeSet("[:age=3.2:]").freeze();
173     }
174     private static final class NFD32ModeImpl {
175         private static final ModeImpl INSTANCE =
176             new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFDInstance(),
177                                                  Unicode32.INSTANCE));
178     }
179     private static final class NFKD32ModeImpl {
180         private static final ModeImpl INSTANCE =
181             new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFKDInstance(),
182                                                  Unicode32.INSTANCE));
183     }
184     private static final class NFC32ModeImpl {
185         private static final ModeImpl INSTANCE =
186             new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFCInstance(),
187                                                  Unicode32.INSTANCE));
188     }
189     private static final class NFKC32ModeImpl {
190         private static final ModeImpl INSTANCE =
191             new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFKCInstance(),
192                                                  Unicode32.INSTANCE));
193     }
194     private static final class FCD32ModeImpl {
195         private static final ModeImpl INSTANCE =
196             new ModeImpl(new FilteredNormalizer2(Norm2AllModes.getFCDNormalizer2(),
197                                                  Unicode32.INSTANCE));
198     }
199 
200     /**
201      * Options bit set value to select Unicode 3.2 normalization
202      * (except NormalizationCorrections).
203      * At most one Unicode version can be selected at a time.
204      *
205      * @deprecated ICU 56 Use {@link FilteredNormalizer2} instead.
206      * @hide original deprecated declaration
207      */
208     @Deprecated
209     public static final int UNICODE_3_2=0x20;
210 
211     /**
212      * Constant indicating that the end of the iteration has been reached.
213      * This is guaranteed to have the same value as {@link UCharacterIterator#DONE}.
214      *
215      * @deprecated ICU 56
216      * @hide original deprecated declaration
217      */
218     @Deprecated
219     public static final int DONE = UCharacterIterator.DONE;
220 
221     /**
222      * Constants for normalization modes.
223      * <p>
224      * The Mode class is not intended for public subclassing.
225      * Only the Mode constants provided by the Normalizer class should be used,
226      * and any fields or methods should not be called or overridden by users.
227      *
228      * @deprecated ICU 56 Use {@link Normalizer2} instead.
229      * @hide original deprecated declaration
230      */
231     @Deprecated
232     public static abstract class Mode {
233         /**
234          * Sole constructor
235          * @deprecated This API is ICU internal only.
236          * @hide original deprecated declaration
237          * @hide draft / provisional / internal are hidden on Android
238          */
239         @Deprecated
Mode()240         protected Mode() {
241         }
242 
243         /**
244          * @deprecated This API is ICU internal only.
245          * @hide original deprecated declaration
246          * @hide draft / provisional / internal are hidden on Android
247          */
248         @Deprecated
getNormalizer2(int options)249         protected abstract Normalizer2 getNormalizer2(int options);
250     }
251 
252     private static final class NONEMode extends Mode {
253         @Override
getNormalizer2(int options)254         protected Normalizer2 getNormalizer2(int options) { return Norm2AllModes.NOOP_NORMALIZER2; }
255     }
256     private static final class NFDMode extends Mode {
257         @Override
getNormalizer2(int options)258         protected Normalizer2 getNormalizer2(int options) {
259             return (options&UNICODE_3_2) != 0 ?
260                     NFD32ModeImpl.INSTANCE.normalizer2 : NFDModeImpl.INSTANCE.normalizer2;
261         }
262     }
263     private static final class NFKDMode extends Mode {
264         @Override
getNormalizer2(int options)265         protected Normalizer2 getNormalizer2(int options) {
266             return (options&UNICODE_3_2) != 0 ?
267                     NFKD32ModeImpl.INSTANCE.normalizer2 : NFKDModeImpl.INSTANCE.normalizer2;
268         }
269     }
270     private static final class NFCMode extends Mode {
271         @Override
getNormalizer2(int options)272         protected Normalizer2 getNormalizer2(int options) {
273             return (options&UNICODE_3_2) != 0 ?
274                     NFC32ModeImpl.INSTANCE.normalizer2 : NFCModeImpl.INSTANCE.normalizer2;
275         }
276     }
277     private static final class NFKCMode extends Mode {
278         @Override
getNormalizer2(int options)279         protected Normalizer2 getNormalizer2(int options) {
280             return (options&UNICODE_3_2) != 0 ?
281                     NFKC32ModeImpl.INSTANCE.normalizer2 : NFKCModeImpl.INSTANCE.normalizer2;
282         }
283     }
284     private static final class FCDMode extends Mode {
285         @Override
getNormalizer2(int options)286         protected Normalizer2 getNormalizer2(int options) {
287             return (options&UNICODE_3_2) != 0 ?
288                     FCD32ModeImpl.INSTANCE.normalizer2 : FCDModeImpl.INSTANCE.normalizer2;
289         }
290     }
291 
292     /**
293      * No decomposition/composition.
294      *
295      * @deprecated ICU 56 Use {@link Normalizer2} instead.
296      * @hide original deprecated declaration
297      */
298     @Deprecated
299     public static final Mode NONE = new NONEMode();
300 
301     /**
302      * Canonical decomposition.
303      *
304      * @deprecated ICU 56 Use {@link Normalizer2} instead.
305      * @hide original deprecated declaration
306      */
307     @Deprecated
308     public static final Mode NFD = new NFDMode();
309 
310     /**
311      * Compatibility decomposition.
312      *
313      * @deprecated ICU 56 Use {@link Normalizer2} instead.
314      * @hide original deprecated declaration
315      */
316     @Deprecated
317     public static final Mode NFKD = new NFKDMode();
318 
319     /**
320      * Canonical decomposition followed by canonical composition.
321      *
322      * @deprecated ICU 56 Use {@link Normalizer2} instead.
323      * @hide original deprecated declaration
324      */
325     @Deprecated
326     public static final Mode NFC = new NFCMode();
327 
328     /**
329      * Default normalization.
330      *
331      * @deprecated ICU 56 Use {@link Normalizer2} instead.
332      * @hide original deprecated declaration
333      */
334     @Deprecated
335     public static final Mode DEFAULT = NFC;
336 
337     /**
338      * Compatibility decomposition followed by canonical composition.
339      *
340      * @deprecated ICU 56 Use {@link Normalizer2} instead.
341      * @hide original deprecated declaration
342      */
343     @Deprecated
344     public static final Mode NFKC =new NFKCMode();
345 
346     /**
347      * "Fast C or D" form.
348      *
349      * @deprecated ICU 56 Use {@link Normalizer2} instead.
350      * @hide original deprecated declaration
351      */
352     @Deprecated
353     public static final Mode FCD = new FCDMode();
354 
355     /**
356      * Null operation for use with the {@link android.icu.text.Normalizer constructors}
357      * and the static {@link #normalize normalize} method.  This value tells
358      * the <tt>Normalizer</tt> to do nothing but return unprocessed characters
359      * from the underlying String or CharacterIterator.  If you have code which
360      * requires raw text at some times and normalized text at others, you can
361      * use <tt>NO_OP</tt> for the cases where you want raw text, rather
362      * than having a separate code path that bypasses <tt>Normalizer</tt>
363      * altogether.
364      * <p>
365      * @see #setMode
366      * @deprecated ICU 2.8. Use Nomalizer.NONE
367      * @see #NONE
368      * @hide original deprecated declaration
369      */
370     @Deprecated
371     public static final Mode NO_OP = NONE;
372 
373     /**
374      * Canonical decomposition followed by canonical composition.  Used with the
375      * {@link android.icu.text.Normalizer constructors} and the static
376      * {@link #normalize normalize} method to determine the operation to be
377      * performed.
378      * <p>
379      * If all optional features (<i>e.g.</i> {@link #IGNORE_HANGUL}) are turned
380      * off, this operation produces output that is in
381      * <a href=http://www.unicode.org/unicode/reports/tr15/>Unicode Canonical
382      * Form</a>
383      * <b>C</b>.
384      * <p>
385      * @see #setMode
386      * @deprecated ICU 2.8. Use Normalier.NFC
387      * @see #NFC
388      * @hide original deprecated declaration
389      */
390     @Deprecated
391     public static final Mode COMPOSE = NFC;
392 
393     /**
394      * Compatibility decomposition followed by canonical composition.
395      * Used with the {@link android.icu.text.Normalizer constructors} and the static
396      * {@link #normalize normalize} method to determine the operation to be
397      * performed.
398      * <p>
399      * If all optional features (<i>e.g.</i> {@link #IGNORE_HANGUL}) are turned
400      * off, this operation produces output that is in
401      * <a href=http://www.unicode.org/unicode/reports/tr15/>Unicode Canonical
402      * Form</a>
403      * <b>KC</b>.
404      * <p>
405      * @see #setMode
406      * @deprecated ICU 2.8. Use Normalizer.NFKC
407      * @see #NFKC
408      * @hide original deprecated declaration
409      */
410     @Deprecated
411     public static final Mode COMPOSE_COMPAT = NFKC;
412 
413     /**
414      * Canonical decomposition.  This value is passed to the
415      * {@link android.icu.text.Normalizer constructors} and the static
416      * {@link #normalize normalize}
417      * method to determine the operation to be performed.
418      * <p>
419      * If all optional features (<i>e.g.</i> {@link #IGNORE_HANGUL}) are turned
420      * off, this operation produces output that is in
421      * <a href=http://www.unicode.org/unicode/reports/tr15/>Unicode Canonical
422      * Form</a>
423      * <b>D</b>.
424      * <p>
425      * @see #setMode
426      * @deprecated ICU 2.8. Use Normalizer.NFD
427      * @see #NFD
428      * @hide original deprecated declaration
429      */
430     @Deprecated
431     public static final Mode DECOMP = NFD;
432 
433     /**
434      * Compatibility decomposition.  This value is passed to the
435      * {@link android.icu.text.Normalizer constructors} and the static
436      * {@link #normalize normalize}
437      * method to determine the operation to be performed.
438      * <p>
439      * If all optional features (<i>e.g.</i> {@link #IGNORE_HANGUL}) are turned
440      * off, this operation produces output that is in
441      * <a href=http://www.unicode.org/unicode/reports/tr15/>Unicode Canonical
442      * Form</a>
443      * <b>KD</b>.
444      * <p>
445      * @see #setMode
446      * @deprecated ICU 2.8. Use Normalizer.NFKD
447      * @see #NFKD
448      * @hide original deprecated declaration
449      */
450     @Deprecated
451     public static final Mode DECOMP_COMPAT = NFKD;
452 
453     /**
454      * Option to disable Hangul/Jamo composition and decomposition.
455      * This option applies to Korean text,
456      * which can be represented either in the Jamo alphabet or in Hangul
457      * characters, which are really just two or three Jamo combined
458      * into one visual glyph.  Since Jamo takes up more storage space than
459      * Hangul, applications that process only Hangul text may wish to turn
460      * this option on when decomposing text.
461      * <p>
462      * The Unicode standard treates Hangul to Jamo conversion as a
463      * canonical decomposition, so this option must be turned <b>off</b> if you
464      * wish to transform strings into one of the standard
465      * <a href="http://www.unicode.org/unicode/reports/tr15/" target="unicode">
466      * Unicode Normalization Forms</a>.
467      * <p>
468      * @see #setOption
469      * @deprecated ICU 2.8. This option is no longer supported.
470      * @hide original deprecated declaration
471      */
472     @Deprecated
473     public static final int IGNORE_HANGUL = 0x0001;
474 
475     /**
476      * Result values for quickCheck().
477      * For details see Unicode Technical Report 15.
478      */
479     public static final class QuickCheckResult{
480         //private int resultValue;
QuickCheckResult(int value)481         private QuickCheckResult(int value) {
482             //resultValue=value;
483         }
484     }
485     /**
486      * Indicates that string is not in the normalized format
487      */
488     public static final QuickCheckResult NO = new QuickCheckResult(0);
489 
490     /**
491      * Indicates that string is in the normalized format
492      */
493     public static final QuickCheckResult YES = new QuickCheckResult(1);
494 
495     /**
496      * Indicates it cannot be determined if string is in the normalized
497      * format without further thorough checks.
498      */
499     public static final QuickCheckResult MAYBE = new QuickCheckResult(2);
500 
501     /**
502      * Option bit for compare:
503      * Case sensitively compare the strings
504      */
505     public static final int FOLD_CASE_DEFAULT =  UCharacter.FOLD_CASE_DEFAULT;
506 
507     /**
508      * Option bit for compare:
509      * Both input strings are assumed to fulfill FCD conditions.
510      */
511     public static final int INPUT_IS_FCD    =      0x20000;
512 
513     /**
514      * Option bit for compare:
515      * Perform case-insensitive comparison.
516      */
517     public static final int COMPARE_IGNORE_CASE  =     0x10000;
518 
519     /**
520      * Option bit for compare:
521      * Compare strings in code point order instead of code unit order.
522      */
523     public static final int COMPARE_CODE_POINT_ORDER = 0x8000;
524 
525     /**
526      * Option value for case folding:
527      * Use the modified set of mappings provided in CaseFolding.txt to handle dotted I
528      * and dotless i appropriately for Turkic languages (tr, az).
529      * @see UCharacter#FOLD_CASE_EXCLUDE_SPECIAL_I
530      */
531     public static final int FOLD_CASE_EXCLUDE_SPECIAL_I = UCharacter.FOLD_CASE_EXCLUDE_SPECIAL_I;
532 
533     /**
534      * Lowest-order bit number of compare() options bits corresponding to
535      * normalization options bits.
536      *
537      * The options parameter for compare() uses most bits for
538      * itself and for various comparison and folding flags.
539      * The most significant bits, however, are shifted down and passed on
540      * to the normalization implementation.
541      * (That is, from compare(..., options, ...),
542      * options&gt;&gt;COMPARE_NORM_OPTIONS_SHIFT will be passed on to the
543      * internal normalization functions.)
544      *
545      * @see #compare
546      * @deprecated ICU 56 Use {@link Normalizer2} instead.
547      * @hide original deprecated declaration
548      */
549     @Deprecated
550     public static final int COMPARE_NORM_OPTIONS_SHIFT  = 20;
551 
552     //-------------------------------------------------------------------------
553     // Iterator constructors
554     //-------------------------------------------------------------------------
555 
556     /**
557      * Creates a new <tt>Normalizer</tt> object for iterating over the
558      * normalized form of a given string.
559      * <p>
560      * The <tt>options</tt> parameter specifies which optional
561      * <tt>Normalizer</tt> features are to be enabled for this object.
562      * <p>
563      * @param str  The string to be normalized.  The normalization
564      *              will start at the beginning of the string.
565      *
566      * @param mode The normalization mode.
567      *
568      * @param opt Any optional features to be enabled.
569      *            Currently the only available option is {@link #UNICODE_3_2}.
570      *            If you want the default behavior corresponding to one of the
571      *            standard Unicode Normalization Forms, use 0 for this argument.
572      * @deprecated ICU 56 Use {@link Normalizer2} instead.
573      * @hide original deprecated declaration
574      */
575     @Deprecated
Normalizer(String str, Mode mode, int opt)576     public Normalizer(String str, Mode mode, int opt) {
577         this.text = UCharacterIterator.getInstance(str);
578         this.mode = mode;
579         this.options=opt;
580         norm2 = mode.getNormalizer2(opt);
581         buffer = new StringBuilder();
582     }
583 
584     /**
585      * Creates a new <tt>Normalizer</tt> object for iterating over the
586      * normalized form of the given text.
587      * <p>
588      * @param iter  The input text to be normalized.  The normalization
589      *              will start at the beginning of the string.
590      *
591      * @param mode  The normalization mode.
592      *
593      * @param opt Any optional features to be enabled.
594      *            Currently the only available option is {@link #UNICODE_3_2}.
595      *            If you want the default behavior corresponding to one of the
596      *            standard Unicode Normalization Forms, use 0 for this argument.
597      * @deprecated ICU 56 Use {@link Normalizer2} instead.
598      * @hide original deprecated declaration
599      */
600     @Deprecated
Normalizer(CharacterIterator iter, Mode mode, int opt)601     public Normalizer(CharacterIterator iter, Mode mode, int opt) {
602         this.text = UCharacterIterator.getInstance((CharacterIterator)iter.clone());
603         this.mode = mode;
604         this.options = opt;
605         norm2 = mode.getNormalizer2(opt);
606         buffer = new StringBuilder();
607     }
608 
609     /**
610      * Creates a new <tt>Normalizer</tt> object for iterating over the
611      * normalized form of the given text.
612      * <p>
613      * @param iter  The input text to be normalized.  The normalization
614      *              will start at the beginning of the string.
615      *
616      * @param mode  The normalization mode.
617      * @param options The normalization options, ORed together (0 for no options).
618      * @deprecated ICU 56 Use {@link Normalizer2} instead.
619      * @hide original deprecated declaration
620      */
621     @Deprecated
Normalizer(UCharacterIterator iter, Mode mode, int options)622     public Normalizer(UCharacterIterator iter, Mode mode, int options) {
623         try {
624             this.text     = (UCharacterIterator)iter.clone();
625             this.mode     = mode;
626             this.options  = options;
627             norm2 = mode.getNormalizer2(options);
628             buffer = new StringBuilder();
629         } catch (CloneNotSupportedException e) {
630             throw new ICUCloneNotSupportedException(e);
631         }
632     }
633 
634     /**
635      * Clones this <tt>Normalizer</tt> object.  All properties of this
636      * object are duplicated in the new object, including the cloning of any
637      * {@link CharacterIterator} that was passed in to the constructor
638      * or to {@link #setText(CharacterIterator) setText}.
639      * However, the text storage underlying
640      * the <tt>CharacterIterator</tt> is not duplicated unless the
641      * iterator's <tt>clone</tt> method does so.
642      *
643      * @deprecated ICU 56 Use {@link Normalizer2} instead.
644      * @hide original deprecated declaration
645      */
646     @Deprecated
647     @Override
clone()648     public Object clone() {
649         try {
650             Normalizer copy = (Normalizer) super.clone();
651             copy.text = (UCharacterIterator) text.clone();
652             copy.mode = mode;
653             copy.options = options;
654             copy.norm2 = norm2;
655             copy.buffer = new StringBuilder(buffer);
656             copy.bufferPos = bufferPos;
657             copy.currentIndex = currentIndex;
658             copy.nextIndex = nextIndex;
659             return copy;
660         }
661         catch (CloneNotSupportedException e) {
662             throw new ICUCloneNotSupportedException(e);
663         }
664     }
665 
666     //--------------------------------------------------------------------------
667     // Static Utility methods
668     //--------------------------------------------------------------------------
669 
getComposeNormalizer2(boolean compat, int options)670     private static final Normalizer2 getComposeNormalizer2(boolean compat, int options) {
671         return (compat ? NFKC : NFC).getNormalizer2(options);
672     }
getDecomposeNormalizer2(boolean compat, int options)673     private static final Normalizer2 getDecomposeNormalizer2(boolean compat, int options) {
674         return (compat ? NFKD : NFD).getNormalizer2(options);
675     }
676 
677     /**
678      * Compose a string.
679      * The string will be composed to according to the specified mode.
680      * @param str        The string to compose.
681      * @param compat     If true the string will be composed according to
682      *                    NFKC rules and if false will be composed according to
683      *                    NFC rules.
684      * @return String    The composed string
685      * @deprecated ICU 56 Use {@link Normalizer2} instead.
686      * @hide original deprecated declaration
687      */
688     @Deprecated
compose(String str, boolean compat)689     public static String compose(String str, boolean compat) {
690         return compose(str,compat,0);
691     }
692 
693     /**
694      * Compose a string.
695      * The string will be composed to according to the specified mode.
696      * @param str        The string to compose.
697      * @param compat     If true the string will be composed according to
698      *                    NFKC rules and if false will be composed according to
699      *                    NFC rules.
700      * @param options    The only recognized option is UNICODE_3_2
701      * @return String    The composed string
702      * @deprecated ICU 56 Use {@link Normalizer2} instead.
703      * @hide original deprecated declaration
704      */
705     @Deprecated
compose(String str, boolean compat, int options)706     public static String compose(String str, boolean compat, int options) {
707         return getComposeNormalizer2(compat, options).normalize(str);
708     }
709 
710     /**
711      * Compose a string.
712      * The string will be composed to according to the specified mode.
713      * @param source The char array to compose.
714      * @param target A char buffer to receive the normalized text.
715      * @param compat If true the char array will be composed according to
716      *                NFKC rules and if false will be composed according to
717      *                NFC rules.
718      * @param options The normalization options, ORed together (0 for no options).
719      * @return int   The total buffer size needed;if greater than length of
720      *                result, the output was truncated.
721      * @exception IndexOutOfBoundsException if target.length is less than the
722      *             required length
723      * @deprecated ICU 56 Use {@link Normalizer2} instead.
724      * @hide original deprecated declaration
725      */
726     @Deprecated
compose(char[] source,char[] target, boolean compat, int options)727     public static int compose(char[] source,char[] target, boolean compat, int options) {
728         return compose(source, 0, source.length, target, 0, target.length, compat, options);
729     }
730 
731     /**
732      * Compose a string.
733      * The string will be composed to according to the specified mode.
734      * @param src       The char array to compose.
735      * @param srcStart  Start index of the source
736      * @param srcLimit  Limit index of the source
737      * @param dest      The char buffer to fill in
738      * @param destStart Start index of the destination buffer
739      * @param destLimit End index of the destination buffer
740      * @param compat If true the char array will be composed according to
741      *                NFKC rules and if false will be composed according to
742      *                NFC rules.
743      * @param options The normalization options, ORed together (0 for no options).
744      * @return int   The total buffer size needed;if greater than length of
745      *                result, the output was truncated.
746      * @exception IndexOutOfBoundsException if target.length is less than the
747      *             required length
748      * @deprecated ICU 56 Use {@link Normalizer2} instead.
749      * @hide original deprecated declaration
750      */
751     @Deprecated
compose(char[] src,int srcStart, int srcLimit, char[] dest,int destStart, int destLimit, boolean compat, int options)752     public static int compose(char[] src,int srcStart, int srcLimit,
753                               char[] dest,int destStart, int destLimit,
754                               boolean compat, int options) {
755         CharBuffer srcBuffer = CharBuffer.wrap(src, srcStart, srcLimit - srcStart);
756         CharsAppendable app = new CharsAppendable(dest, destStart, destLimit);
757         getComposeNormalizer2(compat, options).normalize(srcBuffer, app);
758         return app.length();
759     }
760 
761     /**
762      * Decompose a string.
763      * The string will be decomposed to according to the specified mode.
764      * @param str       The string to decompose.
765      * @param compat    If true the string will be decomposed according to NFKD
766      *                   rules and if false will be decomposed according to NFD
767      *                   rules.
768      * @return String   The decomposed string
769      * @deprecated ICU 56 Use {@link Normalizer2} instead.
770      * @hide original deprecated declaration
771      */
772     @Deprecated
decompose(String str, boolean compat)773     public static String decompose(String str, boolean compat) {
774         return decompose(str,compat,0);
775     }
776 
777     /**
778      * Decompose a string.
779      * The string will be decomposed to according to the specified mode.
780      * @param str     The string to decompose.
781      * @param compat  If true the string will be decomposed according to NFKD
782      *                 rules and if false will be decomposed according to NFD
783      *                 rules.
784      * @param options The normalization options, ORed together (0 for no options).
785      * @return String The decomposed string
786      * @deprecated ICU 56 Use {@link Normalizer2} instead.
787      * @hide original deprecated declaration
788      */
789     @Deprecated
decompose(String str, boolean compat, int options)790     public static String decompose(String str, boolean compat, int options) {
791         return getDecomposeNormalizer2(compat, options).normalize(str);
792     }
793 
794     /**
795      * Decompose a string.
796      * The string will be decomposed to according to the specified mode.
797      * @param source The char array to decompose.
798      * @param target A char buffer to receive the normalized text.
799      * @param compat If true the char array will be decomposed according to NFKD
800      *                rules and if false will be decomposed according to
801      *                NFD rules.
802      * @return int   The total buffer size needed;if greater than length of
803      *                result,the output was truncated.
804      * @param options The normalization options, ORed together (0 for no options).
805      * @exception IndexOutOfBoundsException if the target capacity is less than
806      *             the required length
807      * @deprecated ICU 56 Use {@link Normalizer2} instead.
808      * @hide original deprecated declaration
809      */
810     @Deprecated
decompose(char[] source,char[] target, boolean compat, int options)811     public static int decompose(char[] source,char[] target, boolean compat, int options) {
812         return decompose(source, 0, source.length, target, 0, target.length, compat, options);
813     }
814 
815     /**
816      * Decompose a string.
817      * The string will be decomposed to according to the specified mode.
818      * @param src       The char array to compose.
819      * @param srcStart  Start index of the source
820      * @param srcLimit  Limit index of the source
821      * @param dest      The char buffer to fill in
822      * @param destStart Start index of the destination buffer
823      * @param destLimit End index of the destination buffer
824      * @param compat If true the char array will be decomposed according to NFKD
825      *                rules and if false will be decomposed according to
826      *                NFD rules.
827      * @param options The normalization options, ORed together (0 for no options).
828      * @return int   The total buffer size needed;if greater than length of
829      *                result,the output was truncated.
830      * @exception IndexOutOfBoundsException if the target capacity is less than
831      *             the required length
832      * @deprecated ICU 56 Use {@link Normalizer2} instead.
833      * @hide original deprecated declaration
834      */
835     @Deprecated
decompose(char[] src,int srcStart, int srcLimit, char[] dest,int destStart, int destLimit, boolean compat, int options)836     public static int decompose(char[] src,int srcStart, int srcLimit,
837                                 char[] dest,int destStart, int destLimit,
838                                 boolean compat, int options) {
839         CharBuffer srcBuffer = CharBuffer.wrap(src, srcStart, srcLimit - srcStart);
840         CharsAppendable app = new CharsAppendable(dest, destStart, destLimit);
841         getDecomposeNormalizer2(compat, options).normalize(srcBuffer, app);
842         return app.length();
843     }
844 
845     /**
846      * Normalizes a <tt>String</tt> using the given normalization operation.
847      * <p>
848      * The <tt>options</tt> parameter specifies which optional
849      * <tt>Normalizer</tt> features are to be enabled for this operation.
850      * Currently the only available option is {@link #UNICODE_3_2}.
851      * If you want the default behavior corresponding to one of the standard
852      * Unicode Normalization Forms, use 0 for this argument.
853      * <p>
854      * @param str       the input string to be normalized.
855      * @param mode      the normalization mode
856      * @param options   the optional features to be enabled.
857      * @return String   the normalized string
858      * @deprecated ICU 56 Use {@link Normalizer2} instead.
859      * @hide original deprecated declaration
860      */
861     @Deprecated
normalize(String str, Mode mode, int options)862     public static String normalize(String str, Mode mode, int options) {
863         return mode.getNormalizer2(options).normalize(str);
864     }
865 
866     /**
867      * Normalize a string.
868      * The string will be normalized according to the specified normalization
869      * mode and options.
870      * @param src        The string to normalize.
871      * @param mode       The normalization mode; one of Normalizer.NONE,
872      *                    Normalizer.NFD, Normalizer.NFC, Normalizer.NFKC,
873      *                    Normalizer.NFKD, Normalizer.DEFAULT
874      * @return the normalized string
875      * @deprecated ICU 56 Use {@link Normalizer2} instead.
876      * @hide original deprecated declaration
877      */
878     @Deprecated
normalize(String src,Mode mode)879     public static String normalize(String src,Mode mode) {
880         return normalize(src, mode, 0);
881     }
882     /**
883      * Normalize a string.
884      * The string will be normalized according to the specified normalization
885      * mode and options.
886      * @param source The char array to normalize.
887      * @param target A char buffer to receive the normalized text.
888      * @param mode   The normalization mode; one of Normalizer.NONE,
889      *                Normalizer.NFD, Normalizer.NFC, Normalizer.NFKC,
890      *                Normalizer.NFKD, Normalizer.DEFAULT
891      * @param options The normalization options, ORed together (0 for no options).
892      * @return int   The total buffer size needed;if greater than length of
893      *                result, the output was truncated.
894      * @exception    IndexOutOfBoundsException if the target capacity is less
895      *                than the required length
896      * @deprecated ICU 56 Use {@link Normalizer2} instead.
897      * @hide original deprecated declaration
898      */
899     @Deprecated
normalize(char[] source,char[] target, Mode mode, int options)900     public static int normalize(char[] source,char[] target, Mode  mode, int options) {
901         return normalize(source,0,source.length,target,0,target.length,mode, options);
902     }
903 
904     /**
905      * Normalize a string.
906      * The string will be normalized according to the specified normalization
907      * mode and options.
908      * @param src       The char array to compose.
909      * @param srcStart  Start index of the source
910      * @param srcLimit  Limit index of the source
911      * @param dest      The char buffer to fill in
912      * @param destStart Start index of the destination buffer
913      * @param destLimit End index of the destination buffer
914      * @param mode      The normalization mode; one of Normalizer.NONE,
915      *                   Normalizer.NFD, Normalizer.NFC, Normalizer.NFKC,
916      *                   Normalizer.NFKD, Normalizer.DEFAULT
917      * @param options The normalization options, ORed together (0 for no options).
918      * @return int      The total buffer size needed;if greater than length of
919      *                   result, the output was truncated.
920      * @exception       IndexOutOfBoundsException if the target capacity is
921      *                   less than the required length
922      * @deprecated ICU 56 Use {@link Normalizer2} instead.
923      * @hide original deprecated declaration
924      */
925     @Deprecated
normalize(char[] src,int srcStart, int srcLimit, char[] dest,int destStart, int destLimit, Mode mode, int options)926     public static int normalize(char[] src,int srcStart, int srcLimit,
927                                 char[] dest,int destStart, int destLimit,
928                                 Mode  mode, int options) {
929         CharBuffer srcBuffer = CharBuffer.wrap(src, srcStart, srcLimit - srcStart);
930         CharsAppendable app = new CharsAppendable(dest, destStart, destLimit);
931         mode.getNormalizer2(options).normalize(srcBuffer, app);
932         return app.length();
933     }
934 
935     /**
936      * Normalize a codepoint according to the given mode
937      * @param char32    The input string to be normalized.
938      * @param mode      The normalization mode
939      * @param options   Options for use with exclusion set and tailored Normalization
940      *                                   The only option that is currently recognized is UNICODE_3_2
941      * @return String   The normalized string
942      * @see #UNICODE_3_2
943      * @deprecated ICU 56 Use {@link Normalizer2} instead.
944      * @hide original deprecated declaration
945      */
946     @Deprecated
normalize(int char32, Mode mode, int options)947     public static String normalize(int char32, Mode mode, int options) {
948         if(mode == NFD && options == 0) {
949             String decomposition = Normalizer2.getNFCInstance().getDecomposition(char32);
950             if(decomposition == null) {
951                 decomposition = UTF16.valueOf(char32);
952             }
953             return decomposition;
954         }
955         return normalize(UTF16.valueOf(char32), mode, options);
956     }
957 
958     /**
959      * Convenience method to normalize a codepoint according to the given mode
960      * @param char32    The input string to be normalized.
961      * @param mode      The normalization mode
962      * @return String   The normalized string
963      * @deprecated ICU 56 Use {@link Normalizer2} instead.
964      * @hide original deprecated declaration
965      */
966     @Deprecated
normalize(int char32, Mode mode)967     public static String normalize(int char32, Mode mode) {
968         return normalize(char32, mode, 0);
969     }
970 
971     /**
972      * Convenience method.
973      *
974      * @param source   string for determining if it is in a normalized format
975      * @param mode     normalization format (Normalizer.NFC,Normalizer.NFD,
976      *                  Normalizer.NFKC,Normalizer.NFKD)
977      * @return         Return code to specify if the text is normalized or not
978      *                     (Normalizer.YES, Normalizer.NO or Normalizer.MAYBE)
979      * @deprecated ICU 56 Use {@link Normalizer2} instead.
980      * @hide original deprecated declaration
981      */
982     @Deprecated
quickCheck(String source, Mode mode)983     public static QuickCheckResult quickCheck(String source, Mode mode) {
984         return quickCheck(source, mode, 0);
985     }
986 
987     /**
988      * Performing quick check on a string, to quickly determine if the string is
989      * in a particular normalization format.
990      * Three types of result can be returned Normalizer.YES, Normalizer.NO or
991      * Normalizer.MAYBE. Result Normalizer.YES indicates that the argument
992      * string is in the desired normalized format, Normalizer.NO determines that
993      * argument string is not in the desired normalized format. A
994      * Normalizer.MAYBE result indicates that a more thorough check is required,
995      * the user may have to put the string in its normalized form and compare
996      * the results.
997      *
998      * @param source   string for determining if it is in a normalized format
999      * @param mode     normalization format (Normalizer.NFC,Normalizer.NFD,
1000      *                  Normalizer.NFKC,Normalizer.NFKD)
1001      * @param options   Options for use with exclusion set and tailored Normalization
1002      *                                   The only option that is currently recognized is UNICODE_3_2
1003      * @return         Return code to specify if the text is normalized or not
1004      *                     (Normalizer.YES, Normalizer.NO or Normalizer.MAYBE)
1005      * @deprecated ICU 56 Use {@link Normalizer2} instead.
1006      * @hide original deprecated declaration
1007      */
1008     @Deprecated
quickCheck(String source, Mode mode, int options)1009     public static QuickCheckResult quickCheck(String source, Mode mode, int options) {
1010         return mode.getNormalizer2(options).quickCheck(source);
1011     }
1012 
1013     /**
1014      * Convenience method.
1015      *
1016      * @param source Array of characters for determining if it is in a
1017      *                normalized format
1018      * @param mode   normalization format (Normalizer.NFC,Normalizer.NFD,
1019      *                Normalizer.NFKC,Normalizer.NFKD)
1020      * @param options   Options for use with exclusion set and tailored Normalization
1021      *                                   The only option that is currently recognized is UNICODE_3_2
1022      * @return       Return code to specify if the text is normalized or not
1023      *                (Normalizer.YES, Normalizer.NO or Normalizer.MAYBE)
1024      * @deprecated ICU 56 Use {@link Normalizer2} instead.
1025      * @hide original deprecated declaration
1026      */
1027     @Deprecated
quickCheck(char[] source, Mode mode, int options)1028     public static QuickCheckResult quickCheck(char[] source, Mode mode, int options) {
1029         return quickCheck(source, 0, source.length, mode, options);
1030     }
1031 
1032     /**
1033      * Performing quick check on a string, to quickly determine if the string is
1034      * in a particular normalization format.
1035      * Three types of result can be returned Normalizer.YES, Normalizer.NO or
1036      * Normalizer.MAYBE. Result Normalizer.YES indicates that the argument
1037      * string is in the desired normalized format, Normalizer.NO determines that
1038      * argument string is not in the desired normalized format. A
1039      * Normalizer.MAYBE result indicates that a more thorough check is required,
1040      * the user may have to put the string in its normalized form and compare
1041      * the results.
1042      *
1043      * @param source    string for determining if it is in a normalized format
1044      * @param start     the start index of the source
1045      * @param limit     the limit index of the source it is equal to the length
1046      * @param mode      normalization format (Normalizer.NFC,Normalizer.NFD,
1047      *                   Normalizer.NFKC,Normalizer.NFKD)
1048      * @param options   Options for use with exclusion set and tailored Normalization
1049      *                                   The only option that is currently recognized is UNICODE_3_2
1050      * @return          Return code to specify if the text is normalized or not
1051      *                   (Normalizer.YES, Normalizer.NO or
1052      *                   Normalizer.MAYBE)
1053      * @deprecated ICU 56 Use {@link Normalizer2} instead.
1054      * @hide original deprecated declaration
1055      */
1056     @Deprecated
quickCheck(char[] source,int start, int limit, Mode mode,int options)1057     public static QuickCheckResult quickCheck(char[] source,int start,
1058                                               int limit, Mode mode,int options) {
1059         CharBuffer srcBuffer = CharBuffer.wrap(source, start, limit - start);
1060         return mode.getNormalizer2(options).quickCheck(srcBuffer);
1061     }
1062 
1063     /**
1064      * Test if a string is in a given normalization form.
1065      * This is semantically equivalent to source.equals(normalize(source, mode)).
1066      *
1067      * Unlike quickCheck(), this function returns a definitive result,
1068      * never a "maybe".
1069      * For NFD, NFKD, and FCD, both functions work exactly the same.
1070      * For NFC and NFKC where quickCheck may return "maybe", this function will
1071      * perform further tests to arrive at a true/false result.
1072      * @param src       The input array of characters to be checked to see if
1073      *                   it is normalized
1074      * @param start     The strart index in the source
1075      * @param limit     The limit index in the source
1076      * @param mode      the normalization mode
1077      * @param options   Options for use with exclusion set and tailored Normalization
1078      *                                   The only option that is currently recognized is UNICODE_3_2
1079      * @return Boolean value indicating whether the source string is in the
1080      *         "mode" normalization form
1081      * @deprecated ICU 56 Use {@link Normalizer2} instead.
1082      * @hide original deprecated declaration
1083      */
1084     @Deprecated
isNormalized(char[] src,int start, int limit, Mode mode, int options)1085     public static boolean isNormalized(char[] src,int start,
1086                                        int limit, Mode mode,
1087                                        int options) {
1088         CharBuffer srcBuffer = CharBuffer.wrap(src, start, limit - start);
1089         return mode.getNormalizer2(options).isNormalized(srcBuffer);
1090     }
1091 
1092     /**
1093      * Test if a string is in a given normalization form.
1094      * This is semantically equivalent to source.equals(normalize(source, mode)).
1095      *
1096      * Unlike quickCheck(), this function returns a definitive result,
1097      * never a "maybe".
1098      * For NFD, NFKD, and FCD, both functions work exactly the same.
1099      * For NFC and NFKC where quickCheck may return "maybe", this function will
1100      * perform further tests to arrive at a true/false result.
1101      * @param str       the input string to be checked to see if it is
1102      *                   normalized
1103      * @param mode      the normalization mode
1104      * @param options   Options for use with exclusion set and tailored Normalization
1105      *                  The only option that is currently recognized is UNICODE_3_2
1106      * @see #isNormalized
1107      * @deprecated ICU 56 Use {@link Normalizer2} instead.
1108      * @hide original deprecated declaration
1109      */
1110     @Deprecated
isNormalized(String str, Mode mode, int options)1111     public static boolean isNormalized(String str, Mode mode, int options) {
1112         return mode.getNormalizer2(options).isNormalized(str);
1113     }
1114 
1115     /**
1116      * Convenience Method
1117      * @param char32    the input code point to be checked to see if it is
1118      *                   normalized
1119      * @param mode      the normalization mode
1120      * @param options   Options for use with exclusion set and tailored Normalization
1121      *                  The only option that is currently recognized is UNICODE_3_2
1122      *
1123      * @see #isNormalized
1124      * @deprecated ICU 56 Use {@link Normalizer2} instead.
1125      * @hide original deprecated declaration
1126      */
1127     @Deprecated
isNormalized(int char32, Mode mode,int options)1128     public static boolean isNormalized(int char32, Mode mode,int options) {
1129         return isNormalized(UTF16.valueOf(char32), mode, options);
1130     }
1131 
1132     /**
1133      * Compare two strings for canonical equivalence.
1134      * Further options include case-insensitive comparison and
1135      * code point order (as opposed to code unit order).
1136      *
1137      * Canonical equivalence between two strings is defined as their normalized
1138      * forms (NFD or NFC) being identical.
1139      * This function compares strings incrementally instead of normalizing
1140      * (and optionally case-folding) both strings entirely,
1141      * improving performance significantly.
1142      *
1143      * Bulk normalization is only necessary if the strings do not fulfill the
1144      * FCD conditions. Only in this case, and only if the strings are relatively
1145      * long, is memory allocated temporarily.
1146      * For FCD strings and short non-FCD strings there is no memory allocation.
1147      *
1148      * Semantically, this is equivalent to
1149      *   strcmp[CodePointOrder](foldCase(NFD(s1)), foldCase(NFD(s2)))
1150      * where code point order and foldCase are all optional.
1151      *
1152      * @param s1        First source character array.
1153      * @param s1Start   start index of source
1154      * @param s1Limit   limit of the source
1155      *
1156      * @param s2        Second source character array.
1157      * @param s2Start   start index of the source
1158      * @param s2Limit   limit of the source
1159      *
1160      * @param options A bit set of options:
1161      *   - FOLD_CASE_DEFAULT or 0 is used for default options:
1162      *     Case-sensitive comparison in code unit order, and the input strings
1163      *     are quick-checked for FCD.
1164      *
1165      *   - INPUT_IS_FCD
1166      *     Set if the caller knows that both s1 and s2 fulfill the FCD
1167      *     conditions.If not set, the function will quickCheck for FCD
1168      *     and normalize if necessary.
1169      *
1170      *   - COMPARE_CODE_POINT_ORDER
1171      *     Set to choose code point order instead of code unit order
1172      *
1173      *   - COMPARE_IGNORE_CASE
1174      *     Set to compare strings case-insensitively using case folding,
1175      *     instead of case-sensitively.
1176      *     If set, then the following case folding options are used.
1177      *
1178      *
1179      * @return &lt;0 or 0 or &gt;0 as usual for string comparisons
1180      */
compare(char[] s1, int s1Start, int s1Limit, char[] s2, int s2Start, int s2Limit, int options)1181     public static int compare(char[] s1, int s1Start, int s1Limit,
1182                               char[] s2, int s2Start, int s2Limit,
1183                               int options) {
1184         if( s1==null || s1Start<0 || s1Limit<0 ||
1185             s2==null || s2Start<0 || s2Limit<0 ||
1186             s1Limit<s1Start || s2Limit<s2Start
1187         ) {
1188             throw new IllegalArgumentException();
1189         }
1190         return internalCompare(CharBuffer.wrap(s1, s1Start, s1Limit-s1Start),
1191                                CharBuffer.wrap(s2, s2Start, s2Limit-s2Start),
1192                                options);
1193     }
1194 
1195     /**
1196      * Compare two strings for canonical equivalence.
1197      * Further options include case-insensitive comparison and
1198      * code point order (as opposed to code unit order).
1199      *
1200      * Canonical equivalence between two strings is defined as their normalized
1201      * forms (NFD or NFC) being identical.
1202      * This function compares strings incrementally instead of normalizing
1203      * (and optionally case-folding) both strings entirely,
1204      * improving performance significantly.
1205      *
1206      * Bulk normalization is only necessary if the strings do not fulfill the
1207      * FCD conditions. Only in this case, and only if the strings are relatively
1208      * long, is memory allocated temporarily.
1209      * For FCD strings and short non-FCD strings there is no memory allocation.
1210      *
1211      * Semantically, this is equivalent to
1212      *   strcmp[CodePointOrder](foldCase(NFD(s1)), foldCase(NFD(s2)))
1213      * where code point order and foldCase are all optional.
1214      *
1215      * @param s1 First source string.
1216      * @param s2 Second source string.
1217      *
1218      * @param options A bit set of options:
1219      *   - FOLD_CASE_DEFAULT or 0 is used for default options:
1220      *     Case-sensitive comparison in code unit order, and the input strings
1221      *     are quick-checked for FCD.
1222      *
1223      *   - INPUT_IS_FCD
1224      *     Set if the caller knows that both s1 and s2 fulfill the FCD
1225      *     conditions. If not set, the function will quickCheck for FCD
1226      *     and normalize if necessary.
1227      *
1228      *   - COMPARE_CODE_POINT_ORDER
1229      *     Set to choose code point order instead of code unit order
1230      *
1231      *   - COMPARE_IGNORE_CASE
1232      *     Set to compare strings case-insensitively using case folding,
1233      *     instead of case-sensitively.
1234      *     If set, then the following case folding options are used.
1235      *
1236      * @return &lt;0 or 0 or &gt;0 as usual for string comparisons
1237      */
compare(String s1, String s2, int options)1238     public static int compare(String s1, String s2, int options) {
1239         return internalCompare(s1, s2, options);
1240     }
1241 
1242     /**
1243      * Compare two strings for canonical equivalence.
1244      * Further options include case-insensitive comparison and
1245      * code point order (as opposed to code unit order).
1246      * Convenience method.
1247      *
1248      * @param s1 First source string.
1249      * @param s2 Second source string.
1250      *
1251      * @param options A bit set of options:
1252      *   - FOLD_CASE_DEFAULT or 0 is used for default options:
1253      *     Case-sensitive comparison in code unit order, and the input strings
1254      *     are quick-checked for FCD.
1255      *
1256      *   - INPUT_IS_FCD
1257      *     Set if the caller knows that both s1 and s2 fulfill the FCD
1258      *     conditions. If not set, the function will quickCheck for FCD
1259      *     and normalize if necessary.
1260      *
1261      *   - COMPARE_CODE_POINT_ORDER
1262      *     Set to choose code point order instead of code unit order
1263      *
1264      *   - COMPARE_IGNORE_CASE
1265      *     Set to compare strings case-insensitively using case folding,
1266      *     instead of case-sensitively.
1267      *     If set, then the following case folding options are used.
1268      *
1269      * @return &lt;0 or 0 or &gt;0 as usual for string comparisons
1270      */
compare(char[] s1, char[] s2, int options)1271     public static int compare(char[] s1, char[] s2, int options) {
1272         return internalCompare(CharBuffer.wrap(s1), CharBuffer.wrap(s2), options);
1273     }
1274 
1275     /**
1276      * Convenience method that can have faster implementation
1277      * by not allocating buffers.
1278      * @param char32a    the first code point to be checked against the
1279      * @param char32b    the second code point
1280      * @param options    A bit set of options
1281      */
compare(int char32a, int char32b, int options)1282     public static int compare(int char32a, int char32b, int options) {
1283         return internalCompare(UTF16.valueOf(char32a), UTF16.valueOf(char32b), options|INPUT_IS_FCD);
1284     }
1285 
1286     /**
1287      * Convenience method that can have faster implementation
1288      * by not allocating buffers.
1289      * @param char32a   the first code point to be checked against
1290      * @param str2      the second string
1291      * @param options   A bit set of options
1292      */
compare(int char32a, String str2, int options)1293     public static int compare(int char32a, String str2, int options) {
1294         return internalCompare(UTF16.valueOf(char32a), str2, options);
1295     }
1296 
1297     /* Concatenation of normalized strings --------------------------------- */
1298     /**
1299      * Concatenate normalized strings, making sure that the result is normalized
1300      * as well.
1301      *
1302      * If both the left and the right strings are in
1303      * the normalization form according to "mode",
1304      * then the result will be
1305      *
1306      * <code>
1307      *     dest=normalize(left+right, mode)
1308      * </code>
1309      *
1310      * With the input strings already being normalized,
1311      * this function will use next() and previous()
1312      * to find the adjacent end pieces of the input strings.
1313      * Only the concatenation of these end pieces will be normalized and
1314      * then concatenated with the remaining parts of the input strings.
1315      *
1316      * It is allowed to have dest==left to avoid copying the entire left string.
1317      *
1318      * @param left Left source array, may be same as dest.
1319      * @param leftStart start in the left array.
1320      * @param leftLimit limit in the left array (==length)
1321      * @param right Right source array.
1322      * @param rightStart start in the right array.
1323      * @param rightLimit limit in the right array (==length)
1324      * @param dest The output buffer; can be null if destStart==destLimit==0
1325      *              for pure preflighting.
1326      * @param destStart start in the destination array
1327      * @param destLimit limit in the destination array (==length)
1328      * @param mode The normalization mode.
1329      * @param options The normalization options, ORed together (0 for no options).
1330      * @return Length of output (number of chars) when successful or
1331      *          IndexOutOfBoundsException
1332      * @exception IndexOutOfBoundsException whose message has the string
1333      *             representation of destination capacity required.
1334      * @see #normalize
1335      * @see #next
1336      * @see #previous
1337      * @exception IndexOutOfBoundsException if target capacity is less than the
1338      *             required length
1339      * @deprecated ICU 56 Use {@link Normalizer2} instead.
1340      * @hide original deprecated declaration
1341      */
1342     @Deprecated
concatenate(char[] left, int leftStart, int leftLimit, char[] right, int rightStart, int rightLimit, char[] dest, int destStart, int destLimit, Normalizer.Mode mode, int options)1343     public static int concatenate(char[] left,  int leftStart,  int leftLimit,
1344                                   char[] right, int rightStart, int rightLimit,
1345                                   char[] dest,  int destStart,  int destLimit,
1346                                   Normalizer.Mode mode, int options) {
1347         if(dest == null) {
1348             throw new IllegalArgumentException();
1349         }
1350 
1351         /* check for overlapping right and destination */
1352         if (right == dest && rightStart < destLimit && destStart < rightLimit) {
1353             throw new IllegalArgumentException("overlapping right and dst ranges");
1354         }
1355 
1356         /* allow left==dest */
1357         StringBuilder destBuilder=new StringBuilder(leftLimit-leftStart+rightLimit-rightStart+16);
1358         destBuilder.append(left, leftStart, leftLimit-leftStart);
1359         CharBuffer rightBuffer=CharBuffer.wrap(right, rightStart, rightLimit-rightStart);
1360         mode.getNormalizer2(options).append(destBuilder, rightBuffer);
1361         int destLength=destBuilder.length();
1362         if(destLength<=(destLimit-destStart)) {
1363             destBuilder.getChars(0, destLength, dest, destStart);
1364             return destLength;
1365         } else {
1366             throw new IndexOutOfBoundsException(Integer.toString(destLength));
1367         }
1368     }
1369 
1370     /**
1371      * Concatenate normalized strings, making sure that the result is normalized
1372      * as well.
1373      *
1374      * If both the left and the right strings are in
1375      * the normalization form according to "mode",
1376      * then the result will be
1377      *
1378      * <code>
1379      *     dest=normalize(left+right, mode)
1380      * </code>
1381      *
1382      * For details see concatenate
1383      *
1384      * @param left Left source string.
1385      * @param right Right source string.
1386      * @param mode The normalization mode.
1387      * @param options The normalization options, ORed together (0 for no options).
1388      * @return result
1389      *
1390      * @see #concatenate
1391      * @see #normalize
1392      * @see #next
1393      * @see #previous
1394      * @see #concatenate
1395      * @deprecated ICU 56 Use {@link Normalizer2} instead.
1396      * @hide original deprecated declaration
1397      */
1398     @Deprecated
concatenate(char[] left, char[] right,Mode mode, int options)1399     public static String concatenate(char[] left, char[] right,Mode mode, int options) {
1400         StringBuilder dest=new StringBuilder(left.length+right.length+16).append(left);
1401         return mode.getNormalizer2(options).append(dest, CharBuffer.wrap(right)).toString();
1402     }
1403 
1404     /**
1405      * Concatenate normalized strings, making sure that the result is normalized
1406      * as well.
1407      *
1408      * If both the left and the right strings are in
1409      * the normalization form according to "mode",
1410      * then the result will be
1411      *
1412      * <code>
1413      *     dest=normalize(left+right, mode)
1414      * </code>
1415      *
1416      * With the input strings already being normalized,
1417      * this function will use next() and previous()
1418      * to find the adjacent end pieces of the input strings.
1419      * Only the concatenation of these end pieces will be normalized and
1420      * then concatenated with the remaining parts of the input strings.
1421      *
1422      * @param left Left source string.
1423      * @param right Right source string.
1424      * @param mode The normalization mode.
1425      * @param options The normalization options, ORed together (0 for no options).
1426      * @return result
1427      *
1428      * @see #concatenate
1429      * @see #normalize
1430      * @see #next
1431      * @see #previous
1432      * @see #concatenate
1433      * @deprecated ICU 56 Use {@link Normalizer2} instead.
1434      * @hide original deprecated declaration
1435      */
1436     @Deprecated
concatenate(String left, String right, Mode mode, int options)1437     public static String concatenate(String left, String right, Mode mode, int options) {
1438         StringBuilder dest=new StringBuilder(left.length()+right.length()+16).append(left);
1439         return mode.getNormalizer2(options).append(dest, right).toString();
1440     }
1441 
1442     /**
1443      * Gets the FC_NFKC closure value.
1444      * @param c The code point whose closure value is to be retrieved
1445      * @param dest The char array to receive the closure value
1446      * @return the length of the closure value; 0 if there is none
1447      * @deprecated ICU 56
1448      * @hide original deprecated declaration
1449      */
1450     @Deprecated
getFC_NFKC_Closure(int c,char[] dest)1451     public static int getFC_NFKC_Closure(int c,char[] dest) {
1452         String closure=getFC_NFKC_Closure(c);
1453         int length=closure.length();
1454         if(length!=0 && dest!=null && length<=dest.length) {
1455             closure.getChars(0, length, dest, 0);
1456         }
1457         return length;
1458     }
1459     /**
1460      * Gets the FC_NFKC closure value.
1461      * @param c The code point whose closure value is to be retrieved
1462      * @return String representation of the closure value; "" if there is none
1463      * @deprecated ICU 56
1464      * @hide original deprecated declaration
1465      */
1466     @Deprecated
getFC_NFKC_Closure(int c)1467     public static String getFC_NFKC_Closure(int c) {
1468         // Compute the FC_NFKC_Closure on the fly:
1469         // We have the API for complete coverage of Unicode properties, although
1470         // this value by itself is not useful via API.
1471         // (What could be useful is a custom normalization table that combines
1472         // case folding and NFKC.)
1473         // For the derivation, see Unicode's DerivedNormalizationProps.txt.
1474         Normalizer2 nfkc=NFKCModeImpl.INSTANCE.normalizer2;
1475         UCaseProps csp=UCaseProps.INSTANCE;
1476         // first: b = NFKC(Fold(a))
1477         StringBuilder folded=new StringBuilder();
1478         int folded1Length=csp.toFullFolding(c, folded, 0);
1479         if(folded1Length<0) {
1480             Normalizer2Impl nfkcImpl=((Norm2AllModes.Normalizer2WithImpl)nfkc).impl;
1481             if(nfkcImpl.getCompQuickCheck(nfkcImpl.getNorm16(c))!=0) {
1482                 return "";  // c does not change at all under CaseFolding+NFKC
1483             }
1484             folded.appendCodePoint(c);
1485         } else {
1486             if(folded1Length>UCaseProps.MAX_STRING_LENGTH) {
1487                 folded.appendCodePoint(folded1Length);
1488             }
1489         }
1490         String kc1=nfkc.normalize(folded);
1491         // second: c = NFKC(Fold(b))
1492         String kc2=nfkc.normalize(UCharacter.foldCase(kc1, 0));
1493         // if (c != b) add the mapping from a to c
1494         if(kc1.equals(kc2)) {
1495             return "";
1496         } else {
1497             return kc2;
1498         }
1499     }
1500 
1501     //-------------------------------------------------------------------------
1502     // Iteration API
1503     //-------------------------------------------------------------------------
1504 
1505     /**
1506      * Return the current character in the normalized text.
1507      * @return The codepoint as an int
1508      * @deprecated ICU 56
1509      * @hide original deprecated declaration
1510      */
1511     @Deprecated
current()1512     public int current() {
1513         if(bufferPos<buffer.length() || nextNormalize()) {
1514             return buffer.codePointAt(bufferPos);
1515         } else {
1516             return DONE;
1517         }
1518     }
1519 
1520     /**
1521      * Return the next character in the normalized text and advance
1522      * the iteration position by one.  If the end
1523      * of the text has already been reached, {@link #DONE} is returned.
1524      * @return The codepoint as an int
1525      * @deprecated ICU 56
1526      * @hide original deprecated declaration
1527      */
1528     @Deprecated
next()1529     public int next() {
1530         if(bufferPos<buffer.length() ||  nextNormalize()) {
1531             int c=buffer.codePointAt(bufferPos);
1532             bufferPos+=Character.charCount(c);
1533             return c;
1534         } else {
1535             return DONE;
1536         }
1537     }
1538 
1539 
1540     /**
1541      * Return the previous character in the normalized text and decrement
1542      * the iteration position by one.  If the beginning
1543      * of the text has already been reached, {@link #DONE} is returned.
1544      * @return The codepoint as an int
1545      * @deprecated ICU 56
1546      * @hide original deprecated declaration
1547      */
1548     @Deprecated
previous()1549     public int previous() {
1550         if(bufferPos>0 || previousNormalize()) {
1551             int c=buffer.codePointBefore(bufferPos);
1552             bufferPos-=Character.charCount(c);
1553             return c;
1554         } else {
1555             return DONE;
1556         }
1557     }
1558 
1559     /**
1560      * Reset the index to the beginning of the text.
1561      * This is equivalent to setIndexOnly(startIndex)).
1562      * @deprecated ICU 56
1563      * @hide original deprecated declaration
1564      */
1565     @Deprecated
reset()1566     public void reset() {
1567         text.setToStart();
1568         currentIndex=nextIndex=0;
1569         clearBuffer();
1570     }
1571 
1572     /**
1573      * Set the iteration position in the input text that is being normalized,
1574      * without any immediate normalization.
1575      * After setIndexOnly(), getIndex() will return the same index that is
1576      * specified here.
1577      *
1578      * @param index the desired index in the input text.
1579      * @deprecated ICU 56
1580      * @hide original deprecated declaration
1581      */
1582     @Deprecated
setIndexOnly(int index)1583     public void setIndexOnly(int index) {
1584         text.setIndex(index);  // validates index
1585         currentIndex=nextIndex=index;
1586         clearBuffer();
1587     }
1588 
1589     /**
1590      * Set the iteration position in the input text that is being normalized
1591      * and return the first normalized character at that position.
1592      * <p>
1593      * <b>Note:</b> This method sets the position in the <em>input</em> text,
1594      * while {@link #next} and {@link #previous} iterate through characters
1595      * in the normalized <em>output</em>.  This means that there is not
1596      * necessarily a one-to-one correspondence between characters returned
1597      * by <tt>next</tt> and <tt>previous</tt> and the indices passed to and
1598      * returned from <tt>setIndex</tt> and {@link #getIndex}.
1599      * <p>
1600      * @param index the desired index in the input text.
1601      *
1602      * @return   the first normalized character that is the result of iterating
1603      *            forward starting at the given index.
1604      *
1605      * @throws IllegalArgumentException if the given index is less than
1606      *          {@link #getBeginIndex} or greater than {@link #getEndIndex}.
1607      * @deprecated ICU 3.2
1608      * @obsolete ICU 3.2
1609      * @hide original deprecated declaration
1610      */
1611     @Deprecated
1612      ///CLOVER:OFF
setIndex(int index)1613      public int setIndex(int index) {
1614          setIndexOnly(index);
1615          return current();
1616      }
1617      ///CLOVER:ON
1618     /**
1619      * Retrieve the index of the start of the input text. This is the begin
1620      * index of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the
1621      * <tt>String</tt> over which this <tt>Normalizer</tt> is iterating
1622      * @deprecated ICU 2.2. Use startIndex() instead.
1623      * @return The codepoint as an int
1624      * @see #startIndex
1625      * @hide original deprecated declaration
1626      */
1627     @Deprecated
getBeginIndex()1628     public int getBeginIndex() {
1629         return 0;
1630     }
1631 
1632     /**
1633      * Retrieve the index of the end of the input text.  This is the end index
1634      * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt>
1635      * over which this <tt>Normalizer</tt> is iterating
1636      * @deprecated ICU 2.2. Use endIndex() instead.
1637      * @return The codepoint as an int
1638      * @see #endIndex
1639      * @hide original deprecated declaration
1640      */
1641     @Deprecated
getEndIndex()1642     public int getEndIndex() {
1643         return endIndex();
1644     }
1645     /**
1646      * Return the first character in the normalized text.  This resets
1647      * the <tt>Normalizer's</tt> position to the beginning of the text.
1648      * @return The codepoint as an int
1649      * @deprecated ICU 56
1650      * @hide original deprecated declaration
1651      */
1652     @Deprecated
first()1653     public int first() {
1654         reset();
1655         return next();
1656     }
1657 
1658     /**
1659      * Return the last character in the normalized text.  This resets
1660      * the <tt>Normalizer's</tt> position to be just before the
1661      * the input text corresponding to that normalized character.
1662      * @return The codepoint as an int
1663      * @deprecated ICU 56
1664      * @hide original deprecated declaration
1665      */
1666     @Deprecated
last()1667     public int last() {
1668         text.setToLimit();
1669         currentIndex=nextIndex=text.getIndex();
1670         clearBuffer();
1671         return previous();
1672     }
1673 
1674     /**
1675      * Retrieve the current iteration position in the input text that is
1676      * being normalized.  This method is useful in applications such as
1677      * searching, where you need to be able to determine the position in
1678      * the input text that corresponds to a given normalized output character.
1679      * <p>
1680      * <b>Note:</b> This method sets the position in the <em>input</em>, while
1681      * {@link #next} and {@link #previous} iterate through characters in the
1682      * <em>output</em>.  This means that there is not necessarily a one-to-one
1683      * correspondence between characters returned by <tt>next</tt> and
1684      * <tt>previous</tt> and the indices passed to and returned from
1685      * <tt>setIndex</tt> and {@link #getIndex}.
1686      * @return The current iteration position
1687      * @deprecated ICU 56
1688      * @hide original deprecated declaration
1689      */
1690     @Deprecated
getIndex()1691     public int getIndex() {
1692         if(bufferPos<buffer.length()) {
1693             return currentIndex;
1694         } else {
1695             return nextIndex;
1696         }
1697     }
1698 
1699     /**
1700      * Retrieve the index of the start of the input text. This is the begin
1701      * index of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the
1702      * <tt>String</tt> over which this <tt>Normalizer</tt> is iterating
1703      * @return The current iteration position
1704      * @deprecated ICU 56
1705      * @hide original deprecated declaration
1706      */
1707     @Deprecated
startIndex()1708     public int startIndex() {
1709         return 0;
1710     }
1711 
1712     /**
1713      * Retrieve the index of the end of the input text.  This is the end index
1714      * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt>
1715      * over which this <tt>Normalizer</tt> is iterating
1716      * @return The current iteration position
1717      * @deprecated ICU 56
1718      * @hide original deprecated declaration
1719      */
1720     @Deprecated
endIndex()1721     public int endIndex() {
1722         return text.getLength();
1723     }
1724 
1725     //-------------------------------------------------------------------------
1726     // Iterator attributes
1727     //-------------------------------------------------------------------------
1728     /**
1729      * Set the normalization mode for this object.
1730      * <p>
1731      * <b>Note:</b>If the normalization mode is changed while iterating
1732      * over a string, calls to {@link #next} and {@link #previous} may
1733      * return previously buffers characters in the old normalization mode
1734      * until the iteration is able to re-sync at the next base character.
1735      * It is safest to call {@link #setText setText()}, {@link #first},
1736      * {@link #last}, etc. after calling <tt>setMode</tt>.
1737      * <p>
1738      * @param newMode the new mode for this <tt>Normalizer</tt>.
1739      * The supported modes are:
1740      * <ul>
1741      *  <li>{@link #NFC}    - Unicode canonical decompositiion
1742      *                        followed by canonical composition.
1743      *  <li>{@link #NFKC}   - Unicode compatibility decompositiion
1744      *                        follwed by canonical composition.
1745      *  <li>{@link #NFD}    - Unicode canonical decomposition
1746      *  <li>{@link #NFKD}   - Unicode compatibility decomposition.
1747      *  <li>{@link #NONE}   - Do nothing but return characters
1748      *                        from the underlying input text.
1749      * </ul>
1750      *
1751      * @see #getMode
1752      * @deprecated ICU 56
1753      * @hide original deprecated declaration
1754      */
1755     @Deprecated
setMode(Mode newMode)1756     public void setMode(Mode newMode) {
1757         mode = newMode;
1758         norm2 = mode.getNormalizer2(options);
1759     }
1760     /**
1761      * Return the basic operation performed by this <tt>Normalizer</tt>
1762      *
1763      * @see #setMode
1764      * @deprecated ICU 56
1765      * @hide original deprecated declaration
1766      */
1767     @Deprecated
getMode()1768     public Mode getMode() {
1769         return mode;
1770     }
1771     /**
1772      * Set options that affect this <tt>Normalizer</tt>'s operation.
1773      * Options do not change the basic composition or decomposition operation
1774      * that is being performed , but they control whether
1775      * certain optional portions of the operation are done.
1776      * Currently the only available option is:
1777      *
1778      * <ul>
1779      *   <li>{@link #UNICODE_3_2} - Use Normalization conforming to Unicode version 3.2.
1780      * </ul>
1781      *
1782      * @param   option  the option whose value is to be set.
1783      * @param   value   the new setting for the option.  Use <tt>true</tt> to
1784      *                  turn the option on and <tt>false</tt> to turn it off.
1785      *
1786      * @see #getOption
1787      * @deprecated ICU 56
1788      * @hide original deprecated declaration
1789      */
1790     @Deprecated
setOption(int option,boolean value)1791     public void setOption(int option,boolean value) {
1792         if (value) {
1793             options |= option;
1794         } else {
1795             options &= (~option);
1796         }
1797         norm2 = mode.getNormalizer2(options);
1798     }
1799 
1800     /**
1801      * Determine whether an option is turned on or off.
1802      * <p>
1803      * @see #setOption
1804      * @deprecated ICU 56
1805      * @hide original deprecated declaration
1806      */
1807     @Deprecated
getOption(int option)1808     public int getOption(int option) {
1809         if((options & option)!=0) {
1810             return 1 ;
1811         } else {
1812             return 0;
1813         }
1814     }
1815 
1816     /**
1817      * Gets the underlying text storage
1818      * @param fillIn the char buffer to fill the UTF-16 units.
1819      *         The length of the buffer should be equal to the length of the
1820      *         underlying text storage
1821      * @throws IndexOutOfBoundsException If the index passed for the array is invalid.
1822      * @see   #getLength
1823      * @deprecated ICU 56
1824      * @hide original deprecated declaration
1825      */
1826     @Deprecated
getText(char[] fillIn)1827     public int getText(char[] fillIn) {
1828         return text.getText(fillIn);
1829     }
1830 
1831     /**
1832      * Gets the length of underlying text storage
1833      * @return the length
1834      * @deprecated ICU 56
1835      * @hide original deprecated declaration
1836      */
1837     @Deprecated
getLength()1838     public int getLength() {
1839         return text.getLength();
1840     }
1841 
1842     /**
1843      * Returns the text under iteration as a string
1844      * @return a copy of the text under iteration.
1845      * @deprecated ICU 56
1846      * @hide original deprecated declaration
1847      */
1848     @Deprecated
getText()1849     public String getText() {
1850         return text.getText();
1851     }
1852 
1853     /**
1854      * Set the input text over which this <tt>Normalizer</tt> will iterate.
1855      * The iteration position is set to the beginning of the input text.
1856      * @param newText   The new string to be normalized.
1857      * @deprecated ICU 56
1858      * @hide original deprecated declaration
1859      */
1860     @Deprecated
setText(StringBuffer newText)1861     public void setText(StringBuffer newText) {
1862         UCharacterIterator newIter = UCharacterIterator.getInstance(newText);
1863         if (newIter == null) {
1864             throw new IllegalStateException("Could not create a new UCharacterIterator");
1865         }
1866         text = newIter;
1867         reset();
1868     }
1869 
1870     /**
1871      * Set the input text over which this <tt>Normalizer</tt> will iterate.
1872      * The iteration position is set to the beginning of the input text.
1873      * @param newText   The new string to be normalized.
1874      * @deprecated ICU 56
1875      * @hide original deprecated declaration
1876      */
1877     @Deprecated
setText(char[] newText)1878     public void setText(char[] newText) {
1879         UCharacterIterator newIter = UCharacterIterator.getInstance(newText);
1880         if (newIter == null) {
1881             throw new IllegalStateException("Could not create a new UCharacterIterator");
1882         }
1883         text = newIter;
1884         reset();
1885     }
1886 
1887     /**
1888      * Set the input text over which this <tt>Normalizer</tt> will iterate.
1889      * The iteration position is set to the beginning of the input text.
1890      * @param newText   The new string to be normalized.
1891      * @deprecated ICU 56
1892      * @hide original deprecated declaration
1893      */
1894     @Deprecated
setText(String newText)1895     public void setText(String newText) {
1896         UCharacterIterator newIter = UCharacterIterator.getInstance(newText);
1897         if (newIter == null) {
1898             throw new IllegalStateException("Could not create a new UCharacterIterator");
1899         }
1900         text = newIter;
1901         reset();
1902     }
1903 
1904     /**
1905      * Set the input text over which this <tt>Normalizer</tt> will iterate.
1906      * The iteration position is set to the beginning of the input text.
1907      * @param newText   The new string to be normalized.
1908      * @deprecated ICU 56
1909      * @hide original deprecated declaration
1910      */
1911     @Deprecated
setText(CharacterIterator newText)1912     public void setText(CharacterIterator newText) {
1913         UCharacterIterator newIter = UCharacterIterator.getInstance(newText);
1914         if (newIter == null) {
1915             throw new IllegalStateException("Could not create a new UCharacterIterator");
1916         }
1917         text = newIter;
1918         reset();
1919     }
1920 
1921     /**
1922      * Set the input text over which this <tt>Normalizer</tt> will iterate.
1923      * The iteration position is set to the beginning of the string.
1924      * @param newText   The new string to be normalized.
1925      * @deprecated ICU 56
1926      * @hide original deprecated declaration
1927      */
1928     @Deprecated
setText(UCharacterIterator newText)1929     public void setText(UCharacterIterator newText) {
1930         try{
1931             UCharacterIterator newIter = (UCharacterIterator)newText.clone();
1932             if (newIter == null) {
1933                 throw new IllegalStateException("Could not create a new UCharacterIterator");
1934             }
1935             text = newIter;
1936             reset();
1937         }catch(CloneNotSupportedException e) {
1938             throw new ICUCloneNotSupportedException("Could not clone the UCharacterIterator", e);
1939         }
1940     }
1941 
clearBuffer()1942     private void clearBuffer() {
1943         buffer.setLength(0);
1944         bufferPos=0;
1945     }
1946 
nextNormalize()1947     private boolean nextNormalize() {
1948         clearBuffer();
1949         currentIndex=nextIndex;
1950         text.setIndex(nextIndex);
1951         // Skip at least one character so we make progress.
1952         int c=text.nextCodePoint();
1953         if(c<0) {
1954             return false;
1955         }
1956         StringBuilder segment=new StringBuilder().appendCodePoint(c);
1957         while((c=text.nextCodePoint())>=0) {
1958             if(norm2.hasBoundaryBefore(c)) {
1959                 text.moveCodePointIndex(-1);
1960                 break;
1961             }
1962             segment.appendCodePoint(c);
1963         }
1964         nextIndex=text.getIndex();
1965         norm2.normalize(segment, buffer);
1966         return buffer.length()!=0;
1967     }
1968 
previousNormalize()1969     private boolean previousNormalize() {
1970         clearBuffer();
1971         nextIndex=currentIndex;
1972         text.setIndex(currentIndex);
1973         StringBuilder segment=new StringBuilder();
1974         int c;
1975         while((c=text.previousCodePoint())>=0) {
1976             if(c<=0xffff) {
1977                 segment.insert(0, (char)c);
1978             } else {
1979                 segment.insert(0, Character.toChars(c));
1980             }
1981             if(norm2.hasBoundaryBefore(c)) {
1982                 break;
1983             }
1984         }
1985         currentIndex=text.getIndex();
1986         norm2.normalize(segment, buffer);
1987         bufferPos=buffer.length();
1988         return buffer.length()!=0;
1989     }
1990 
1991     /* compare canonically equivalent ------------------------------------------- */
1992 
1993     // TODO: Broaden the public compare(String, String, options) API like this. Ticket #7407
internalCompare(CharSequence s1, CharSequence s2, int options)1994     private static int internalCompare(CharSequence s1, CharSequence s2, int options) {
1995         int normOptions=options>>>COMPARE_NORM_OPTIONS_SHIFT;
1996         options|= COMPARE_EQUIV;
1997 
1998         /*
1999          * UAX #21 Case Mappings, as fixed for Unicode version 4
2000          * (see Jitterbug 2021), defines a canonical caseless match as
2001          *
2002          * A string X is a canonical caseless match
2003          * for a string Y if and only if
2004          * NFD(toCasefold(NFD(X))) = NFD(toCasefold(NFD(Y)))
2005          *
2006          * For better performance, we check for FCD (or let the caller tell us that
2007          * both strings are in FCD) for the inner normalization.
2008          * BasicNormalizerTest::FindFoldFCDExceptions() makes sure that
2009          * case-folding preserves the FCD-ness of a string.
2010          * The outer normalization is then only performed by NormalizerImpl.cmpEquivFold()
2011          * when there is a difference.
2012          *
2013          * Exception: When using the Turkic case-folding option, we do perform
2014          * full NFD first. This is because in the Turkic case precomposed characters
2015          * with 0049 capital I or 0069 small i fold differently whether they
2016          * are first decomposed or not, so an FCD check - a check only for
2017          * canonical order - is not sufficient.
2018          */
2019         if((options&INPUT_IS_FCD)==0 || (options&FOLD_CASE_EXCLUDE_SPECIAL_I)!=0) {
2020             Normalizer2 n2;
2021             if((options&FOLD_CASE_EXCLUDE_SPECIAL_I)!=0) {
2022                 n2=NFD.getNormalizer2(normOptions);
2023             } else {
2024                 n2=FCD.getNormalizer2(normOptions);
2025             }
2026 
2027             // check if s1 and/or s2 fulfill the FCD conditions
2028             int spanQCYes1=n2.spanQuickCheckYes(s1);
2029             int spanQCYes2=n2.spanQuickCheckYes(s2);
2030 
2031             /*
2032              * ICU 2.4 had a further optimization:
2033              * If both strings were not in FCD, then they were both NFD'ed,
2034              * and the COMPARE_EQUIV option was turned off.
2035              * It is not entirely clear that this is valid with the current
2036              * definition of the canonical caseless match.
2037              * Therefore, ICU 2.6 removes that optimization.
2038              */
2039 
2040             if(spanQCYes1<s1.length()) {
2041                 StringBuilder fcd1=new StringBuilder(s1.length()+16).append(s1, 0, spanQCYes1);
2042                 s1=n2.normalizeSecondAndAppend(fcd1, s1.subSequence(spanQCYes1, s1.length()));
2043             }
2044             if(spanQCYes2<s2.length()) {
2045                 StringBuilder fcd2=new StringBuilder(s2.length()+16).append(s2, 0, spanQCYes2);
2046                 s2=n2.normalizeSecondAndAppend(fcd2, s2.subSequence(spanQCYes2, s2.length()));
2047             }
2048         }
2049 
2050         return cmpEquivFold(s1, s2, options);
2051     }
2052 
2053     /*
2054      * Compare two strings for canonical equivalence.
2055      * Further options include case-insensitive comparison and
2056      * code point order (as opposed to code unit order).
2057      *
2058      * In this function, canonical equivalence is optional as well.
2059      * If canonical equivalence is tested, then both strings must fulfill
2060      * the FCD check.
2061      *
2062      * Semantically, this is equivalent to
2063      *   strcmp[CodePointOrder](NFD(foldCase(s1)), NFD(foldCase(s2)))
2064      * where code point order, NFD and foldCase are all optional.
2065      *
2066      * String comparisons almost always yield results before processing both strings
2067      * completely.
2068      * They are generally more efficient working incrementally instead of
2069      * performing the sub-processing (strlen, normalization, case-folding)
2070      * on the entire strings first.
2071      *
2072      * It is also unnecessary to not normalize identical characters.
2073      *
2074      * This function works in principle as follows:
2075      *
2076      * loop {
2077      *   get one code unit c1 from s1 (-1 if end of source)
2078      *   get one code unit c2 from s2 (-1 if end of source)
2079      *
2080      *   if(either string finished) {
2081      *     return result;
2082      *   }
2083      *   if(c1==c2) {
2084      *     continue;
2085      *   }
2086      *
2087      *   // c1!=c2
2088      *   try to decompose/case-fold c1/c2, and continue if one does;
2089      *
2090      *   // still c1!=c2 and neither decomposes/case-folds, return result
2091      *   return c1-c2;
2092      * }
2093      *
2094      * When a character decomposes, then the pointer for that source changes to
2095      * the decomposition, pushing the previous pointer onto a stack.
2096      * When the end of the decomposition is reached, then the code unit reader
2097      * pops the previous source from the stack.
2098      * (Same for case-folding.)
2099      *
2100      * This is complicated further by operating on variable-width UTF-16.
2101      * The top part of the loop works on code units, while lookups for decomposition
2102      * and case-folding need code points.
2103      * Code points are assembled after the equality/end-of-source part.
2104      * The source pointer is only advanced beyond all code units when the code point
2105      * actually decomposes/case-folds.
2106      *
2107      * If we were on a trail surrogate unit when assembling a code point,
2108      * and the code point decomposes/case-folds, then the decomposition/folding
2109      * result must be compared with the part of the other string that corresponds to
2110      * this string's lead surrogate.
2111      * Since we only assemble a code point when hitting a trail unit when the
2112      * preceding lead units were identical, we back up the other string by one unit
2113      * in such a case.
2114      *
2115      * The optional code point order comparison at the end works with
2116      * the same fix-up as the other code point order comparison functions.
2117      * See ustring.c and the comment near the end of this function.
2118      *
2119      * Assumption: A decomposition or case-folding result string never contains
2120      * a single surrogate. This is a safe assumption in the Unicode Standard.
2121      * Therefore, we do not need to check for surrogate pairs across
2122      * decomposition/case-folding boundaries.
2123      *
2124      * Further assumptions (see verifications tstnorm.cpp):
2125      * The API function checks for FCD first, while the core function
2126      * first case-folds and then decomposes. This requires that case-folding does not
2127      * un-FCD any strings.
2128      *
2129      * The API function may also NFD the input and turn off decomposition.
2130      * This requires that case-folding does not un-NFD strings either.
2131      *
2132      * TODO If any of the above two assumptions is violated,
2133      * then this entire code must be re-thought.
2134      * If this happens, then a simple solution is to case-fold both strings up front
2135      * and to turn off UNORM_INPUT_IS_FCD.
2136      * We already do this when not both strings are in FCD because makeFCD
2137      * would be a partial NFD before the case folding, which does not work.
2138      * Note that all of this is only a problem when case-folding _and_
2139      * canonical equivalence come together.
2140      * (Comments in unorm_compare() are more up to date than this TODO.)
2141      */
2142 
2143     /* stack element for previous-level source/decomposition pointers */
2144     private static final class CmpEquivLevel {
2145         CharSequence cs;
2146         int s;
2147     };
createCmpEquivLevelStack()2148     private static final CmpEquivLevel[] createCmpEquivLevelStack() {
2149         return new CmpEquivLevel[] {
2150             new CmpEquivLevel(), new CmpEquivLevel()
2151         };
2152     }
2153 
2154     /**
2155      * Internal option for unorm_cmpEquivFold() for decomposing.
2156      * If not set, just do strcasecmp().
2157      */
2158     private static final int COMPARE_EQUIV=0x80000;
2159 
2160     /* internal function; package visibility for use by UTF16.StringComparator */
cmpEquivFold(CharSequence cs1, CharSequence cs2, int options)2161     /*package*/ static int cmpEquivFold(CharSequence cs1, CharSequence cs2, int options) {
2162         Normalizer2Impl nfcImpl;
2163         UCaseProps csp;
2164 
2165         /* current-level start/limit - s1/s2 as current */
2166         int s1, s2, limit1, limit2;
2167 
2168         /* decomposition and case folding variables */
2169         int length;
2170 
2171         /* stacks of previous-level start/current/limit */
2172         CmpEquivLevel[] stack1=null, stack2=null;
2173 
2174         /* buffers for algorithmic decompositions */
2175         String decomp1, decomp2;
2176 
2177         /* case folding buffers, only use current-level start/limit */
2178         StringBuilder fold1, fold2;
2179 
2180         /* track which is the current level per string */
2181         int level1, level2;
2182 
2183         /* current code units, and code points for lookups */
2184         int c1, c2, cp1, cp2;
2185 
2186         /* no argument error checking because this itself is not an API */
2187 
2188         /*
2189          * assume that at least one of the options _COMPARE_EQUIV and U_COMPARE_IGNORE_CASE is set
2190          * otherwise this function must behave exactly as uprv_strCompare()
2191          * not checking for that here makes testing this function easier
2192          */
2193 
2194         /* normalization/properties data loaded? */
2195         if((options&COMPARE_EQUIV)!=0) {
2196             nfcImpl=Norm2AllModes.getNFCInstance().impl;
2197         } else {
2198             nfcImpl=null;
2199         }
2200         if((options&COMPARE_IGNORE_CASE)!=0) {
2201             csp=UCaseProps.INSTANCE;
2202             fold1=new StringBuilder();
2203             fold2=new StringBuilder();
2204         } else {
2205             csp=null;
2206             fold1=fold2=null;
2207         }
2208 
2209         /* initialize */
2210         s1=0;
2211         limit1=cs1.length();
2212         s2=0;
2213         limit2=cs2.length();
2214 
2215         level1=level2=0;
2216         c1=c2=-1;
2217 
2218         /* comparison loop */
2219         for(;;) {
2220             /*
2221              * here a code unit value of -1 means "get another code unit"
2222              * below it will mean "this source is finished"
2223              */
2224 
2225             if(c1<0) {
2226                 /* get next code unit from string 1, post-increment */
2227                 for(;;) {
2228                     if(s1==limit1) {
2229                         if(level1==0) {
2230                             c1=-1;
2231                             break;
2232                         }
2233                     } else {
2234                         c1=cs1.charAt(s1++);
2235                         break;
2236                     }
2237 
2238                     /* reached end of level buffer, pop one level */
2239                     do {
2240                         --level1;
2241                         cs1=stack1[level1].cs;
2242                     } while(cs1==null);
2243                     s1=stack1[level1].s;
2244                     limit1=cs1.length();
2245                 }
2246             }
2247 
2248             if(c2<0) {
2249                 /* get next code unit from string 2, post-increment */
2250                 for(;;) {
2251                     if(s2==limit2) {
2252                         if(level2==0) {
2253                             c2=-1;
2254                             break;
2255                         }
2256                     } else {
2257                         c2=cs2.charAt(s2++);
2258                         break;
2259                     }
2260 
2261                     /* reached end of level buffer, pop one level */
2262                     do {
2263                         --level2;
2264                         cs2=stack2[level2].cs;
2265                     } while(cs2==null);
2266                     s2=stack2[level2].s;
2267                     limit2=cs2.length();
2268                 }
2269             }
2270 
2271             /*
2272              * compare c1 and c2
2273              * either variable c1, c2 is -1 only if the corresponding string is finished
2274              */
2275             if(c1==c2) {
2276                 if(c1<0) {
2277                     return 0;   /* c1==c2==-1 indicating end of strings */
2278                 }
2279                 c1=c2=-1;       /* make us fetch new code units */
2280                 continue;
2281             } else if(c1<0) {
2282                 return -1;      /* string 1 ends before string 2 */
2283             } else if(c2<0) {
2284                 return 1;       /* string 2 ends before string 1 */
2285             }
2286             /* c1!=c2 && c1>=0 && c2>=0 */
2287 
2288             /* get complete code points for c1, c2 for lookups if either is a surrogate */
2289             cp1=c1;
2290             if(UTF16.isSurrogate((char)c1)) {
2291                 char c;
2292 
2293                 if(Normalizer2Impl.UTF16Plus.isSurrogateLead(c1)) {
2294                     if(s1!=limit1 && Character.isLowSurrogate(c=cs1.charAt(s1))) {
2295                         /* advance ++s1; only below if cp1 decomposes/case-folds */
2296                         cp1=Character.toCodePoint((char)c1, c);
2297                     }
2298                 } else /* isTrail(c1) */ {
2299                     if(0<=(s1-2) && Character.isHighSurrogate(c=cs1.charAt(s1-2))) {
2300                         cp1=Character.toCodePoint(c, (char)c1);
2301                     }
2302                 }
2303             }
2304 
2305             cp2=c2;
2306             if(UTF16.isSurrogate((char)c2)) {
2307                 char c;
2308 
2309                 if(Normalizer2Impl.UTF16Plus.isSurrogateLead(c2)) {
2310                     if(s2!=limit2 && Character.isLowSurrogate(c=cs2.charAt(s2))) {
2311                         /* advance ++s2; only below if cp2 decomposes/case-folds */
2312                         cp2=Character.toCodePoint((char)c2, c);
2313                     }
2314                 } else /* isTrail(c2) */ {
2315                     if(0<=(s2-2) && Character.isHighSurrogate(c=cs2.charAt(s2-2))) {
2316                         cp2=Character.toCodePoint(c, (char)c2);
2317                     }
2318                 }
2319             }
2320 
2321             /*
2322              * go down one level for each string
2323              * continue with the main loop as soon as there is a real change
2324              */
2325 
2326             if( level1==0 && (options&COMPARE_IGNORE_CASE)!=0 &&
2327                 (length=csp.toFullFolding(cp1, fold1, options))>=0
2328             ) {
2329                 /* cp1 case-folds to the code point "length" or to p[length] */
2330                 if(UTF16.isSurrogate((char)c1)) {
2331                     if(Normalizer2Impl.UTF16Plus.isSurrogateLead(c1)) {
2332                         /* advance beyond source surrogate pair if it case-folds */
2333                         ++s1;
2334                     } else /* isTrail(c1) */ {
2335                         /*
2336                          * we got a supplementary code point when hitting its trail surrogate,
2337                          * therefore the lead surrogate must have been the same as in the other string;
2338                          * compare this decomposition with the lead surrogate in the other string
2339                          * remember that this simulates bulk text replacement:
2340                          * the decomposition would replace the entire code point
2341                          */
2342                         --s2;
2343                         c2=cs2.charAt(s2-1);
2344                     }
2345                 }
2346 
2347                 /* push current level pointers */
2348                 if(stack1==null) {
2349                     stack1=createCmpEquivLevelStack();
2350                 }
2351                 stack1[0].cs=cs1;
2352                 stack1[0].s=s1;
2353                 ++level1;
2354 
2355                 /* copy the folding result to fold1[] */
2356                 /* Java: the buffer was probably not empty, remove the old contents */
2357                 if(length<=UCaseProps.MAX_STRING_LENGTH) {
2358                     fold1.delete(0, fold1.length()-length);
2359                 } else {
2360                     fold1.setLength(0);
2361                     fold1.appendCodePoint(length);
2362                 }
2363 
2364                 /* set next level pointers to case folding */
2365                 cs1=fold1;
2366                 s1=0;
2367                 limit1=fold1.length();
2368 
2369                 /* get ready to read from decomposition, continue with loop */
2370                 c1=-1;
2371                 continue;
2372             }
2373 
2374             if( level2==0 && (options&COMPARE_IGNORE_CASE)!=0 &&
2375                 (length=csp.toFullFolding(cp2, fold2, options))>=0
2376             ) {
2377                 /* cp2 case-folds to the code point "length" or to p[length] */
2378                 if(UTF16.isSurrogate((char)c2)) {
2379                     if(Normalizer2Impl.UTF16Plus.isSurrogateLead(c2)) {
2380                         /* advance beyond source surrogate pair if it case-folds */
2381                         ++s2;
2382                     } else /* isTrail(c2) */ {
2383                         /*
2384                          * we got a supplementary code point when hitting its trail surrogate,
2385                          * therefore the lead surrogate must have been the same as in the other string;
2386                          * compare this decomposition with the lead surrogate in the other string
2387                          * remember that this simulates bulk text replacement:
2388                          * the decomposition would replace the entire code point
2389                          */
2390                         --s1;
2391                         c1=cs1.charAt(s1-1);
2392                     }
2393                 }
2394 
2395                 /* push current level pointers */
2396                 if(stack2==null) {
2397                     stack2=createCmpEquivLevelStack();
2398                 }
2399                 stack2[0].cs=cs2;
2400                 stack2[0].s=s2;
2401                 ++level2;
2402 
2403                 /* copy the folding result to fold2[] */
2404                 /* Java: the buffer was probably not empty, remove the old contents */
2405                 if(length<=UCaseProps.MAX_STRING_LENGTH) {
2406                     fold2.delete(0, fold2.length()-length);
2407                 } else {
2408                     fold2.setLength(0);
2409                     fold2.appendCodePoint(length);
2410                 }
2411 
2412                 /* set next level pointers to case folding */
2413                 cs2=fold2;
2414                 s2=0;
2415                 limit2=fold2.length();
2416 
2417                 /* get ready to read from decomposition, continue with loop */
2418                 c2=-1;
2419                 continue;
2420             }
2421 
2422             if( level1<2 && (options&COMPARE_EQUIV)!=0 &&
2423                 (decomp1=nfcImpl.getDecomposition(cp1))!=null
2424             ) {
2425                 /* cp1 decomposes into p[length] */
2426                 if(UTF16.isSurrogate((char)c1)) {
2427                     if(Normalizer2Impl.UTF16Plus.isSurrogateLead(c1)) {
2428                         /* advance beyond source surrogate pair if it decomposes */
2429                         ++s1;
2430                     } else /* isTrail(c1) */ {
2431                         /*
2432                          * we got a supplementary code point when hitting its trail surrogate,
2433                          * therefore the lead surrogate must have been the same as in the other string;
2434                          * compare this decomposition with the lead surrogate in the other string
2435                          * remember that this simulates bulk text replacement:
2436                          * the decomposition would replace the entire code point
2437                          */
2438                         --s2;
2439                         c2=cs2.charAt(s2-1);
2440                     }
2441                 }
2442 
2443                 /* push current level pointers */
2444                 if(stack1==null) {
2445                     stack1=createCmpEquivLevelStack();
2446                 }
2447                 stack1[level1].cs=cs1;
2448                 stack1[level1].s=s1;
2449                 ++level1;
2450 
2451                 /* set empty intermediate level if skipped */
2452                 if(level1<2) {
2453                     stack1[level1++].cs=null;
2454                 }
2455 
2456                 /* set next level pointers to decomposition */
2457                 cs1=decomp1;
2458                 s1=0;
2459                 limit1=decomp1.length();
2460 
2461                 /* get ready to read from decomposition, continue with loop */
2462                 c1=-1;
2463                 continue;
2464             }
2465 
2466             if( level2<2 && (options&COMPARE_EQUIV)!=0 &&
2467                 (decomp2=nfcImpl.getDecomposition(cp2))!=null
2468             ) {
2469                 /* cp2 decomposes into p[length] */
2470                 if(UTF16.isSurrogate((char)c2)) {
2471                     if(Normalizer2Impl.UTF16Plus.isSurrogateLead(c2)) {
2472                         /* advance beyond source surrogate pair if it decomposes */
2473                         ++s2;
2474                     } else /* isTrail(c2) */ {
2475                         /*
2476                          * we got a supplementary code point when hitting its trail surrogate,
2477                          * therefore the lead surrogate must have been the same as in the other string;
2478                          * compare this decomposition with the lead surrogate in the other string
2479                          * remember that this simulates bulk text replacement:
2480                          * the decomposition would replace the entire code point
2481                          */
2482                         --s1;
2483                         c1=cs1.charAt(s1-1);
2484                     }
2485                 }
2486 
2487                 /* push current level pointers */
2488                 if(stack2==null) {
2489                     stack2=createCmpEquivLevelStack();
2490                 }
2491                 stack2[level2].cs=cs2;
2492                 stack2[level2].s=s2;
2493                 ++level2;
2494 
2495                 /* set empty intermediate level if skipped */
2496                 if(level2<2) {
2497                     stack2[level2++].cs=null;
2498                 }
2499 
2500                 /* set next level pointers to decomposition */
2501                 cs2=decomp2;
2502                 s2=0;
2503                 limit2=decomp2.length();
2504 
2505                 /* get ready to read from decomposition, continue with loop */
2506                 c2=-1;
2507                 continue;
2508             }
2509 
2510             /*
2511              * no decomposition/case folding, max level for both sides:
2512              * return difference result
2513              *
2514              * code point order comparison must not just return cp1-cp2
2515              * because when single surrogates are present then the surrogate pairs
2516              * that formed cp1 and cp2 may be from different string indexes
2517              *
2518              * example: { d800 d800 dc01 } vs. { d800 dc00 }, compare at second code units
2519              * c1=d800 cp1=10001 c2=dc00 cp2=10000
2520              * cp1-cp2>0 but c1-c2<0 and in fact in UTF-32 it is { d800 10001 } < { 10000 }
2521              *
2522              * therefore, use same fix-up as in ustring.c/uprv_strCompare()
2523              * except: uprv_strCompare() fetches c=*s while this functions fetches c=*s++
2524              * so we have slightly different pointer/start/limit comparisons here
2525              */
2526 
2527             if(c1>=0xd800 && c2>=0xd800 && (options&COMPARE_CODE_POINT_ORDER)!=0) {
2528                 /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */
2529                 if(
2530                     (c1<=0xdbff && s1!=limit1 && Character.isLowSurrogate(cs1.charAt(s1))) ||
2531                     (Character.isLowSurrogate((char)c1) && 0!=(s1-1) && Character.isHighSurrogate(cs1.charAt(s1-2)))
2532                 ) {
2533                     /* part of a surrogate pair, leave >=d800 */
2534                 } else {
2535                     /* BMP code point - may be surrogate code point - make <d800 */
2536                     c1-=0x2800;
2537                 }
2538 
2539                 if(
2540                     (c2<=0xdbff && s2!=limit2 && Character.isLowSurrogate(cs2.charAt(s2))) ||
2541                     (Character.isLowSurrogate((char)c2) && 0!=(s2-1) && Character.isHighSurrogate(cs2.charAt(s2-2)))
2542                 ) {
2543                     /* part of a surrogate pair, leave >=d800 */
2544                 } else {
2545                     /* BMP code point - may be surrogate code point - make <d800 */
2546                     c2-=0x2800;
2547                 }
2548             }
2549 
2550             return c1-c2;
2551         }
2552     }
2553 
2554     /**
2555      * An Appendable that writes into a char array with a capacity that may be
2556      * less than array.length.
2557      * (By contrast, CharBuffer will write beyond destLimit all the way up to array.length.)
2558      * <p>
2559      * An overflow is only reported at the end, for the old Normalizer API functions that write
2560      * to char arrays.
2561      */
2562     private static final class CharsAppendable implements Appendable {
CharsAppendable(char[] dest, int destStart, int destLimit)2563         public CharsAppendable(char[] dest, int destStart, int destLimit) {
2564             chars=dest;
2565             start=offset=destStart;
2566             limit=destLimit;
2567         }
length()2568         public int length() {
2569             int len=offset-start;
2570             if(offset<=limit) {
2571                 return len;
2572             } else {
2573                 throw new IndexOutOfBoundsException(Integer.toString(len));
2574             }
2575         }
2576         @Override
append(char c)2577         public Appendable append(char c) {
2578             if(offset<limit) {
2579                 chars[offset]=c;
2580             }
2581             ++offset;
2582             return this;
2583         }
2584         @Override
append(CharSequence s)2585         public Appendable append(CharSequence s) {
2586             return append(s, 0, s.length());
2587         }
2588         @Override
append(CharSequence s, int sStart, int sLimit)2589         public Appendable append(CharSequence s, int sStart, int sLimit) {
2590             int len=sLimit-sStart;
2591             if(len<=(limit-offset)) {
2592                 while(sStart<sLimit) {  // TODO: Is there a better way to copy the characters?
2593                     chars[offset++]=s.charAt(sStart++);
2594                 }
2595             } else {
2596                 offset+=len;
2597             }
2598             return this;
2599         }
2600 
2601         private final char[] chars;
2602         private final int start, limit;
2603         private int offset;
2604     }
2605 }
2606