• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *******************************************************************************
3  * Copyright (C) 2000-2016, International Business Machines Corporation and
4  * others. All Rights Reserved.
5  *******************************************************************************
6  */
7 package com.ibm.icu.text;
8 import java.nio.CharBuffer;
9 import java.text.CharacterIterator;
10 
11 import com.ibm.icu.impl.Norm2AllModes;
12 import com.ibm.icu.impl.Normalizer2Impl;
13 import com.ibm.icu.impl.UCaseProps;
14 import com.ibm.icu.lang.UCharacter;
15 import com.ibm.icu.util.ICUCloneNotSupportedException;
16 
17 /**
18  * Old Unicode normalization API.
19  *
20  * <p>This API has been replaced by the {@link Normalizer2} class and is only available
21  * for backward compatibility. This class simply delegates to the Normalizer2 class.
22  * There are two exceptions: The new API does not provide a replacement for
23  * <code>QuickCheckResult</code> and <code>compare()</code>.
24  *
25  * <p><code>normalize</code> transforms Unicode text into an equivalent composed or
26  * decomposed form, allowing for easier sorting and searching of text.
27  * <code>normalize</code> supports the standard normalization forms described in
28  * <a href="http://www.unicode.org/unicode/reports/tr15/" target="unicode">
29  * Unicode Standard Annex #15 &mdash; Unicode Normalization Forms</a>.
30  *
31  * <p>Characters with accents or other adornments can be encoded in
32  * several different ways in Unicode.  For example, take the character A-acute.
33  * In Unicode, this can be encoded as a single character (the
34  * "composed" form):
35  *
36  * <pre>
37  *      00C1    LATIN CAPITAL LETTER A WITH ACUTE
38  * </pre>
39  *
40  * or as two separate characters (the "decomposed" form):
41  *
42  * <pre>
43  *      0041    LATIN CAPITAL LETTER A
44  *      0301    COMBINING ACUTE ACCENT
45  * </pre>
46  *
47  * <p>To a user of your program, however, both of these sequences should be
48  * treated as the same "user-level" character "A with acute accent".  When you
49  * are searching or comparing text, you must ensure that these two sequences are
50  * treated equivalently.  In addition, you must handle characters with more than
51  * one accent.  Sometimes the order of a character's combining accents is
52  * significant, while in other cases accent sequences in different orders are
53  * really equivalent.
54  *
55  * <p>Similarly, the string "ffi" can be encoded as three separate letters:
56  *
57  * <pre>
58  *      0066    LATIN SMALL LETTER F
59  *      0066    LATIN SMALL LETTER F
60  *      0069    LATIN SMALL LETTER I
61  * </pre>
62  *
63  * or as the single character
64  *
65  * <pre>
66  *      FB03    LATIN SMALL LIGATURE FFI
67  * </pre>
68  *
69  * <p>The ffi ligature is not a distinct semantic character, and strictly speaking
70  * it shouldn't be in Unicode at all, but it was included for compatibility
71  * with existing character sets that already provided it.  The Unicode standard
72  * identifies such characters by giving them "compatibility" decompositions
73  * into the corresponding semantic characters.  When sorting and searching, you
74  * will often want to use these mappings.
75  *
76  * <p><code>normalize</code> helps solve these problems by transforming text into
77  * the canonical composed and decomposed forms as shown in the first example
78  * above. In addition, you can have it perform compatibility decompositions so
79  * that you can treat compatibility characters the same as their equivalents.
80  * Finally, <code>normalize</code> rearranges accents into the proper canonical
81  * order, so that you do not have to worry about accent rearrangement on your
82  * own.
83  *
84  * <p>Form FCD, "Fast C or D", is also designed for collation.
85  * It allows to work on strings that are not necessarily normalized
86  * with an algorithm (like in collation) that works under "canonical closure",
87  * i.e., it treats precomposed characters and their decomposed equivalents the
88  * same.
89  *
90  * <p>It is not a normalization form because it does not provide for uniqueness of
91  * representation. Multiple strings may be canonically equivalent (their NFDs
92  * are identical) and may all conform to FCD without being identical themselves.
93  *
94  * <p>The form is defined such that the "raw decomposition", the recursive
95  * canonical decomposition of each character, results in a string that is
96  * canonically ordered. This means that precomposed characters are allowed for
97  * as long as their decompositions do not need canonical reordering.
98  *
99  * <p>Its advantage for a process like collation is that all NFD and most NFC texts
100  * - and many unnormalized texts - already conform to FCD and do not need to be
101  * normalized (NFD) for such a process. The FCD quick check will return YES for
102  * most strings in practice.
103  *
104  * <p>normalize(FCD) may be implemented with NFD.
105  *
106  * <p>For more details on FCD see Unicode Technical Note #5 (Canonical Equivalence in Applications):
107  * http://www.unicode.org/notes/tn5/#FCD
108  *
109  * <p>ICU collation performs either NFD or FCD normalization automatically if
110  * normalization is turned on for the collator object. Beyond collation and
111  * string search, normalized strings may be useful for string equivalence
112  * comparisons, transliteration/transcription, unique representations, etc.
113  *
114  * <p>The W3C generally recommends to exchange texts in NFC.
115  * Note also that most legacy character encodings use only precomposed forms and
116  * often do not encode any combining marks by themselves. For conversion to such
117  * character encodings the Unicode text needs to be normalized to NFC.
118  * For more usage examples, see the Unicode Standard Annex.
119  *
120  * <p>Note: The Normalizer class also provides API for iterative normalization.
121  * While the setIndex() and getIndex() refer to indices in the
122  * underlying Unicode input text, the next() and previous() methods
123  * iterate through characters in the normalized output.
124  * This means that there is not necessarily a one-to-one correspondence
125  * between characters returned by next() and previous() and the indices
126  * passed to and returned from setIndex() and getIndex().
127  * It is for this reason that Normalizer does not implement the CharacterIterator interface.
128  *
129  * @stable ICU 2.8
130  */
131 public final class Normalizer implements Cloneable {
132     // The input text and our position in it
133     private UCharacterIterator  text;
134     private Normalizer2         norm2;
135     private Mode                mode;
136     private int                 options;
137 
138     // The normalization buffer is the result of normalization
139     // of the source in [currentIndex..nextIndex[ .
140     private int                 currentIndex;
141     private int                 nextIndex;
142 
143     // A buffer for holding intermediate results
144     private StringBuilder       buffer;
145     private int                 bufferPos;
146 
147     // Helper classes to defer loading of normalization data.
148     private static final class ModeImpl {
ModeImpl(Normalizer2 n2)149         private ModeImpl(Normalizer2 n2) {
150             normalizer2 = n2;
151         }
152         private final Normalizer2 normalizer2;
153     }
154     private static final class NFDModeImpl {
155         private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFDInstance());
156     }
157     private static final class NFKDModeImpl {
158         private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFKDInstance());
159     }
160     private static final class NFCModeImpl {
161         private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFCInstance());
162     }
163     private static final class NFKCModeImpl {
164         private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFKCInstance());
165     }
166     private static final class FCDModeImpl {
167         private static final ModeImpl INSTANCE = new ModeImpl(Norm2AllModes.getFCDNormalizer2());
168     }
169 
170     private static final class Unicode32 {
171         private static final UnicodeSet INSTANCE = new UnicodeSet("[:age=3.2:]").freeze();
172     }
173     private static final class NFD32ModeImpl {
174         private static final ModeImpl INSTANCE =
175             new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFDInstance(),
176                                                  Unicode32.INSTANCE));
177     }
178     private static final class NFKD32ModeImpl {
179         private static final ModeImpl INSTANCE =
180             new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFKDInstance(),
181                                                  Unicode32.INSTANCE));
182     }
183     private static final class NFC32ModeImpl {
184         private static final ModeImpl INSTANCE =
185             new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFCInstance(),
186                                                  Unicode32.INSTANCE));
187     }
188     private static final class NFKC32ModeImpl {
189         private static final ModeImpl INSTANCE =
190             new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFKCInstance(),
191                                                  Unicode32.INSTANCE));
192     }
193     private static final class FCD32ModeImpl {
194         private static final ModeImpl INSTANCE =
195             new ModeImpl(new FilteredNormalizer2(Norm2AllModes.getFCDNormalizer2(),
196                                                  Unicode32.INSTANCE));
197     }
198 
199     /**
200      * Options bit set value to select Unicode 3.2 normalization
201      * (except NormalizationCorrections).
202      * At most one Unicode version can be selected at a time.
203      *
204      * @deprecated ICU 56 Use {@link FilteredNormalizer2} instead.
205      */
206     @Deprecated
207     public static final int UNICODE_3_2=0x20;
208 
209     /**
210      * Constant indicating that the end of the iteration has been reached.
211      * This is guaranteed to have the same value as {@link UCharacterIterator#DONE}.
212      *
213      * @deprecated ICU 56
214      */
215     @Deprecated
216     public static final int DONE = UCharacterIterator.DONE;
217 
218     /**
219      * Constants for normalization modes.
220      * <p>
221      * The Mode class is not intended for public subclassing.
222      * Only the Mode constants provided by the Normalizer class should be used,
223      * and any fields or methods should not be called or overridden by users.
224      *
225      * @deprecated ICU 56 Use {@link Normalizer2} instead.
226      */
227     @Deprecated
228     public static abstract class Mode {
229         /**
230          * Sole constructor
231          * @internal
232          * @deprecated This API is ICU internal only.
233          */
234         @Deprecated
Mode()235         protected Mode() {
236         }
237 
238         /**
239          * @internal
240          * @deprecated This API is ICU internal only.
241          */
242         @Deprecated
getNormalizer2(int options)243         protected abstract Normalizer2 getNormalizer2(int options);
244     }
245 
246     private static final class NONEMode extends Mode {
getNormalizer2(int options)247         protected Normalizer2 getNormalizer2(int options) { return Norm2AllModes.NOOP_NORMALIZER2; }
248     }
249     private static final class NFDMode extends Mode {
getNormalizer2(int options)250         protected Normalizer2 getNormalizer2(int options) {
251             return (options&UNICODE_3_2) != 0 ?
252                     NFD32ModeImpl.INSTANCE.normalizer2 : NFDModeImpl.INSTANCE.normalizer2;
253         }
254     }
255     private static final class NFKDMode extends Mode {
getNormalizer2(int options)256         protected Normalizer2 getNormalizer2(int options) {
257             return (options&UNICODE_3_2) != 0 ?
258                     NFKD32ModeImpl.INSTANCE.normalizer2 : NFKDModeImpl.INSTANCE.normalizer2;
259         }
260     }
261     private static final class NFCMode extends Mode {
getNormalizer2(int options)262         protected Normalizer2 getNormalizer2(int options) {
263             return (options&UNICODE_3_2) != 0 ?
264                     NFC32ModeImpl.INSTANCE.normalizer2 : NFCModeImpl.INSTANCE.normalizer2;
265         }
266     }
267     private static final class NFKCMode extends Mode {
getNormalizer2(int options)268         protected Normalizer2 getNormalizer2(int options) {
269             return (options&UNICODE_3_2) != 0 ?
270                     NFKC32ModeImpl.INSTANCE.normalizer2 : NFKCModeImpl.INSTANCE.normalizer2;
271         }
272     }
273     private static final class FCDMode extends Mode {
getNormalizer2(int options)274         protected Normalizer2 getNormalizer2(int options) {
275             return (options&UNICODE_3_2) != 0 ?
276                     FCD32ModeImpl.INSTANCE.normalizer2 : FCDModeImpl.INSTANCE.normalizer2;
277         }
278     }
279 
280     /**
281      * No decomposition/composition.
282      *
283      * @deprecated ICU 56 Use {@link Normalizer2} instead.
284      */
285     @Deprecated
286     public static final Mode NONE = new NONEMode();
287 
288     /**
289      * Canonical decomposition.
290      *
291      * @deprecated ICU 56 Use {@link Normalizer2} instead.
292      */
293     @Deprecated
294     public static final Mode NFD = new NFDMode();
295 
296     /**
297      * Compatibility decomposition.
298      *
299      * @deprecated ICU 56 Use {@link Normalizer2} instead.
300      */
301     @Deprecated
302     public static final Mode NFKD = new NFKDMode();
303 
304     /**
305      * Canonical decomposition followed by canonical composition.
306      *
307      * @deprecated ICU 56 Use {@link Normalizer2} instead.
308      */
309     @Deprecated
310     public static final Mode NFC = new NFCMode();
311 
312     /**
313      * Default normalization.
314      *
315      * @deprecated ICU 56 Use {@link Normalizer2} instead.
316      */
317     @Deprecated
318     public static final Mode DEFAULT = NFC;
319 
320     /**
321      * Compatibility decomposition followed by canonical composition.
322      *
323      * @deprecated ICU 56 Use {@link Normalizer2} instead.
324      */
325     @Deprecated
326     public static final Mode NFKC =new NFKCMode();
327 
328     /**
329      * "Fast C or D" form.
330      *
331      * @deprecated ICU 56 Use {@link Normalizer2} instead.
332      */
333     @Deprecated
334     public static final Mode FCD = new FCDMode();
335 
336     /**
337      * Null operation for use with the {@link com.ibm.icu.text.Normalizer constructors}
338      * and the static {@link #normalize normalize} method.  This value tells
339      * the <tt>Normalizer</tt> to do nothing but return unprocessed characters
340      * from the underlying String or CharacterIterator.  If you have code which
341      * requires raw text at some times and normalized text at others, you can
342      * use <tt>NO_OP</tt> for the cases where you want raw text, rather
343      * than having a separate code path that bypasses <tt>Normalizer</tt>
344      * altogether.
345      * <p>
346      * @see #setMode
347      * @deprecated ICU 2.8. Use Nomalizer.NONE
348      * @see #NONE
349      */
350     @Deprecated
351     public static final Mode NO_OP = NONE;
352 
353     /**
354      * Canonical decomposition followed by canonical composition.  Used with the
355      * {@link com.ibm.icu.text.Normalizer constructors} and the static
356      * {@link #normalize normalize} method to determine the operation to be
357      * performed.
358      * <p>
359      * If all optional features (<i>e.g.</i> {@link #IGNORE_HANGUL}) are turned
360      * off, this operation produces output that is in
361      * <a href=http://www.unicode.org/unicode/reports/tr15/>Unicode Canonical
362      * Form</a>
363      * <b>C</b>.
364      * <p>
365      * @see #setMode
366      * @deprecated ICU 2.8. Use Normalier.NFC
367      * @see #NFC
368      */
369     @Deprecated
370     public static final Mode COMPOSE = NFC;
371 
372     /**
373      * Compatibility decomposition followed by canonical composition.
374      * Used with the {@link com.ibm.icu.text.Normalizer constructors} and the static
375      * {@link #normalize normalize} method to determine the operation to be
376      * performed.
377      * <p>
378      * If all optional features (<i>e.g.</i> {@link #IGNORE_HANGUL}) are turned
379      * off, this operation produces output that is in
380      * <a href=http://www.unicode.org/unicode/reports/tr15/>Unicode Canonical
381      * Form</a>
382      * <b>KC</b>.
383      * <p>
384      * @see #setMode
385      * @deprecated ICU 2.8. Use Normalizer.NFKC
386      * @see #NFKC
387      */
388     @Deprecated
389     public static final Mode COMPOSE_COMPAT = NFKC;
390 
391     /**
392      * Canonical decomposition.  This value is passed to the
393      * {@link com.ibm.icu.text.Normalizer constructors} and the static
394      * {@link #normalize normalize}
395      * method to determine the operation to be performed.
396      * <p>
397      * If all optional features (<i>e.g.</i> {@link #IGNORE_HANGUL}) are turned
398      * off, this operation produces output that is in
399      * <a href=http://www.unicode.org/unicode/reports/tr15/>Unicode Canonical
400      * Form</a>
401      * <b>D</b>.
402      * <p>
403      * @see #setMode
404      * @deprecated ICU 2.8. Use Normalizer.NFD
405      * @see #NFD
406      */
407     @Deprecated
408     public static final Mode DECOMP = NFD;
409 
410     /**
411      * Compatibility decomposition.  This value is passed to the
412      * {@link com.ibm.icu.text.Normalizer constructors} and the static
413      * {@link #normalize normalize}
414      * method to determine the operation to be performed.
415      * <p>
416      * If all optional features (<i>e.g.</i> {@link #IGNORE_HANGUL}) are turned
417      * off, this operation produces output that is in
418      * <a href=http://www.unicode.org/unicode/reports/tr15/>Unicode Canonical
419      * Form</a>
420      * <b>KD</b>.
421      * <p>
422      * @see #setMode
423      * @deprecated ICU 2.8. Use Normalizer.NFKD
424      * @see #NFKD
425      */
426     @Deprecated
427     public static final Mode DECOMP_COMPAT = NFKD;
428 
429     /**
430      * Option to disable Hangul/Jamo composition and decomposition.
431      * This option applies to Korean text,
432      * which can be represented either in the Jamo alphabet or in Hangul
433      * characters, which are really just two or three Jamo combined
434      * into one visual glyph.  Since Jamo takes up more storage space than
435      * Hangul, applications that process only Hangul text may wish to turn
436      * this option on when decomposing text.
437      * <p>
438      * The Unicode standard treates Hangul to Jamo conversion as a
439      * canonical decomposition, so this option must be turned <b>off</b> if you
440      * wish to transform strings into one of the standard
441      * <a href="http://www.unicode.org/unicode/reports/tr15/" target="unicode">
442      * Unicode Normalization Forms</a>.
443      * <p>
444      * @see #setOption
445      * @deprecated ICU 2.8. This option is no longer supported.
446      */
447     @Deprecated
448     public static final int IGNORE_HANGUL = 0x0001;
449 
450     /**
451      * Result values for quickCheck().
452      * For details see Unicode Technical Report 15.
453      * @stable ICU 2.8
454      */
455     public static final class QuickCheckResult{
456         //private int resultValue;
QuickCheckResult(int value)457         private QuickCheckResult(int value) {
458             //resultValue=value;
459         }
460     }
461     /**
462      * Indicates that string is not in the normalized format
463      * @stable ICU 2.8
464      */
465     public static final QuickCheckResult NO = new QuickCheckResult(0);
466 
467     /**
468      * Indicates that string is in the normalized format
469      * @stable ICU 2.8
470      */
471     public static final QuickCheckResult YES = new QuickCheckResult(1);
472 
473     /**
474      * Indicates it cannot be determined if string is in the normalized
475      * format without further thorough checks.
476      * @stable ICU 2.8
477      */
478     public static final QuickCheckResult MAYBE = new QuickCheckResult(2);
479 
480     /**
481      * Option bit for compare:
482      * Case sensitively compare the strings
483      * @stable ICU 2.8
484      */
485     public static final int FOLD_CASE_DEFAULT =  UCharacter.FOLD_CASE_DEFAULT;
486 
487     /**
488      * Option bit for compare:
489      * Both input strings are assumed to fulfill FCD conditions.
490      * @stable ICU 2.8
491      */
492     public static final int INPUT_IS_FCD    =      0x20000;
493 
494     /**
495      * Option bit for compare:
496      * Perform case-insensitive comparison.
497      * @stable ICU 2.8
498      */
499     public static final int COMPARE_IGNORE_CASE  =     0x10000;
500 
501     /**
502      * Option bit for compare:
503      * Compare strings in code point order instead of code unit order.
504      * @stable ICU 2.8
505      */
506     public static final int COMPARE_CODE_POINT_ORDER = 0x8000;
507 
508     /**
509      * Option value for case folding:
510      * Use the modified set of mappings provided in CaseFolding.txt to handle dotted I
511      * and dotless i appropriately for Turkic languages (tr, az).
512      * @see UCharacter#FOLD_CASE_EXCLUDE_SPECIAL_I
513      * @stable ICU 2.8
514      */
515     public static final int FOLD_CASE_EXCLUDE_SPECIAL_I = UCharacter.FOLD_CASE_EXCLUDE_SPECIAL_I;
516 
517     /**
518      * Lowest-order bit number of compare() options bits corresponding to
519      * normalization options bits.
520      *
521      * The options parameter for compare() uses most bits for
522      * itself and for various comparison and folding flags.
523      * The most significant bits, however, are shifted down and passed on
524      * to the normalization implementation.
525      * (That is, from compare(..., options, ...),
526      * options&gt;&gt;COMPARE_NORM_OPTIONS_SHIFT will be passed on to the
527      * internal normalization functions.)
528      *
529      * @see #compare
530      * @deprecated ICU 56 Use {@link Normalizer2} instead.
531      */
532     @Deprecated
533     public static final int COMPARE_NORM_OPTIONS_SHIFT  = 20;
534 
535     //-------------------------------------------------------------------------
536     // Iterator constructors
537     //-------------------------------------------------------------------------
538 
539     /**
540      * Creates a new <tt>Normalizer</tt> object for iterating over the
541      * normalized form of a given string.
542      * <p>
543      * The <tt>options</tt> parameter specifies which optional
544      * <tt>Normalizer</tt> features are to be enabled for this object.
545      * <p>
546      * @param str  The string to be normalized.  The normalization
547      *              will start at the beginning of the string.
548      *
549      * @param mode The normalization mode.
550      *
551      * @param opt Any optional features to be enabled.
552      *            Currently the only available option is {@link #UNICODE_3_2}.
553      *            If you want the default behavior corresponding to one of the
554      *            standard Unicode Normalization Forms, use 0 for this argument.
555      * @deprecated ICU 56 Use {@link Normalizer2} instead.
556      */
557     @Deprecated
Normalizer(String str, Mode mode, int opt)558     public Normalizer(String str, Mode mode, int opt) {
559         this.text = UCharacterIterator.getInstance(str);
560         this.mode = mode;
561         this.options=opt;
562         norm2 = mode.getNormalizer2(opt);
563         buffer = new StringBuilder();
564     }
565 
566     /**
567      * Creates a new <tt>Normalizer</tt> object for iterating over the
568      * normalized form of the given text.
569      * <p>
570      * @param iter  The input text to be normalized.  The normalization
571      *              will start at the beginning of the string.
572      *
573      * @param mode  The normalization mode.
574      *
575      * @param opt Any optional features to be enabled.
576      *            Currently the only available option is {@link #UNICODE_3_2}.
577      *            If you want the default behavior corresponding to one of the
578      *            standard Unicode Normalization Forms, use 0 for this argument.
579      * @deprecated ICU 56 Use {@link Normalizer2} instead.
580      */
581     @Deprecated
Normalizer(CharacterIterator iter, Mode mode, int opt)582     public Normalizer(CharacterIterator iter, Mode mode, int opt) {
583         this.text = UCharacterIterator.getInstance((CharacterIterator)iter.clone());
584         this.mode = mode;
585         this.options = opt;
586         norm2 = mode.getNormalizer2(opt);
587         buffer = new StringBuilder();
588     }
589 
590     /**
591      * Creates a new <tt>Normalizer</tt> object for iterating over the
592      * normalized form of the given text.
593      * <p>
594      * @param iter  The input text to be normalized.  The normalization
595      *              will start at the beginning of the string.
596      *
597      * @param mode  The normalization mode.
598      * @param options The normalization options, ORed together (0 for no options).
599      * @deprecated ICU 56 Use {@link Normalizer2} instead.
600      */
601     @Deprecated
Normalizer(UCharacterIterator iter, Mode mode, int options)602     public Normalizer(UCharacterIterator iter, Mode mode, int options) {
603         try {
604             this.text     = (UCharacterIterator)iter.clone();
605             this.mode     = mode;
606             this.options  = options;
607             norm2 = mode.getNormalizer2(options);
608             buffer = new StringBuilder();
609         } catch (CloneNotSupportedException e) {
610             throw new ICUCloneNotSupportedException(e);
611         }
612     }
613 
614     /**
615      * Clones this <tt>Normalizer</tt> object.  All properties of this
616      * object are duplicated in the new object, including the cloning of any
617      * {@link CharacterIterator} that was passed in to the constructor
618      * or to {@link #setText(CharacterIterator) setText}.
619      * However, the text storage underlying
620      * the <tt>CharacterIterator</tt> is not duplicated unless the
621      * iterator's <tt>clone</tt> method does so.
622      *
623      * @deprecated ICU 56 Use {@link Normalizer2} instead.
624      */
625     @Deprecated
626     @Override
clone()627     public Object clone() {
628         try {
629             Normalizer copy = (Normalizer) super.clone();
630             copy.text = (UCharacterIterator) text.clone();
631             copy.mode = mode;
632             copy.options = options;
633             copy.norm2 = norm2;
634             copy.buffer = new StringBuilder(buffer);
635             copy.bufferPos = bufferPos;
636             copy.currentIndex = currentIndex;
637             copy.nextIndex = nextIndex;
638             return copy;
639         }
640         catch (CloneNotSupportedException e) {
641             throw new ICUCloneNotSupportedException(e);
642         }
643     }
644 
645     //--------------------------------------------------------------------------
646     // Static Utility methods
647     //--------------------------------------------------------------------------
648 
getComposeNormalizer2(boolean compat, int options)649     private static final Normalizer2 getComposeNormalizer2(boolean compat, int options) {
650         return (compat ? NFKC : NFC).getNormalizer2(options);
651     }
getDecomposeNormalizer2(boolean compat, int options)652     private static final Normalizer2 getDecomposeNormalizer2(boolean compat, int options) {
653         return (compat ? NFKD : NFD).getNormalizer2(options);
654     }
655 
656     /**
657      * Compose a string.
658      * The string will be composed to according to the specified mode.
659      * @param str        The string to compose.
660      * @param compat     If true the string will be composed according to
661      *                    NFKC rules and if false will be composed according to
662      *                    NFC rules.
663      * @return String    The composed string
664      * @deprecated ICU 56 Use {@link Normalizer2} instead.
665      */
666     @Deprecated
compose(String str, boolean compat)667     public static String compose(String str, boolean compat) {
668         return compose(str,compat,0);
669     }
670 
671     /**
672      * Compose a string.
673      * The string will be composed to according to the specified mode.
674      * @param str        The string to compose.
675      * @param compat     If true the string will be composed according to
676      *                    NFKC rules and if false will be composed according to
677      *                    NFC rules.
678      * @param options    The only recognized option is UNICODE_3_2
679      * @return String    The composed string
680      * @deprecated ICU 56 Use {@link Normalizer2} instead.
681      */
682     @Deprecated
compose(String str, boolean compat, int options)683     public static String compose(String str, boolean compat, int options) {
684         return getComposeNormalizer2(compat, options).normalize(str);
685     }
686 
687     /**
688      * Compose a string.
689      * The string will be composed to according to the specified mode.
690      * @param source The char array to compose.
691      * @param target A char buffer to receive the normalized text.
692      * @param compat If true the char array will be composed according to
693      *                NFKC rules and if false will be composed according to
694      *                NFC rules.
695      * @param options The normalization options, ORed together (0 for no options).
696      * @return int   The total buffer size needed;if greater than length of
697      *                result, the output was truncated.
698      * @exception IndexOutOfBoundsException if target.length is less than the
699      *             required length
700      * @deprecated ICU 56 Use {@link Normalizer2} instead.
701      */
702     @Deprecated
compose(char[] source,char[] target, boolean compat, int options)703     public static int compose(char[] source,char[] target, boolean compat, int options) {
704         return compose(source, 0, source.length, target, 0, target.length, compat, options);
705     }
706 
707     /**
708      * Compose a string.
709      * The string will be composed to according to the specified mode.
710      * @param src       The char array to compose.
711      * @param srcStart  Start index of the source
712      * @param srcLimit  Limit index of the source
713      * @param dest      The char buffer to fill in
714      * @param destStart Start index of the destination buffer
715      * @param destLimit End index of the destination buffer
716      * @param compat If true the char array will be composed according to
717      *                NFKC rules and if false will be composed according to
718      *                NFC rules.
719      * @param options The normalization options, ORed together (0 for no options).
720      * @return int   The total buffer size needed;if greater than length of
721      *                result, the output was truncated.
722      * @exception IndexOutOfBoundsException if target.length is less than the
723      *             required length
724      * @deprecated ICU 56 Use {@link Normalizer2} instead.
725      */
726     @Deprecated
compose(char[] src,int srcStart, int srcLimit, char[] dest,int destStart, int destLimit, boolean compat, int options)727     public static int compose(char[] src,int srcStart, int srcLimit,
728                               char[] dest,int destStart, int destLimit,
729                               boolean compat, int options) {
730         CharBuffer srcBuffer = CharBuffer.wrap(src, srcStart, srcLimit - srcStart);
731         CharsAppendable app = new CharsAppendable(dest, destStart, destLimit);
732         getComposeNormalizer2(compat, options).normalize(srcBuffer, app);
733         return app.length();
734     }
735 
736     /**
737      * Decompose a string.
738      * The string will be decomposed to according to the specified mode.
739      * @param str       The string to decompose.
740      * @param compat    If true the string will be decomposed according to NFKD
741      *                   rules and if false will be decomposed according to NFD
742      *                   rules.
743      * @return String   The decomposed string
744      * @deprecated ICU 56 Use {@link Normalizer2} instead.
745      */
746     @Deprecated
decompose(String str, boolean compat)747     public static String decompose(String str, boolean compat) {
748         return decompose(str,compat,0);
749     }
750 
751     /**
752      * Decompose a string.
753      * The string will be decomposed to according to the specified mode.
754      * @param str     The string to decompose.
755      * @param compat  If true the string will be decomposed according to NFKD
756      *                 rules and if false will be decomposed according to NFD
757      *                 rules.
758      * @param options The normalization options, ORed together (0 for no options).
759      * @return String The decomposed string
760      * @deprecated ICU 56 Use {@link Normalizer2} instead.
761      */
762     @Deprecated
decompose(String str, boolean compat, int options)763     public static String decompose(String str, boolean compat, int options) {
764         return getDecomposeNormalizer2(compat, options).normalize(str);
765     }
766 
767     /**
768      * Decompose a string.
769      * The string will be decomposed to according to the specified mode.
770      * @param source The char array to decompose.
771      * @param target A char buffer to receive the normalized text.
772      * @param compat If true the char array will be decomposed according to NFKD
773      *                rules and if false will be decomposed according to
774      *                NFD rules.
775      * @return int   The total buffer size needed;if greater than length of
776      *                result,the output was truncated.
777      * @param options The normalization options, ORed together (0 for no options).
778      * @exception IndexOutOfBoundsException if the target capacity is less than
779      *             the required length
780      * @deprecated ICU 56 Use {@link Normalizer2} instead.
781      */
782     @Deprecated
decompose(char[] source,char[] target, boolean compat, int options)783     public static int decompose(char[] source,char[] target, boolean compat, int options) {
784         return decompose(source, 0, source.length, target, 0, target.length, compat, options);
785     }
786 
787     /**
788      * Decompose a string.
789      * The string will be decomposed to according to the specified mode.
790      * @param src       The char array to compose.
791      * @param srcStart  Start index of the source
792      * @param srcLimit  Limit index of the source
793      * @param dest      The char buffer to fill in
794      * @param destStart Start index of the destination buffer
795      * @param destLimit End index of the destination buffer
796      * @param compat If true the char array will be decomposed according to NFKD
797      *                rules and if false will be decomposed according to
798      *                NFD rules.
799      * @param options The normalization options, ORed together (0 for no options).
800      * @return int   The total buffer size needed;if greater than length of
801      *                result,the output was truncated.
802      * @exception IndexOutOfBoundsException if the target capacity is less than
803      *             the required length
804      * @deprecated ICU 56 Use {@link Normalizer2} instead.
805      */
806     @Deprecated
decompose(char[] src,int srcStart, int srcLimit, char[] dest,int destStart, int destLimit, boolean compat, int options)807     public static int decompose(char[] src,int srcStart, int srcLimit,
808                                 char[] dest,int destStart, int destLimit,
809                                 boolean compat, int options) {
810         CharBuffer srcBuffer = CharBuffer.wrap(src, srcStart, srcLimit - srcStart);
811         CharsAppendable app = new CharsAppendable(dest, destStart, destLimit);
812         getDecomposeNormalizer2(compat, options).normalize(srcBuffer, app);
813         return app.length();
814     }
815 
816     /**
817      * Normalizes a <tt>String</tt> using the given normalization operation.
818      * <p>
819      * The <tt>options</tt> parameter specifies which optional
820      * <tt>Normalizer</tt> features are to be enabled for this operation.
821      * Currently the only available option is {@link #UNICODE_3_2}.
822      * If you want the default behavior corresponding to one of the standard
823      * Unicode Normalization Forms, use 0 for this argument.
824      * <p>
825      * @param str       the input string to be normalized.
826      * @param mode      the normalization mode
827      * @param options   the optional features to be enabled.
828      * @return String   the normalized string
829      * @deprecated ICU 56 Use {@link Normalizer2} instead.
830      */
831     @Deprecated
normalize(String str, Mode mode, int options)832     public static String normalize(String str, Mode mode, int options) {
833         return mode.getNormalizer2(options).normalize(str);
834     }
835 
836     /**
837      * Normalize a string.
838      * The string will be normalized according to the specified normalization
839      * mode and options.
840      * @param src        The string to normalize.
841      * @param mode       The normalization mode; one of Normalizer.NONE,
842      *                    Normalizer.NFD, Normalizer.NFC, Normalizer.NFKC,
843      *                    Normalizer.NFKD, Normalizer.DEFAULT
844      * @return the normalized string
845      * @deprecated ICU 56 Use {@link Normalizer2} instead.
846      */
847     @Deprecated
normalize(String src,Mode mode)848     public static String normalize(String src,Mode mode) {
849         return normalize(src, mode, 0);
850     }
851     /**
852      * Normalize a string.
853      * The string will be normalized according to the specified normalization
854      * mode and options.
855      * @param source The char array to normalize.
856      * @param target A char buffer to receive the normalized text.
857      * @param mode   The normalization mode; one of Normalizer.NONE,
858      *                Normalizer.NFD, Normalizer.NFC, Normalizer.NFKC,
859      *                Normalizer.NFKD, Normalizer.DEFAULT
860      * @param options The normalization options, ORed together (0 for no options).
861      * @return int   The total buffer size needed;if greater than length of
862      *                result, the output was truncated.
863      * @exception    IndexOutOfBoundsException if the target capacity is less
864      *                than the required length
865      * @deprecated ICU 56 Use {@link Normalizer2} instead.
866      */
867     @Deprecated
normalize(char[] source,char[] target, Mode mode, int options)868     public static int normalize(char[] source,char[] target, Mode  mode, int options) {
869         return normalize(source,0,source.length,target,0,target.length,mode, options);
870     }
871 
872     /**
873      * Normalize a string.
874      * The string will be normalized according to the specified normalization
875      * mode and options.
876      * @param src       The char array to compose.
877      * @param srcStart  Start index of the source
878      * @param srcLimit  Limit index of the source
879      * @param dest      The char buffer to fill in
880      * @param destStart Start index of the destination buffer
881      * @param destLimit End index of the destination buffer
882      * @param mode      The normalization mode; one of Normalizer.NONE,
883      *                   Normalizer.NFD, Normalizer.NFC, Normalizer.NFKC,
884      *                   Normalizer.NFKD, Normalizer.DEFAULT
885      * @param options The normalization options, ORed together (0 for no options).
886      * @return int      The total buffer size needed;if greater than length of
887      *                   result, the output was truncated.
888      * @exception       IndexOutOfBoundsException if the target capacity is
889      *                   less than the required length
890      * @deprecated ICU 56 Use {@link Normalizer2} instead.
891      */
892     @Deprecated
normalize(char[] src,int srcStart, int srcLimit, char[] dest,int destStart, int destLimit, Mode mode, int options)893     public static int normalize(char[] src,int srcStart, int srcLimit,
894                                 char[] dest,int destStart, int destLimit,
895                                 Mode  mode, int options) {
896         CharBuffer srcBuffer = CharBuffer.wrap(src, srcStart, srcLimit - srcStart);
897         CharsAppendable app = new CharsAppendable(dest, destStart, destLimit);
898         mode.getNormalizer2(options).normalize(srcBuffer, app);
899         return app.length();
900     }
901 
902     /**
903      * Normalize a codepoint according to the given mode
904      * @param char32    The input string to be normalized.
905      * @param mode      The normalization mode
906      * @param options   Options for use with exclusion set and tailored Normalization
907      *                                   The only option that is currently recognized is UNICODE_3_2
908      * @return String   The normalized string
909      * @see #UNICODE_3_2
910      * @deprecated ICU 56 Use {@link Normalizer2} instead.
911      */
912     @Deprecated
normalize(int char32, Mode mode, int options)913     public static String normalize(int char32, Mode mode, int options) {
914         if(mode == NFD && options == 0) {
915             String decomposition = Normalizer2.getNFCInstance().getDecomposition(char32);
916             if(decomposition == null) {
917                 decomposition = UTF16.valueOf(char32);
918             }
919             return decomposition;
920         }
921         return normalize(UTF16.valueOf(char32), mode, options);
922     }
923 
924     /**
925      * Convenience method to normalize a codepoint according to the given mode
926      * @param char32    The input string to be normalized.
927      * @param mode      The normalization mode
928      * @return String   The normalized string
929      * @deprecated ICU 56 Use {@link Normalizer2} instead.
930      */
931     @Deprecated
normalize(int char32, Mode mode)932     public static String normalize(int char32, Mode mode) {
933         return normalize(char32, mode, 0);
934     }
935 
936     /**
937      * Convenience method.
938      *
939      * @param source   string for determining if it is in a normalized format
940      * @param mode     normalization format (Normalizer.NFC,Normalizer.NFD,
941      *                  Normalizer.NFKC,Normalizer.NFKD)
942      * @return         Return code to specify if the text is normalized or not
943      *                     (Normalizer.YES, Normalizer.NO or Normalizer.MAYBE)
944      * @deprecated ICU 56 Use {@link Normalizer2} instead.
945      */
946     @Deprecated
quickCheck(String source, Mode mode)947     public static QuickCheckResult quickCheck(String source, Mode mode) {
948         return quickCheck(source, mode, 0);
949     }
950 
951     /**
952      * Performing quick check on a string, to quickly determine if the string is
953      * in a particular normalization format.
954      * Three types of result can be returned Normalizer.YES, Normalizer.NO or
955      * Normalizer.MAYBE. Result Normalizer.YES indicates that the argument
956      * string is in the desired normalized format, Normalizer.NO determines that
957      * argument string is not in the desired normalized format. A
958      * Normalizer.MAYBE result indicates that a more thorough check is required,
959      * the user may have to put the string in its normalized form and compare
960      * the results.
961      *
962      * @param source   string for determining if it is in a normalized format
963      * @param mode     normalization format (Normalizer.NFC,Normalizer.NFD,
964      *                  Normalizer.NFKC,Normalizer.NFKD)
965      * @param options   Options for use with exclusion set and tailored Normalization
966      *                                   The only option that is currently recognized is UNICODE_3_2
967      * @return         Return code to specify if the text is normalized or not
968      *                     (Normalizer.YES, Normalizer.NO or Normalizer.MAYBE)
969      * @deprecated ICU 56 Use {@link Normalizer2} instead.
970      */
971     @Deprecated
quickCheck(String source, Mode mode, int options)972     public static QuickCheckResult quickCheck(String source, Mode mode, int options) {
973         return mode.getNormalizer2(options).quickCheck(source);
974     }
975 
976     /**
977      * Convenience method.
978      *
979      * @param source Array of characters for determining if it is in a
980      *                normalized format
981      * @param mode   normalization format (Normalizer.NFC,Normalizer.NFD,
982      *                Normalizer.NFKC,Normalizer.NFKD)
983      * @param options   Options for use with exclusion set and tailored Normalization
984      *                                   The only option that is currently recognized is UNICODE_3_2
985      * @return       Return code to specify if the text is normalized or not
986      *                (Normalizer.YES, Normalizer.NO or Normalizer.MAYBE)
987      * @deprecated ICU 56 Use {@link Normalizer2} instead.
988      */
989     @Deprecated
quickCheck(char[] source, Mode mode, int options)990     public static QuickCheckResult quickCheck(char[] source, Mode mode, int options) {
991         return quickCheck(source, 0, source.length, mode, options);
992     }
993 
994     /**
995      * Performing quick check on a string, to quickly determine if the string is
996      * in a particular normalization format.
997      * Three types of result can be returned Normalizer.YES, Normalizer.NO or
998      * Normalizer.MAYBE. Result Normalizer.YES indicates that the argument
999      * string is in the desired normalized format, Normalizer.NO determines that
1000      * argument string is not in the desired normalized format. A
1001      * Normalizer.MAYBE result indicates that a more thorough check is required,
1002      * the user may have to put the string in its normalized form and compare
1003      * the results.
1004      *
1005      * @param source    string for determining if it is in a normalized format
1006      * @param start     the start index of the source
1007      * @param limit     the limit index of the source it is equal to the length
1008      * @param mode      normalization format (Normalizer.NFC,Normalizer.NFD,
1009      *                   Normalizer.NFKC,Normalizer.NFKD)
1010      * @param options   Options for use with exclusion set and tailored Normalization
1011      *                                   The only option that is currently recognized is UNICODE_3_2
1012      * @return          Return code to specify if the text is normalized or not
1013      *                   (Normalizer.YES, Normalizer.NO or
1014      *                   Normalizer.MAYBE)
1015      * @deprecated ICU 56 Use {@link Normalizer2} instead.
1016      */
1017     @Deprecated
quickCheck(char[] source,int start, int limit, Mode mode,int options)1018     public static QuickCheckResult quickCheck(char[] source,int start,
1019                                               int limit, Mode mode,int options) {
1020         CharBuffer srcBuffer = CharBuffer.wrap(source, start, limit - start);
1021         return mode.getNormalizer2(options).quickCheck(srcBuffer);
1022     }
1023 
1024     /**
1025      * Test if a string is in a given normalization form.
1026      * This is semantically equivalent to source.equals(normalize(source, mode)).
1027      *
1028      * Unlike quickCheck(), this function returns a definitive result,
1029      * never a "maybe".
1030      * For NFD, NFKD, and FCD, both functions work exactly the same.
1031      * For NFC and NFKC where quickCheck may return "maybe", this function will
1032      * perform further tests to arrive at a true/false result.
1033      * @param src       The input array of characters to be checked to see if
1034      *                   it is normalized
1035      * @param start     The strart index in the source
1036      * @param limit     The limit index in the source
1037      * @param mode      the normalization mode
1038      * @param options   Options for use with exclusion set and tailored Normalization
1039      *                                   The only option that is currently recognized is UNICODE_3_2
1040      * @return Boolean value indicating whether the source string is in the
1041      *         "mode" normalization form
1042      * @deprecated ICU 56 Use {@link Normalizer2} instead.
1043      */
1044     @Deprecated
isNormalized(char[] src,int start, int limit, Mode mode, int options)1045     public static boolean isNormalized(char[] src,int start,
1046                                        int limit, Mode mode,
1047                                        int options) {
1048         CharBuffer srcBuffer = CharBuffer.wrap(src, start, limit - start);
1049         return mode.getNormalizer2(options).isNormalized(srcBuffer);
1050     }
1051 
1052     /**
1053      * Test if a string is in a given normalization form.
1054      * This is semantically equivalent to source.equals(normalize(source, mode)).
1055      *
1056      * Unlike quickCheck(), this function returns a definitive result,
1057      * never a "maybe".
1058      * For NFD, NFKD, and FCD, both functions work exactly the same.
1059      * For NFC and NFKC where quickCheck may return "maybe", this function will
1060      * perform further tests to arrive at a true/false result.
1061      * @param str       the input string to be checked to see if it is
1062      *                   normalized
1063      * @param mode      the normalization mode
1064      * @param options   Options for use with exclusion set and tailored Normalization
1065      *                  The only option that is currently recognized is UNICODE_3_2
1066      * @see #isNormalized
1067      * @deprecated ICU 56 Use {@link Normalizer2} instead.
1068      */
1069     @Deprecated
isNormalized(String str, Mode mode, int options)1070     public static boolean isNormalized(String str, Mode mode, int options) {
1071         return mode.getNormalizer2(options).isNormalized(str);
1072     }
1073 
1074     /**
1075      * Convenience Method
1076      * @param char32    the input code point to be checked to see if it is
1077      *                   normalized
1078      * @param mode      the normalization mode
1079      * @param options   Options for use with exclusion set and tailored Normalization
1080      *                  The only option that is currently recognized is UNICODE_3_2
1081      *
1082      * @see #isNormalized
1083      * @deprecated ICU 56 Use {@link Normalizer2} instead.
1084      */
1085     @Deprecated
isNormalized(int char32, Mode mode,int options)1086     public static boolean isNormalized(int char32, Mode mode,int options) {
1087         return isNormalized(UTF16.valueOf(char32), mode, options);
1088     }
1089 
1090     /**
1091      * Compare two strings for canonical equivalence.
1092      * Further options include case-insensitive comparison and
1093      * code point order (as opposed to code unit order).
1094      *
1095      * Canonical equivalence between two strings is defined as their normalized
1096      * forms (NFD or NFC) being identical.
1097      * This function compares strings incrementally instead of normalizing
1098      * (and optionally case-folding) both strings entirely,
1099      * improving performance significantly.
1100      *
1101      * Bulk normalization is only necessary if the strings do not fulfill the
1102      * FCD conditions. Only in this case, and only if the strings are relatively
1103      * long, is memory allocated temporarily.
1104      * For FCD strings and short non-FCD strings there is no memory allocation.
1105      *
1106      * Semantically, this is equivalent to
1107      *   strcmp[CodePointOrder](foldCase(NFD(s1)), foldCase(NFD(s2)))
1108      * where code point order and foldCase are all optional.
1109      *
1110      * @param s1        First source character array.
1111      * @param s1Start   start index of source
1112      * @param s1Limit   limit of the source
1113      *
1114      * @param s2        Second source character array.
1115      * @param s2Start   start index of the source
1116      * @param s2Limit   limit of the source
1117      *
1118      * @param options A bit set of options:
1119      *   - FOLD_CASE_DEFAULT or 0 is used for default options:
1120      *     Case-sensitive comparison in code unit order, and the input strings
1121      *     are quick-checked for FCD.
1122      *
1123      *   - INPUT_IS_FCD
1124      *     Set if the caller knows that both s1 and s2 fulfill the FCD
1125      *     conditions.If not set, the function will quickCheck for FCD
1126      *     and normalize if necessary.
1127      *
1128      *   - COMPARE_CODE_POINT_ORDER
1129      *     Set to choose code point order instead of code unit order
1130      *
1131      *   - COMPARE_IGNORE_CASE
1132      *     Set to compare strings case-insensitively using case folding,
1133      *     instead of case-sensitively.
1134      *     If set, then the following case folding options are used.
1135      *
1136      *
1137      * @return &lt;0 or 0 or &gt;0 as usual for string comparisons
1138      *
1139      * @see #normalize
1140      * @see #FCD
1141      * @stable ICU 2.8
1142      */
compare(char[] s1, int s1Start, int s1Limit, char[] s2, int s2Start, int s2Limit, int options)1143     public static int compare(char[] s1, int s1Start, int s1Limit,
1144                               char[] s2, int s2Start, int s2Limit,
1145                               int options) {
1146         if( s1==null || s1Start<0 || s1Limit<0 ||
1147             s2==null || s2Start<0 || s2Limit<0 ||
1148             s1Limit<s1Start || s2Limit<s2Start
1149         ) {
1150             throw new IllegalArgumentException();
1151         }
1152         return internalCompare(CharBuffer.wrap(s1, s1Start, s1Limit-s1Start),
1153                                CharBuffer.wrap(s2, s2Start, s2Limit-s2Start),
1154                                options);
1155     }
1156 
1157     /**
1158      * Compare two strings for canonical equivalence.
1159      * Further options include case-insensitive comparison and
1160      * code point order (as opposed to code unit order).
1161      *
1162      * Canonical equivalence between two strings is defined as their normalized
1163      * forms (NFD or NFC) being identical.
1164      * This function compares strings incrementally instead of normalizing
1165      * (and optionally case-folding) both strings entirely,
1166      * improving performance significantly.
1167      *
1168      * Bulk normalization is only necessary if the strings do not fulfill the
1169      * FCD conditions. Only in this case, and only if the strings are relatively
1170      * long, is memory allocated temporarily.
1171      * For FCD strings and short non-FCD strings there is no memory allocation.
1172      *
1173      * Semantically, this is equivalent to
1174      *   strcmp[CodePointOrder](foldCase(NFD(s1)), foldCase(NFD(s2)))
1175      * where code point order and foldCase are all optional.
1176      *
1177      * @param s1 First source string.
1178      * @param s2 Second source string.
1179      *
1180      * @param options A bit set of options:
1181      *   - FOLD_CASE_DEFAULT or 0 is used for default options:
1182      *     Case-sensitive comparison in code unit order, and the input strings
1183      *     are quick-checked for FCD.
1184      *
1185      *   - INPUT_IS_FCD
1186      *     Set if the caller knows that both s1 and s2 fulfill the FCD
1187      *     conditions. If not set, the function will quickCheck for FCD
1188      *     and normalize if necessary.
1189      *
1190      *   - COMPARE_CODE_POINT_ORDER
1191      *     Set to choose code point order instead of code unit order
1192      *
1193      *   - COMPARE_IGNORE_CASE
1194      *     Set to compare strings case-insensitively using case folding,
1195      *     instead of case-sensitively.
1196      *     If set, then the following case folding options are used.
1197      *
1198      * @return &lt;0 or 0 or &gt;0 as usual for string comparisons
1199      *
1200      * @see #normalize
1201      * @see #FCD
1202      * @stable ICU 2.8
1203      */
compare(String s1, String s2, int options)1204     public static int compare(String s1, String s2, int options) {
1205         return internalCompare(s1, s2, options);
1206     }
1207 
1208     /**
1209      * Compare two strings for canonical equivalence.
1210      * Further options include case-insensitive comparison and
1211      * code point order (as opposed to code unit order).
1212      * Convenience method.
1213      *
1214      * @param s1 First source string.
1215      * @param s2 Second source string.
1216      *
1217      * @param options A bit set of options:
1218      *   - FOLD_CASE_DEFAULT or 0 is used for default options:
1219      *     Case-sensitive comparison in code unit order, and the input strings
1220      *     are quick-checked for FCD.
1221      *
1222      *   - INPUT_IS_FCD
1223      *     Set if the caller knows that both s1 and s2 fulfill the FCD
1224      *     conditions. If not set, the function will quickCheck for FCD
1225      *     and normalize if necessary.
1226      *
1227      *   - COMPARE_CODE_POINT_ORDER
1228      *     Set to choose code point order instead of code unit order
1229      *
1230      *   - COMPARE_IGNORE_CASE
1231      *     Set to compare strings case-insensitively using case folding,
1232      *     instead of case-sensitively.
1233      *     If set, then the following case folding options are used.
1234      *
1235      * @return &lt;0 or 0 or &gt;0 as usual for string comparisons
1236      *
1237      * @see #normalize
1238      * @see #FCD
1239      * @stable ICU 2.8
1240      */
compare(char[] s1, char[] s2, int options)1241     public static int compare(char[] s1, char[] s2, int options) {
1242         return internalCompare(CharBuffer.wrap(s1), CharBuffer.wrap(s2), options);
1243     }
1244 
1245     /**
1246      * Convenience method that can have faster implementation
1247      * by not allocating buffers.
1248      * @param char32a    the first code point to be checked against the
1249      * @param char32b    the second code point
1250      * @param options    A bit set of options
1251      * @stable ICU 2.8
1252      */
compare(int char32a, int char32b, int options)1253     public static int compare(int char32a, int char32b, int options) {
1254         return internalCompare(UTF16.valueOf(char32a), UTF16.valueOf(char32b), options|INPUT_IS_FCD);
1255     }
1256 
1257     /**
1258      * Convenience method that can have faster implementation
1259      * by not allocating buffers.
1260      * @param char32a   the first code point to be checked against
1261      * @param str2      the second string
1262      * @param options   A bit set of options
1263      * @stable ICU 2.8
1264      */
compare(int char32a, String str2, int options)1265     public static int compare(int char32a, String str2, int options) {
1266         return internalCompare(UTF16.valueOf(char32a), str2, options);
1267     }
1268 
1269     /* Concatenation of normalized strings --------------------------------- */
1270     /**
1271      * Concatenate normalized strings, making sure that the result is normalized
1272      * as well.
1273      *
1274      * If both the left and the right strings are in
1275      * the normalization form according to "mode",
1276      * then the result will be
1277      *
1278      * <code>
1279      *     dest=normalize(left+right, mode)
1280      * </code>
1281      *
1282      * With the input strings already being normalized,
1283      * this function will use next() and previous()
1284      * to find the adjacent end pieces of the input strings.
1285      * Only the concatenation of these end pieces will be normalized and
1286      * then concatenated with the remaining parts of the input strings.
1287      *
1288      * It is allowed to have dest==left to avoid copying the entire left string.
1289      *
1290      * @param left Left source array, may be same as dest.
1291      * @param leftStart start in the left array.
1292      * @param leftLimit limit in the left array (==length)
1293      * @param right Right source array.
1294      * @param rightStart start in the right array.
1295      * @param rightLimit limit in the right array (==length)
1296      * @param dest The output buffer; can be null if destStart==destLimit==0
1297      *              for pure preflighting.
1298      * @param destStart start in the destination array
1299      * @param destLimit limit in the destination array (==length)
1300      * @param mode The normalization mode.
1301      * @param options The normalization options, ORed together (0 for no options).
1302      * @return Length of output (number of chars) when successful or
1303      *          IndexOutOfBoundsException
1304      * @exception IndexOutOfBoundsException whose message has the string
1305      *             representation of destination capacity required.
1306      * @see #normalize
1307      * @see #next
1308      * @see #previous
1309      * @exception IndexOutOfBoundsException if target capacity is less than the
1310      *             required length
1311      * @deprecated ICU 56 Use {@link Normalizer2} instead.
1312      */
1313     @Deprecated
concatenate(char[] left, int leftStart, int leftLimit, char[] right, int rightStart, int rightLimit, char[] dest, int destStart, int destLimit, Normalizer.Mode mode, int options)1314     public static int concatenate(char[] left,  int leftStart,  int leftLimit,
1315                                   char[] right, int rightStart, int rightLimit,
1316                                   char[] dest,  int destStart,  int destLimit,
1317                                   Normalizer.Mode mode, int options) {
1318         if(dest == null) {
1319             throw new IllegalArgumentException();
1320         }
1321 
1322         /* check for overlapping right and destination */
1323         if (right == dest && rightStart < destLimit && destStart < rightLimit) {
1324             throw new IllegalArgumentException("overlapping right and dst ranges");
1325         }
1326 
1327         /* allow left==dest */
1328         StringBuilder destBuilder=new StringBuilder(leftLimit-leftStart+rightLimit-rightStart+16);
1329         destBuilder.append(left, leftStart, leftLimit-leftStart);
1330         CharBuffer rightBuffer=CharBuffer.wrap(right, rightStart, rightLimit-rightStart);
1331         mode.getNormalizer2(options).append(destBuilder, rightBuffer);
1332         int destLength=destBuilder.length();
1333         if(destLength<=(destLimit-destStart)) {
1334             destBuilder.getChars(0, destLength, dest, destStart);
1335             return destLength;
1336         } else {
1337             throw new IndexOutOfBoundsException(Integer.toString(destLength));
1338         }
1339     }
1340 
1341     /**
1342      * Concatenate normalized strings, making sure that the result is normalized
1343      * as well.
1344      *
1345      * If both the left and the right strings are in
1346      * the normalization form according to "mode",
1347      * then the result will be
1348      *
1349      * <code>
1350      *     dest=normalize(left+right, mode)
1351      * </code>
1352      *
1353      * For details see concatenate
1354      *
1355      * @param left Left source string.
1356      * @param right Right source string.
1357      * @param mode The normalization mode.
1358      * @param options The normalization options, ORed together (0 for no options).
1359      * @return result
1360      *
1361      * @see #concatenate
1362      * @see #normalize
1363      * @see #next
1364      * @see #previous
1365      * @see #concatenate
1366      * @deprecated ICU 56 Use {@link Normalizer2} instead.
1367      */
1368     @Deprecated
concatenate(char[] left, char[] right,Mode mode, int options)1369     public static String concatenate(char[] left, char[] right,Mode mode, int options) {
1370         StringBuilder dest=new StringBuilder(left.length+right.length+16).append(left);
1371         return mode.getNormalizer2(options).append(dest, CharBuffer.wrap(right)).toString();
1372     }
1373 
1374     /**
1375      * Concatenate normalized strings, making sure that the result is normalized
1376      * as well.
1377      *
1378      * If both the left and the right strings are in
1379      * the normalization form according to "mode",
1380      * then the result will be
1381      *
1382      * <code>
1383      *     dest=normalize(left+right, mode)
1384      * </code>
1385      *
1386      * With the input strings already being normalized,
1387      * this function will use next() and previous()
1388      * to find the adjacent end pieces of the input strings.
1389      * Only the concatenation of these end pieces will be normalized and
1390      * then concatenated with the remaining parts of the input strings.
1391      *
1392      * @param left Left source string.
1393      * @param right Right source string.
1394      * @param mode The normalization mode.
1395      * @param options The normalization options, ORed together (0 for no options).
1396      * @return result
1397      *
1398      * @see #concatenate
1399      * @see #normalize
1400      * @see #next
1401      * @see #previous
1402      * @see #concatenate
1403      * @deprecated ICU 56 Use {@link Normalizer2} instead.
1404      */
1405     @Deprecated
concatenate(String left, String right, Mode mode, int options)1406     public static String concatenate(String left, String right, Mode mode, int options) {
1407         StringBuilder dest=new StringBuilder(left.length()+right.length()+16).append(left);
1408         return mode.getNormalizer2(options).append(dest, right).toString();
1409     }
1410 
1411     /**
1412      * Gets the FC_NFKC closure value.
1413      * @param c The code point whose closure value is to be retrieved
1414      * @param dest The char array to receive the closure value
1415      * @return the length of the closure value; 0 if there is none
1416      * @deprecated ICU 56
1417      */
1418     @Deprecated
getFC_NFKC_Closure(int c,char[] dest)1419     public static int getFC_NFKC_Closure(int c,char[] dest) {
1420         String closure=getFC_NFKC_Closure(c);
1421         int length=closure.length();
1422         if(length!=0 && dest!=null && length<=dest.length) {
1423             closure.getChars(0, length, dest, 0);
1424         }
1425         return length;
1426     }
1427     /**
1428      * Gets the FC_NFKC closure value.
1429      * @param c The code point whose closure value is to be retrieved
1430      * @return String representation of the closure value; "" if there is none
1431      * @deprecated ICU 56
1432      */
1433     @Deprecated
getFC_NFKC_Closure(int c)1434     public static String getFC_NFKC_Closure(int c) {
1435         // Compute the FC_NFKC_Closure on the fly:
1436         // We have the API for complete coverage of Unicode properties, although
1437         // this value by itself is not useful via API.
1438         // (What could be useful is a custom normalization table that combines
1439         // case folding and NFKC.)
1440         // For the derivation, see Unicode's DerivedNormalizationProps.txt.
1441         Normalizer2 nfkc=NFKCModeImpl.INSTANCE.normalizer2;
1442         UCaseProps csp=UCaseProps.INSTANCE;
1443         // first: b = NFKC(Fold(a))
1444         StringBuilder folded=new StringBuilder();
1445         int folded1Length=csp.toFullFolding(c, folded, 0);
1446         if(folded1Length<0) {
1447             Normalizer2Impl nfkcImpl=((Norm2AllModes.Normalizer2WithImpl)nfkc).impl;
1448             if(nfkcImpl.getCompQuickCheck(nfkcImpl.getNorm16(c))!=0) {
1449                 return "";  // c does not change at all under CaseFolding+NFKC
1450             }
1451             folded.appendCodePoint(c);
1452         } else {
1453             if(folded1Length>UCaseProps.MAX_STRING_LENGTH) {
1454                 folded.appendCodePoint(folded1Length);
1455             }
1456         }
1457         String kc1=nfkc.normalize(folded);
1458         // second: c = NFKC(Fold(b))
1459         String kc2=nfkc.normalize(UCharacter.foldCase(kc1, 0));
1460         // if (c != b) add the mapping from a to c
1461         if(kc1.equals(kc2)) {
1462             return "";
1463         } else {
1464             return kc2;
1465         }
1466     }
1467 
1468     //-------------------------------------------------------------------------
1469     // Iteration API
1470     //-------------------------------------------------------------------------
1471 
1472     /**
1473      * Return the current character in the normalized text.
1474      * @return The codepoint as an int
1475      * @deprecated ICU 56
1476      */
1477     @Deprecated
current()1478     public int current() {
1479         if(bufferPos<buffer.length() || nextNormalize()) {
1480             return buffer.codePointAt(bufferPos);
1481         } else {
1482             return DONE;
1483         }
1484     }
1485 
1486     /**
1487      * Return the next character in the normalized text and advance
1488      * the iteration position by one.  If the end
1489      * of the text has already been reached, {@link #DONE} is returned.
1490      * @return The codepoint as an int
1491      * @deprecated ICU 56
1492      */
1493     @Deprecated
next()1494     public int next() {
1495         if(bufferPos<buffer.length() ||  nextNormalize()) {
1496             int c=buffer.codePointAt(bufferPos);
1497             bufferPos+=Character.charCount(c);
1498             return c;
1499         } else {
1500             return DONE;
1501         }
1502     }
1503 
1504 
1505     /**
1506      * Return the previous character in the normalized text and decrement
1507      * the iteration position by one.  If the beginning
1508      * of the text has already been reached, {@link #DONE} is returned.
1509      * @return The codepoint as an int
1510      * @deprecated ICU 56
1511      */
1512     @Deprecated
previous()1513     public int previous() {
1514         if(bufferPos>0 || previousNormalize()) {
1515             int c=buffer.codePointBefore(bufferPos);
1516             bufferPos-=Character.charCount(c);
1517             return c;
1518         } else {
1519             return DONE;
1520         }
1521     }
1522 
1523     /**
1524      * Reset the index to the beginning of the text.
1525      * This is equivalent to setIndexOnly(startIndex)).
1526      * @deprecated ICU 56
1527      */
1528     @Deprecated
reset()1529     public void reset() {
1530         text.setToStart();
1531         currentIndex=nextIndex=0;
1532         clearBuffer();
1533     }
1534 
1535     /**
1536      * Set the iteration position in the input text that is being normalized,
1537      * without any immediate normalization.
1538      * After setIndexOnly(), getIndex() will return the same index that is
1539      * specified here.
1540      *
1541      * @param index the desired index in the input text.
1542      * @deprecated ICU 56
1543      */
1544     @Deprecated
setIndexOnly(int index)1545     public void setIndexOnly(int index) {
1546         text.setIndex(index);  // validates index
1547         currentIndex=nextIndex=index;
1548         clearBuffer();
1549     }
1550 
1551     /**
1552      * Set the iteration position in the input text that is being normalized
1553      * and return the first normalized character at that position.
1554      * <p>
1555      * <b>Note:</b> This method sets the position in the <em>input</em> text,
1556      * while {@link #next} and {@link #previous} iterate through characters
1557      * in the normalized <em>output</em>.  This means that there is not
1558      * necessarily a one-to-one correspondence between characters returned
1559      * by <tt>next</tt> and <tt>previous</tt> and the indices passed to and
1560      * returned from <tt>setIndex</tt> and {@link #getIndex}.
1561      * <p>
1562      * @param index the desired index in the input text.
1563      *
1564      * @return   the first normalized character that is the result of iterating
1565      *            forward starting at the given index.
1566      *
1567      * @throws IllegalArgumentException if the given index is less than
1568      *          {@link #getBeginIndex} or greater than {@link #getEndIndex}.
1569      * @deprecated ICU 3.2
1570      * @obsolete ICU 3.2
1571      */
1572     @Deprecated
1573      ///CLOVER:OFF
setIndex(int index)1574      public int setIndex(int index) {
1575          setIndexOnly(index);
1576          return current();
1577      }
1578      ///CLOVER:ON
1579     /**
1580      * Retrieve the index of the start of the input text. This is the begin
1581      * index of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the
1582      * <tt>String</tt> over which this <tt>Normalizer</tt> is iterating
1583      * @deprecated ICU 2.2. Use startIndex() instead.
1584      * @return The codepoint as an int
1585      * @see #startIndex
1586      */
1587     @Deprecated
getBeginIndex()1588     public int getBeginIndex() {
1589         return 0;
1590     }
1591 
1592     /**
1593      * Retrieve the index of the end of the input text.  This is the end index
1594      * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt>
1595      * over which this <tt>Normalizer</tt> is iterating
1596      * @deprecated ICU 2.2. Use endIndex() instead.
1597      * @return The codepoint as an int
1598      * @see #endIndex
1599      */
1600     @Deprecated
getEndIndex()1601     public int getEndIndex() {
1602         return endIndex();
1603     }
1604     /**
1605      * Return the first character in the normalized text.  This resets
1606      * the <tt>Normalizer's</tt> position to the beginning of the text.
1607      * @return The codepoint as an int
1608      * @deprecated ICU 56
1609      */
1610     @Deprecated
first()1611     public int first() {
1612         reset();
1613         return next();
1614     }
1615 
1616     /**
1617      * Return the last character in the normalized text.  This resets
1618      * the <tt>Normalizer's</tt> position to be just before the
1619      * the input text corresponding to that normalized character.
1620      * @return The codepoint as an int
1621      * @deprecated ICU 56
1622      */
1623     @Deprecated
last()1624     public int last() {
1625         text.setToLimit();
1626         currentIndex=nextIndex=text.getIndex();
1627         clearBuffer();
1628         return previous();
1629     }
1630 
1631     /**
1632      * Retrieve the current iteration position in the input text that is
1633      * being normalized.  This method is useful in applications such as
1634      * searching, where you need to be able to determine the position in
1635      * the input text that corresponds to a given normalized output character.
1636      * <p>
1637      * <b>Note:</b> This method sets the position in the <em>input</em>, while
1638      * {@link #next} and {@link #previous} iterate through characters in the
1639      * <em>output</em>.  This means that there is not necessarily a one-to-one
1640      * correspondence between characters returned by <tt>next</tt> and
1641      * <tt>previous</tt> and the indices passed to and returned from
1642      * <tt>setIndex</tt> and {@link #getIndex}.
1643      * @return The current iteration position
1644      * @deprecated ICU 56
1645      */
1646     @Deprecated
getIndex()1647     public int getIndex() {
1648         if(bufferPos<buffer.length()) {
1649             return currentIndex;
1650         } else {
1651             return nextIndex;
1652         }
1653     }
1654 
1655     /**
1656      * Retrieve the index of the start of the input text. This is the begin
1657      * index of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the
1658      * <tt>String</tt> over which this <tt>Normalizer</tt> is iterating
1659      * @return The current iteration position
1660      * @deprecated ICU 56
1661      */
1662     @Deprecated
startIndex()1663     public int startIndex() {
1664         return 0;
1665     }
1666 
1667     /**
1668      * Retrieve the index of the end of the input text.  This is the end index
1669      * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt>
1670      * over which this <tt>Normalizer</tt> is iterating
1671      * @return The current iteration position
1672      * @deprecated ICU 56
1673      */
1674     @Deprecated
endIndex()1675     public int endIndex() {
1676         return text.getLength();
1677     }
1678 
1679     //-------------------------------------------------------------------------
1680     // Iterator attributes
1681     //-------------------------------------------------------------------------
1682     /**
1683      * Set the normalization mode for this object.
1684      * <p>
1685      * <b>Note:</b>If the normalization mode is changed while iterating
1686      * over a string, calls to {@link #next} and {@link #previous} may
1687      * return previously buffers characters in the old normalization mode
1688      * until the iteration is able to re-sync at the next base character.
1689      * It is safest to call {@link #setText setText()}, {@link #first},
1690      * {@link #last}, etc. after calling <tt>setMode</tt>.
1691      * <p>
1692      * @param newMode the new mode for this <tt>Normalizer</tt>.
1693      * The supported modes are:
1694      * <ul>
1695      *  <li>{@link #NFC}    - Unicode canonical decompositiion
1696      *                        followed by canonical composition.
1697      *  <li>{@link #NFKC}   - Unicode compatibility decompositiion
1698      *                        follwed by canonical composition.
1699      *  <li>{@link #NFD}    - Unicode canonical decomposition
1700      *  <li>{@link #NFKD}   - Unicode compatibility decomposition.
1701      *  <li>{@link #NONE}   - Do nothing but return characters
1702      *                        from the underlying input text.
1703      * </ul>
1704      *
1705      * @see #getMode
1706      * @deprecated ICU 56
1707      */
1708     @Deprecated
setMode(Mode newMode)1709     public void setMode(Mode newMode) {
1710         mode = newMode;
1711         norm2 = mode.getNormalizer2(options);
1712     }
1713     /**
1714      * Return the basic operation performed by this <tt>Normalizer</tt>
1715      *
1716      * @see #setMode
1717      * @deprecated ICU 56
1718      */
1719     @Deprecated
getMode()1720     public Mode getMode() {
1721         return mode;
1722     }
1723     /**
1724      * Set options that affect this <tt>Normalizer</tt>'s operation.
1725      * Options do not change the basic composition or decomposition operation
1726      * that is being performed , but they control whether
1727      * certain optional portions of the operation are done.
1728      * Currently the only available option is:
1729      *
1730      * <ul>
1731      *   <li>{@link #UNICODE_3_2} - Use Normalization conforming to Unicode version 3.2.
1732      * </ul>
1733      *
1734      * @param   option  the option whose value is to be set.
1735      * @param   value   the new setting for the option.  Use <tt>true</tt> to
1736      *                  turn the option on and <tt>false</tt> to turn it off.
1737      *
1738      * @see #getOption
1739      * @deprecated ICU 56
1740      */
1741     @Deprecated
setOption(int option,boolean value)1742     public void setOption(int option,boolean value) {
1743         if (value) {
1744             options |= option;
1745         } else {
1746             options &= (~option);
1747         }
1748         norm2 = mode.getNormalizer2(options);
1749     }
1750 
1751     /**
1752      * Determine whether an option is turned on or off.
1753      * <p>
1754      * @see #setOption
1755      * @deprecated ICU 56
1756      */
1757     @Deprecated
getOption(int option)1758     public int getOption(int option) {
1759         if((options & option)!=0) {
1760             return 1 ;
1761         } else {
1762             return 0;
1763         }
1764     }
1765 
1766     /**
1767      * Gets the underlying text storage
1768      * @param fillIn the char buffer to fill the UTF-16 units.
1769      *         The length of the buffer should be equal to the length of the
1770      *         underlying text storage
1771      * @throws IndexOutOfBoundsException If the index passed for the array is invalid.
1772      * @see   #getLength
1773      * @deprecated ICU 56
1774      */
1775     @Deprecated
getText(char[] fillIn)1776     public int getText(char[] fillIn) {
1777         return text.getText(fillIn);
1778     }
1779 
1780     /**
1781      * Gets the length of underlying text storage
1782      * @return the length
1783      * @deprecated ICU 56
1784      */
1785     @Deprecated
getLength()1786     public int getLength() {
1787         return text.getLength();
1788     }
1789 
1790     /**
1791      * Returns the text under iteration as a string
1792      * @return a copy of the text under iteration.
1793      * @deprecated ICU 56
1794      */
1795     @Deprecated
getText()1796     public String getText() {
1797         return text.getText();
1798     }
1799 
1800     /**
1801      * Set the input text over which this <tt>Normalizer</tt> will iterate.
1802      * The iteration position is set to the beginning of the input text.
1803      * @param newText   The new string to be normalized.
1804      * @deprecated ICU 56
1805      */
1806     @Deprecated
setText(StringBuffer newText)1807     public void setText(StringBuffer newText) {
1808         UCharacterIterator newIter = UCharacterIterator.getInstance(newText);
1809         if (newIter == null) {
1810             throw new IllegalStateException("Could not create a new UCharacterIterator");
1811         }
1812         text = newIter;
1813         reset();
1814     }
1815 
1816     /**
1817      * Set the input text over which this <tt>Normalizer</tt> will iterate.
1818      * The iteration position is set to the beginning of the input text.
1819      * @param newText   The new string to be normalized.
1820      * @deprecated ICU 56
1821      */
1822     @Deprecated
setText(char[] newText)1823     public void setText(char[] newText) {
1824         UCharacterIterator newIter = UCharacterIterator.getInstance(newText);
1825         if (newIter == null) {
1826             throw new IllegalStateException("Could not create a new UCharacterIterator");
1827         }
1828         text = newIter;
1829         reset();
1830     }
1831 
1832     /**
1833      * Set the input text over which this <tt>Normalizer</tt> will iterate.
1834      * The iteration position is set to the beginning of the input text.
1835      * @param newText   The new string to be normalized.
1836      * @deprecated ICU 56
1837      */
1838     @Deprecated
setText(String newText)1839     public void setText(String newText) {
1840         UCharacterIterator newIter = UCharacterIterator.getInstance(newText);
1841         if (newIter == null) {
1842             throw new IllegalStateException("Could not create a new UCharacterIterator");
1843         }
1844         text = newIter;
1845         reset();
1846     }
1847 
1848     /**
1849      * Set the input text over which this <tt>Normalizer</tt> will iterate.
1850      * The iteration position is set to the beginning of the input text.
1851      * @param newText   The new string to be normalized.
1852      * @deprecated ICU 56
1853      */
1854     @Deprecated
setText(CharacterIterator newText)1855     public void setText(CharacterIterator newText) {
1856         UCharacterIterator newIter = UCharacterIterator.getInstance(newText);
1857         if (newIter == null) {
1858             throw new IllegalStateException("Could not create a new UCharacterIterator");
1859         }
1860         text = newIter;
1861         reset();
1862     }
1863 
1864     /**
1865      * Set the input text over which this <tt>Normalizer</tt> will iterate.
1866      * The iteration position is set to the beginning of the string.
1867      * @param newText   The new string to be normalized.
1868      * @deprecated ICU 56
1869      */
1870     @Deprecated
setText(UCharacterIterator newText)1871     public void setText(UCharacterIterator newText) {
1872         try{
1873             UCharacterIterator newIter = (UCharacterIterator)newText.clone();
1874             if (newIter == null) {
1875                 throw new IllegalStateException("Could not create a new UCharacterIterator");
1876             }
1877             text = newIter;
1878             reset();
1879         }catch(CloneNotSupportedException e) {
1880             throw new ICUCloneNotSupportedException("Could not clone the UCharacterIterator", e);
1881         }
1882     }
1883 
clearBuffer()1884     private void clearBuffer() {
1885         buffer.setLength(0);
1886         bufferPos=0;
1887     }
1888 
nextNormalize()1889     private boolean nextNormalize() {
1890         clearBuffer();
1891         currentIndex=nextIndex;
1892         text.setIndex(nextIndex);
1893         // Skip at least one character so we make progress.
1894         int c=text.nextCodePoint();
1895         if(c<0) {
1896             return false;
1897         }
1898         StringBuilder segment=new StringBuilder().appendCodePoint(c);
1899         while((c=text.nextCodePoint())>=0) {
1900             if(norm2.hasBoundaryBefore(c)) {
1901                 text.moveCodePointIndex(-1);
1902                 break;
1903             }
1904             segment.appendCodePoint(c);
1905         }
1906         nextIndex=text.getIndex();
1907         norm2.normalize(segment, buffer);
1908         return buffer.length()!=0;
1909     }
1910 
previousNormalize()1911     private boolean previousNormalize() {
1912         clearBuffer();
1913         nextIndex=currentIndex;
1914         text.setIndex(currentIndex);
1915         StringBuilder segment=new StringBuilder();
1916         int c;
1917         while((c=text.previousCodePoint())>=0) {
1918             if(c<=0xffff) {
1919                 segment.insert(0, (char)c);
1920             } else {
1921                 segment.insert(0, Character.toChars(c));
1922             }
1923             if(norm2.hasBoundaryBefore(c)) {
1924                 break;
1925             }
1926         }
1927         currentIndex=text.getIndex();
1928         norm2.normalize(segment, buffer);
1929         bufferPos=buffer.length();
1930         return buffer.length()!=0;
1931     }
1932 
1933     /* compare canonically equivalent ------------------------------------------- */
1934 
1935     // TODO: Broaden the public compare(String, String, options) API like this. Ticket #7407
internalCompare(CharSequence s1, CharSequence s2, int options)1936     private static int internalCompare(CharSequence s1, CharSequence s2, int options) {
1937         int normOptions=options>>>COMPARE_NORM_OPTIONS_SHIFT;
1938         options|= COMPARE_EQUIV;
1939 
1940         /*
1941          * UAX #21 Case Mappings, as fixed for Unicode version 4
1942          * (see Jitterbug 2021), defines a canonical caseless match as
1943          *
1944          * A string X is a canonical caseless match
1945          * for a string Y if and only if
1946          * NFD(toCasefold(NFD(X))) = NFD(toCasefold(NFD(Y)))
1947          *
1948          * For better performance, we check for FCD (or let the caller tell us that
1949          * both strings are in FCD) for the inner normalization.
1950          * BasicNormalizerTest::FindFoldFCDExceptions() makes sure that
1951          * case-folding preserves the FCD-ness of a string.
1952          * The outer normalization is then only performed by NormalizerImpl.cmpEquivFold()
1953          * when there is a difference.
1954          *
1955          * Exception: When using the Turkic case-folding option, we do perform
1956          * full NFD first. This is because in the Turkic case precomposed characters
1957          * with 0049 capital I or 0069 small i fold differently whether they
1958          * are first decomposed or not, so an FCD check - a check only for
1959          * canonical order - is not sufficient.
1960          */
1961         if((options&INPUT_IS_FCD)==0 || (options&FOLD_CASE_EXCLUDE_SPECIAL_I)!=0) {
1962             Normalizer2 n2;
1963             if((options&FOLD_CASE_EXCLUDE_SPECIAL_I)!=0) {
1964                 n2=NFD.getNormalizer2(normOptions);
1965             } else {
1966                 n2=FCD.getNormalizer2(normOptions);
1967             }
1968 
1969             // check if s1 and/or s2 fulfill the FCD conditions
1970             int spanQCYes1=n2.spanQuickCheckYes(s1);
1971             int spanQCYes2=n2.spanQuickCheckYes(s2);
1972 
1973             /*
1974              * ICU 2.4 had a further optimization:
1975              * If both strings were not in FCD, then they were both NFD'ed,
1976              * and the COMPARE_EQUIV option was turned off.
1977              * It is not entirely clear that this is valid with the current
1978              * definition of the canonical caseless match.
1979              * Therefore, ICU 2.6 removes that optimization.
1980              */
1981 
1982             if(spanQCYes1<s1.length()) {
1983                 StringBuilder fcd1=new StringBuilder(s1.length()+16).append(s1, 0, spanQCYes1);
1984                 s1=n2.normalizeSecondAndAppend(fcd1, s1.subSequence(spanQCYes1, s1.length()));
1985             }
1986             if(spanQCYes2<s2.length()) {
1987                 StringBuilder fcd2=new StringBuilder(s2.length()+16).append(s2, 0, spanQCYes2);
1988                 s2=n2.normalizeSecondAndAppend(fcd2, s2.subSequence(spanQCYes2, s2.length()));
1989             }
1990         }
1991 
1992         return cmpEquivFold(s1, s2, options);
1993     }
1994 
1995     /*
1996      * Compare two strings for canonical equivalence.
1997      * Further options include case-insensitive comparison and
1998      * code point order (as opposed to code unit order).
1999      *
2000      * In this function, canonical equivalence is optional as well.
2001      * If canonical equivalence is tested, then both strings must fulfill
2002      * the FCD check.
2003      *
2004      * Semantically, this is equivalent to
2005      *   strcmp[CodePointOrder](NFD(foldCase(s1)), NFD(foldCase(s2)))
2006      * where code point order, NFD and foldCase are all optional.
2007      *
2008      * String comparisons almost always yield results before processing both strings
2009      * completely.
2010      * They are generally more efficient working incrementally instead of
2011      * performing the sub-processing (strlen, normalization, case-folding)
2012      * on the entire strings first.
2013      *
2014      * It is also unnecessary to not normalize identical characters.
2015      *
2016      * This function works in principle as follows:
2017      *
2018      * loop {
2019      *   get one code unit c1 from s1 (-1 if end of source)
2020      *   get one code unit c2 from s2 (-1 if end of source)
2021      *
2022      *   if(either string finished) {
2023      *     return result;
2024      *   }
2025      *   if(c1==c2) {
2026      *     continue;
2027      *   }
2028      *
2029      *   // c1!=c2
2030      *   try to decompose/case-fold c1/c2, and continue if one does;
2031      *
2032      *   // still c1!=c2 and neither decomposes/case-folds, return result
2033      *   return c1-c2;
2034      * }
2035      *
2036      * When a character decomposes, then the pointer for that source changes to
2037      * the decomposition, pushing the previous pointer onto a stack.
2038      * When the end of the decomposition is reached, then the code unit reader
2039      * pops the previous source from the stack.
2040      * (Same for case-folding.)
2041      *
2042      * This is complicated further by operating on variable-width UTF-16.
2043      * The top part of the loop works on code units, while lookups for decomposition
2044      * and case-folding need code points.
2045      * Code points are assembled after the equality/end-of-source part.
2046      * The source pointer is only advanced beyond all code units when the code point
2047      * actually decomposes/case-folds.
2048      *
2049      * If we were on a trail surrogate unit when assembling a code point,
2050      * and the code point decomposes/case-folds, then the decomposition/folding
2051      * result must be compared with the part of the other string that corresponds to
2052      * this string's lead surrogate.
2053      * Since we only assemble a code point when hitting a trail unit when the
2054      * preceding lead units were identical, we back up the other string by one unit
2055      * in such a case.
2056      *
2057      * The optional code point order comparison at the end works with
2058      * the same fix-up as the other code point order comparison functions.
2059      * See ustring.c and the comment near the end of this function.
2060      *
2061      * Assumption: A decomposition or case-folding result string never contains
2062      * a single surrogate. This is a safe assumption in the Unicode Standard.
2063      * Therefore, we do not need to check for surrogate pairs across
2064      * decomposition/case-folding boundaries.
2065      *
2066      * Further assumptions (see verifications tstnorm.cpp):
2067      * The API function checks for FCD first, while the core function
2068      * first case-folds and then decomposes. This requires that case-folding does not
2069      * un-FCD any strings.
2070      *
2071      * The API function may also NFD the input and turn off decomposition.
2072      * This requires that case-folding does not un-NFD strings either.
2073      *
2074      * TODO If any of the above two assumptions is violated,
2075      * then this entire code must be re-thought.
2076      * If this happens, then a simple solution is to case-fold both strings up front
2077      * and to turn off UNORM_INPUT_IS_FCD.
2078      * We already do this when not both strings are in FCD because makeFCD
2079      * would be a partial NFD before the case folding, which does not work.
2080      * Note that all of this is only a problem when case-folding _and_
2081      * canonical equivalence come together.
2082      * (Comments in unorm_compare() are more up to date than this TODO.)
2083      */
2084 
2085     /* stack element for previous-level source/decomposition pointers */
2086     private static final class CmpEquivLevel {
2087         CharSequence cs;
2088         int s;
2089     };
createCmpEquivLevelStack()2090     private static final CmpEquivLevel[] createCmpEquivLevelStack() {
2091         return new CmpEquivLevel[] {
2092             new CmpEquivLevel(), new CmpEquivLevel()
2093         };
2094     }
2095 
2096     /**
2097      * Internal option for unorm_cmpEquivFold() for decomposing.
2098      * If not set, just do strcasecmp().
2099      */
2100     private static final int COMPARE_EQUIV=0x80000;
2101 
2102     /* internal function; package visibility for use by UTF16.StringComparator */
cmpEquivFold(CharSequence cs1, CharSequence cs2, int options)2103     /*package*/ static int cmpEquivFold(CharSequence cs1, CharSequence cs2, int options) {
2104         Normalizer2Impl nfcImpl;
2105         UCaseProps csp;
2106 
2107         /* current-level start/limit - s1/s2 as current */
2108         int s1, s2, limit1, limit2;
2109 
2110         /* decomposition and case folding variables */
2111         int length;
2112 
2113         /* stacks of previous-level start/current/limit */
2114         CmpEquivLevel[] stack1=null, stack2=null;
2115 
2116         /* buffers for algorithmic decompositions */
2117         String decomp1, decomp2;
2118 
2119         /* case folding buffers, only use current-level start/limit */
2120         StringBuilder fold1, fold2;
2121 
2122         /* track which is the current level per string */
2123         int level1, level2;
2124 
2125         /* current code units, and code points for lookups */
2126         int c1, c2, cp1, cp2;
2127 
2128         /* no argument error checking because this itself is not an API */
2129 
2130         /*
2131          * assume that at least one of the options _COMPARE_EQUIV and U_COMPARE_IGNORE_CASE is set
2132          * otherwise this function must behave exactly as uprv_strCompare()
2133          * not checking for that here makes testing this function easier
2134          */
2135 
2136         /* normalization/properties data loaded? */
2137         if((options&COMPARE_EQUIV)!=0) {
2138             nfcImpl=Norm2AllModes.getNFCInstance().impl;
2139         } else {
2140             nfcImpl=null;
2141         }
2142         if((options&COMPARE_IGNORE_CASE)!=0) {
2143             csp=UCaseProps.INSTANCE;
2144             fold1=new StringBuilder();
2145             fold2=new StringBuilder();
2146         } else {
2147             csp=null;
2148             fold1=fold2=null;
2149         }
2150 
2151         /* initialize */
2152         s1=0;
2153         limit1=cs1.length();
2154         s2=0;
2155         limit2=cs2.length();
2156 
2157         level1=level2=0;
2158         c1=c2=-1;
2159 
2160         /* comparison loop */
2161         for(;;) {
2162             /*
2163              * here a code unit value of -1 means "get another code unit"
2164              * below it will mean "this source is finished"
2165              */
2166 
2167             if(c1<0) {
2168                 /* get next code unit from string 1, post-increment */
2169                 for(;;) {
2170                     if(s1==limit1) {
2171                         if(level1==0) {
2172                             c1=-1;
2173                             break;
2174                         }
2175                     } else {
2176                         c1=cs1.charAt(s1++);
2177                         break;
2178                     }
2179 
2180                     /* reached end of level buffer, pop one level */
2181                     do {
2182                         --level1;
2183                         cs1=stack1[level1].cs;
2184                     } while(cs1==null);
2185                     s1=stack1[level1].s;
2186                     limit1=cs1.length();
2187                 }
2188             }
2189 
2190             if(c2<0) {
2191                 /* get next code unit from string 2, post-increment */
2192                 for(;;) {
2193                     if(s2==limit2) {
2194                         if(level2==0) {
2195                             c2=-1;
2196                             break;
2197                         }
2198                     } else {
2199                         c2=cs2.charAt(s2++);
2200                         break;
2201                     }
2202 
2203                     /* reached end of level buffer, pop one level */
2204                     do {
2205                         --level2;
2206                         cs2=stack2[level2].cs;
2207                     } while(cs2==null);
2208                     s2=stack2[level2].s;
2209                     limit2=cs2.length();
2210                 }
2211             }
2212 
2213             /*
2214              * compare c1 and c2
2215              * either variable c1, c2 is -1 only if the corresponding string is finished
2216              */
2217             if(c1==c2) {
2218                 if(c1<0) {
2219                     return 0;   /* c1==c2==-1 indicating end of strings */
2220                 }
2221                 c1=c2=-1;       /* make us fetch new code units */
2222                 continue;
2223             } else if(c1<0) {
2224                 return -1;      /* string 1 ends before string 2 */
2225             } else if(c2<0) {
2226                 return 1;       /* string 2 ends before string 1 */
2227             }
2228             /* c1!=c2 && c1>=0 && c2>=0 */
2229 
2230             /* get complete code points for c1, c2 for lookups if either is a surrogate */
2231             cp1=c1;
2232             if(UTF16.isSurrogate((char)c1)) {
2233                 char c;
2234 
2235                 if(Normalizer2Impl.UTF16Plus.isSurrogateLead(c1)) {
2236                     if(s1!=limit1 && Character.isLowSurrogate(c=cs1.charAt(s1))) {
2237                         /* advance ++s1; only below if cp1 decomposes/case-folds */
2238                         cp1=Character.toCodePoint((char)c1, c);
2239                     }
2240                 } else /* isTrail(c1) */ {
2241                     if(0<=(s1-2) && Character.isHighSurrogate(c=cs1.charAt(s1-2))) {
2242                         cp1=Character.toCodePoint(c, (char)c1);
2243                     }
2244                 }
2245             }
2246 
2247             cp2=c2;
2248             if(UTF16.isSurrogate((char)c2)) {
2249                 char c;
2250 
2251                 if(Normalizer2Impl.UTF16Plus.isSurrogateLead(c2)) {
2252                     if(s2!=limit2 && Character.isLowSurrogate(c=cs2.charAt(s2))) {
2253                         /* advance ++s2; only below if cp2 decomposes/case-folds */
2254                         cp2=Character.toCodePoint((char)c2, c);
2255                     }
2256                 } else /* isTrail(c2) */ {
2257                     if(0<=(s2-2) && Character.isHighSurrogate(c=cs2.charAt(s2-2))) {
2258                         cp2=Character.toCodePoint(c, (char)c2);
2259                     }
2260                 }
2261             }
2262 
2263             /*
2264              * go down one level for each string
2265              * continue with the main loop as soon as there is a real change
2266              */
2267 
2268             if( level1==0 && (options&COMPARE_IGNORE_CASE)!=0 &&
2269                 (length=csp.toFullFolding(cp1, fold1, options))>=0
2270             ) {
2271                 /* cp1 case-folds to the code point "length" or to p[length] */
2272                 if(UTF16.isSurrogate((char)c1)) {
2273                     if(Normalizer2Impl.UTF16Plus.isSurrogateLead(c1)) {
2274                         /* advance beyond source surrogate pair if it case-folds */
2275                         ++s1;
2276                     } else /* isTrail(c1) */ {
2277                         /*
2278                          * we got a supplementary code point when hitting its trail surrogate,
2279                          * therefore the lead surrogate must have been the same as in the other string;
2280                          * compare this decomposition with the lead surrogate in the other string
2281                          * remember that this simulates bulk text replacement:
2282                          * the decomposition would replace the entire code point
2283                          */
2284                         --s2;
2285                         c2=cs2.charAt(s2-1);
2286                     }
2287                 }
2288 
2289                 /* push current level pointers */
2290                 if(stack1==null) {
2291                     stack1=createCmpEquivLevelStack();
2292                 }
2293                 stack1[0].cs=cs1;
2294                 stack1[0].s=s1;
2295                 ++level1;
2296 
2297                 /* copy the folding result to fold1[] */
2298                 /* Java: the buffer was probably not empty, remove the old contents */
2299                 if(length<=UCaseProps.MAX_STRING_LENGTH) {
2300                     fold1.delete(0, fold1.length()-length);
2301                 } else {
2302                     fold1.setLength(0);
2303                     fold1.appendCodePoint(length);
2304                 }
2305 
2306                 /* set next level pointers to case folding */
2307                 cs1=fold1;
2308                 s1=0;
2309                 limit1=fold1.length();
2310 
2311                 /* get ready to read from decomposition, continue with loop */
2312                 c1=-1;
2313                 continue;
2314             }
2315 
2316             if( level2==0 && (options&COMPARE_IGNORE_CASE)!=0 &&
2317                 (length=csp.toFullFolding(cp2, fold2, options))>=0
2318             ) {
2319                 /* cp2 case-folds to the code point "length" or to p[length] */
2320                 if(UTF16.isSurrogate((char)c2)) {
2321                     if(Normalizer2Impl.UTF16Plus.isSurrogateLead(c2)) {
2322                         /* advance beyond source surrogate pair if it case-folds */
2323                         ++s2;
2324                     } else /* isTrail(c2) */ {
2325                         /*
2326                          * we got a supplementary code point when hitting its trail surrogate,
2327                          * therefore the lead surrogate must have been the same as in the other string;
2328                          * compare this decomposition with the lead surrogate in the other string
2329                          * remember that this simulates bulk text replacement:
2330                          * the decomposition would replace the entire code point
2331                          */
2332                         --s1;
2333                         c1=cs1.charAt(s1-1);
2334                     }
2335                 }
2336 
2337                 /* push current level pointers */
2338                 if(stack2==null) {
2339                     stack2=createCmpEquivLevelStack();
2340                 }
2341                 stack2[0].cs=cs2;
2342                 stack2[0].s=s2;
2343                 ++level2;
2344 
2345                 /* copy the folding result to fold2[] */
2346                 /* Java: the buffer was probably not empty, remove the old contents */
2347                 if(length<=UCaseProps.MAX_STRING_LENGTH) {
2348                     fold2.delete(0, fold2.length()-length);
2349                 } else {
2350                     fold2.setLength(0);
2351                     fold2.appendCodePoint(length);
2352                 }
2353 
2354                 /* set next level pointers to case folding */
2355                 cs2=fold2;
2356                 s2=0;
2357                 limit2=fold2.length();
2358 
2359                 /* get ready to read from decomposition, continue with loop */
2360                 c2=-1;
2361                 continue;
2362             }
2363 
2364             if( level1<2 && (options&COMPARE_EQUIV)!=0 &&
2365                 (decomp1=nfcImpl.getDecomposition(cp1))!=null
2366             ) {
2367                 /* cp1 decomposes into p[length] */
2368                 if(UTF16.isSurrogate((char)c1)) {
2369                     if(Normalizer2Impl.UTF16Plus.isSurrogateLead(c1)) {
2370                         /* advance beyond source surrogate pair if it decomposes */
2371                         ++s1;
2372                     } else /* isTrail(c1) */ {
2373                         /*
2374                          * we got a supplementary code point when hitting its trail surrogate,
2375                          * therefore the lead surrogate must have been the same as in the other string;
2376                          * compare this decomposition with the lead surrogate in the other string
2377                          * remember that this simulates bulk text replacement:
2378                          * the decomposition would replace the entire code point
2379                          */
2380                         --s2;
2381                         c2=cs2.charAt(s2-1);
2382                     }
2383                 }
2384 
2385                 /* push current level pointers */
2386                 if(stack1==null) {
2387                     stack1=createCmpEquivLevelStack();
2388                 }
2389                 stack1[level1].cs=cs1;
2390                 stack1[level1].s=s1;
2391                 ++level1;
2392 
2393                 /* set empty intermediate level if skipped */
2394                 if(level1<2) {
2395                     stack1[level1++].cs=null;
2396                 }
2397 
2398                 /* set next level pointers to decomposition */
2399                 cs1=decomp1;
2400                 s1=0;
2401                 limit1=decomp1.length();
2402 
2403                 /* get ready to read from decomposition, continue with loop */
2404                 c1=-1;
2405                 continue;
2406             }
2407 
2408             if( level2<2 && (options&COMPARE_EQUIV)!=0 &&
2409                 (decomp2=nfcImpl.getDecomposition(cp2))!=null
2410             ) {
2411                 /* cp2 decomposes into p[length] */
2412                 if(UTF16.isSurrogate((char)c2)) {
2413                     if(Normalizer2Impl.UTF16Plus.isSurrogateLead(c2)) {
2414                         /* advance beyond source surrogate pair if it decomposes */
2415                         ++s2;
2416                     } else /* isTrail(c2) */ {
2417                         /*
2418                          * we got a supplementary code point when hitting its trail surrogate,
2419                          * therefore the lead surrogate must have been the same as in the other string;
2420                          * compare this decomposition with the lead surrogate in the other string
2421                          * remember that this simulates bulk text replacement:
2422                          * the decomposition would replace the entire code point
2423                          */
2424                         --s1;
2425                         c1=cs1.charAt(s1-1);
2426                     }
2427                 }
2428 
2429                 /* push current level pointers */
2430                 if(stack2==null) {
2431                     stack2=createCmpEquivLevelStack();
2432                 }
2433                 stack2[level2].cs=cs2;
2434                 stack2[level2].s=s2;
2435                 ++level2;
2436 
2437                 /* set empty intermediate level if skipped */
2438                 if(level2<2) {
2439                     stack2[level2++].cs=null;
2440                 }
2441 
2442                 /* set next level pointers to decomposition */
2443                 cs2=decomp2;
2444                 s2=0;
2445                 limit2=decomp2.length();
2446 
2447                 /* get ready to read from decomposition, continue with loop */
2448                 c2=-1;
2449                 continue;
2450             }
2451 
2452             /*
2453              * no decomposition/case folding, max level for both sides:
2454              * return difference result
2455              *
2456              * code point order comparison must not just return cp1-cp2
2457              * because when single surrogates are present then the surrogate pairs
2458              * that formed cp1 and cp2 may be from different string indexes
2459              *
2460              * example: { d800 d800 dc01 } vs. { d800 dc00 }, compare at second code units
2461              * c1=d800 cp1=10001 c2=dc00 cp2=10000
2462              * cp1-cp2>0 but c1-c2<0 and in fact in UTF-32 it is { d800 10001 } < { 10000 }
2463              *
2464              * therefore, use same fix-up as in ustring.c/uprv_strCompare()
2465              * except: uprv_strCompare() fetches c=*s while this functions fetches c=*s++
2466              * so we have slightly different pointer/start/limit comparisons here
2467              */
2468 
2469             if(c1>=0xd800 && c2>=0xd800 && (options&COMPARE_CODE_POINT_ORDER)!=0) {
2470                 /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */
2471                 if(
2472                     (c1<=0xdbff && s1!=limit1 && Character.isLowSurrogate(cs1.charAt(s1))) ||
2473                     (Character.isLowSurrogate((char)c1) && 0!=(s1-1) && Character.isHighSurrogate(cs1.charAt(s1-2)))
2474                 ) {
2475                     /* part of a surrogate pair, leave >=d800 */
2476                 } else {
2477                     /* BMP code point - may be surrogate code point - make <d800 */
2478                     c1-=0x2800;
2479                 }
2480 
2481                 if(
2482                     (c2<=0xdbff && s2!=limit2 && Character.isLowSurrogate(cs2.charAt(s2))) ||
2483                     (Character.isLowSurrogate((char)c2) && 0!=(s2-1) && Character.isHighSurrogate(cs2.charAt(s2-2)))
2484                 ) {
2485                     /* part of a surrogate pair, leave >=d800 */
2486                 } else {
2487                     /* BMP code point - may be surrogate code point - make <d800 */
2488                     c2-=0x2800;
2489                 }
2490             }
2491 
2492             return c1-c2;
2493         }
2494     }
2495 
2496     /**
2497      * An Appendable that writes into a char array with a capacity that may be
2498      * less than array.length.
2499      * (By contrast, CharBuffer will write beyond destLimit all the way up to array.length.)
2500      * <p>
2501      * An overflow is only reported at the end, for the old Normalizer API functions that write
2502      * to char arrays.
2503      */
2504     private static final class CharsAppendable implements Appendable {
CharsAppendable(char[] dest, int destStart, int destLimit)2505         public CharsAppendable(char[] dest, int destStart, int destLimit) {
2506             chars=dest;
2507             start=offset=destStart;
2508             limit=destLimit;
2509         }
length()2510         public int length() {
2511             int len=offset-start;
2512             if(offset<=limit) {
2513                 return len;
2514             } else {
2515                 throw new IndexOutOfBoundsException(Integer.toString(len));
2516             }
2517         }
append(char c)2518         public Appendable append(char c) {
2519             if(offset<limit) {
2520                 chars[offset]=c;
2521             }
2522             ++offset;
2523             return this;
2524         }
append(CharSequence s)2525         public Appendable append(CharSequence s) {
2526             return append(s, 0, s.length());
2527         }
append(CharSequence s, int sStart, int sLimit)2528         public Appendable append(CharSequence s, int sStart, int sLimit) {
2529             int len=sLimit-sStart;
2530             if(len<=(limit-offset)) {
2531                 while(sStart<sLimit) {  // TODO: Is there a better way to copy the characters?
2532                     chars[offset++]=s.charAt(sStart++);
2533                 }
2534             } else {
2535                 offset+=len;
2536             }
2537             return this;
2538         }
2539 
2540         private final char[] chars;
2541         private final int start, limit;
2542         private int offset;
2543     }
2544 }
2545