• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* GENERATED SOURCE. DO NOT MODIFY. */
2 // © 2016 and later: Unicode, Inc. and others.
3 // License & terms of use: http://www.unicode.org/copyright.html#License
4 /*
5  *******************************************************************************
6  * Copyright (C) 2000-2016, International Business Machines Corporation and
7  * others. All Rights Reserved.
8  *******************************************************************************
9  */
10 package ohos.global.icu.text;
11 import java.nio.CharBuffer;
12 import java.text.CharacterIterator;
13 
14 import ohos.global.icu.impl.Norm2AllModes;
15 import ohos.global.icu.impl.Normalizer2Impl;
16 import ohos.global.icu.impl.UCaseProps;
17 import ohos.global.icu.lang.UCharacter;
18 import ohos.global.icu.util.ICUCloneNotSupportedException;
19 
20 /**
21  * Old Unicode normalization API.
22  *
23  * <p>This API has been replaced by the {@link Normalizer2} class and is only available
24  * for backward compatibility. This class simply delegates to the Normalizer2 class.
25  * There are two exceptions: The new API does not provide a replacement for
26  * <code>QuickCheckResult</code> and <code>compare()</code>.
27  *
28  * <p><code>normalize</code> transforms Unicode text into an equivalent composed or
29  * decomposed form, allowing for easier sorting and searching of text.
30  * <code>normalize</code> supports the standard normalization forms described in
31  * <a href="http://www.unicode.org/unicode/reports/tr15/" target="unicode">
32  * Unicode Standard Annex #15 &mdash; Unicode Normalization Forms</a>.
33  *
34  * <p>Characters with accents or other adornments can be encoded in
35  * several different ways in Unicode.  For example, take the character A-acute.
36  * In Unicode, this can be encoded as a single character (the
37  * "composed" form):
38  *
39  * <pre>
40  *      00C1    LATIN CAPITAL LETTER A WITH ACUTE
41  * </pre>
42  *
43  * or as two separate characters (the "decomposed" form):
44  *
45  * <pre>
46  *      0041    LATIN CAPITAL LETTER A
47  *      0301    COMBINING ACUTE ACCENT
48  * </pre>
49  *
50  * <p>To a user of your program, however, both of these sequences should be
51  * treated as the same "user-level" character "A with acute accent".  When you
52  * are searching or comparing text, you must ensure that these two sequences are
53  * treated equivalently.  In addition, you must handle characters with more than
54  * one accent.  Sometimes the order of a character's combining accents is
55  * significant, while in other cases accent sequences in different orders are
56  * really equivalent.
57  *
58  * <p>Similarly, the string "ffi" can be encoded as three separate letters:
59  *
60  * <pre>
61  *      0066    LATIN SMALL LETTER F
62  *      0066    LATIN SMALL LETTER F
63  *      0069    LATIN SMALL LETTER I
64  * </pre>
65  *
66  * or as the single character
67  *
68  * <pre>
69  *      FB03    LATIN SMALL LIGATURE FFI
70  * </pre>
71  *
72  * <p>The ffi ligature is not a distinct semantic character, and strictly speaking
73  * it shouldn't be in Unicode at all, but it was included for compatibility
74  * with existing character sets that already provided it.  The Unicode standard
75  * identifies such characters by giving them "compatibility" decompositions
76  * into the corresponding semantic characters.  When sorting and searching, you
77  * will often want to use these mappings.
78  *
79  * <p><code>normalize</code> helps solve these problems by transforming text into
80  * the canonical composed and decomposed forms as shown in the first example
81  * above. In addition, you can have it perform compatibility decompositions so
82  * that you can treat compatibility characters the same as their equivalents.
83  * Finally, <code>normalize</code> rearranges accents into the proper canonical
84  * order, so that you do not have to worry about accent rearrangement on your
85  * own.
86  *
87  * <p>Form FCD, "Fast C or D", is also designed for collation.
88  * It allows to work on strings that are not necessarily normalized
89  * with an algorithm (like in collation) that works under "canonical closure",
90  * i.e., it treats precomposed characters and their decomposed equivalents the
91  * same.
92  *
93  * <p>It is not a normalization form because it does not provide for uniqueness of
94  * representation. Multiple strings may be canonically equivalent (their NFDs
95  * are identical) and may all conform to FCD without being identical themselves.
96  *
97  * <p>The form is defined such that the "raw decomposition", the recursive
98  * canonical decomposition of each character, results in a string that is
99  * canonically ordered. This means that precomposed characters are allowed for
100  * as long as their decompositions do not need canonical reordering.
101  *
102  * <p>Its advantage for a process like collation is that all NFD and most NFC texts
103  * - and many unnormalized texts - already conform to FCD and do not need to be
104  * normalized (NFD) for such a process. The FCD quick check will return YES for
105  * most strings in practice.
106  *
107  * <p>normalize(FCD) may be implemented with NFD.
108  *
109  * <p>For more details on FCD see Unicode Technical Note #5 (Canonical Equivalence in Applications):
110  * http://www.unicode.org/notes/tn5/#FCD
111  *
112  * <p>ICU collation performs either NFD or FCD normalization automatically if
113  * normalization is turned on for the collator object. Beyond collation and
114  * string search, normalized strings may be useful for string equivalence
115  * comparisons, transliteration/transcription, unique representations, etc.
116  *
117  * <p>The W3C generally recommends to exchange texts in NFC.
118  * Note also that most legacy character encodings use only precomposed forms and
119  * often do not encode any combining marks by themselves. For conversion to such
120  * character encodings the Unicode text needs to be normalized to NFC.
121  * For more usage examples, see the Unicode Standard Annex.
122  *
123  * <p>Note: The Normalizer class also provides API for iterative normalization.
124  * While the setIndex() and getIndex() refer to indices in the
125  * underlying Unicode input text, the next() and previous() methods
126  * iterate through characters in the normalized output.
127  * This means that there is not necessarily a one-to-one correspondence
128  * between characters returned by next() and previous() and the indices
129  * passed to and returned from setIndex() and getIndex().
130  * It is for this reason that Normalizer does not implement the CharacterIterator interface.
131  */
132 public final class Normalizer implements Cloneable {
133     // The input text and our position in it
134     private UCharacterIterator  text;
135     private Normalizer2         norm2;
136     private Mode                mode;
137     private int                 options;
138 
139     // The normalization buffer is the result of normalization
140     // of the source in [currentIndex..nextIndex[ .
141     private int                 currentIndex;
142     private int                 nextIndex;
143 
144     // A buffer for holding intermediate results
145     private StringBuilder       buffer;
146     private int                 bufferPos;
147 
148     // Helper classes to defer loading of normalization data.
149     private static final class ModeImpl {
ModeImpl(Normalizer2 n2)150         private ModeImpl(Normalizer2 n2) {
151             normalizer2 = n2;
152         }
153         private final Normalizer2 normalizer2;
154     }
155     private static final class NFDModeImpl {
156         private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFDInstance());
157     }
158     private static final class NFKDModeImpl {
159         private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFKDInstance());
160     }
161     private static final class NFCModeImpl {
162         private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFCInstance());
163     }
164     private static final class NFKCModeImpl {
165         private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFKCInstance());
166     }
167     private static final class FCDModeImpl {
168         private static final ModeImpl INSTANCE = new ModeImpl(Norm2AllModes.getFCDNormalizer2());
169     }
170 
171     private static final class Unicode32 {
172         private static final UnicodeSet INSTANCE = new UnicodeSet("[:age=3.2:]").freeze();
173     }
174     private static final class NFD32ModeImpl {
175         private static final ModeImpl INSTANCE =
176             new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFDInstance(),
177                                                  Unicode32.INSTANCE));
178     }
179     private static final class NFKD32ModeImpl {
180         private static final ModeImpl INSTANCE =
181             new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFKDInstance(),
182                                                  Unicode32.INSTANCE));
183     }
184     private static final class NFC32ModeImpl {
185         private static final ModeImpl INSTANCE =
186             new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFCInstance(),
187                                                  Unicode32.INSTANCE));
188     }
189     private static final class NFKC32ModeImpl {
190         private static final ModeImpl INSTANCE =
191             new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFKCInstance(),
192                                                  Unicode32.INSTANCE));
193     }
194     private static final class FCD32ModeImpl {
195         private static final ModeImpl INSTANCE =
196             new ModeImpl(new FilteredNormalizer2(Norm2AllModes.getFCDNormalizer2(),
197                                                  Unicode32.INSTANCE));
198     }
199 
200     /**
201      * Options bit set value to select Unicode 3.2 normalization
202      * (except NormalizationCorrections).
203      * At most one Unicode version can be selected at a time.
204      *
205      * @deprecated ICU 56 Use {@link FilteredNormalizer2} instead.
206      * @hide deprecated on icu4j-org
207      */
208     @Deprecated
209     public static final int UNICODE_3_2=0x20;
210 
211     /**
212      * Constant indicating that the end of the iteration has been reached.
213      * This is guaranteed to have the same value as {@link UCharacterIterator#DONE}.
214      *
215      * @deprecated ICU 56
216      * @hide deprecated on icu4j-org
217      */
218     @Deprecated
219     public static final int DONE = UCharacterIterator.DONE;
220 
221     /**
222      * Constants for normalization modes.
223      * <p>
224      * The Mode class is not intended for public subclassing.
225      * Only the Mode constants provided by the Normalizer class should be used,
226      * and any fields or methods should not be called or overridden by users.
227      *
228      * @deprecated ICU 56 Use {@link Normalizer2} instead.
229      * @hide exposed on OHOS
230      * @hide deprecated on icu4j-org
231      */
232     @Deprecated
233     public static abstract class Mode {
234         /**
235          * Sole constructor
236          * @deprecated This API is ICU internal only.
237          * @hide deprecated on icu4j-org
238          * @hide draft / provisional / internal are hidden on OHOS
239          */
240         @Deprecated
Mode()241         protected Mode() {
242         }
243 
244         /**
245          * @deprecated This API is ICU internal only.
246          * @hide deprecated on icu4j-org
247          * @hide draft / provisional / internal are hidden on OHOS
248          */
249         @Deprecated
getNormalizer2(int options)250         protected abstract Normalizer2 getNormalizer2(int options);
251     }
252 
253     private static final class NONEMode extends Mode {
254         @Override
getNormalizer2(int options)255         protected Normalizer2 getNormalizer2(int options) { return Norm2AllModes.NOOP_NORMALIZER2; }
256     }
257     private static final class NFDMode extends Mode {
258         @Override
getNormalizer2(int options)259         protected Normalizer2 getNormalizer2(int options) {
260             return (options&UNICODE_3_2) != 0 ?
261                     NFD32ModeImpl.INSTANCE.normalizer2 : NFDModeImpl.INSTANCE.normalizer2;
262         }
263     }
264     private static final class NFKDMode extends Mode {
265         @Override
getNormalizer2(int options)266         protected Normalizer2 getNormalizer2(int options) {
267             return (options&UNICODE_3_2) != 0 ?
268                     NFKD32ModeImpl.INSTANCE.normalizer2 : NFKDModeImpl.INSTANCE.normalizer2;
269         }
270     }
271     private static final class NFCMode extends Mode {
272         @Override
getNormalizer2(int options)273         protected Normalizer2 getNormalizer2(int options) {
274             return (options&UNICODE_3_2) != 0 ?
275                     NFC32ModeImpl.INSTANCE.normalizer2 : NFCModeImpl.INSTANCE.normalizer2;
276         }
277     }
278     private static final class NFKCMode extends Mode {
279         @Override
getNormalizer2(int options)280         protected Normalizer2 getNormalizer2(int options) {
281             return (options&UNICODE_3_2) != 0 ?
282                     NFKC32ModeImpl.INSTANCE.normalizer2 : NFKCModeImpl.INSTANCE.normalizer2;
283         }
284     }
285     private static final class FCDMode extends Mode {
286         @Override
getNormalizer2(int options)287         protected Normalizer2 getNormalizer2(int options) {
288             return (options&UNICODE_3_2) != 0 ?
289                     FCD32ModeImpl.INSTANCE.normalizer2 : FCDModeImpl.INSTANCE.normalizer2;
290         }
291     }
292 
293     /**
294      * No decomposition/composition.
295      *
296      * @deprecated ICU 56 Use {@link Normalizer2} instead.
297      * @hide deprecated on icu4j-org
298      */
299     @Deprecated
300     public static final Mode NONE = new NONEMode();
301 
302     /**
303      * Canonical decomposition.
304      *
305      * @deprecated ICU 56 Use {@link Normalizer2} instead.
306      * @hide deprecated on icu4j-org
307      */
308     @Deprecated
309     public static final Mode NFD = new NFDMode();
310 
311     /**
312      * Compatibility decomposition.
313      *
314      * @deprecated ICU 56 Use {@link Normalizer2} instead.
315      * @hide deprecated on icu4j-org
316      */
317     @Deprecated
318     public static final Mode NFKD = new NFKDMode();
319 
320     /**
321      * Canonical decomposition followed by canonical composition.
322      *
323      * @deprecated ICU 56 Use {@link Normalizer2} instead.
324      * @hide deprecated on icu4j-org
325      */
326     @Deprecated
327     public static final Mode NFC = new NFCMode();
328 
329     /**
330      * Default normalization.
331      *
332      * @deprecated ICU 56 Use {@link Normalizer2} instead.
333      * @hide deprecated on icu4j-org
334      */
335     @Deprecated
336     public static final Mode DEFAULT = NFC;
337 
338     /**
339      * Compatibility decomposition followed by canonical composition.
340      *
341      * @deprecated ICU 56 Use {@link Normalizer2} instead.
342      * @hide deprecated on icu4j-org
343      */
344     @Deprecated
345     public static final Mode NFKC =new NFKCMode();
346 
347     /**
348      * "Fast C or D" form.
349      *
350      * @deprecated ICU 56 Use {@link Normalizer2} instead.
351      * @hide deprecated on icu4j-org
352      */
353     @Deprecated
354     public static final Mode FCD = new FCDMode();
355 
356     /**
357      * Null operation for use with the {@link ohos.global.icu.text.Normalizer constructors}
358      * and the static {@link #normalize normalize} method.  This value tells
359      * the <tt>Normalizer</tt> to do nothing but return unprocessed characters
360      * from the underlying String or CharacterIterator.  If you have code which
361      * requires raw text at some times and normalized text at others, you can
362      * use <tt>NO_OP</tt> for the cases where you want raw text, rather
363      * than having a separate code path that bypasses <tt>Normalizer</tt>
364      * altogether.
365      * <p>
366      * @see #setMode
367      * @deprecated ICU 2.8. Use Nomalizer.NONE
368      * @see #NONE
369      * @hide deprecated on icu4j-org
370      */
371     @Deprecated
372     public static final Mode NO_OP = NONE;
373 
374     /**
375      * Canonical decomposition followed by canonical composition.  Used with the
376      * {@link ohos.global.icu.text.Normalizer constructors} and the static
377      * {@link #normalize normalize} method to determine the operation to be
378      * performed.
379      * <p>
380      * If all optional features (<i>e.g.</i> {@link #IGNORE_HANGUL}) are turned
381      * off, this operation produces output that is in
382      * <a href=http://www.unicode.org/unicode/reports/tr15/>Unicode Canonical
383      * Form</a>
384      * <b>C</b>.
385      * <p>
386      * @see #setMode
387      * @deprecated ICU 2.8. Use Normalier.NFC
388      * @see #NFC
389      * @hide deprecated on icu4j-org
390      */
391     @Deprecated
392     public static final Mode COMPOSE = NFC;
393 
394     /**
395      * Compatibility decomposition followed by canonical composition.
396      * Used with the {@link ohos.global.icu.text.Normalizer constructors} and the static
397      * {@link #normalize normalize} method to determine the operation to be
398      * performed.
399      * <p>
400      * If all optional features (<i>e.g.</i> {@link #IGNORE_HANGUL}) are turned
401      * off, this operation produces output that is in
402      * <a href=http://www.unicode.org/unicode/reports/tr15/>Unicode Canonical
403      * Form</a>
404      * <b>KC</b>.
405      * <p>
406      * @see #setMode
407      * @deprecated ICU 2.8. Use Normalizer.NFKC
408      * @see #NFKC
409      * @hide deprecated on icu4j-org
410      */
411     @Deprecated
412     public static final Mode COMPOSE_COMPAT = NFKC;
413 
414     /**
415      * Canonical decomposition.  This value is passed to the
416      * {@link ohos.global.icu.text.Normalizer constructors} and the static
417      * {@link #normalize normalize}
418      * method to determine the operation to be performed.
419      * <p>
420      * If all optional features (<i>e.g.</i> {@link #IGNORE_HANGUL}) are turned
421      * off, this operation produces output that is in
422      * <a href=http://www.unicode.org/unicode/reports/tr15/>Unicode Canonical
423      * Form</a>
424      * <b>D</b>.
425      * <p>
426      * @see #setMode
427      * @deprecated ICU 2.8. Use Normalizer.NFD
428      * @see #NFD
429      * @hide deprecated on icu4j-org
430      */
431     @Deprecated
432     public static final Mode DECOMP = NFD;
433 
434     /**
435      * Compatibility decomposition.  This value is passed to the
436      * {@link ohos.global.icu.text.Normalizer constructors} and the static
437      * {@link #normalize normalize}
438      * method to determine the operation to be performed.
439      * <p>
440      * If all optional features (<i>e.g.</i> {@link #IGNORE_HANGUL}) are turned
441      * off, this operation produces output that is in
442      * <a href=http://www.unicode.org/unicode/reports/tr15/>Unicode Canonical
443      * Form</a>
444      * <b>KD</b>.
445      * <p>
446      * @see #setMode
447      * @deprecated ICU 2.8. Use Normalizer.NFKD
448      * @see #NFKD
449      * @hide deprecated on icu4j-org
450      */
451     @Deprecated
452     public static final Mode DECOMP_COMPAT = NFKD;
453 
454     /**
455      * Option to disable Hangul/Jamo composition and decomposition.
456      * This option applies to Korean text,
457      * which can be represented either in the Jamo alphabet or in Hangul
458      * characters, which are really just two or three Jamo combined
459      * into one visual glyph.  Since Jamo takes up more storage space than
460      * Hangul, applications that process only Hangul text may wish to turn
461      * this option on when decomposing text.
462      * <p>
463      * The Unicode standard treates Hangul to Jamo conversion as a
464      * canonical decomposition, so this option must be turned <b>off</b> if you
465      * wish to transform strings into one of the standard
466      * <a href="http://www.unicode.org/unicode/reports/tr15/" target="unicode">
467      * Unicode Normalization Forms</a>.
468      * <p>
469      * @see #setOption
470      * @deprecated ICU 2.8. This option is no longer supported.
471      * @hide deprecated on icu4j-org
472      */
473     @Deprecated
474     public static final int IGNORE_HANGUL = 0x0001;
475 
476     /**
477      * Result values for quickCheck().
478      * For details see Unicode Technical Report 15.
479      */
480     public static final class QuickCheckResult{
481         //private int resultValue;
QuickCheckResult(int value)482         private QuickCheckResult(int value) {
483             //resultValue=value;
484         }
485     }
486     /**
487      * Indicates that string is not in the normalized format
488      */
489     public static final QuickCheckResult NO = new QuickCheckResult(0);
490 
491     /**
492      * Indicates that string is in the normalized format
493      */
494     public static final QuickCheckResult YES = new QuickCheckResult(1);
495 
496     /**
497      * Indicates it cannot be determined if string is in the normalized
498      * format without further thorough checks.
499      */
500     public static final QuickCheckResult MAYBE = new QuickCheckResult(2);
501 
502     /**
503      * Option bit for compare:
504      * Case sensitively compare the strings
505      */
506     public static final int FOLD_CASE_DEFAULT =  UCharacter.FOLD_CASE_DEFAULT;
507 
508     /**
509      * Option bit for compare:
510      * Both input strings are assumed to fulfill FCD conditions.
511      */
512     public static final int INPUT_IS_FCD    =      0x20000;
513 
514     /**
515      * Option bit for compare:
516      * Perform case-insensitive comparison.
517      */
518     public static final int COMPARE_IGNORE_CASE  =     0x10000;
519 
520     /**
521      * Option bit for compare:
522      * Compare strings in code point order instead of code unit order.
523      */
524     public static final int COMPARE_CODE_POINT_ORDER = 0x8000;
525 
526     /**
527      * Option value for case folding:
528      * Use the modified set of mappings provided in CaseFolding.txt to handle dotted I
529      * and dotless i appropriately for Turkic languages (tr, az).
530      * @see UCharacter#FOLD_CASE_EXCLUDE_SPECIAL_I
531      */
532     public static final int FOLD_CASE_EXCLUDE_SPECIAL_I = UCharacter.FOLD_CASE_EXCLUDE_SPECIAL_I;
533 
534     /**
535      * Lowest-order bit number of compare() options bits corresponding to
536      * normalization options bits.
537      *
538      * The options parameter for compare() uses most bits for
539      * itself and for various comparison and folding flags.
540      * The most significant bits, however, are shifted down and passed on
541      * to the normalization implementation.
542      * (That is, from compare(..., options, ...),
543      * options&gt;&gt;COMPARE_NORM_OPTIONS_SHIFT will be passed on to the
544      * internal normalization functions.)
545      *
546      * @see #compare
547      * @deprecated ICU 56 Use {@link Normalizer2} instead.
548      * @hide deprecated on icu4j-org
549      */
550     @Deprecated
551     public static final int COMPARE_NORM_OPTIONS_SHIFT  = 20;
552 
553     //-------------------------------------------------------------------------
554     // Iterator constructors
555     //-------------------------------------------------------------------------
556 
557     /**
558      * Creates a new <tt>Normalizer</tt> object for iterating over the
559      * normalized form of a given string.
560      * <p>
561      * The <tt>options</tt> parameter specifies which optional
562      * <tt>Normalizer</tt> features are to be enabled for this object.
563      * <p>
564      * @param str  The string to be normalized.  The normalization
565      *              will start at the beginning of the string.
566      *
567      * @param mode The normalization mode.
568      *
569      * @param opt Any optional features to be enabled.
570      *            Currently the only available option is {@link #UNICODE_3_2}.
571      *            If you want the default behavior corresponding to one of the
572      *            standard Unicode Normalization Forms, use 0 for this argument.
573      * @deprecated ICU 56 Use {@link Normalizer2} instead.
574      * @hide deprecated on icu4j-org
575      */
576     @Deprecated
Normalizer(String str, Mode mode, int opt)577     public Normalizer(String str, Mode mode, int opt) {
578         this.text = UCharacterIterator.getInstance(str);
579         this.mode = mode;
580         this.options=opt;
581         norm2 = mode.getNormalizer2(opt);
582         buffer = new StringBuilder();
583     }
584 
585     /**
586      * Creates a new <tt>Normalizer</tt> object for iterating over the
587      * normalized form of the given text.
588      * <p>
589      * @param iter  The input text to be normalized.  The normalization
590      *              will start at the beginning of the string.
591      *
592      * @param mode  The normalization mode.
593      *
594      * @param opt Any optional features to be enabled.
595      *            Currently the only available option is {@link #UNICODE_3_2}.
596      *            If you want the default behavior corresponding to one of the
597      *            standard Unicode Normalization Forms, use 0 for this argument.
598      * @deprecated ICU 56 Use {@link Normalizer2} instead.
599      * @hide deprecated on icu4j-org
600      */
601     @Deprecated
Normalizer(CharacterIterator iter, Mode mode, int opt)602     public Normalizer(CharacterIterator iter, Mode mode, int opt) {
603         this.text = UCharacterIterator.getInstance((CharacterIterator)iter.clone());
604         this.mode = mode;
605         this.options = opt;
606         norm2 = mode.getNormalizer2(opt);
607         buffer = new StringBuilder();
608     }
609 
610     /**
611      * Creates a new <tt>Normalizer</tt> object for iterating over the
612      * normalized form of the given text.
613      * <p>
614      * @param iter  The input text to be normalized.  The normalization
615      *              will start at the beginning of the string.
616      *
617      * @param mode  The normalization mode.
618      * @param options The normalization options, ORed together (0 for no options).
619      * @deprecated ICU 56 Use {@link Normalizer2} instead.
620      * @hide deprecated on icu4j-org
621      */
622     @Deprecated
Normalizer(UCharacterIterator iter, Mode mode, int options)623     public Normalizer(UCharacterIterator iter, Mode mode, int options) {
624         try {
625             this.text     = (UCharacterIterator)iter.clone();
626             this.mode     = mode;
627             this.options  = options;
628             norm2 = mode.getNormalizer2(options);
629             buffer = new StringBuilder();
630         } catch (CloneNotSupportedException e) {
631             throw new ICUCloneNotSupportedException(e);
632         }
633     }
634 
635     /**
636      * Clones this <tt>Normalizer</tt> object.  All properties of this
637      * object are duplicated in the new object, including the cloning of any
638      * {@link CharacterIterator} that was passed in to the constructor
639      * or to {@link #setText(CharacterIterator) setText}.
640      * However, the text storage underlying
641      * the <tt>CharacterIterator</tt> is not duplicated unless the
642      * iterator's <tt>clone</tt> method does so.
643      *
644      * @deprecated ICU 56 Use {@link Normalizer2} instead.
645      * @hide deprecated on icu4j-org
646      */
647     @Deprecated
648     @Override
clone()649     public Object clone() {
650         try {
651             Normalizer copy = (Normalizer) super.clone();
652             copy.text = (UCharacterIterator) text.clone();
653             copy.mode = mode;
654             copy.options = options;
655             copy.norm2 = norm2;
656             copy.buffer = new StringBuilder(buffer);
657             copy.bufferPos = bufferPos;
658             copy.currentIndex = currentIndex;
659             copy.nextIndex = nextIndex;
660             return copy;
661         }
662         catch (CloneNotSupportedException e) {
663             throw new ICUCloneNotSupportedException(e);
664         }
665     }
666 
667     //--------------------------------------------------------------------------
668     // Static Utility methods
669     //--------------------------------------------------------------------------
670 
getComposeNormalizer2(boolean compat, int options)671     private static final Normalizer2 getComposeNormalizer2(boolean compat, int options) {
672         return (compat ? NFKC : NFC).getNormalizer2(options);
673     }
getDecomposeNormalizer2(boolean compat, int options)674     private static final Normalizer2 getDecomposeNormalizer2(boolean compat, int options) {
675         return (compat ? NFKD : NFD).getNormalizer2(options);
676     }
677 
678     /**
679      * Compose a string.
680      * The string will be composed to according to the specified mode.
681      * @param str        The string to compose.
682      * @param compat     If true the string will be composed according to
683      *                    NFKC rules and if false will be composed according to
684      *                    NFC rules.
685      * @return String    The composed string
686      * @deprecated ICU 56 Use {@link Normalizer2} instead.
687      * @hide deprecated on icu4j-org
688      */
689     @Deprecated
compose(String str, boolean compat)690     public static String compose(String str, boolean compat) {
691         return compose(str,compat,0);
692     }
693 
694     /**
695      * Compose a string.
696      * The string will be composed to according to the specified mode.
697      * @param str        The string to compose.
698      * @param compat     If true the string will be composed according to
699      *                    NFKC rules and if false will be composed according to
700      *                    NFC rules.
701      * @param options    The only recognized option is UNICODE_3_2
702      * @return String    The composed string
703      * @deprecated ICU 56 Use {@link Normalizer2} instead.
704      * @hide deprecated on icu4j-org
705      */
706     @Deprecated
compose(String str, boolean compat, int options)707     public static String compose(String str, boolean compat, int options) {
708         return getComposeNormalizer2(compat, options).normalize(str);
709     }
710 
711     /**
712      * Compose a string.
713      * The string will be composed to according to the specified mode.
714      * @param source The char array to compose.
715      * @param target A char buffer to receive the normalized text.
716      * @param compat If true the char array will be composed according to
717      *                NFKC rules and if false will be composed according to
718      *                NFC rules.
719      * @param options The normalization options, ORed together (0 for no options).
720      * @return int   The total buffer size needed;if greater than length of
721      *                result, the output was truncated.
722      * @exception IndexOutOfBoundsException if target.length is less than the
723      *             required length
724      * @deprecated ICU 56 Use {@link Normalizer2} instead.
725      * @hide deprecated on icu4j-org
726      */
727     @Deprecated
compose(char[] source,char[] target, boolean compat, int options)728     public static int compose(char[] source,char[] target, boolean compat, int options) {
729         return compose(source, 0, source.length, target, 0, target.length, compat, options);
730     }
731 
732     /**
733      * Compose a string.
734      * The string will be composed to according to the specified mode.
735      * @param src       The char array to compose.
736      * @param srcStart  Start index of the source
737      * @param srcLimit  Limit index of the source
738      * @param dest      The char buffer to fill in
739      * @param destStart Start index of the destination buffer
740      * @param destLimit End index of the destination buffer
741      * @param compat If true the char array will be composed according to
742      *                NFKC rules and if false will be composed according to
743      *                NFC rules.
744      * @param options The normalization options, ORed together (0 for no options).
745      * @return int   The total buffer size needed;if greater than length of
746      *                result, the output was truncated.
747      * @exception IndexOutOfBoundsException if target.length is less than the
748      *             required length
749      * @deprecated ICU 56 Use {@link Normalizer2} instead.
750      * @hide deprecated on icu4j-org
751      */
752     @Deprecated
compose(char[] src,int srcStart, int srcLimit, char[] dest,int destStart, int destLimit, boolean compat, int options)753     public static int compose(char[] src,int srcStart, int srcLimit,
754                               char[] dest,int destStart, int destLimit,
755                               boolean compat, int options) {
756         CharBuffer srcBuffer = CharBuffer.wrap(src, srcStart, srcLimit - srcStart);
757         CharsAppendable app = new CharsAppendable(dest, destStart, destLimit);
758         getComposeNormalizer2(compat, options).normalize(srcBuffer, app);
759         return app.length();
760     }
761 
762     /**
763      * Decompose a string.
764      * The string will be decomposed to according to the specified mode.
765      * @param str       The string to decompose.
766      * @param compat    If true the string will be decomposed according to NFKD
767      *                   rules and if false will be decomposed according to NFD
768      *                   rules.
769      * @return String   The decomposed string
770      * @deprecated ICU 56 Use {@link Normalizer2} instead.
771      * @hide deprecated on icu4j-org
772      */
773     @Deprecated
decompose(String str, boolean compat)774     public static String decompose(String str, boolean compat) {
775         return decompose(str,compat,0);
776     }
777 
778     /**
779      * Decompose a string.
780      * The string will be decomposed to according to the specified mode.
781      * @param str     The string to decompose.
782      * @param compat  If true the string will be decomposed according to NFKD
783      *                 rules and if false will be decomposed according to NFD
784      *                 rules.
785      * @param options The normalization options, ORed together (0 for no options).
786      * @return String The decomposed string
787      * @deprecated ICU 56 Use {@link Normalizer2} instead.
788      * @hide deprecated on icu4j-org
789      */
790     @Deprecated
decompose(String str, boolean compat, int options)791     public static String decompose(String str, boolean compat, int options) {
792         return getDecomposeNormalizer2(compat, options).normalize(str);
793     }
794 
795     /**
796      * Decompose a string.
797      * The string will be decomposed to according to the specified mode.
798      * @param source The char array to decompose.
799      * @param target A char buffer to receive the normalized text.
800      * @param compat If true the char array will be decomposed according to NFKD
801      *                rules and if false will be decomposed according to
802      *                NFD rules.
803      * @return int   The total buffer size needed;if greater than length of
804      *                result,the output was truncated.
805      * @param options The normalization options, ORed together (0 for no options).
806      * @exception IndexOutOfBoundsException if the target capacity is less than
807      *             the required length
808      * @deprecated ICU 56 Use {@link Normalizer2} instead.
809      * @hide deprecated on icu4j-org
810      */
811     @Deprecated
decompose(char[] source,char[] target, boolean compat, int options)812     public static int decompose(char[] source,char[] target, boolean compat, int options) {
813         return decompose(source, 0, source.length, target, 0, target.length, compat, options);
814     }
815 
816     /**
817      * Decompose a string.
818      * The string will be decomposed to according to the specified mode.
819      * @param src       The char array to compose.
820      * @param srcStart  Start index of the source
821      * @param srcLimit  Limit index of the source
822      * @param dest      The char buffer to fill in
823      * @param destStart Start index of the destination buffer
824      * @param destLimit End index of the destination buffer
825      * @param compat If true the char array will be decomposed according to NFKD
826      *                rules and if false will be decomposed according to
827      *                NFD rules.
828      * @param options The normalization options, ORed together (0 for no options).
829      * @return int   The total buffer size needed;if greater than length of
830      *                result,the output was truncated.
831      * @exception IndexOutOfBoundsException if the target capacity is less than
832      *             the required length
833      * @deprecated ICU 56 Use {@link Normalizer2} instead.
834      * @hide deprecated on icu4j-org
835      */
836     @Deprecated
decompose(char[] src,int srcStart, int srcLimit, char[] dest,int destStart, int destLimit, boolean compat, int options)837     public static int decompose(char[] src,int srcStart, int srcLimit,
838                                 char[] dest,int destStart, int destLimit,
839                                 boolean compat, int options) {
840         CharBuffer srcBuffer = CharBuffer.wrap(src, srcStart, srcLimit - srcStart);
841         CharsAppendable app = new CharsAppendable(dest, destStart, destLimit);
842         getDecomposeNormalizer2(compat, options).normalize(srcBuffer, app);
843         return app.length();
844     }
845 
846     /**
847      * Normalizes a <tt>String</tt> using the given normalization operation.
848      * <p>
849      * The <tt>options</tt> parameter specifies which optional
850      * <tt>Normalizer</tt> features are to be enabled for this operation.
851      * Currently the only available option is {@link #UNICODE_3_2}.
852      * If you want the default behavior corresponding to one of the standard
853      * Unicode Normalization Forms, use 0 for this argument.
854      * <p>
855      * @param str       the input string to be normalized.
856      * @param mode      the normalization mode
857      * @param options   the optional features to be enabled.
858      * @return String   the normalized string
859      * @deprecated ICU 56 Use {@link Normalizer2} instead.
860      * @hide deprecated on icu4j-org
861      */
862     @Deprecated
normalize(String str, Mode mode, int options)863     public static String normalize(String str, Mode mode, int options) {
864         return mode.getNormalizer2(options).normalize(str);
865     }
866 
867     /**
868      * Normalize a string.
869      * The string will be normalized according to the specified normalization
870      * mode and options.
871      * @param src        The string to normalize.
872      * @param mode       The normalization mode; one of Normalizer.NONE,
873      *                    Normalizer.NFD, Normalizer.NFC, Normalizer.NFKC,
874      *                    Normalizer.NFKD, Normalizer.DEFAULT
875      * @return the normalized string
876      * @deprecated ICU 56 Use {@link Normalizer2} instead.
877      * @hide deprecated on icu4j-org
878      */
879     @Deprecated
normalize(String src,Mode mode)880     public static String normalize(String src,Mode mode) {
881         return normalize(src, mode, 0);
882     }
883     /**
884      * Normalize a string.
885      * The string will be normalized according to the specified normalization
886      * mode and options.
887      * @param source The char array to normalize.
888      * @param target A char buffer to receive the normalized text.
889      * @param mode   The normalization mode; one of Normalizer.NONE,
890      *                Normalizer.NFD, Normalizer.NFC, Normalizer.NFKC,
891      *                Normalizer.NFKD, Normalizer.DEFAULT
892      * @param options The normalization options, ORed together (0 for no options).
893      * @return int   The total buffer size needed;if greater than length of
894      *                result, the output was truncated.
895      * @exception    IndexOutOfBoundsException if the target capacity is less
896      *                than the required length
897      * @deprecated ICU 56 Use {@link Normalizer2} instead.
898      * @hide deprecated on icu4j-org
899      */
900     @Deprecated
normalize(char[] source,char[] target, Mode mode, int options)901     public static int normalize(char[] source,char[] target, Mode  mode, int options) {
902         return normalize(source,0,source.length,target,0,target.length,mode, options);
903     }
904 
905     /**
906      * Normalize a string.
907      * The string will be normalized according to the specified normalization
908      * mode and options.
909      * @param src       The char array to compose.
910      * @param srcStart  Start index of the source
911      * @param srcLimit  Limit index of the source
912      * @param dest      The char buffer to fill in
913      * @param destStart Start index of the destination buffer
914      * @param destLimit End index of the destination buffer
915      * @param mode      The normalization mode; one of Normalizer.NONE,
916      *                   Normalizer.NFD, Normalizer.NFC, Normalizer.NFKC,
917      *                   Normalizer.NFKD, Normalizer.DEFAULT
918      * @param options The normalization options, ORed together (0 for no options).
919      * @return int      The total buffer size needed;if greater than length of
920      *                   result, the output was truncated.
921      * @exception       IndexOutOfBoundsException if the target capacity is
922      *                   less than the required length
923      * @deprecated ICU 56 Use {@link Normalizer2} instead.
924      * @hide deprecated on icu4j-org
925      */
926     @Deprecated
normalize(char[] src,int srcStart, int srcLimit, char[] dest,int destStart, int destLimit, Mode mode, int options)927     public static int normalize(char[] src,int srcStart, int srcLimit,
928                                 char[] dest,int destStart, int destLimit,
929                                 Mode  mode, int options) {
930         CharBuffer srcBuffer = CharBuffer.wrap(src, srcStart, srcLimit - srcStart);
931         CharsAppendable app = new CharsAppendable(dest, destStart, destLimit);
932         mode.getNormalizer2(options).normalize(srcBuffer, app);
933         return app.length();
934     }
935 
936     /**
937      * Normalize a codepoint according to the given mode
938      * @param char32    The input string to be normalized.
939      * @param mode      The normalization mode
940      * @param options   Options for use with exclusion set and tailored Normalization
941      *                                   The only option that is currently recognized is UNICODE_3_2
942      * @return String   The normalized string
943      * @see #UNICODE_3_2
944      * @deprecated ICU 56 Use {@link Normalizer2} instead.
945      * @hide deprecated on icu4j-org
946      */
947     @Deprecated
normalize(int char32, Mode mode, int options)948     public static String normalize(int char32, Mode mode, int options) {
949         if(mode == NFD && options == 0) {
950             String decomposition = Normalizer2.getNFCInstance().getDecomposition(char32);
951             if(decomposition == null) {
952                 decomposition = UTF16.valueOf(char32);
953             }
954             return decomposition;
955         }
956         return normalize(UTF16.valueOf(char32), mode, options);
957     }
958 
959     /**
960      * Convenience method to normalize a codepoint according to the given mode
961      * @param char32    The input string to be normalized.
962      * @param mode      The normalization mode
963      * @return String   The normalized string
964      * @deprecated ICU 56 Use {@link Normalizer2} instead.
965      * @hide deprecated on icu4j-org
966      */
967     @Deprecated
normalize(int char32, Mode mode)968     public static String normalize(int char32, Mode mode) {
969         return normalize(char32, mode, 0);
970     }
971 
972     /**
973      * Convenience method.
974      *
975      * @param source   string for determining if it is in a normalized format
976      * @param mode     normalization format (Normalizer.NFC,Normalizer.NFD,
977      *                  Normalizer.NFKC,Normalizer.NFKD)
978      * @return         Return code to specify if the text is normalized or not
979      *                     (Normalizer.YES, Normalizer.NO or Normalizer.MAYBE)
980      * @deprecated ICU 56 Use {@link Normalizer2} instead.
981      * @hide deprecated on icu4j-org
982      */
983     @Deprecated
quickCheck(String source, Mode mode)984     public static QuickCheckResult quickCheck(String source, Mode mode) {
985         return quickCheck(source, mode, 0);
986     }
987 
988     /**
989      * Performing quick check on a string, to quickly determine if the string is
990      * in a particular normalization format.
991      * Three types of result can be returned Normalizer.YES, Normalizer.NO or
992      * Normalizer.MAYBE. Result Normalizer.YES indicates that the argument
993      * string is in the desired normalized format, Normalizer.NO determines that
994      * argument string is not in the desired normalized format. A
995      * Normalizer.MAYBE result indicates that a more thorough check is required,
996      * the user may have to put the string in its normalized form and compare
997      * the results.
998      *
999      * @param source   string for determining if it is in a normalized format
1000      * @param mode     normalization format (Normalizer.NFC,Normalizer.NFD,
1001      *                  Normalizer.NFKC,Normalizer.NFKD)
1002      * @param options   Options for use with exclusion set and tailored Normalization
1003      *                                   The only option that is currently recognized is UNICODE_3_2
1004      * @return         Return code to specify if the text is normalized or not
1005      *                     (Normalizer.YES, Normalizer.NO or Normalizer.MAYBE)
1006      * @deprecated ICU 56 Use {@link Normalizer2} instead.
1007      * @hide deprecated on icu4j-org
1008      */
1009     @Deprecated
quickCheck(String source, Mode mode, int options)1010     public static QuickCheckResult quickCheck(String source, Mode mode, int options) {
1011         return mode.getNormalizer2(options).quickCheck(source);
1012     }
1013 
1014     /**
1015      * Convenience method.
1016      *
1017      * @param source Array of characters for determining if it is in a
1018      *                normalized format
1019      * @param mode   normalization format (Normalizer.NFC,Normalizer.NFD,
1020      *                Normalizer.NFKC,Normalizer.NFKD)
1021      * @param options   Options for use with exclusion set and tailored Normalization
1022      *                                   The only option that is currently recognized is UNICODE_3_2
1023      * @return       Return code to specify if the text is normalized or not
1024      *                (Normalizer.YES, Normalizer.NO or Normalizer.MAYBE)
1025      * @deprecated ICU 56 Use {@link Normalizer2} instead.
1026      * @hide deprecated on icu4j-org
1027      */
1028     @Deprecated
quickCheck(char[] source, Mode mode, int options)1029     public static QuickCheckResult quickCheck(char[] source, Mode mode, int options) {
1030         return quickCheck(source, 0, source.length, mode, options);
1031     }
1032 
1033     /**
1034      * Performing quick check on a string, to quickly determine if the string is
1035      * in a particular normalization format.
1036      * Three types of result can be returned Normalizer.YES, Normalizer.NO or
1037      * Normalizer.MAYBE. Result Normalizer.YES indicates that the argument
1038      * string is in the desired normalized format, Normalizer.NO determines that
1039      * argument string is not in the desired normalized format. A
1040      * Normalizer.MAYBE result indicates that a more thorough check is required,
1041      * the user may have to put the string in its normalized form and compare
1042      * the results.
1043      *
1044      * @param source    string for determining if it is in a normalized format
1045      * @param start     the start index of the source
1046      * @param limit     the limit index of the source it is equal to the length
1047      * @param mode      normalization format (Normalizer.NFC,Normalizer.NFD,
1048      *                   Normalizer.NFKC,Normalizer.NFKD)
1049      * @param options   Options for use with exclusion set and tailored Normalization
1050      *                                   The only option that is currently recognized is UNICODE_3_2
1051      * @return          Return code to specify if the text is normalized or not
1052      *                   (Normalizer.YES, Normalizer.NO or
1053      *                   Normalizer.MAYBE)
1054      * @deprecated ICU 56 Use {@link Normalizer2} instead.
1055      * @hide deprecated on icu4j-org
1056      */
1057     @Deprecated
quickCheck(char[] source,int start, int limit, Mode mode,int options)1058     public static QuickCheckResult quickCheck(char[] source,int start,
1059                                               int limit, Mode mode,int options) {
1060         CharBuffer srcBuffer = CharBuffer.wrap(source, start, limit - start);
1061         return mode.getNormalizer2(options).quickCheck(srcBuffer);
1062     }
1063 
1064     /**
1065      * Test if a string is in a given normalization form.
1066      * This is semantically equivalent to source.equals(normalize(source, mode)).
1067      *
1068      * Unlike quickCheck(), this function returns a definitive result,
1069      * never a "maybe".
1070      * For NFD, NFKD, and FCD, both functions work exactly the same.
1071      * For NFC and NFKC where quickCheck may return "maybe", this function will
1072      * perform further tests to arrive at a true/false result.
1073      * @param src       The input array of characters to be checked to see if
1074      *                   it is normalized
1075      * @param start     The strart index in the source
1076      * @param limit     The limit index in the source
1077      * @param mode      the normalization mode
1078      * @param options   Options for use with exclusion set and tailored Normalization
1079      *                                   The only option that is currently recognized is UNICODE_3_2
1080      * @return Boolean value indicating whether the source string is in the
1081      *         "mode" normalization form
1082      * @deprecated ICU 56 Use {@link Normalizer2} instead.
1083      * @hide deprecated on icu4j-org
1084      */
1085     @Deprecated
isNormalized(char[] src,int start, int limit, Mode mode, int options)1086     public static boolean isNormalized(char[] src,int start,
1087                                        int limit, Mode mode,
1088                                        int options) {
1089         CharBuffer srcBuffer = CharBuffer.wrap(src, start, limit - start);
1090         return mode.getNormalizer2(options).isNormalized(srcBuffer);
1091     }
1092 
1093     /**
1094      * Test if a string is in a given normalization form.
1095      * This is semantically equivalent to source.equals(normalize(source, mode)).
1096      *
1097      * Unlike quickCheck(), this function returns a definitive result,
1098      * never a "maybe".
1099      * For NFD, NFKD, and FCD, both functions work exactly the same.
1100      * For NFC and NFKC where quickCheck may return "maybe", this function will
1101      * perform further tests to arrive at a true/false result.
1102      * @param str       the input string to be checked to see if it is
1103      *                   normalized
1104      * @param mode      the normalization mode
1105      * @param options   Options for use with exclusion set and tailored Normalization
1106      *                  The only option that is currently recognized is UNICODE_3_2
1107      * @see #isNormalized
1108      * @deprecated ICU 56 Use {@link Normalizer2} instead.
1109      * @hide deprecated on icu4j-org
1110      */
1111     @Deprecated
isNormalized(String str, Mode mode, int options)1112     public static boolean isNormalized(String str, Mode mode, int options) {
1113         return mode.getNormalizer2(options).isNormalized(str);
1114     }
1115 
1116     /**
1117      * Convenience Method
1118      * @param char32    the input code point to be checked to see if it is
1119      *                   normalized
1120      * @param mode      the normalization mode
1121      * @param options   Options for use with exclusion set and tailored Normalization
1122      *                  The only option that is currently recognized is UNICODE_3_2
1123      *
1124      * @see #isNormalized
1125      * @deprecated ICU 56 Use {@link Normalizer2} instead.
1126      * @hide deprecated on icu4j-org
1127      */
1128     @Deprecated
isNormalized(int char32, Mode mode,int options)1129     public static boolean isNormalized(int char32, Mode mode,int options) {
1130         return isNormalized(UTF16.valueOf(char32), mode, options);
1131     }
1132 
1133     /**
1134      * Compare two strings for canonical equivalence.
1135      * Further options include case-insensitive comparison and
1136      * code point order (as opposed to code unit order).
1137      *
1138      * Canonical equivalence between two strings is defined as their normalized
1139      * forms (NFD or NFC) being identical.
1140      * This function compares strings incrementally instead of normalizing
1141      * (and optionally case-folding) both strings entirely,
1142      * improving performance significantly.
1143      *
1144      * Bulk normalization is only necessary if the strings do not fulfill the
1145      * FCD conditions. Only in this case, and only if the strings are relatively
1146      * long, is memory allocated temporarily.
1147      * For FCD strings and short non-FCD strings there is no memory allocation.
1148      *
1149      * Semantically, this is equivalent to
1150      *   strcmp[CodePointOrder](foldCase(NFD(s1)), foldCase(NFD(s2)))
1151      * where code point order and foldCase are all optional.
1152      *
1153      * @param s1        First source character array.
1154      * @param s1Start   start index of source
1155      * @param s1Limit   limit of the source
1156      *
1157      * @param s2        Second source character array.
1158      * @param s2Start   start index of the source
1159      * @param s2Limit   limit of the source
1160      *
1161      * @param options A bit set of options:
1162      *   - FOLD_CASE_DEFAULT or 0 is used for default options:
1163      *     Case-sensitive comparison in code unit order, and the input strings
1164      *     are quick-checked for FCD.
1165      *
1166      *   - INPUT_IS_FCD
1167      *     Set if the caller knows that both s1 and s2 fulfill the FCD
1168      *     conditions.If not set, the function will quickCheck for FCD
1169      *     and normalize if necessary.
1170      *
1171      *   - COMPARE_CODE_POINT_ORDER
1172      *     Set to choose code point order instead of code unit order
1173      *
1174      *   - COMPARE_IGNORE_CASE
1175      *     Set to compare strings case-insensitively using case folding,
1176      *     instead of case-sensitively.
1177      *     If set, then the following case folding options are used.
1178      *
1179      *
1180      * @return &lt;0 or 0 or &gt;0 as usual for string comparisons
1181      *
1182      * @see #normalize
1183      * @see #FCD
1184      */
compare(char[] s1, int s1Start, int s1Limit, char[] s2, int s2Start, int s2Limit, int options)1185     public static int compare(char[] s1, int s1Start, int s1Limit,
1186                               char[] s2, int s2Start, int s2Limit,
1187                               int options) {
1188         if( s1==null || s1Start<0 || s1Limit<0 ||
1189             s2==null || s2Start<0 || s2Limit<0 ||
1190             s1Limit<s1Start || s2Limit<s2Start
1191         ) {
1192             throw new IllegalArgumentException();
1193         }
1194         return internalCompare(CharBuffer.wrap(s1, s1Start, s1Limit-s1Start),
1195                                CharBuffer.wrap(s2, s2Start, s2Limit-s2Start),
1196                                options);
1197     }
1198 
1199     /**
1200      * Compare two strings for canonical equivalence.
1201      * Further options include case-insensitive comparison and
1202      * code point order (as opposed to code unit order).
1203      *
1204      * Canonical equivalence between two strings is defined as their normalized
1205      * forms (NFD or NFC) being identical.
1206      * This function compares strings incrementally instead of normalizing
1207      * (and optionally case-folding) both strings entirely,
1208      * improving performance significantly.
1209      *
1210      * Bulk normalization is only necessary if the strings do not fulfill the
1211      * FCD conditions. Only in this case, and only if the strings are relatively
1212      * long, is memory allocated temporarily.
1213      * For FCD strings and short non-FCD strings there is no memory allocation.
1214      *
1215      * Semantically, this is equivalent to
1216      *   strcmp[CodePointOrder](foldCase(NFD(s1)), foldCase(NFD(s2)))
1217      * where code point order and foldCase are all optional.
1218      *
1219      * @param s1 First source string.
1220      * @param s2 Second source string.
1221      *
1222      * @param options A bit set of options:
1223      *   - FOLD_CASE_DEFAULT or 0 is used for default options:
1224      *     Case-sensitive comparison in code unit order, and the input strings
1225      *     are quick-checked for FCD.
1226      *
1227      *   - INPUT_IS_FCD
1228      *     Set if the caller knows that both s1 and s2 fulfill the FCD
1229      *     conditions. If not set, the function will quickCheck for FCD
1230      *     and normalize if necessary.
1231      *
1232      *   - COMPARE_CODE_POINT_ORDER
1233      *     Set to choose code point order instead of code unit order
1234      *
1235      *   - COMPARE_IGNORE_CASE
1236      *     Set to compare strings case-insensitively using case folding,
1237      *     instead of case-sensitively.
1238      *     If set, then the following case folding options are used.
1239      *
1240      * @return &lt;0 or 0 or &gt;0 as usual for string comparisons
1241      *
1242      * @see #normalize
1243      * @see #FCD
1244      */
compare(String s1, String s2, int options)1245     public static int compare(String s1, String s2, int options) {
1246         return internalCompare(s1, s2, options);
1247     }
1248 
1249     /**
1250      * Compare two strings for canonical equivalence.
1251      * Further options include case-insensitive comparison and
1252      * code point order (as opposed to code unit order).
1253      * Convenience method.
1254      *
1255      * @param s1 First source string.
1256      * @param s2 Second source string.
1257      *
1258      * @param options A bit set of options:
1259      *   - FOLD_CASE_DEFAULT or 0 is used for default options:
1260      *     Case-sensitive comparison in code unit order, and the input strings
1261      *     are quick-checked for FCD.
1262      *
1263      *   - INPUT_IS_FCD
1264      *     Set if the caller knows that both s1 and s2 fulfill the FCD
1265      *     conditions. If not set, the function will quickCheck for FCD
1266      *     and normalize if necessary.
1267      *
1268      *   - COMPARE_CODE_POINT_ORDER
1269      *     Set to choose code point order instead of code unit order
1270      *
1271      *   - COMPARE_IGNORE_CASE
1272      *     Set to compare strings case-insensitively using case folding,
1273      *     instead of case-sensitively.
1274      *     If set, then the following case folding options are used.
1275      *
1276      * @return &lt;0 or 0 or &gt;0 as usual for string comparisons
1277      *
1278      * @see #normalize
1279      * @see #FCD
1280      */
compare(char[] s1, char[] s2, int options)1281     public static int compare(char[] s1, char[] s2, int options) {
1282         return internalCompare(CharBuffer.wrap(s1), CharBuffer.wrap(s2), options);
1283     }
1284 
1285     /**
1286      * Convenience method that can have faster implementation
1287      * by not allocating buffers.
1288      * @param char32a    the first code point to be checked against the
1289      * @param char32b    the second code point
1290      * @param options    A bit set of options
1291      */
compare(int char32a, int char32b, int options)1292     public static int compare(int char32a, int char32b, int options) {
1293         return internalCompare(UTF16.valueOf(char32a), UTF16.valueOf(char32b), options|INPUT_IS_FCD);
1294     }
1295 
1296     /**
1297      * Convenience method that can have faster implementation
1298      * by not allocating buffers.
1299      * @param char32a   the first code point to be checked against
1300      * @param str2      the second string
1301      * @param options   A bit set of options
1302      */
compare(int char32a, String str2, int options)1303     public static int compare(int char32a, String str2, int options) {
1304         return internalCompare(UTF16.valueOf(char32a), str2, options);
1305     }
1306 
1307     /* Concatenation of normalized strings --------------------------------- */
1308     /**
1309      * Concatenate normalized strings, making sure that the result is normalized
1310      * as well.
1311      *
1312      * If both the left and the right strings are in
1313      * the normalization form according to "mode",
1314      * then the result will be
1315      *
1316      * <code>
1317      *     dest=normalize(left+right, mode)
1318      * </code>
1319      *
1320      * With the input strings already being normalized,
1321      * this function will use next() and previous()
1322      * to find the adjacent end pieces of the input strings.
1323      * Only the concatenation of these end pieces will be normalized and
1324      * then concatenated with the remaining parts of the input strings.
1325      *
1326      * It is allowed to have dest==left to avoid copying the entire left string.
1327      *
1328      * @param left Left source array, may be same as dest.
1329      * @param leftStart start in the left array.
1330      * @param leftLimit limit in the left array (==length)
1331      * @param right Right source array.
1332      * @param rightStart start in the right array.
1333      * @param rightLimit limit in the right array (==length)
1334      * @param dest The output buffer; can be null if destStart==destLimit==0
1335      *              for pure preflighting.
1336      * @param destStart start in the destination array
1337      * @param destLimit limit in the destination array (==length)
1338      * @param mode The normalization mode.
1339      * @param options The normalization options, ORed together (0 for no options).
1340      * @return Length of output (number of chars) when successful or
1341      *          IndexOutOfBoundsException
1342      * @exception IndexOutOfBoundsException whose message has the string
1343      *             representation of destination capacity required.
1344      * @see #normalize
1345      * @see #next
1346      * @see #previous
1347      * @exception IndexOutOfBoundsException if target capacity is less than the
1348      *             required length
1349      * @deprecated ICU 56 Use {@link Normalizer2} instead.
1350      * @hide deprecated on icu4j-org
1351      */
1352     @Deprecated
concatenate(char[] left, int leftStart, int leftLimit, char[] right, int rightStart, int rightLimit, char[] dest, int destStart, int destLimit, Normalizer.Mode mode, int options)1353     public static int concatenate(char[] left,  int leftStart,  int leftLimit,
1354                                   char[] right, int rightStart, int rightLimit,
1355                                   char[] dest,  int destStart,  int destLimit,
1356                                   Normalizer.Mode mode, int options) {
1357         if(dest == null) {
1358             throw new IllegalArgumentException();
1359         }
1360 
1361         /* check for overlapping right and destination */
1362         if (right == dest && rightStart < destLimit && destStart < rightLimit) {
1363             throw new IllegalArgumentException("overlapping right and dst ranges");
1364         }
1365 
1366         /* allow left==dest */
1367         StringBuilder destBuilder=new StringBuilder(leftLimit-leftStart+rightLimit-rightStart+16);
1368         destBuilder.append(left, leftStart, leftLimit-leftStart);
1369         CharBuffer rightBuffer=CharBuffer.wrap(right, rightStart, rightLimit-rightStart);
1370         mode.getNormalizer2(options).append(destBuilder, rightBuffer);
1371         int destLength=destBuilder.length();
1372         if(destLength<=(destLimit-destStart)) {
1373             destBuilder.getChars(0, destLength, dest, destStart);
1374             return destLength;
1375         } else {
1376             throw new IndexOutOfBoundsException(Integer.toString(destLength));
1377         }
1378     }
1379 
1380     /**
1381      * Concatenate normalized strings, making sure that the result is normalized
1382      * as well.
1383      *
1384      * If both the left and the right strings are in
1385      * the normalization form according to "mode",
1386      * then the result will be
1387      *
1388      * <code>
1389      *     dest=normalize(left+right, mode)
1390      * </code>
1391      *
1392      * For details see concatenate
1393      *
1394      * @param left Left source string.
1395      * @param right Right source string.
1396      * @param mode The normalization mode.
1397      * @param options The normalization options, ORed together (0 for no options).
1398      * @return result
1399      *
1400      * @see #concatenate
1401      * @see #normalize
1402      * @see #next
1403      * @see #previous
1404      * @see #concatenate
1405      * @deprecated ICU 56 Use {@link Normalizer2} instead.
1406      * @hide deprecated on icu4j-org
1407      */
1408     @Deprecated
concatenate(char[] left, char[] right,Mode mode, int options)1409     public static String concatenate(char[] left, char[] right,Mode mode, int options) {
1410         StringBuilder dest=new StringBuilder(left.length+right.length+16).append(left);
1411         return mode.getNormalizer2(options).append(dest, CharBuffer.wrap(right)).toString();
1412     }
1413 
1414     /**
1415      * Concatenate normalized strings, making sure that the result is normalized
1416      * as well.
1417      *
1418      * If both the left and the right strings are in
1419      * the normalization form according to "mode",
1420      * then the result will be
1421      *
1422      * <code>
1423      *     dest=normalize(left+right, mode)
1424      * </code>
1425      *
1426      * With the input strings already being normalized,
1427      * this function will use next() and previous()
1428      * to find the adjacent end pieces of the input strings.
1429      * Only the concatenation of these end pieces will be normalized and
1430      * then concatenated with the remaining parts of the input strings.
1431      *
1432      * @param left Left source string.
1433      * @param right Right source string.
1434      * @param mode The normalization mode.
1435      * @param options The normalization options, ORed together (0 for no options).
1436      * @return result
1437      *
1438      * @see #concatenate
1439      * @see #normalize
1440      * @see #next
1441      * @see #previous
1442      * @see #concatenate
1443      * @deprecated ICU 56 Use {@link Normalizer2} instead.
1444      * @hide deprecated on icu4j-org
1445      */
1446     @Deprecated
concatenate(String left, String right, Mode mode, int options)1447     public static String concatenate(String left, String right, Mode mode, int options) {
1448         StringBuilder dest=new StringBuilder(left.length()+right.length()+16).append(left);
1449         return mode.getNormalizer2(options).append(dest, right).toString();
1450     }
1451 
1452     /**
1453      * Gets the FC_NFKC closure value.
1454      * @param c The code point whose closure value is to be retrieved
1455      * @param dest The char array to receive the closure value
1456      * @return the length of the closure value; 0 if there is none
1457      * @deprecated ICU 56
1458      * @hide deprecated on icu4j-org
1459      */
1460     @Deprecated
getFC_NFKC_Closure(int c,char[] dest)1461     public static int getFC_NFKC_Closure(int c,char[] dest) {
1462         String closure=getFC_NFKC_Closure(c);
1463         int length=closure.length();
1464         if(length!=0 && dest!=null && length<=dest.length) {
1465             closure.getChars(0, length, dest, 0);
1466         }
1467         return length;
1468     }
1469     /**
1470      * Gets the FC_NFKC closure value.
1471      * @param c The code point whose closure value is to be retrieved
1472      * @return String representation of the closure value; "" if there is none
1473      * @deprecated ICU 56
1474      * @hide deprecated on icu4j-org
1475      */
1476     @Deprecated
getFC_NFKC_Closure(int c)1477     public static String getFC_NFKC_Closure(int c) {
1478         // Compute the FC_NFKC_Closure on the fly:
1479         // We have the API for complete coverage of Unicode properties, although
1480         // this value by itself is not useful via API.
1481         // (What could be useful is a custom normalization table that combines
1482         // case folding and NFKC.)
1483         // For the derivation, see Unicode's DerivedNormalizationProps.txt.
1484         Normalizer2 nfkc=NFKCModeImpl.INSTANCE.normalizer2;
1485         UCaseProps csp=UCaseProps.INSTANCE;
1486         // first: b = NFKC(Fold(a))
1487         StringBuilder folded=new StringBuilder();
1488         int folded1Length=csp.toFullFolding(c, folded, 0);
1489         if(folded1Length<0) {
1490             Normalizer2Impl nfkcImpl=((Norm2AllModes.Normalizer2WithImpl)nfkc).impl;
1491             if(nfkcImpl.getCompQuickCheck(nfkcImpl.getNorm16(c))!=0) {
1492                 return "";  // c does not change at all under CaseFolding+NFKC
1493             }
1494             folded.appendCodePoint(c);
1495         } else {
1496             if(folded1Length>UCaseProps.MAX_STRING_LENGTH) {
1497                 folded.appendCodePoint(folded1Length);
1498             }
1499         }
1500         String kc1=nfkc.normalize(folded);
1501         // second: c = NFKC(Fold(b))
1502         String kc2=nfkc.normalize(UCharacter.foldCase(kc1, 0));
1503         // if (c != b) add the mapping from a to c
1504         if(kc1.equals(kc2)) {
1505             return "";
1506         } else {
1507             return kc2;
1508         }
1509     }
1510 
1511     //-------------------------------------------------------------------------
1512     // Iteration API
1513     //-------------------------------------------------------------------------
1514 
1515     /**
1516      * Return the current character in the normalized text.
1517      * @return The codepoint as an int
1518      * @deprecated ICU 56
1519      * @hide deprecated on icu4j-org
1520      */
1521     @Deprecated
current()1522     public int current() {
1523         if(bufferPos<buffer.length() || nextNormalize()) {
1524             return buffer.codePointAt(bufferPos);
1525         } else {
1526             return DONE;
1527         }
1528     }
1529 
1530     /**
1531      * Return the next character in the normalized text and advance
1532      * the iteration position by one.  If the end
1533      * of the text has already been reached, {@link #DONE} is returned.
1534      * @return The codepoint as an int
1535      * @deprecated ICU 56
1536      * @hide deprecated on icu4j-org
1537      */
1538     @Deprecated
next()1539     public int next() {
1540         if(bufferPos<buffer.length() ||  nextNormalize()) {
1541             int c=buffer.codePointAt(bufferPos);
1542             bufferPos+=Character.charCount(c);
1543             return c;
1544         } else {
1545             return DONE;
1546         }
1547     }
1548 
1549 
1550     /**
1551      * Return the previous character in the normalized text and decrement
1552      * the iteration position by one.  If the beginning
1553      * of the text has already been reached, {@link #DONE} is returned.
1554      * @return The codepoint as an int
1555      * @deprecated ICU 56
1556      * @hide deprecated on icu4j-org
1557      */
1558     @Deprecated
previous()1559     public int previous() {
1560         if(bufferPos>0 || previousNormalize()) {
1561             int c=buffer.codePointBefore(bufferPos);
1562             bufferPos-=Character.charCount(c);
1563             return c;
1564         } else {
1565             return DONE;
1566         }
1567     }
1568 
1569     /**
1570      * Reset the index to the beginning of the text.
1571      * This is equivalent to setIndexOnly(startIndex)).
1572      * @deprecated ICU 56
1573      * @hide deprecated on icu4j-org
1574      */
1575     @Deprecated
reset()1576     public void reset() {
1577         text.setToStart();
1578         currentIndex=nextIndex=0;
1579         clearBuffer();
1580     }
1581 
1582     /**
1583      * Set the iteration position in the input text that is being normalized,
1584      * without any immediate normalization.
1585      * After setIndexOnly(), getIndex() will return the same index that is
1586      * specified here.
1587      *
1588      * @param index the desired index in the input text.
1589      * @deprecated ICU 56
1590      * @hide deprecated on icu4j-org
1591      */
1592     @Deprecated
setIndexOnly(int index)1593     public void setIndexOnly(int index) {
1594         text.setIndex(index);  // validates index
1595         currentIndex=nextIndex=index;
1596         clearBuffer();
1597     }
1598 
1599     /**
1600      * Set the iteration position in the input text that is being normalized
1601      * and return the first normalized character at that position.
1602      * <p>
1603      * <b>Note:</b> This method sets the position in the <em>input</em> text,
1604      * while {@link #next} and {@link #previous} iterate through characters
1605      * in the normalized <em>output</em>.  This means that there is not
1606      * necessarily a one-to-one correspondence between characters returned
1607      * by <tt>next</tt> and <tt>previous</tt> and the indices passed to and
1608      * returned from <tt>setIndex</tt> and {@link #getIndex}.
1609      * <p>
1610      * @param index the desired index in the input text.
1611      *
1612      * @return   the first normalized character that is the result of iterating
1613      *            forward starting at the given index.
1614      *
1615      * @throws IllegalArgumentException if the given index is less than
1616      *          {@link #getBeginIndex} or greater than {@link #getEndIndex}.
1617      * @deprecated ICU 3.2
1618      * @obsolete ICU 3.2
1619      * @hide deprecated on icu4j-org
1620      */
1621     @Deprecated
1622      ///CLOVER:OFF
setIndex(int index)1623      public int setIndex(int index) {
1624          setIndexOnly(index);
1625          return current();
1626      }
1627      ///CLOVER:ON
1628     /**
1629      * Retrieve the index of the start of the input text. This is the begin
1630      * index of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the
1631      * <tt>String</tt> over which this <tt>Normalizer</tt> is iterating
1632      * @deprecated ICU 2.2. Use startIndex() instead.
1633      * @return The codepoint as an int
1634      * @see #startIndex
1635      * @hide deprecated on icu4j-org
1636      */
1637     @Deprecated
getBeginIndex()1638     public int getBeginIndex() {
1639         return 0;
1640     }
1641 
1642     /**
1643      * Retrieve the index of the end of the input text.  This is the end index
1644      * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt>
1645      * over which this <tt>Normalizer</tt> is iterating
1646      * @deprecated ICU 2.2. Use endIndex() instead.
1647      * @return The codepoint as an int
1648      * @see #endIndex
1649      * @hide deprecated on icu4j-org
1650      */
1651     @Deprecated
getEndIndex()1652     public int getEndIndex() {
1653         return endIndex();
1654     }
1655     /**
1656      * Return the first character in the normalized text.  This resets
1657      * the <tt>Normalizer's</tt> position to the beginning of the text.
1658      * @return The codepoint as an int
1659      * @deprecated ICU 56
1660      * @hide deprecated on icu4j-org
1661      */
1662     @Deprecated
first()1663     public int first() {
1664         reset();
1665         return next();
1666     }
1667 
1668     /**
1669      * Return the last character in the normalized text.  This resets
1670      * the <tt>Normalizer's</tt> position to be just before the
1671      * the input text corresponding to that normalized character.
1672      * @return The codepoint as an int
1673      * @deprecated ICU 56
1674      * @hide deprecated on icu4j-org
1675      */
1676     @Deprecated
last()1677     public int last() {
1678         text.setToLimit();
1679         currentIndex=nextIndex=text.getIndex();
1680         clearBuffer();
1681         return previous();
1682     }
1683 
1684     /**
1685      * Retrieve the current iteration position in the input text that is
1686      * being normalized.  This method is useful in applications such as
1687      * searching, where you need to be able to determine the position in
1688      * the input text that corresponds to a given normalized output character.
1689      * <p>
1690      * <b>Note:</b> This method sets the position in the <em>input</em>, while
1691      * {@link #next} and {@link #previous} iterate through characters in the
1692      * <em>output</em>.  This means that there is not necessarily a one-to-one
1693      * correspondence between characters returned by <tt>next</tt> and
1694      * <tt>previous</tt> and the indices passed to and returned from
1695      * <tt>setIndex</tt> and {@link #getIndex}.
1696      * @return The current iteration position
1697      * @deprecated ICU 56
1698      * @hide deprecated on icu4j-org
1699      */
1700     @Deprecated
getIndex()1701     public int getIndex() {
1702         if(bufferPos<buffer.length()) {
1703             return currentIndex;
1704         } else {
1705             return nextIndex;
1706         }
1707     }
1708 
1709     /**
1710      * Retrieve the index of the start of the input text. This is the begin
1711      * index of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the
1712      * <tt>String</tt> over which this <tt>Normalizer</tt> is iterating
1713      * @return The current iteration position
1714      * @deprecated ICU 56
1715      * @hide deprecated on icu4j-org
1716      */
1717     @Deprecated
startIndex()1718     public int startIndex() {
1719         return 0;
1720     }
1721 
1722     /**
1723      * Retrieve the index of the end of the input text.  This is the end index
1724      * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt>
1725      * over which this <tt>Normalizer</tt> is iterating
1726      * @return The current iteration position
1727      * @deprecated ICU 56
1728      * @hide deprecated on icu4j-org
1729      */
1730     @Deprecated
endIndex()1731     public int endIndex() {
1732         return text.getLength();
1733     }
1734 
1735     //-------------------------------------------------------------------------
1736     // Iterator attributes
1737     //-------------------------------------------------------------------------
1738     /**
1739      * Set the normalization mode for this object.
1740      * <p>
1741      * <b>Note:</b>If the normalization mode is changed while iterating
1742      * over a string, calls to {@link #next} and {@link #previous} may
1743      * return previously buffers characters in the old normalization mode
1744      * until the iteration is able to re-sync at the next base character.
1745      * It is safest to call {@link #setText setText()}, {@link #first},
1746      * {@link #last}, etc. after calling <tt>setMode</tt>.
1747      * <p>
1748      * @param newMode the new mode for this <tt>Normalizer</tt>.
1749      * The supported modes are:
1750      * <ul>
1751      *  <li>{@link #NFC}    - Unicode canonical decompositiion
1752      *                        followed by canonical composition.
1753      *  <li>{@link #NFKC}   - Unicode compatibility decompositiion
1754      *                        follwed by canonical composition.
1755      *  <li>{@link #NFD}    - Unicode canonical decomposition
1756      *  <li>{@link #NFKD}   - Unicode compatibility decomposition.
1757      *  <li>{@link #NONE}   - Do nothing but return characters
1758      *                        from the underlying input text.
1759      * </ul>
1760      *
1761      * @see #getMode
1762      * @deprecated ICU 56
1763      * @hide deprecated on icu4j-org
1764      */
1765     @Deprecated
setMode(Mode newMode)1766     public void setMode(Mode newMode) {
1767         mode = newMode;
1768         norm2 = mode.getNormalizer2(options);
1769     }
1770     /**
1771      * Return the basic operation performed by this <tt>Normalizer</tt>
1772      *
1773      * @see #setMode
1774      * @deprecated ICU 56
1775      * @hide deprecated on icu4j-org
1776      */
1777     @Deprecated
getMode()1778     public Mode getMode() {
1779         return mode;
1780     }
1781     /**
1782      * Set options that affect this <tt>Normalizer</tt>'s operation.
1783      * Options do not change the basic composition or decomposition operation
1784      * that is being performed , but they control whether
1785      * certain optional portions of the operation are done.
1786      * Currently the only available option is:
1787      *
1788      * <ul>
1789      *   <li>{@link #UNICODE_3_2} - Use Normalization conforming to Unicode version 3.2.
1790      * </ul>
1791      *
1792      * @param   option  the option whose value is to be set.
1793      * @param   value   the new setting for the option.  Use <tt>true</tt> to
1794      *                  turn the option on and <tt>false</tt> to turn it off.
1795      *
1796      * @see #getOption
1797      * @deprecated ICU 56
1798      * @hide deprecated on icu4j-org
1799      */
1800     @Deprecated
setOption(int option,boolean value)1801     public void setOption(int option,boolean value) {
1802         if (value) {
1803             options |= option;
1804         } else {
1805             options &= (~option);
1806         }
1807         norm2 = mode.getNormalizer2(options);
1808     }
1809 
1810     /**
1811      * Determine whether an option is turned on or off.
1812      * <p>
1813      * @see #setOption
1814      * @deprecated ICU 56
1815      * @hide deprecated on icu4j-org
1816      */
1817     @Deprecated
getOption(int option)1818     public int getOption(int option) {
1819         if((options & option)!=0) {
1820             return 1 ;
1821         } else {
1822             return 0;
1823         }
1824     }
1825 
1826     /**
1827      * Gets the underlying text storage
1828      * @param fillIn the char buffer to fill the UTF-16 units.
1829      *         The length of the buffer should be equal to the length of the
1830      *         underlying text storage
1831      * @throws IndexOutOfBoundsException If the index passed for the array is invalid.
1832      * @see   #getLength
1833      * @deprecated ICU 56
1834      * @hide deprecated on icu4j-org
1835      */
1836     @Deprecated
getText(char[] fillIn)1837     public int getText(char[] fillIn) {
1838         return text.getText(fillIn);
1839     }
1840 
1841     /**
1842      * Gets the length of underlying text storage
1843      * @return the length
1844      * @deprecated ICU 56
1845      * @hide deprecated on icu4j-org
1846      */
1847     @Deprecated
getLength()1848     public int getLength() {
1849         return text.getLength();
1850     }
1851 
1852     /**
1853      * Returns the text under iteration as a string
1854      * @return a copy of the text under iteration.
1855      * @deprecated ICU 56
1856      * @hide deprecated on icu4j-org
1857      */
1858     @Deprecated
getText()1859     public String getText() {
1860         return text.getText();
1861     }
1862 
1863     /**
1864      * Set the input text over which this <tt>Normalizer</tt> will iterate.
1865      * The iteration position is set to the beginning of the input text.
1866      * @param newText   The new string to be normalized.
1867      * @deprecated ICU 56
1868      * @hide deprecated on icu4j-org
1869      */
1870     @Deprecated
setText(StringBuffer newText)1871     public void setText(StringBuffer newText) {
1872         UCharacterIterator newIter = UCharacterIterator.getInstance(newText);
1873         if (newIter == null) {
1874             throw new IllegalStateException("Could not create a new UCharacterIterator");
1875         }
1876         text = newIter;
1877         reset();
1878     }
1879 
1880     /**
1881      * Set the input text over which this <tt>Normalizer</tt> will iterate.
1882      * The iteration position is set to the beginning of the input text.
1883      * @param newText   The new string to be normalized.
1884      * @deprecated ICU 56
1885      * @hide deprecated on icu4j-org
1886      */
1887     @Deprecated
setText(char[] newText)1888     public void setText(char[] newText) {
1889         UCharacterIterator newIter = UCharacterIterator.getInstance(newText);
1890         if (newIter == null) {
1891             throw new IllegalStateException("Could not create a new UCharacterIterator");
1892         }
1893         text = newIter;
1894         reset();
1895     }
1896 
1897     /**
1898      * Set the input text over which this <tt>Normalizer</tt> will iterate.
1899      * The iteration position is set to the beginning of the input text.
1900      * @param newText   The new string to be normalized.
1901      * @deprecated ICU 56
1902      * @hide deprecated on icu4j-org
1903      */
1904     @Deprecated
setText(String newText)1905     public void setText(String newText) {
1906         UCharacterIterator newIter = UCharacterIterator.getInstance(newText);
1907         if (newIter == null) {
1908             throw new IllegalStateException("Could not create a new UCharacterIterator");
1909         }
1910         text = newIter;
1911         reset();
1912     }
1913 
1914     /**
1915      * Set the input text over which this <tt>Normalizer</tt> will iterate.
1916      * The iteration position is set to the beginning of the input text.
1917      * @param newText   The new string to be normalized.
1918      * @deprecated ICU 56
1919      * @hide deprecated on icu4j-org
1920      */
1921     @Deprecated
setText(CharacterIterator newText)1922     public void setText(CharacterIterator newText) {
1923         UCharacterIterator newIter = UCharacterIterator.getInstance(newText);
1924         if (newIter == null) {
1925             throw new IllegalStateException("Could not create a new UCharacterIterator");
1926         }
1927         text = newIter;
1928         reset();
1929     }
1930 
1931     /**
1932      * Set the input text over which this <tt>Normalizer</tt> will iterate.
1933      * The iteration position is set to the beginning of the string.
1934      * @param newText   The new string to be normalized.
1935      * @deprecated ICU 56
1936      * @hide deprecated on icu4j-org
1937      */
1938     @Deprecated
setText(UCharacterIterator newText)1939     public void setText(UCharacterIterator newText) {
1940         try{
1941             UCharacterIterator newIter = (UCharacterIterator)newText.clone();
1942             if (newIter == null) {
1943                 throw new IllegalStateException("Could not create a new UCharacterIterator");
1944             }
1945             text = newIter;
1946             reset();
1947         }catch(CloneNotSupportedException e) {
1948             throw new ICUCloneNotSupportedException("Could not clone the UCharacterIterator", e);
1949         }
1950     }
1951 
clearBuffer()1952     private void clearBuffer() {
1953         buffer.setLength(0);
1954         bufferPos=0;
1955     }
1956 
nextNormalize()1957     private boolean nextNormalize() {
1958         clearBuffer();
1959         currentIndex=nextIndex;
1960         text.setIndex(nextIndex);
1961         // Skip at least one character so we make progress.
1962         int c=text.nextCodePoint();
1963         if(c<0) {
1964             return false;
1965         }
1966         StringBuilder segment=new StringBuilder().appendCodePoint(c);
1967         while((c=text.nextCodePoint())>=0) {
1968             if(norm2.hasBoundaryBefore(c)) {
1969                 text.moveCodePointIndex(-1);
1970                 break;
1971             }
1972             segment.appendCodePoint(c);
1973         }
1974         nextIndex=text.getIndex();
1975         norm2.normalize(segment, buffer);
1976         return buffer.length()!=0;
1977     }
1978 
previousNormalize()1979     private boolean previousNormalize() {
1980         clearBuffer();
1981         nextIndex=currentIndex;
1982         text.setIndex(currentIndex);
1983         StringBuilder segment=new StringBuilder();
1984         int c;
1985         while((c=text.previousCodePoint())>=0) {
1986             if(c<=0xffff) {
1987                 segment.insert(0, (char)c);
1988             } else {
1989                 segment.insert(0, Character.toChars(c));
1990             }
1991             if(norm2.hasBoundaryBefore(c)) {
1992                 break;
1993             }
1994         }
1995         currentIndex=text.getIndex();
1996         norm2.normalize(segment, buffer);
1997         bufferPos=buffer.length();
1998         return buffer.length()!=0;
1999     }
2000 
2001     /* compare canonically equivalent ------------------------------------------- */
2002 
2003     // TODO: Broaden the public compare(String, String, options) API like this. Ticket #7407
internalCompare(CharSequence s1, CharSequence s2, int options)2004     private static int internalCompare(CharSequence s1, CharSequence s2, int options) {
2005         int normOptions=options>>>COMPARE_NORM_OPTIONS_SHIFT;
2006         options|= COMPARE_EQUIV;
2007 
2008         /*
2009          * UAX #21 Case Mappings, as fixed for Unicode version 4
2010          * (see Jitterbug 2021), defines a canonical caseless match as
2011          *
2012          * A string X is a canonical caseless match
2013          * for a string Y if and only if
2014          * NFD(toCasefold(NFD(X))) = NFD(toCasefold(NFD(Y)))
2015          *
2016          * For better performance, we check for FCD (or let the caller tell us that
2017          * both strings are in FCD) for the inner normalization.
2018          * BasicNormalizerTest::FindFoldFCDExceptions() makes sure that
2019          * case-folding preserves the FCD-ness of a string.
2020          * The outer normalization is then only performed by NormalizerImpl.cmpEquivFold()
2021          * when there is a difference.
2022          *
2023          * Exception: When using the Turkic case-folding option, we do perform
2024          * full NFD first. This is because in the Turkic case precomposed characters
2025          * with 0049 capital I or 0069 small i fold differently whether they
2026          * are first decomposed or not, so an FCD check - a check only for
2027          * canonical order - is not sufficient.
2028          */
2029         if((options&INPUT_IS_FCD)==0 || (options&FOLD_CASE_EXCLUDE_SPECIAL_I)!=0) {
2030             Normalizer2 n2;
2031             if((options&FOLD_CASE_EXCLUDE_SPECIAL_I)!=0) {
2032                 n2=NFD.getNormalizer2(normOptions);
2033             } else {
2034                 n2=FCD.getNormalizer2(normOptions);
2035             }
2036 
2037             // check if s1 and/or s2 fulfill the FCD conditions
2038             int spanQCYes1=n2.spanQuickCheckYes(s1);
2039             int spanQCYes2=n2.spanQuickCheckYes(s2);
2040 
2041             /*
2042              * ICU 2.4 had a further optimization:
2043              * If both strings were not in FCD, then they were both NFD'ed,
2044              * and the COMPARE_EQUIV option was turned off.
2045              * It is not entirely clear that this is valid with the current
2046              * definition of the canonical caseless match.
2047              * Therefore, ICU 2.6 removes that optimization.
2048              */
2049 
2050             if(spanQCYes1<s1.length()) {
2051                 StringBuilder fcd1=new StringBuilder(s1.length()+16).append(s1, 0, spanQCYes1);
2052                 s1=n2.normalizeSecondAndAppend(fcd1, s1.subSequence(spanQCYes1, s1.length()));
2053             }
2054             if(spanQCYes2<s2.length()) {
2055                 StringBuilder fcd2=new StringBuilder(s2.length()+16).append(s2, 0, spanQCYes2);
2056                 s2=n2.normalizeSecondAndAppend(fcd2, s2.subSequence(spanQCYes2, s2.length()));
2057             }
2058         }
2059 
2060         return cmpEquivFold(s1, s2, options);
2061     }
2062 
2063     /*
2064      * Compare two strings for canonical equivalence.
2065      * Further options include case-insensitive comparison and
2066      * code point order (as opposed to code unit order).
2067      *
2068      * In this function, canonical equivalence is optional as well.
2069      * If canonical equivalence is tested, then both strings must fulfill
2070      * the FCD check.
2071      *
2072      * Semantically, this is equivalent to
2073      *   strcmp[CodePointOrder](NFD(foldCase(s1)), NFD(foldCase(s2)))
2074      * where code point order, NFD and foldCase are all optional.
2075      *
2076      * String comparisons almost always yield results before processing both strings
2077      * completely.
2078      * They are generally more efficient working incrementally instead of
2079      * performing the sub-processing (strlen, normalization, case-folding)
2080      * on the entire strings first.
2081      *
2082      * It is also unnecessary to not normalize identical characters.
2083      *
2084      * This function works in principle as follows:
2085      *
2086      * loop {
2087      *   get one code unit c1 from s1 (-1 if end of source)
2088      *   get one code unit c2 from s2 (-1 if end of source)
2089      *
2090      *   if(either string finished) {
2091      *     return result;
2092      *   }
2093      *   if(c1==c2) {
2094      *     continue;
2095      *   }
2096      *
2097      *   // c1!=c2
2098      *   try to decompose/case-fold c1/c2, and continue if one does;
2099      *
2100      *   // still c1!=c2 and neither decomposes/case-folds, return result
2101      *   return c1-c2;
2102      * }
2103      *
2104      * When a character decomposes, then the pointer for that source changes to
2105      * the decomposition, pushing the previous pointer onto a stack.
2106      * When the end of the decomposition is reached, then the code unit reader
2107      * pops the previous source from the stack.
2108      * (Same for case-folding.)
2109      *
2110      * This is complicated further by operating on variable-width UTF-16.
2111      * The top part of the loop works on code units, while lookups for decomposition
2112      * and case-folding need code points.
2113      * Code points are assembled after the equality/end-of-source part.
2114      * The source pointer is only advanced beyond all code units when the code point
2115      * actually decomposes/case-folds.
2116      *
2117      * If we were on a trail surrogate unit when assembling a code point,
2118      * and the code point decomposes/case-folds, then the decomposition/folding
2119      * result must be compared with the part of the other string that corresponds to
2120      * this string's lead surrogate.
2121      * Since we only assemble a code point when hitting a trail unit when the
2122      * preceding lead units were identical, we back up the other string by one unit
2123      * in such a case.
2124      *
2125      * The optional code point order comparison at the end works with
2126      * the same fix-up as the other code point order comparison functions.
2127      * See ustring.c and the comment near the end of this function.
2128      *
2129      * Assumption: A decomposition or case-folding result string never contains
2130      * a single surrogate. This is a safe assumption in the Unicode Standard.
2131      * Therefore, we do not need to check for surrogate pairs across
2132      * decomposition/case-folding boundaries.
2133      *
2134      * Further assumptions (see verifications tstnorm.cpp):
2135      * The API function checks for FCD first, while the core function
2136      * first case-folds and then decomposes. This requires that case-folding does not
2137      * un-FCD any strings.
2138      *
2139      * The API function may also NFD the input and turn off decomposition.
2140      * This requires that case-folding does not un-NFD strings either.
2141      *
2142      * TODO If any of the above two assumptions is violated,
2143      * then this entire code must be re-thought.
2144      * If this happens, then a simple solution is to case-fold both strings up front
2145      * and to turn off UNORM_INPUT_IS_FCD.
2146      * We already do this when not both strings are in FCD because makeFCD
2147      * would be a partial NFD before the case folding, which does not work.
2148      * Note that all of this is only a problem when case-folding _and_
2149      * canonical equivalence come together.
2150      * (Comments in unorm_compare() are more up to date than this TODO.)
2151      */
2152 
2153     /* stack element for previous-level source/decomposition pointers */
2154     private static final class CmpEquivLevel {
2155         CharSequence cs;
2156         int s;
2157     };
createCmpEquivLevelStack()2158     private static final CmpEquivLevel[] createCmpEquivLevelStack() {
2159         return new CmpEquivLevel[] {
2160             new CmpEquivLevel(), new CmpEquivLevel()
2161         };
2162     }
2163 
2164     /**
2165      * Internal option for unorm_cmpEquivFold() for decomposing.
2166      * If not set, just do strcasecmp().
2167      */
2168     private static final int COMPARE_EQUIV=0x80000;
2169 
2170     /* internal function; package visibility for use by UTF16.StringComparator */
cmpEquivFold(CharSequence cs1, CharSequence cs2, int options)2171     /*package*/ static int cmpEquivFold(CharSequence cs1, CharSequence cs2, int options) {
2172         Normalizer2Impl nfcImpl;
2173         UCaseProps csp;
2174 
2175         /* current-level start/limit - s1/s2 as current */
2176         int s1, s2, limit1, limit2;
2177 
2178         /* decomposition and case folding variables */
2179         int length;
2180 
2181         /* stacks of previous-level start/current/limit */
2182         CmpEquivLevel[] stack1=null, stack2=null;
2183 
2184         /* buffers for algorithmic decompositions */
2185         String decomp1, decomp2;
2186 
2187         /* case folding buffers, only use current-level start/limit */
2188         StringBuilder fold1, fold2;
2189 
2190         /* track which is the current level per string */
2191         int level1, level2;
2192 
2193         /* current code units, and code points for lookups */
2194         int c1, c2, cp1, cp2;
2195 
2196         /* no argument error checking because this itself is not an API */
2197 
2198         /*
2199          * assume that at least one of the options _COMPARE_EQUIV and U_COMPARE_IGNORE_CASE is set
2200          * otherwise this function must behave exactly as uprv_strCompare()
2201          * not checking for that here makes testing this function easier
2202          */
2203 
2204         /* normalization/properties data loaded? */
2205         if((options&COMPARE_EQUIV)!=0) {
2206             nfcImpl=Norm2AllModes.getNFCInstance().impl;
2207         } else {
2208             nfcImpl=null;
2209         }
2210         if((options&COMPARE_IGNORE_CASE)!=0) {
2211             csp=UCaseProps.INSTANCE;
2212             fold1=new StringBuilder();
2213             fold2=new StringBuilder();
2214         } else {
2215             csp=null;
2216             fold1=fold2=null;
2217         }
2218 
2219         /* initialize */
2220         s1=0;
2221         limit1=cs1.length();
2222         s2=0;
2223         limit2=cs2.length();
2224 
2225         level1=level2=0;
2226         c1=c2=-1;
2227 
2228         /* comparison loop */
2229         for(;;) {
2230             /*
2231              * here a code unit value of -1 means "get another code unit"
2232              * below it will mean "this source is finished"
2233              */
2234 
2235             if(c1<0) {
2236                 /* get next code unit from string 1, post-increment */
2237                 for(;;) {
2238                     if(s1==limit1) {
2239                         if(level1==0) {
2240                             c1=-1;
2241                             break;
2242                         }
2243                     } else {
2244                         c1=cs1.charAt(s1++);
2245                         break;
2246                     }
2247 
2248                     /* reached end of level buffer, pop one level */
2249                     do {
2250                         --level1;
2251                         cs1=stack1[level1].cs;
2252                     } while(cs1==null);
2253                     s1=stack1[level1].s;
2254                     limit1=cs1.length();
2255                 }
2256             }
2257 
2258             if(c2<0) {
2259                 /* get next code unit from string 2, post-increment */
2260                 for(;;) {
2261                     if(s2==limit2) {
2262                         if(level2==0) {
2263                             c2=-1;
2264                             break;
2265                         }
2266                     } else {
2267                         c2=cs2.charAt(s2++);
2268                         break;
2269                     }
2270 
2271                     /* reached end of level buffer, pop one level */
2272                     do {
2273                         --level2;
2274                         cs2=stack2[level2].cs;
2275                     } while(cs2==null);
2276                     s2=stack2[level2].s;
2277                     limit2=cs2.length();
2278                 }
2279             }
2280 
2281             /*
2282              * compare c1 and c2
2283              * either variable c1, c2 is -1 only if the corresponding string is finished
2284              */
2285             if(c1==c2) {
2286                 if(c1<0) {
2287                     return 0;   /* c1==c2==-1 indicating end of strings */
2288                 }
2289                 c1=c2=-1;       /* make us fetch new code units */
2290                 continue;
2291             } else if(c1<0) {
2292                 return -1;      /* string 1 ends before string 2 */
2293             } else if(c2<0) {
2294                 return 1;       /* string 2 ends before string 1 */
2295             }
2296             /* c1!=c2 && c1>=0 && c2>=0 */
2297 
2298             /* get complete code points for c1, c2 for lookups if either is a surrogate */
2299             cp1=c1;
2300             if(UTF16.isSurrogate((char)c1)) {
2301                 char c;
2302 
2303                 if(Normalizer2Impl.UTF16Plus.isSurrogateLead(c1)) {
2304                     if(s1!=limit1 && Character.isLowSurrogate(c=cs1.charAt(s1))) {
2305                         /* advance ++s1; only below if cp1 decomposes/case-folds */
2306                         cp1=Character.toCodePoint((char)c1, c);
2307                     }
2308                 } else /* isTrail(c1) */ {
2309                     if(0<=(s1-2) && Character.isHighSurrogate(c=cs1.charAt(s1-2))) {
2310                         cp1=Character.toCodePoint(c, (char)c1);
2311                     }
2312                 }
2313             }
2314 
2315             cp2=c2;
2316             if(UTF16.isSurrogate((char)c2)) {
2317                 char c;
2318 
2319                 if(Normalizer2Impl.UTF16Plus.isSurrogateLead(c2)) {
2320                     if(s2!=limit2 && Character.isLowSurrogate(c=cs2.charAt(s2))) {
2321                         /* advance ++s2; only below if cp2 decomposes/case-folds */
2322                         cp2=Character.toCodePoint((char)c2, c);
2323                     }
2324                 } else /* isTrail(c2) */ {
2325                     if(0<=(s2-2) && Character.isHighSurrogate(c=cs2.charAt(s2-2))) {
2326                         cp2=Character.toCodePoint(c, (char)c2);
2327                     }
2328                 }
2329             }
2330 
2331             /*
2332              * go down one level for each string
2333              * continue with the main loop as soon as there is a real change
2334              */
2335 
2336             if( level1==0 && (options&COMPARE_IGNORE_CASE)!=0 &&
2337                 (length=csp.toFullFolding(cp1, fold1, options))>=0
2338             ) {
2339                 /* cp1 case-folds to the code point "length" or to p[length] */
2340                 if(UTF16.isSurrogate((char)c1)) {
2341                     if(Normalizer2Impl.UTF16Plus.isSurrogateLead(c1)) {
2342                         /* advance beyond source surrogate pair if it case-folds */
2343                         ++s1;
2344                     } else /* isTrail(c1) */ {
2345                         /*
2346                          * we got a supplementary code point when hitting its trail surrogate,
2347                          * therefore the lead surrogate must have been the same as in the other string;
2348                          * compare this decomposition with the lead surrogate in the other string
2349                          * remember that this simulates bulk text replacement:
2350                          * the decomposition would replace the entire code point
2351                          */
2352                         --s2;
2353                         c2=cs2.charAt(s2-1);
2354                     }
2355                 }
2356 
2357                 /* push current level pointers */
2358                 if(stack1==null) {
2359                     stack1=createCmpEquivLevelStack();
2360                 }
2361                 stack1[0].cs=cs1;
2362                 stack1[0].s=s1;
2363                 ++level1;
2364 
2365                 /* copy the folding result to fold1[] */
2366                 /* Java: the buffer was probably not empty, remove the old contents */
2367                 if(length<=UCaseProps.MAX_STRING_LENGTH) {
2368                     fold1.delete(0, fold1.length()-length);
2369                 } else {
2370                     fold1.setLength(0);
2371                     fold1.appendCodePoint(length);
2372                 }
2373 
2374                 /* set next level pointers to case folding */
2375                 cs1=fold1;
2376                 s1=0;
2377                 limit1=fold1.length();
2378 
2379                 /* get ready to read from decomposition, continue with loop */
2380                 c1=-1;
2381                 continue;
2382             }
2383 
2384             if( level2==0 && (options&COMPARE_IGNORE_CASE)!=0 &&
2385                 (length=csp.toFullFolding(cp2, fold2, options))>=0
2386             ) {
2387                 /* cp2 case-folds to the code point "length" or to p[length] */
2388                 if(UTF16.isSurrogate((char)c2)) {
2389                     if(Normalizer2Impl.UTF16Plus.isSurrogateLead(c2)) {
2390                         /* advance beyond source surrogate pair if it case-folds */
2391                         ++s2;
2392                     } else /* isTrail(c2) */ {
2393                         /*
2394                          * we got a supplementary code point when hitting its trail surrogate,
2395                          * therefore the lead surrogate must have been the same as in the other string;
2396                          * compare this decomposition with the lead surrogate in the other string
2397                          * remember that this simulates bulk text replacement:
2398                          * the decomposition would replace the entire code point
2399                          */
2400                         --s1;
2401                         c1=cs1.charAt(s1-1);
2402                     }
2403                 }
2404 
2405                 /* push current level pointers */
2406                 if(stack2==null) {
2407                     stack2=createCmpEquivLevelStack();
2408                 }
2409                 stack2[0].cs=cs2;
2410                 stack2[0].s=s2;
2411                 ++level2;
2412 
2413                 /* copy the folding result to fold2[] */
2414                 /* Java: the buffer was probably not empty, remove the old contents */
2415                 if(length<=UCaseProps.MAX_STRING_LENGTH) {
2416                     fold2.delete(0, fold2.length()-length);
2417                 } else {
2418                     fold2.setLength(0);
2419                     fold2.appendCodePoint(length);
2420                 }
2421 
2422                 /* set next level pointers to case folding */
2423                 cs2=fold2;
2424                 s2=0;
2425                 limit2=fold2.length();
2426 
2427                 /* get ready to read from decomposition, continue with loop */
2428                 c2=-1;
2429                 continue;
2430             }
2431 
2432             if( level1<2 && (options&COMPARE_EQUIV)!=0 &&
2433                 (decomp1=nfcImpl.getDecomposition(cp1))!=null
2434             ) {
2435                 /* cp1 decomposes into p[length] */
2436                 if(UTF16.isSurrogate((char)c1)) {
2437                     if(Normalizer2Impl.UTF16Plus.isSurrogateLead(c1)) {
2438                         /* advance beyond source surrogate pair if it decomposes */
2439                         ++s1;
2440                     } else /* isTrail(c1) */ {
2441                         /*
2442                          * we got a supplementary code point when hitting its trail surrogate,
2443                          * therefore the lead surrogate must have been the same as in the other string;
2444                          * compare this decomposition with the lead surrogate in the other string
2445                          * remember that this simulates bulk text replacement:
2446                          * the decomposition would replace the entire code point
2447                          */
2448                         --s2;
2449                         c2=cs2.charAt(s2-1);
2450                     }
2451                 }
2452 
2453                 /* push current level pointers */
2454                 if(stack1==null) {
2455                     stack1=createCmpEquivLevelStack();
2456                 }
2457                 stack1[level1].cs=cs1;
2458                 stack1[level1].s=s1;
2459                 ++level1;
2460 
2461                 /* set empty intermediate level if skipped */
2462                 if(level1<2) {
2463                     stack1[level1++].cs=null;
2464                 }
2465 
2466                 /* set next level pointers to decomposition */
2467                 cs1=decomp1;
2468                 s1=0;
2469                 limit1=decomp1.length();
2470 
2471                 /* get ready to read from decomposition, continue with loop */
2472                 c1=-1;
2473                 continue;
2474             }
2475 
2476             if( level2<2 && (options&COMPARE_EQUIV)!=0 &&
2477                 (decomp2=nfcImpl.getDecomposition(cp2))!=null
2478             ) {
2479                 /* cp2 decomposes into p[length] */
2480                 if(UTF16.isSurrogate((char)c2)) {
2481                     if(Normalizer2Impl.UTF16Plus.isSurrogateLead(c2)) {
2482                         /* advance beyond source surrogate pair if it decomposes */
2483                         ++s2;
2484                     } else /* isTrail(c2) */ {
2485                         /*
2486                          * we got a supplementary code point when hitting its trail surrogate,
2487                          * therefore the lead surrogate must have been the same as in the other string;
2488                          * compare this decomposition with the lead surrogate in the other string
2489                          * remember that this simulates bulk text replacement:
2490                          * the decomposition would replace the entire code point
2491                          */
2492                         --s1;
2493                         c1=cs1.charAt(s1-1);
2494                     }
2495                 }
2496 
2497                 /* push current level pointers */
2498                 if(stack2==null) {
2499                     stack2=createCmpEquivLevelStack();
2500                 }
2501                 stack2[level2].cs=cs2;
2502                 stack2[level2].s=s2;
2503                 ++level2;
2504 
2505                 /* set empty intermediate level if skipped */
2506                 if(level2<2) {
2507                     stack2[level2++].cs=null;
2508                 }
2509 
2510                 /* set next level pointers to decomposition */
2511                 cs2=decomp2;
2512                 s2=0;
2513                 limit2=decomp2.length();
2514 
2515                 /* get ready to read from decomposition, continue with loop */
2516                 c2=-1;
2517                 continue;
2518             }
2519 
2520             /*
2521              * no decomposition/case folding, max level for both sides:
2522              * return difference result
2523              *
2524              * code point order comparison must not just return cp1-cp2
2525              * because when single surrogates are present then the surrogate pairs
2526              * that formed cp1 and cp2 may be from different string indexes
2527              *
2528              * example: { d800 d800 dc01 } vs. { d800 dc00 }, compare at second code units
2529              * c1=d800 cp1=10001 c2=dc00 cp2=10000
2530              * cp1-cp2>0 but c1-c2<0 and in fact in UTF-32 it is { d800 10001 } < { 10000 }
2531              *
2532              * therefore, use same fix-up as in ustring.c/uprv_strCompare()
2533              * except: uprv_strCompare() fetches c=*s while this functions fetches c=*s++
2534              * so we have slightly different pointer/start/limit comparisons here
2535              */
2536 
2537             if(c1>=0xd800 && c2>=0xd800 && (options&COMPARE_CODE_POINT_ORDER)!=0) {
2538                 /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */
2539                 if(
2540                     (c1<=0xdbff && s1!=limit1 && Character.isLowSurrogate(cs1.charAt(s1))) ||
2541                     (Character.isLowSurrogate((char)c1) && 0!=(s1-1) && Character.isHighSurrogate(cs1.charAt(s1-2)))
2542                 ) {
2543                     /* part of a surrogate pair, leave >=d800 */
2544                 } else {
2545                     /* BMP code point - may be surrogate code point - make <d800 */
2546                     c1-=0x2800;
2547                 }
2548 
2549                 if(
2550                     (c2<=0xdbff && s2!=limit2 && Character.isLowSurrogate(cs2.charAt(s2))) ||
2551                     (Character.isLowSurrogate((char)c2) && 0!=(s2-1) && Character.isHighSurrogate(cs2.charAt(s2-2)))
2552                 ) {
2553                     /* part of a surrogate pair, leave >=d800 */
2554                 } else {
2555                     /* BMP code point - may be surrogate code point - make <d800 */
2556                     c2-=0x2800;
2557                 }
2558             }
2559 
2560             return c1-c2;
2561         }
2562     }
2563 
2564     /**
2565      * An Appendable that writes into a char array with a capacity that may be
2566      * less than array.length.
2567      * (By contrast, CharBuffer will write beyond destLimit all the way up to array.length.)
2568      * <p>
2569      * An overflow is only reported at the end, for the old Normalizer API functions that write
2570      * to char arrays.
2571      */
2572     private static final class CharsAppendable implements Appendable {
CharsAppendable(char[] dest, int destStart, int destLimit)2573         public CharsAppendable(char[] dest, int destStart, int destLimit) {
2574             chars=dest;
2575             start=offset=destStart;
2576             limit=destLimit;
2577         }
length()2578         public int length() {
2579             int len=offset-start;
2580             if(offset<=limit) {
2581                 return len;
2582             } else {
2583                 throw new IndexOutOfBoundsException(Integer.toString(len));
2584             }
2585         }
2586         @Override
append(char c)2587         public Appendable append(char c) {
2588             if(offset<limit) {
2589                 chars[offset]=c;
2590             }
2591             ++offset;
2592             return this;
2593         }
2594         @Override
append(CharSequence s)2595         public Appendable append(CharSequence s) {
2596             return append(s, 0, s.length());
2597         }
2598         @Override
append(CharSequence s, int sStart, int sLimit)2599         public Appendable append(CharSequence s, int sStart, int sLimit) {
2600             int len=sLimit-sStart;
2601             if(len<=(limit-offset)) {
2602                 while(sStart<sLimit) {  // TODO: Is there a better way to copy the characters?
2603                     chars[offset++]=s.charAt(sStart++);
2604                 }
2605             } else {
2606                 offset+=len;
2607             }
2608             return this;
2609         }
2610 
2611         private final char[] chars;
2612         private final int start, limit;
2613         private int offset;
2614     }
2615 }
2616