• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* GENERATED SOURCE. DO NOT MODIFY. */
2 // © 2016 and later: Unicode, Inc. and others.
3 // License & terms of use: http://www.unicode.org/copyright.html
4 /*
5  *******************************************************************************
6  *   Copyright (C) 2009-2016, International Business Machines
7  *   Corporation and others.  All Rights Reserved.
8  *******************************************************************************
9  */
10 
11 package android.icu.text;
12 
13 import java.io.IOException;
14 import java.io.InputStream;
15 import java.nio.ByteBuffer;
16 
17 import android.icu.impl.ICUBinary;
18 import android.icu.impl.Norm2AllModes;
19 import android.icu.util.ICUUncheckedIOException;
20 
21 /**
22  * Unicode normalization functionality for standard Unicode normalization or
23  * for using custom mapping tables.
24  * All instances of this class are unmodifiable/immutable.
25  * The Normalizer2 class is not intended for public subclassing.
26  * <p>
27  * The primary functions are to produce a normalized string and to detect whether
28  * a string is already normalized.
29  * The most commonly used normalization forms are those defined in
30  * https://www.unicode.org/reports/tr15/
31  * However, this API supports additional normalization forms for specialized purposes.
32  * For example, NFKC_Casefold is provided via getInstance("nfkc_cf", COMPOSE)
33  * and can be used in implementations of UTS #46.
34  * <p>
35  * Not only are the standard compose and decompose modes supplied,
36  * but additional modes are provided as documented in the Mode enum.
37  * <p>
38  * Some of the functions in this class identify normalization boundaries.
39  * At a normalization boundary, the portions of the string
40  * before it and starting from it do not interact and can be handled independently.
41  * <p>
42  * The spanQuickCheckYes() stops at a normalization boundary.
43  * When the goal is a normalized string, then the text before the boundary
44  * can be copied, and the remainder can be processed with normalizeSecondAndAppend().
45  * <p>
46  * The hasBoundaryBefore(), hasBoundaryAfter() and isInert() functions test whether
47  * a character is guaranteed to be at a normalization boundary,
48  * regardless of context.
49  * This is used for moving from one normalization boundary to the next
50  * or preceding boundary, and for performing iterative normalization.
51  * <p>
52  * Iterative normalization is useful when only a small portion of a
53  * longer string needs to be processed.
54  * For example, in ICU, iterative normalization is used by the NormalizationTransliterator
55  * (to avoid replacing already-normalized text) and ucol_nextSortKeyPart()
56  * (to process only the substring for which sort key bytes are computed).
57  * <p>
58  * The set of normalization boundaries returned by these functions may not be
59  * complete: There may be more boundaries that could be returned.
60  * Different functions may return different boundaries.
61  * @author Markus W. Scherer
62  */
63 public abstract class Normalizer2 {
64     /**
65      * Constants for normalization modes.
66      * For details about standard Unicode normalization forms
67      * and about the algorithms which are also used with custom mapping tables
68      * see https://www.unicode.org/reports/tr15/
69      */
70     public enum Mode {
71         /**
72          * Decomposition followed by composition.
73          * Same as standard NFC when using an "nfc" instance.
74          * Same as standard NFKC when using an "nfkc" instance.
75          * For details about standard Unicode normalization forms
76          * see https://www.unicode.org/reports/tr15/
77          */
78         COMPOSE,
79         /**
80          * Map, and reorder canonically.
81          * Same as standard NFD when using an "nfc" instance.
82          * Same as standard NFKD when using an "nfkc" instance.
83          * For details about standard Unicode normalization forms
84          * see https://www.unicode.org/reports/tr15/
85          */
86         DECOMPOSE,
87         /**
88          * "Fast C or D" form.
89          * If a string is in this form, then further decomposition <i>without reordering</i>
90          * would yield the same form as DECOMPOSE.
91          * Text in "Fast C or D" form can be processed efficiently with data tables
92          * that are "canonically closed", that is, that provide equivalent data for
93          * equivalent text, without having to be fully normalized.<br>
94          * Not a standard Unicode normalization form.<br>
95          * Not a unique form: Different FCD strings can be canonically equivalent.<br>
96          * For details see http://www.unicode.org/notes/tn5/#FCD
97          */
98         FCD,
99         /**
100          * Compose only contiguously.
101          * Also known as "FCC" or "Fast C Contiguous".
102          * The result will often but not always be in NFC.
103          * The result will conform to FCD which is useful for processing.<br>
104          * Not a standard Unicode normalization form.<br>
105          * For details see http://www.unicode.org/notes/tn5/#FCC
106          */
107         COMPOSE_CONTIGUOUS
108     };
109 
110     /**
111      * Returns a Normalizer2 instance for Unicode NFC normalization.
112      * Same as getInstance(null, "nfc", Mode.COMPOSE).
113      * Returns an unmodifiable singleton instance.
114      * @return the requested Normalizer2, if successful
115      */
getNFCInstance()116     public static Normalizer2 getNFCInstance() {
117         return Norm2AllModes.getNFCInstance().comp;
118     }
119 
120     /**
121      * Returns a Normalizer2 instance for Unicode NFD normalization.
122      * Same as getInstance(null, "nfc", Mode.DECOMPOSE).
123      * Returns an unmodifiable singleton instance.
124      * @return the requested Normalizer2, if successful
125      */
getNFDInstance()126     public static Normalizer2 getNFDInstance() {
127         return Norm2AllModes.getNFCInstance().decomp;
128     }
129 
130     /**
131      * Returns a Normalizer2 instance for Unicode NFKC normalization.
132      * Same as getInstance(null, "nfkc", Mode.COMPOSE).
133      * Returns an unmodifiable singleton instance.
134      * @return the requested Normalizer2, if successful
135      */
getNFKCInstance()136     public static Normalizer2 getNFKCInstance() {
137         return Norm2AllModes.getNFKCInstance().comp;
138     }
139 
140     /**
141      * Returns a Normalizer2 instance for Unicode NFKD normalization.
142      * Same as getInstance(null, "nfkc", Mode.DECOMPOSE).
143      * Returns an unmodifiable singleton instance.
144      * @return the requested Normalizer2, if successful
145      */
getNFKDInstance()146     public static Normalizer2 getNFKDInstance() {
147         return Norm2AllModes.getNFKCInstance().decomp;
148     }
149 
150     /**
151      * Returns a Normalizer2 instance for Unicode toNFKC_Casefold() normalization
152      * which is equivalent to applying the NFKC_Casefold mappings and then NFC.
153      * See https://www.unicode.org/reports/tr44/#NFKC_Casefold
154      *
155      * <p>Same as getInstance(null, "nfkc_cf", Mode.COMPOSE).
156      * Returns an unmodifiable singleton instance.
157      * @return the requested Normalizer2, if successful
158      */
getNFKCCasefoldInstance()159     public static Normalizer2 getNFKCCasefoldInstance() {
160         return Norm2AllModes.getNFKC_CFInstance().comp;
161     }
162 
163     /**
164      * Returns a Normalizer2 instance for a variant of Unicode toNFKC_Casefold() normalization
165      * which is equivalent to applying the NFKC_Simple_Casefold mappings and then NFC.
166      * See https://www.unicode.org/reports/tr44/#NFKC_Simple_Casefold
167      *
168      * <p>Same as getInstance(null, "nfkc_scf", Mode.COMPOSE).
169      * Returns an unmodifiable singleton instance.
170      * @return the requested Normalizer2, if successful
171      */
172     @android.annotation.FlaggedApi(com.android.icu.Flags.FLAG_ICU_25Q2_API)
getNFKCSimpleCasefoldInstance()173     public static Normalizer2 getNFKCSimpleCasefoldInstance() {
174         return Norm2AllModes.getNFKC_SCFInstance().comp;
175     }
176 
177     /**
178      * Returns a Normalizer2 instance which uses the specified data file
179      * (an ICU data file if data=null, or else custom binary data)
180      * and which composes or decomposes text according to the specified mode.
181      * Returns an unmodifiable singleton instance.
182      * <ul>
183      * <li>Use data=null for data files that are part of ICU's own data.
184      * <li>Use name="nfc" and COMPOSE/DECOMPOSE for Unicode standard NFC/NFD.
185      * <li>Use name="nfkc" and COMPOSE/DECOMPOSE for Unicode standard NFKC/NFKD.
186      * <li>Use name="nfkc_cf" and COMPOSE for Unicode standard NFKC_CF=NFKC_Casefold.
187      * </ul>
188      * If data!=null, then the binary data is read once and cached using the provided
189      * name as the key.
190      * If you know or expect the data to be cached already, you can use data!=null
191      * for non-ICU data as well.
192      * <p>Any {@link java.io.IOException} is wrapped into a {@link android.icu.util.ICUUncheckedIOException}.
193      * @param data the binary, big-endian normalization (.nrm file) data, or null for ICU data
194      * @param name "nfc" or "nfkc" or "nfkc_cf" or "nfkc_scf" or name of custom data file
195      * @param mode normalization mode (compose or decompose etc.)
196      * @return the requested Normalizer2, if successful
197      * @deprecated Don't use because the binary {@code data} format is not stable across API levels.
198      */
199     @Deprecated
getInstance(InputStream data, String name, Mode mode)200     public static Normalizer2 getInstance(InputStream data, String name, Mode mode) {
201         // TODO: If callers really use this API, then we should add an overload that takes a ByteBuffer.
202         ByteBuffer bytes = null;
203         if (data != null) {
204             try {
205                 bytes = ICUBinary.getByteBufferFromInputStreamAndCloseStream(data);
206             } catch (IOException e) {
207                 throw new ICUUncheckedIOException(e);
208             }
209         }
210         Norm2AllModes all2Modes=Norm2AllModes.getInstance(bytes, name);
211         switch(mode) {
212         case COMPOSE: return all2Modes.comp;
213         case DECOMPOSE: return all2Modes.decomp;
214         case FCD: return all2Modes.fcd;
215         case COMPOSE_CONTIGUOUS: return all2Modes.fcc;
216         default: return null;  // will not occur
217         }
218     }
219 
220     /**
221      * Returns the normalized form of the source string.
222      * @param src source string
223      * @return normalized src
224      */
normalize(CharSequence src)225     public String normalize(CharSequence src) {
226         if(src instanceof String) {
227             // Fastpath: Do not construct a new String if the src is a String
228             // and is already normalized.
229             int spanLength=spanQuickCheckYes(src);
230             if(spanLength==src.length()) {
231                 return (String)src;
232             }
233             if (spanLength != 0) {
234                 StringBuilder sb=new StringBuilder(src.length()).append(src, 0, spanLength);
235                 return normalizeSecondAndAppend(sb, src.subSequence(spanLength, src.length())).toString();
236             }
237         }
238         return normalize(src, new StringBuilder(src.length())).toString();
239     }
240 
241     /**
242      * Writes the normalized form of the source string to the destination string
243      * (replacing its contents) and returns the destination string.
244      * The source and destination strings must be different objects.
245      * @param src source string
246      * @param dest destination string; its contents is replaced with normalized src
247      * @return dest
248      */
normalize(CharSequence src, StringBuilder dest)249     public abstract StringBuilder normalize(CharSequence src, StringBuilder dest);
250 
251     /**
252      * Writes the normalized form of the source string to the destination Appendable
253      * and returns the destination Appendable.
254      * The source and destination strings must be different objects.
255      *
256      * <p>Any {@link java.io.IOException} is wrapped into a {@link android.icu.util.ICUUncheckedIOException}.
257      *
258      * @param src source string
259      * @param dest destination Appendable; gets normalized src appended
260      * @return dest
261      */
normalize(CharSequence src, Appendable dest)262     public abstract Appendable normalize(CharSequence src, Appendable dest);
263 
264     /**
265      * Appends the normalized form of the second string to the first string
266      * (merging them at the boundary) and returns the first string.
267      * The result is normalized if the first string was normalized.
268      * The first and second strings must be different objects.
269      * @param first string, should be normalized
270      * @param second string, will be normalized
271      * @return first
272      */
normalizeSecondAndAppend( StringBuilder first, CharSequence second)273     public abstract StringBuilder normalizeSecondAndAppend(
274             StringBuilder first, CharSequence second);
275 
276     /**
277      * Appends the second string to the first string
278      * (merging them at the boundary) and returns the first string.
279      * The result is normalized if both the strings were normalized.
280      * The first and second strings must be different objects.
281      * @param first string, should be normalized
282      * @param second string, should be normalized
283      * @return first
284      */
append(StringBuilder first, CharSequence second)285     public abstract StringBuilder append(StringBuilder first, CharSequence second);
286 
287     /**
288      * Gets the decomposition mapping of c.
289      * Roughly equivalent to normalizing the String form of c
290      * on a DECOMPOSE Normalizer2 instance, but much faster, and except that this function
291      * returns null if c does not have a decomposition mapping in this instance's data.
292      * This function is independent of the mode of the Normalizer2.
293      * @param c code point
294      * @return c's decomposition mapping, if any; otherwise null
295      */
getDecomposition(int c)296     public abstract String getDecomposition(int c);
297 
298     /**
299      * Gets the raw decomposition mapping of c.
300      *
301      * <p>This is similar to the getDecomposition() method but returns the
302      * raw decomposition mapping as specified in UnicodeData.txt or
303      * (for custom data) in the mapping files processed by the gennorm2 tool.
304      * By contrast, getDecomposition() returns the processed,
305      * recursively-decomposed version of this mapping.
306      *
307      * <p>When used on a standard NFKC Normalizer2 instance,
308      * getRawDecomposition() returns the Unicode Decomposition_Mapping (dm) property.
309      *
310      * <p>When used on a standard NFC Normalizer2 instance,
311      * it returns the Decomposition_Mapping only if the Decomposition_Type (dt) is Canonical (Can);
312      * in this case, the result contains either one or two code points (=1..4 Java chars).
313      *
314      * <p>This function is independent of the mode of the Normalizer2.
315      * The default implementation returns null.
316      * @param c code point
317      * @return c's raw decomposition mapping, if any; otherwise null
318      */
getRawDecomposition(int c)319     public String getRawDecomposition(int c) { return null; }
320 
321     /**
322      * Performs pairwise composition of a &amp; b and returns the composite if there is one.
323      *
324      * <p>Returns a composite code point c only if c has a two-way mapping to a+b.
325      * In standard Unicode normalization, this means that
326      * c has a canonical decomposition to a+b
327      * and c does not have the Full_Composition_Exclusion property.
328      *
329      * <p>This function is independent of the mode of the Normalizer2.
330      * The default implementation returns a negative value.
331      * @param a A (normalization starter) code point.
332      * @param b Another code point.
333      * @return The non-negative composite code point if there is one; otherwise a negative value.
334      */
composePair(int a, int b)335     public int composePair(int a, int b) { return -1; }
336 
337     /**
338      * Gets the combining class of c.
339      * The default implementation returns 0
340      * but all standard implementations return the Unicode Canonical_Combining_Class value.
341      * @param c code point
342      * @return c's combining class
343      */
getCombiningClass(int c)344     public int getCombiningClass(int c) { return 0; }
345 
346     /**
347      * Tests if the string is normalized.
348      * Internally, in cases where the quickCheck() method would return "maybe"
349      * (which is only possible for the two COMPOSE modes) this method
350      * resolves to "yes" or "no" to provide a definitive result,
351      * at the cost of doing more work in those cases.
352      * @param s input string
353      * @return true if s is normalized
354      */
isNormalized(CharSequence s)355     public abstract boolean isNormalized(CharSequence s);
356 
357     /**
358      * Tests if the string is normalized.
359      * For the two COMPOSE modes, the result could be "maybe" in cases that
360      * would take a little more work to resolve definitively.
361      * Use spanQuickCheckYes() and normalizeSecondAndAppend() for a faster
362      * combination of quick check + normalization, to avoid
363      * re-checking the "yes" prefix.
364      * @param s input string
365      * @return the quick check result
366      */
quickCheck(CharSequence s)367     public abstract Normalizer.QuickCheckResult quickCheck(CharSequence s);
368 
369     /**
370      * Returns the end of the normalized substring of the input string.
371      * In other words, with <code>end=spanQuickCheckYes(s);</code>
372      * the substring <code>s.subSequence(0, end)</code>
373      * will pass the quick check with a "yes" result.
374      * <p>
375      * The returned end index is usually one or more characters before the
376      * "no" or "maybe" character: The end index is at a normalization boundary.
377      * (See the class documentation for more about normalization boundaries.)
378      * <p>
379      * When the goal is a normalized string and most input strings are expected
380      * to be normalized already, then call this method,
381      * and if it returns a prefix shorter than the input string,
382      * copy that prefix and use normalizeSecondAndAppend() for the remainder.
383      * @param s input string
384      * @return "yes" span end index
385      */
spanQuickCheckYes(CharSequence s)386     public abstract int spanQuickCheckYes(CharSequence s);
387 
388     /**
389      * Tests if the character always has a normalization boundary before it,
390      * regardless of context.
391      * If true, then the character does not normalization-interact with
392      * preceding characters.
393      * In other words, a string containing this character can be normalized
394      * by processing portions before this character and starting from this
395      * character independently.
396      * This is used for iterative normalization. See the class documentation for details.
397      * @param c character to test
398      * @return true if c has a normalization boundary before it
399      */
hasBoundaryBefore(int c)400     public abstract boolean hasBoundaryBefore(int c);
401 
402     /**
403      * Tests if the character always has a normalization boundary after it,
404      * regardless of context.
405      * If true, then the character does not normalization-interact with
406      * following characters.
407      * In other words, a string containing this character can be normalized
408      * by processing portions up to this character and after this
409      * character independently.
410      * This is used for iterative normalization. See the class documentation for details.
411      * <p>
412      * Note that this operation may be significantly slower than hasBoundaryBefore().
413      * @param c character to test
414      * @return true if c has a normalization boundary after it
415      */
hasBoundaryAfter(int c)416     public abstract boolean hasBoundaryAfter(int c);
417 
418     /**
419      * Tests if the character is normalization-inert.
420      * If true, then the character does not change, nor normalization-interact with
421      * preceding or following characters.
422      * In other words, a string containing this character can be normalized
423      * by processing portions before this character and after this
424      * character independently.
425      * This is used for iterative normalization. See the class documentation for details.
426      * <p>
427      * Note that this operation may be significantly slower than hasBoundaryBefore().
428      * @param c character to test
429      * @return true if c is normalization-inert
430      */
isInert(int c)431     public abstract boolean isInert(int c);
432 
433     /**
434      * Sole constructor.  (For invocation by subclass constructors,
435      * typically implicit.)
436      * @deprecated This API is ICU internal only.
437      * @hide original deprecated declaration
438      * @hide draft / provisional / internal are hidden on Android
439      */
440     @Deprecated
Normalizer2()441     protected Normalizer2() {
442     }
443 }
444