• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4  *******************************************************************************
5  *   Copyright (C) 2009-2016, International Business Machines
6  *   Corporation and others.  All Rights Reserved.
7  *******************************************************************************
8  */
9 
10 package com.ibm.icu.text;
11 
12 import java.io.IOException;
13 import java.io.InputStream;
14 import java.nio.ByteBuffer;
15 
16 import com.ibm.icu.impl.ICUBinary;
17 import com.ibm.icu.impl.Norm2AllModes;
18 import com.ibm.icu.util.ICUUncheckedIOException;
19 
20 /**
21  * Unicode normalization functionality for standard Unicode normalization or
22  * for using custom mapping tables.
23  * All instances of this class are unmodifiable/immutable.
24  * The Normalizer2 class is not intended for public subclassing.
25  * <p>
26  * The primary functions are to produce a normalized string and to detect whether
27  * a string is already normalized.
28  * The most commonly used normalization forms are those defined in
29  * https://www.unicode.org/reports/tr15/
30  * However, this API supports additional normalization forms for specialized purposes.
31  * For example, NFKC_Casefold is provided via getInstance("nfkc_cf", COMPOSE)
32  * and can be used in implementations of UTS #46.
33  * <p>
34  * Not only are the standard compose and decompose modes supplied,
35  * but additional modes are provided as documented in the Mode enum.
36  * <p>
37  * Some of the functions in this class identify normalization boundaries.
38  * At a normalization boundary, the portions of the string
39  * before it and starting from it do not interact and can be handled independently.
40  * <p>
41  * The spanQuickCheckYes() stops at a normalization boundary.
42  * When the goal is a normalized string, then the text before the boundary
43  * can be copied, and the remainder can be processed with normalizeSecondAndAppend().
44  * <p>
45  * The hasBoundaryBefore(), hasBoundaryAfter() and isInert() functions test whether
46  * a character is guaranteed to be at a normalization boundary,
47  * regardless of context.
48  * This is used for moving from one normalization boundary to the next
49  * or preceding boundary, and for performing iterative normalization.
50  * <p>
51  * Iterative normalization is useful when only a small portion of a
52  * longer string needs to be processed.
53  * For example, in ICU, iterative normalization is used by the NormalizationTransliterator
54  * (to avoid replacing already-normalized text) and ucol_nextSortKeyPart()
55  * (to process only the substring for which sort key bytes are computed).
56  * <p>
57  * The set of normalization boundaries returned by these functions may not be
58  * complete: There may be more boundaries that could be returned.
59  * Different functions may return different boundaries.
60  * @stable ICU 4.4
61  * @author Markus W. Scherer
62  */
63 public abstract class Normalizer2 {
64     /**
65      * Constants for normalization modes.
66      * For details about standard Unicode normalization forms
67      * and about the algorithms which are also used with custom mapping tables
68      * see https://www.unicode.org/reports/tr15/
69      * @stable ICU 4.4
70      */
71     public enum Mode {
72         /**
73          * Decomposition followed by composition.
74          * Same as standard NFC when using an "nfc" instance.
75          * Same as standard NFKC when using an "nfkc" instance.
76          * For details about standard Unicode normalization forms
77          * see https://www.unicode.org/reports/tr15/
78          * @stable ICU 4.4
79          */
80         COMPOSE,
81         /**
82          * Map, and reorder canonically.
83          * Same as standard NFD when using an "nfc" instance.
84          * Same as standard NFKD when using an "nfkc" instance.
85          * For details about standard Unicode normalization forms
86          * see https://www.unicode.org/reports/tr15/
87          * @stable ICU 4.4
88          */
89         DECOMPOSE,
90         /**
91          * "Fast C or D" form.
92          * If a string is in this form, then further decomposition <i>without reordering</i>
93          * would yield the same form as DECOMPOSE.
94          * Text in "Fast C or D" form can be processed efficiently with data tables
95          * that are "canonically closed", that is, that provide equivalent data for
96          * equivalent text, without having to be fully normalized.<br>
97          * Not a standard Unicode normalization form.<br>
98          * Not a unique form: Different FCD strings can be canonically equivalent.<br>
99          * For details see http://www.unicode.org/notes/tn5/#FCD
100          * @stable ICU 4.4
101          */
102         FCD,
103         /**
104          * Compose only contiguously.
105          * Also known as "FCC" or "Fast C Contiguous".
106          * The result will often but not always be in NFC.
107          * The result will conform to FCD which is useful for processing.<br>
108          * Not a standard Unicode normalization form.<br>
109          * For details see http://www.unicode.org/notes/tn5/#FCC
110          * @stable ICU 4.4
111          */
112         COMPOSE_CONTIGUOUS
113     };
114 
115     /**
116      * Returns a Normalizer2 instance for Unicode NFC normalization.
117      * Same as getInstance(null, "nfc", Mode.COMPOSE).
118      * Returns an unmodifiable singleton instance.
119      * @return the requested Normalizer2, if successful
120      * @stable ICU 49
121      */
getNFCInstance()122     public static Normalizer2 getNFCInstance() {
123         return Norm2AllModes.getNFCInstance().comp;
124     }
125 
126     /**
127      * Returns a Normalizer2 instance for Unicode NFD normalization.
128      * Same as getInstance(null, "nfc", Mode.DECOMPOSE).
129      * Returns an unmodifiable singleton instance.
130      * @return the requested Normalizer2, if successful
131      * @stable ICU 49
132      */
getNFDInstance()133     public static Normalizer2 getNFDInstance() {
134         return Norm2AllModes.getNFCInstance().decomp;
135     }
136 
137     /**
138      * Returns a Normalizer2 instance for Unicode NFKC normalization.
139      * Same as getInstance(null, "nfkc", Mode.COMPOSE).
140      * Returns an unmodifiable singleton instance.
141      * @return the requested Normalizer2, if successful
142      * @stable ICU 49
143      */
getNFKCInstance()144     public static Normalizer2 getNFKCInstance() {
145         return Norm2AllModes.getNFKCInstance().comp;
146     }
147 
148     /**
149      * Returns a Normalizer2 instance for Unicode NFKD normalization.
150      * Same as getInstance(null, "nfkc", Mode.DECOMPOSE).
151      * Returns an unmodifiable singleton instance.
152      * @return the requested Normalizer2, if successful
153      * @stable ICU 49
154      */
getNFKDInstance()155     public static Normalizer2 getNFKDInstance() {
156         return Norm2AllModes.getNFKCInstance().decomp;
157     }
158 
159     /**
160      * Returns a Normalizer2 instance for Unicode toNFKC_Casefold() normalization
161      * which is equivalent to applying the NFKC_Casefold mappings and then NFC.
162      * See https://www.unicode.org/reports/tr44/#NFKC_Casefold
163      *
164      * <p>Same as getInstance(null, "nfkc_cf", Mode.COMPOSE).
165      * Returns an unmodifiable singleton instance.
166      * @return the requested Normalizer2, if successful
167      * @stable ICU 49
168      */
getNFKCCasefoldInstance()169     public static Normalizer2 getNFKCCasefoldInstance() {
170         return Norm2AllModes.getNFKC_CFInstance().comp;
171     }
172 
173     /**
174      * Returns a Normalizer2 instance for a variant of Unicode toNFKC_Casefold() normalization
175      * which is equivalent to applying the NFKC_Simple_Casefold mappings and then NFC.
176      * See https://www.unicode.org/reports/tr44/#NFKC_Simple_Casefold
177      *
178      * <p>Same as getInstance(null, "nfkc_scf", Mode.COMPOSE).
179      * Returns an unmodifiable singleton instance.
180      * @return the requested Normalizer2, if successful
181      * @draft ICU 74
182      */
getNFKCSimpleCasefoldInstance()183     public static Normalizer2 getNFKCSimpleCasefoldInstance() {
184         return Norm2AllModes.getNFKC_SCFInstance().comp;
185     }
186 
187     /**
188      * Returns a Normalizer2 instance which uses the specified data file
189      * (an ICU data file if data=null, or else custom binary data)
190      * and which composes or decomposes text according to the specified mode.
191      * Returns an unmodifiable singleton instance.
192      * <ul>
193      * <li>Use data=null for data files that are part of ICU's own data.
194      * <li>Use name="nfc" and COMPOSE/DECOMPOSE for Unicode standard NFC/NFD.
195      * <li>Use name="nfkc" and COMPOSE/DECOMPOSE for Unicode standard NFKC/NFKD.
196      * <li>Use name="nfkc_cf" and COMPOSE for Unicode standard NFKC_CF=NFKC_Casefold.
197      * </ul>
198      * If data!=null, then the binary data is read once and cached using the provided
199      * name as the key.
200      * If you know or expect the data to be cached already, you can use data!=null
201      * for non-ICU data as well.
202      * <p>Any {@link java.io.IOException} is wrapped into a {@link com.ibm.icu.util.ICUUncheckedIOException}.
203      * @param data the binary, big-endian normalization (.nrm file) data, or null for ICU data
204      * @param name "nfc" or "nfkc" or "nfkc_cf" or "nfkc_scf" or name of custom data file
205      * @param mode normalization mode (compose or decompose etc.)
206      * @return the requested Normalizer2, if successful
207      * @stable ICU 4.4
208      */
getInstance(InputStream data, String name, Mode mode)209     public static Normalizer2 getInstance(InputStream data, String name, Mode mode) {
210         // TODO: If callers really use this API, then we should add an overload that takes a ByteBuffer.
211         ByteBuffer bytes = null;
212         if (data != null) {
213             try {
214                 bytes = ICUBinary.getByteBufferFromInputStreamAndCloseStream(data);
215             } catch (IOException e) {
216                 throw new ICUUncheckedIOException(e);
217             }
218         }
219         Norm2AllModes all2Modes=Norm2AllModes.getInstance(bytes, name);
220         switch(mode) {
221         case COMPOSE: return all2Modes.comp;
222         case DECOMPOSE: return all2Modes.decomp;
223         case FCD: return all2Modes.fcd;
224         case COMPOSE_CONTIGUOUS: return all2Modes.fcc;
225         default: return null;  // will not occur
226         }
227     }
228 
229     /**
230      * Returns the normalized form of the source string.
231      * @param src source string
232      * @return normalized src
233      * @stable ICU 4.4
234      */
normalize(CharSequence src)235     public String normalize(CharSequence src) {
236         if(src instanceof String) {
237             // Fastpath: Do not construct a new String if the src is a String
238             // and is already normalized.
239             int spanLength=spanQuickCheckYes(src);
240             if(spanLength==src.length()) {
241                 return (String)src;
242             }
243             if (spanLength != 0) {
244                 StringBuilder sb=new StringBuilder(src.length()).append(src, 0, spanLength);
245                 return normalizeSecondAndAppend(sb, src.subSequence(spanLength, src.length())).toString();
246             }
247         }
248         return normalize(src, new StringBuilder(src.length())).toString();
249     }
250 
251     /**
252      * Writes the normalized form of the source string to the destination string
253      * (replacing its contents) and returns the destination string.
254      * The source and destination strings must be different objects.
255      * @param src source string
256      * @param dest destination string; its contents is replaced with normalized src
257      * @return dest
258      * @stable ICU 4.4
259      */
normalize(CharSequence src, StringBuilder dest)260     public abstract StringBuilder normalize(CharSequence src, StringBuilder dest);
261 
262     /**
263      * Writes the normalized form of the source string to the destination Appendable
264      * and returns the destination Appendable.
265      * The source and destination strings must be different objects.
266      *
267      * <p>Any {@link java.io.IOException} is wrapped into a {@link com.ibm.icu.util.ICUUncheckedIOException}.
268      *
269      * @param src source string
270      * @param dest destination Appendable; gets normalized src appended
271      * @return dest
272      * @stable ICU 4.6
273      */
normalize(CharSequence src, Appendable dest)274     public abstract Appendable normalize(CharSequence src, Appendable dest);
275 
276     /**
277      * Appends the normalized form of the second string to the first string
278      * (merging them at the boundary) and returns the first string.
279      * The result is normalized if the first string was normalized.
280      * The first and second strings must be different objects.
281      * @param first string, should be normalized
282      * @param second string, will be normalized
283      * @return first
284      * @stable ICU 4.4
285      */
normalizeSecondAndAppend( StringBuilder first, CharSequence second)286     public abstract StringBuilder normalizeSecondAndAppend(
287             StringBuilder first, CharSequence second);
288 
289     /**
290      * Appends the second string to the first string
291      * (merging them at the boundary) and returns the first string.
292      * The result is normalized if both the strings were normalized.
293      * The first and second strings must be different objects.
294      * @param first string, should be normalized
295      * @param second string, should be normalized
296      * @return first
297      * @stable ICU 4.4
298      */
append(StringBuilder first, CharSequence second)299     public abstract StringBuilder append(StringBuilder first, CharSequence second);
300 
301     /**
302      * Gets the decomposition mapping of c.
303      * Roughly equivalent to normalizing the String form of c
304      * on a DECOMPOSE Normalizer2 instance, but much faster, and except that this function
305      * returns null if c does not have a decomposition mapping in this instance's data.
306      * This function is independent of the mode of the Normalizer2.
307      * @param c code point
308      * @return c's decomposition mapping, if any; otherwise null
309      * @stable ICU 4.6
310      */
getDecomposition(int c)311     public abstract String getDecomposition(int c);
312 
313     /**
314      * Gets the raw decomposition mapping of c.
315      *
316      * <p>This is similar to the getDecomposition() method but returns the
317      * raw decomposition mapping as specified in UnicodeData.txt or
318      * (for custom data) in the mapping files processed by the gennorm2 tool.
319      * By contrast, getDecomposition() returns the processed,
320      * recursively-decomposed version of this mapping.
321      *
322      * <p>When used on a standard NFKC Normalizer2 instance,
323      * getRawDecomposition() returns the Unicode Decomposition_Mapping (dm) property.
324      *
325      * <p>When used on a standard NFC Normalizer2 instance,
326      * it returns the Decomposition_Mapping only if the Decomposition_Type (dt) is Canonical (Can);
327      * in this case, the result contains either one or two code points (=1..4 Java chars).
328      *
329      * <p>This function is independent of the mode of the Normalizer2.
330      * The default implementation returns null.
331      * @param c code point
332      * @return c's raw decomposition mapping, if any; otherwise null
333      * @stable ICU 49
334      */
getRawDecomposition(int c)335     public String getRawDecomposition(int c) { return null; }
336 
337     /**
338      * Performs pairwise composition of a &amp; b and returns the composite if there is one.
339      *
340      * <p>Returns a composite code point c only if c has a two-way mapping to a+b.
341      * In standard Unicode normalization, this means that
342      * c has a canonical decomposition to a+b
343      * and c does not have the Full_Composition_Exclusion property.
344      *
345      * <p>This function is independent of the mode of the Normalizer2.
346      * The default implementation returns a negative value.
347      * @param a A (normalization starter) code point.
348      * @param b Another code point.
349      * @return The non-negative composite code point if there is one; otherwise a negative value.
350      * @stable ICU 49
351      */
composePair(int a, int b)352     public int composePair(int a, int b) { return -1; }
353 
354     /**
355      * Gets the combining class of c.
356      * The default implementation returns 0
357      * but all standard implementations return the Unicode Canonical_Combining_Class value.
358      * @param c code point
359      * @return c's combining class
360      * @stable ICU 49
361      */
getCombiningClass(int c)362     public int getCombiningClass(int c) { return 0; }
363 
364     /**
365      * Tests if the string is normalized.
366      * Internally, in cases where the quickCheck() method would return "maybe"
367      * (which is only possible for the two COMPOSE modes) this method
368      * resolves to "yes" or "no" to provide a definitive result,
369      * at the cost of doing more work in those cases.
370      * @param s input string
371      * @return true if s is normalized
372      * @stable ICU 4.4
373      */
isNormalized(CharSequence s)374     public abstract boolean isNormalized(CharSequence s);
375 
376     /**
377      * Tests if the string is normalized.
378      * For the two COMPOSE modes, the result could be "maybe" in cases that
379      * would take a little more work to resolve definitively.
380      * Use spanQuickCheckYes() and normalizeSecondAndAppend() for a faster
381      * combination of quick check + normalization, to avoid
382      * re-checking the "yes" prefix.
383      * @param s input string
384      * @return the quick check result
385      * @stable ICU 4.4
386      */
quickCheck(CharSequence s)387     public abstract Normalizer.QuickCheckResult quickCheck(CharSequence s);
388 
389     /**
390      * Returns the end of the normalized substring of the input string.
391      * In other words, with <code>end=spanQuickCheckYes(s);</code>
392      * the substring <code>s.subSequence(0, end)</code>
393      * will pass the quick check with a "yes" result.
394      * <p>
395      * The returned end index is usually one or more characters before the
396      * "no" or "maybe" character: The end index is at a normalization boundary.
397      * (See the class documentation for more about normalization boundaries.)
398      * <p>
399      * When the goal is a normalized string and most input strings are expected
400      * to be normalized already, then call this method,
401      * and if it returns a prefix shorter than the input string,
402      * copy that prefix and use normalizeSecondAndAppend() for the remainder.
403      * @param s input string
404      * @return "yes" span end index
405      * @stable ICU 4.4
406      */
spanQuickCheckYes(CharSequence s)407     public abstract int spanQuickCheckYes(CharSequence s);
408 
409     /**
410      * Tests if the character always has a normalization boundary before it,
411      * regardless of context.
412      * If true, then the character does not normalization-interact with
413      * preceding characters.
414      * In other words, a string containing this character can be normalized
415      * by processing portions before this character and starting from this
416      * character independently.
417      * This is used for iterative normalization. See the class documentation for details.
418      * @param c character to test
419      * @return true if c has a normalization boundary before it
420      * @stable ICU 4.4
421      */
hasBoundaryBefore(int c)422     public abstract boolean hasBoundaryBefore(int c);
423 
424     /**
425      * Tests if the character always has a normalization boundary after it,
426      * regardless of context.
427      * If true, then the character does not normalization-interact with
428      * following characters.
429      * In other words, a string containing this character can be normalized
430      * by processing portions up to this character and after this
431      * character independently.
432      * This is used for iterative normalization. See the class documentation for details.
433      * <p>
434      * Note that this operation may be significantly slower than hasBoundaryBefore().
435      * @param c character to test
436      * @return true if c has a normalization boundary after it
437      * @stable ICU 4.4
438      */
hasBoundaryAfter(int c)439     public abstract boolean hasBoundaryAfter(int c);
440 
441     /**
442      * Tests if the character is normalization-inert.
443      * If true, then the character does not change, nor normalization-interact with
444      * preceding or following characters.
445      * In other words, a string containing this character can be normalized
446      * by processing portions before this character and after this
447      * character independently.
448      * This is used for iterative normalization. See the class documentation for details.
449      * <p>
450      * Note that this operation may be significantly slower than hasBoundaryBefore().
451      * @param c character to test
452      * @return true if c is normalization-inert
453      * @stable ICU 4.4
454      */
isInert(int c)455     public abstract boolean isInert(int c);
456 
457     /**
458      * Sole constructor.  (For invocation by subclass constructors,
459      * typically implicit.)
460      * @internal
461      * @deprecated This API is ICU internal only.
462      */
463     @Deprecated
Normalizer2()464     protected Normalizer2() {
465     }
466 }
467